Coverage for python/lsst/pipe/base/graphBuilder.py: 16%
460 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-11 02:40 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-11 02:40 -0800
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
38from lsst.daf.butler import (
39 CollectionType,
40 DataCoordinate,
41 DatasetIdGenEnum,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 Quantum,
50 Registry,
51)
52from lsst.daf.butler.registry import MissingDatasetTypeError
53from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
54from lsst.daf.butler.registry.wildcards import CollectionWildcard
55from lsst.utils import doImportType
57from ._datasetQueryConstraints import DatasetQueryConstraintVariant
58from ._status import NoWorkFound
60# -----------------------------
61# Imports for other modules --
62# -----------------------------
63from .connections import AdjustQuantumHelper, iterConnections
64from .graph import QuantumGraph
65from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
67# ----------------------------------
68# Local non-exported definitions --
69# ----------------------------------
71_LOG = logging.getLogger(__name__)
74class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
75 """A custom dictionary that maps `DatasetType` to a nested dictionary of
76 the known `DatasetRef` instances of that type.
78 Parameters
79 ----------
80 args
81 Positional arguments are forwarded to the `dict` constructor.
82 universe : `DimensionUniverse`
83 Universe of all possible dimensions.
84 """
86 def __init__(self, *args: Any, universe: DimensionUniverse):
87 super().__init__(*args)
88 self.universe = universe
90 @classmethod
91 def fromDatasetTypes(
92 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
93 ) -> _DatasetDict:
94 """Construct a dictionary from a flat iterable of `DatasetType` keys.
96 Parameters
97 ----------
98 datasetTypes : `iterable` of `DatasetType`
99 DatasetTypes to use as keys for the dict. Values will be empty
100 dictionaries.
101 universe : `DimensionUniverse`
102 Universe of all possible dimensions.
104 Returns
105 -------
106 dictionary : `_DatasetDict`
107 A new `_DatasetDict` instance.
108 """
109 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
111 @classmethod
112 def fromSubset(
113 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
114 ) -> _DatasetDict:
115 """Return a new dictionary by extracting items corresponding to the
116 given keys from one or more existing dictionaries.
118 Parameters
119 ----------
120 datasetTypes : `iterable` of `DatasetType`
121 DatasetTypes to use as keys for the dict. Values will be obtained
122 by lookups against ``first`` and ``rest``.
123 first : `_DatasetDict`
124 Another dictionary from which to extract values.
125 rest
126 Additional dictionaries from which to extract values.
128 Returns
129 -------
130 dictionary : `_DatasetDict`
131 A new dictionary instance.
132 """
133 combined = ChainMap(first, *rest)
135 # Dataset types known to match immediately can be processed
136 # without checks.
137 matches = combined.keys() & set(datasetTypes)
138 _dict = {k: combined[k] for k in matches}
140 if len(_dict) < len(datasetTypes):
141 # Work out which ones are missing.
142 missing_datasetTypes = set(datasetTypes) - _dict.keys()
144 # Get the known names for comparison.
145 combined_by_name = {k.name: k for k in combined}
147 missing = set()
148 incompatible = {}
149 for datasetType in missing_datasetTypes:
150 # The dataset type is not found. It may not be listed
151 # or it may be that it is there with the same name
152 # but different definition.
153 if datasetType.name in combined_by_name:
154 # This implies some inconsistency in definitions
155 # for connections. If there is support for storage
156 # class conversion we can let it slide.
157 # At this point we do not know
158 # where the inconsistency is but trust that down
159 # stream code will be more explicit about input
160 # vs output incompatibilities.
161 existing = combined_by_name[datasetType.name]
162 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
163 _LOG.warning(
164 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
165 datasetType,
166 existing,
167 )
168 _dict[datasetType] = combined[existing]
169 else:
170 incompatible[datasetType] = existing
171 else:
172 missing.add(datasetType)
174 if missing or incompatible:
175 reasons = []
176 if missing:
177 reasons.append(
178 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
179 + ", ".join(d.name for d in combined)
180 )
181 if incompatible:
182 for x, y in incompatible.items():
183 reasons.append(f"{x} incompatible with {y}")
184 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
186 return cls(_dict, universe=first.universe)
188 @property
189 def dimensions(self) -> DimensionGraph:
190 """The union of all dimensions used by all dataset types in this
191 dictionary, including implied dependencies (`DimensionGraph`).
192 """
193 base = self.universe.empty
194 if len(self) == 0:
195 return base
196 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
198 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
199 """Unpack nested single-element `DatasetRef` dicts into a new
200 mapping with `DatasetType` keys and `DatasetRef` values.
202 This method assumes that each nest contains exactly one item, as is the
203 case for all "init" datasets.
205 Returns
206 -------
207 dictionary : `NamedKeyDict`
208 Dictionary mapping `DatasetType` to `DatasetRef`, with both
209 `DatasetType` instances and string names usable as keys.
210 """
212 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
213 (ref,) = refs.values()
214 return ref
216 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
218 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
219 """Unpack nested multi-element `DatasetRef` dicts into a new
220 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
222 Returns
223 -------
224 dictionary : `NamedKeyDict`
225 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
226 both `DatasetType` instances and string names usable as keys.
227 """
228 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
230 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
231 """Iterate over the contained `DatasetRef` instances that match the
232 given `DatasetType` and data IDs.
234 Parameters
235 ----------
236 datasetType : `DatasetType`
237 Dataset type to match.
238 dataIds : `Iterable` [ `DataCoordinate` ]
239 Data IDs to match.
241 Returns
242 -------
243 refs : `Iterator` [ `DatasetRef` ]
244 DatasetRef instances for which ``ref.datasetType == datasetType``
245 and ``ref.dataId`` is in ``dataIds``.
246 """
247 refs = self[datasetType]
248 return (refs[dataId] for dataId in dataIds)
251class _QuantumScaffolding:
252 """Helper class aggregating information about a `Quantum`, used when
253 constructing a `QuantumGraph`.
255 See `_PipelineScaffolding` for a top-down description of the full
256 scaffolding data structure.
258 Parameters
259 ----------
260 task : _TaskScaffolding
261 Back-reference to the helper object for the `PipelineTask` this quantum
262 represents an execution of.
263 dataId : `DataCoordinate`
264 Data ID for this quantum.
265 """
267 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
268 self.task = task
269 self.dataId = dataId
270 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
271 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
272 self.prerequisites = _DatasetDict.fromDatasetTypes(
273 task.prerequisites.keys(), universe=dataId.universe
274 )
276 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
278 def __repr__(self) -> str:
279 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
281 task: _TaskScaffolding
282 """Back-reference to the helper object for the `PipelineTask` this quantum
283 represents an execution of.
284 """
286 dataId: DataCoordinate
287 """Data ID for this quantum.
288 """
290 inputs: _DatasetDict
291 """Nested dictionary containing `DatasetRef` inputs to this quantum.
293 This is initialized to map each `DatasetType` to an empty dictionary at
294 construction. Those nested dictionaries are populated (with data IDs as
295 keys) with unresolved `DatasetRef` instances in
296 `_PipelineScaffolding.connectDataIds`.
297 """
299 outputs: _DatasetDict
300 """Nested dictionary containing `DatasetRef` outputs this quantum.
301 """
303 prerequisites: _DatasetDict
304 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
305 quantum.
306 """
308 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
309 """Transform the scaffolding object into a true `Quantum` instance.
311 Parameters
312 ----------
313 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
314 If not `None` then fill datastore records in each generated Quantum
315 using the records from this structure.
317 Returns
318 -------
319 quantum : `Quantum`
320 An actual `Quantum` instance.
321 """
322 allInputs = self.inputs.unpackMultiRefs()
323 allInputs.update(self.prerequisites.unpackMultiRefs())
324 # Give the task's Connections class an opportunity to remove some
325 # inputs, or complain if they are unacceptable.
326 # This will raise if one of the check conditions is not met, which is
327 # the intended behavior.
328 # If it raises NotWorkFound, there is a bug in the QG algorithm
329 # or the adjustQuantum is incorrectly trying to make a prerequisite
330 # input behave like a regular input; adjustQuantum should only raise
331 # NoWorkFound if a regular input is missing, and it shouldn't be
332 # possible for us to have generated ``self`` if that's true.
333 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
334 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
335 initInputs = self.task.initInputs.unpackSingleRefs()
336 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
337 if datastore_records is not None:
338 quantum_records = {}
339 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
340 input_refs += list(initInputs.values())
341 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
342 for datastore_name, records in datastore_records.items():
343 matching_records = records.subset(input_ids)
344 if matching_records is not None:
345 quantum_records[datastore_name] = matching_records
346 return Quantum(
347 taskName=self.task.taskDef.taskName,
348 taskClass=self.task.taskDef.taskClass,
349 dataId=self.dataId,
350 initInputs=initInputs,
351 inputs=helper.inputs,
352 outputs=helper.outputs,
353 datastore_records=quantum_records,
354 )
357@dataclass
358class _TaskScaffolding:
359 """Helper class aggregating information about a `PipelineTask`, used when
360 constructing a `QuantumGraph`.
362 See `_PipelineScaffolding` for a top-down description of the full
363 scaffolding data structure.
365 Parameters
366 ----------
367 taskDef : `TaskDef`
368 Data structure that identifies the task class and its config.
369 parent : `_PipelineScaffolding`
370 The parent data structure that will hold the instance being
371 constructed.
372 datasetTypes : `TaskDatasetTypes`
373 Data structure that categorizes the dataset types used by this task.
374 """
376 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
377 universe = parent.dimensions.universe
378 self.taskDef = taskDef
379 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
380 assert self.dimensions.issubset(parent.dimensions)
381 # Initialize _DatasetDicts as subsets of the one or two
382 # corresponding dicts in the parent _PipelineScaffolding.
383 self.initInputs = _DatasetDict.fromSubset(
384 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
385 )
386 self.initOutputs = _DatasetDict.fromSubset(
387 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
388 )
389 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
390 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
391 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
392 self.dataIds: Set[DataCoordinate] = set()
393 self.quanta = {}
395 def __repr__(self) -> str:
396 # Default dataclass-injected __repr__ gets caught in an infinite loop
397 # because of back-references.
398 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
400 taskDef: TaskDef
401 """Data structure that identifies the task class and its config
402 (`TaskDef`).
403 """
405 dimensions: DimensionGraph
406 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
407 """
409 initInputs: _DatasetDict
410 """Dictionary containing information about datasets used to construct this
411 task (`_DatasetDict`).
412 """
414 initOutputs: _DatasetDict
415 """Dictionary containing information about datasets produced as a
416 side-effect of constructing this task (`_DatasetDict`).
417 """
419 inputs: _DatasetDict
420 """Dictionary containing information about datasets used as regular,
421 graph-constraining inputs to this task (`_DatasetDict`).
422 """
424 outputs: _DatasetDict
425 """Dictionary containing information about datasets produced by this task
426 (`_DatasetDict`).
427 """
429 prerequisites: _DatasetDict
430 """Dictionary containing information about input datasets that must be
431 present in the repository before any Pipeline containing this task is run
432 (`_DatasetDict`).
433 """
435 quanta: Dict[DataCoordinate, _QuantumScaffolding]
436 """Dictionary mapping data ID to a scaffolding object for the Quantum of
437 this task with that data ID.
438 """
440 def makeQuantumSet(
441 self,
442 unresolvedRefs: Optional[Set[DatasetRef]] = None,
443 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
444 ) -> Set[Quantum]:
445 """Create a `set` of `Quantum` from the information in ``self``.
447 Parameters
448 ----------
449 unresolvedRefs : `set` [ `DatasetRef` ], optional
450 Input dataset refs that have not been found.
451 datastore_records : `dict`
454 Returns
455 -------
456 nodes : `set` of `Quantum`
457 The `Quantum` elements corresponding to this task.
458 """
459 if unresolvedRefs is None:
460 unresolvedRefs = set()
461 outputs = set()
462 for q in self.quanta.values():
463 try:
464 tmpQuanta = q.makeQuantum(datastore_records)
465 outputs.add(tmpQuanta)
466 except (NoWorkFound, FileNotFoundError) as exc:
467 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
468 if unresolvedRefs.intersection(refs):
469 # This means it is a node that is Known to be pruned
470 # later and should be left in even though some follow up
471 # queries fail. This allows the pruning to start from this
472 # quantum with known issues, and prune other nodes it
473 # touches
474 inputs = q.inputs.unpackMultiRefs()
475 inputs.update(q.prerequisites.unpackMultiRefs())
476 tmpQuantum = Quantum(
477 taskName=q.task.taskDef.taskName,
478 taskClass=q.task.taskDef.taskClass,
479 dataId=q.dataId,
480 initInputs=q.task.initInputs.unpackSingleRefs(),
481 inputs=inputs,
482 outputs=q.outputs.unpackMultiRefs(),
483 )
484 outputs.add(tmpQuantum)
485 else:
486 raise exc
487 return outputs
490class _DatasetIdMaker:
491 """Helper class which generates random dataset UUIDs for unresolved
492 datasets.
493 """
495 def __init__(self, registry: Registry, run: str):
496 self.datasetIdFactory = registry.datasetIdFactory
497 self.run = run
498 # Dataset IDs generated so far
499 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {}
501 def resolveRef(self, ref: DatasetRef) -> DatasetRef:
502 if ref.id is not None:
503 return ref
504 key = ref.datasetType, ref.dataId
505 if (resolved := self.resolved.get(key)) is None:
506 datasetId = self.datasetIdFactory.makeDatasetId(
507 self.run, ref.datasetType, ref.dataId, DatasetIdGenEnum.UNIQUE
508 )
509 resolved = ref.resolved(datasetId, self.run)
510 self.resolved[key] = resolved
511 return resolved
513 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]:
514 """Resolve all unresolved references in the provided dictionary."""
515 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()}
518@dataclass
519class _PipelineScaffolding:
520 """A helper data structure that organizes the information involved in
521 constructing a `QuantumGraph` for a `Pipeline`.
523 Parameters
524 ----------
525 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
526 Sequence of tasks from which a graph is to be constructed. Must
527 have nested task classes already imported.
528 universe : `DimensionUniverse`
529 Universe of all possible dimensions.
531 Notes
532 -----
533 The scaffolding data structure contains nested data structures for both
534 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
535 data structures are shared between the pipeline-level structure (which
536 aggregates all datasets and categorizes them from the perspective of the
537 complete pipeline) and the individual tasks that use them as inputs and
538 outputs.
540 `QuantumGraph` construction proceeds in four steps, with each corresponding
541 to a different `_PipelineScaffolding` method:
543 1. When `_PipelineScaffolding` is constructed, we extract and categorize
544 the DatasetTypes used by the pipeline (delegating to
545 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
546 nested `_TaskScaffolding` and `_DatasetDict` objects.
548 2. In `connectDataIds`, we construct and run the "Big Join Query", which
549 returns related tuples of all dimensions used to identify any regular
550 input, output, and intermediate datasets (not prerequisites). We then
551 iterate over these tuples of related dimensions, identifying the subsets
552 that correspond to distinct data IDs for each task and dataset type,
553 and then create `_QuantumScaffolding` objects.
555 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
556 dataset data IDs previously identified, transforming unresolved
557 DatasetRefs into resolved DatasetRefs where appropriate. We then look
558 up prerequisite datasets for all quanta.
560 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
561 per-task `_QuantumScaffolding` objects.
562 """
564 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
565 _LOG.debug("Initializing data structures for QuantumGraph generation.")
566 self.tasks = []
567 # Aggregate and categorize the DatasetTypes in the Pipeline.
568 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
569 # Construct dictionaries that map those DatasetTypes to structures
570 # that will (later) hold addiitonal information about them.
571 for attr in (
572 "initInputs",
573 "initIntermediates",
574 "initOutputs",
575 "inputs",
576 "intermediates",
577 "outputs",
578 "prerequisites",
579 ):
580 setattr(
581 self,
582 attr,
583 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
584 )
585 # Aggregate all dimensions for all non-init, non-prerequisite
586 # DatasetTypes. These are the ones we'll include in the big join
587 # query.
588 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
589 # Construct scaffolding nodes for each Task, and add backreferences
590 # to the Task from each DatasetScaffolding node.
591 # Note that there's only one scaffolding node for each DatasetType,
592 # shared by _PipelineScaffolding and all _TaskScaffoldings that
593 # reference it.
594 if isinstance(pipeline, Pipeline):
595 pipeline = pipeline.toExpandedPipeline()
596 self.tasks = [
597 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
598 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
599 ]
601 def __repr__(self) -> str:
602 # Default dataclass-injected __repr__ gets caught in an infinite loop
603 # because of back-references.
604 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
606 tasks: List[_TaskScaffolding]
607 """Scaffolding data structures for each task in the pipeline
608 (`list` of `_TaskScaffolding`).
609 """
611 initInputs: _DatasetDict
612 """Datasets consumed but not produced when constructing the tasks in this
613 pipeline (`_DatasetDict`).
614 """
616 initIntermediates: _DatasetDict
617 """Datasets that are both consumed and produced when constructing the tasks
618 in this pipeline (`_DatasetDict`).
619 """
621 initOutputs: _DatasetDict
622 """Datasets produced but not consumed when constructing the tasks in this
623 pipeline (`_DatasetDict`).
624 """
626 inputs: _DatasetDict
627 """Datasets that are consumed but not produced when running this pipeline
628 (`_DatasetDict`).
629 """
631 intermediates: _DatasetDict
632 """Datasets that are both produced and consumed when running this pipeline
633 (`_DatasetDict`).
634 """
636 outputs: _DatasetDict
637 """Datasets produced but not consumed when when running this pipeline
638 (`_DatasetDict`).
639 """
641 prerequisites: _DatasetDict
642 """Datasets that are consumed when running this pipeline and looked up
643 per-Quantum when generating the graph (`_DatasetDict`).
644 """
646 dimensions: DimensionGraph
647 """All dimensions used by any regular input, intermediate, or output
648 (not prerequisite) dataset; the set of dimension used in the "Big Join
649 Query" (`DimensionGraph`).
651 This is required to be a superset of all task quantum dimensions.
652 """
654 @contextmanager
655 def connectDataIds(
656 self,
657 registry: Registry,
658 collections: Any,
659 userQuery: Optional[str],
660 externalDataId: DataCoordinate,
661 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
662 bind: Optional[Mapping[str, Any]] = None,
663 ) -> Iterator[DataCoordinateQueryResults]:
664 """Query for the data IDs that connect nodes in the `QuantumGraph`.
666 This method populates `_TaskScaffolding.dataIds` and
667 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
669 Parameters
670 ----------
671 registry : `lsst.daf.butler.Registry`
672 Registry for the data repository; used for all data ID queries.
673 collections
674 Expressions representing the collections to search for input
675 datasets. See :ref:`daf_butler_ordered_collection_searches`.
676 userQuery : `str` or `None`
677 User-provided expression to limit the data IDs processed.
678 externalDataId : `DataCoordinate`
679 Externally-provided data ID that should be used to restrict the
680 results, just as if these constraints had been included via ``AND``
681 in ``userQuery``. This includes (at least) any instrument named
682 in the pipeline definition.
683 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
684 The query constraint variant that should be used to constraint the
685 query based on dataset existance, defaults to
686 `DatasetQueryConstraintVariant.ALL`.
687 bind : `Mapping`, optional
688 Mapping containing literal values that should be injected into the
689 ``userQuery`` expression, keyed by the identifiers they replace.
691 Returns
692 -------
693 commonDataIds : \
694 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
695 An interface to a database temporary table containing all data IDs
696 that will appear in this `QuantumGraph`. Returned inside a
697 context manager, which will drop the temporary table at the end of
698 the `with` block in which this method is called.
699 """
700 _LOG.debug("Building query for data IDs.")
701 # Initialization datasets always have empty data IDs.
702 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
703 for datasetType, refs in itertools.chain(
704 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
705 ):
706 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
707 # Run one big query for the data IDs for task dimensions and regular
708 # inputs and outputs. We limit the query to only dimensions that are
709 # associated with the input dataset types, but don't (yet) try to
710 # obtain the dataset_ids for those inputs.
711 _LOG.debug("Submitting data ID query and materializing results.")
712 queryArgs: Dict[str, Any] = {
713 "dimensions": self.dimensions,
714 "where": userQuery,
715 "dataId": externalDataId,
716 "bind": bind,
717 }
718 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
719 _LOG.debug("Constraining graph query using all datasets in pipeline.")
720 queryArgs["datasets"] = list(self.inputs)
721 queryArgs["collections"] = collections
722 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
723 _LOG.debug("Not using dataset existence to constrain query.")
724 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
725 constraint = set(datasetQueryConstraint)
726 inputs = {k.name: k for k in self.inputs.keys()}
727 if remainder := constraint.difference(inputs.keys()):
728 raise ValueError(
729 f"{remainder} dataset type(s) specified as a graph constraint, but"
730 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
731 )
732 _LOG.debug(f"Constraining graph query using {constraint}")
733 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
734 queryArgs["collections"] = collections
735 else:
736 raise ValueError(
737 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
738 )
740 if "datasets" in queryArgs:
741 for i, dataset_type in enumerate(queryArgs["datasets"]):
742 if dataset_type.isComponent():
743 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
745 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
746 _LOG.debug("Expanding data IDs.")
747 commonDataIds = commonDataIds.expanded()
748 _LOG.debug("Iterating over query results to associate quanta with datasets.")
749 # Iterate over query results, populating data IDs for datasets and
750 # quanta and then connecting them to each other.
751 n = -1
752 for n, commonDataId in enumerate(commonDataIds):
753 _LOG.debug("Next DataID = %s", commonDataId)
754 # Create DatasetRefs for all DatasetTypes from this result row,
755 # noting that we might have created some already.
756 # We remember both those that already existed and those that we
757 # create now.
758 refsForRow = {}
759 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
760 for datasetType, refs in itertools.chain(
761 self.inputs.items(), self.intermediates.items(), self.outputs.items()
762 ):
763 datasetDataId: Optional[DataCoordinate]
764 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
765 datasetDataId = commonDataId.subset(datasetType.dimensions)
766 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
767 ref = refs.get(datasetDataId)
768 if ref is None:
769 ref = DatasetRef(datasetType, datasetDataId)
770 _LOG.debug("Made new ref = %s", ref)
771 refs[datasetDataId] = ref
772 refsForRow[datasetType.name] = ref
773 # Create _QuantumScaffolding objects for all tasks from this
774 # result row, noting that we might have created some already.
775 for task in self.tasks:
776 quantumDataId = commonDataId.subset(task.dimensions)
777 quantum = task.quanta.get(quantumDataId)
778 if quantum is None:
779 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
780 task.quanta[quantumDataId] = quantum
781 # Whether this is a new quantum or an existing one, we can
782 # now associate the DatasetRefs for this row with it. The
783 # fact that a Quantum data ID and a dataset data ID both
784 # came from the same result row is what tells us they
785 # should be associated.
786 # Many of these associates will be duplicates (because
787 # another query row that differed from this one only in
788 # irrelevant dimensions already added them), and we use
789 # sets to skip.
790 for datasetType in task.inputs:
791 ref = refsForRow[datasetType.name]
792 quantum.inputs[datasetType.name][ref.dataId] = ref
793 for datasetType in task.outputs:
794 ref = refsForRow[datasetType.name]
795 quantum.outputs[datasetType.name][ref.dataId] = ref
796 if n < 0:
797 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
798 emptiness_explained = False
799 for message in commonDataIds.explain_no_results():
800 _LOG.critical(message)
801 emptiness_explained = True
802 if not emptiness_explained:
803 _LOG.critical(
804 "To reproduce this query for debugging purposes, run "
805 "Registry.queryDataIds with these arguments:"
806 )
807 # We could just repr() the queryArgs dict to get something
808 # the user could make sense of, but it's friendlier to
809 # put these args in an easier-to-construct equivalent form
810 # so they can read it more easily and copy and paste into
811 # a Python terminal.
812 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
813 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
814 if queryArgs["where"]:
815 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
816 if "datasets" in queryArgs:
817 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
818 if "collections" in queryArgs:
819 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
820 _LOG.debug("Finished processing %d rows from data ID query.", n)
821 yield commonDataIds
823 def resolveDatasetRefs(
824 self,
825 registry: Registry,
826 collections: Any,
827 run: Optional[str],
828 commonDataIds: DataCoordinateQueryResults,
829 *,
830 skipExistingIn: Any = None,
831 clobberOutputs: bool = True,
832 constrainedByAllDatasets: bool = True,
833 resolveRefs: bool = False,
834 ) -> None:
835 """Perform follow up queries for each dataset data ID produced in
836 `fillDataIds`.
838 This method populates `_DatasetScaffolding.refs` (except for those in
839 `prerequisites`).
841 Parameters
842 ----------
843 registry : `lsst.daf.butler.Registry`
844 Registry for the data repository; used for all data ID queries.
845 collections
846 Expressions representing the collections to search for input
847 datasets. See :ref:`daf_butler_ordered_collection_searches`.
848 run : `str`, optional
849 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
850 output datasets, if it already exists.
851 commonDataIds : \
852 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
853 Result of a previous call to `connectDataIds`.
854 skipExistingIn
855 Expressions representing the collections to search for existing
856 output datasets that should be skipped. See
857 :ref:`daf_butler_ordered_collection_searches` for allowed types.
858 `None` or empty string/sequence disables skipping.
859 clobberOutputs : `bool`, optional
860 If `True` (default), allow quanta to created even if outputs exist;
861 this requires the same behavior behavior to be enabled when
862 executing. If ``skipExistingIn`` is not `None`, completed quanta
863 (those with metadata, or all outputs if there is no metadata
864 dataset configured) will be skipped rather than clobbered.
865 constrainedByAllDatasets : `bool`, optional
866 Indicates if the commonDataIds were generated with a constraint on
867 all dataset types.
868 resolveRefs : `bool`, optional
869 If `True` then resolve all input references and generate random
870 dataset IDs for all output and intermediate datasets. True value
871 requires ``run`` collection to be specified.
873 Raises
874 ------
875 OutputExistsError
876 Raised if an output dataset already exists in the output run
877 and ``skipExistingIn`` does not include output run, or if only
878 some outputs are present and ``clobberOutputs`` is `False`.
879 """
880 skip_collections_wildcard: CollectionWildcard | None = None
881 skipExistingInRun = False
882 if skipExistingIn:
883 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
884 if run:
885 # as optimization check in the explicit list of names first
886 skipExistingInRun = run in skip_collections_wildcard.strings
887 if not skipExistingInRun:
888 # need to flatten it and check again
889 skipExistingInRun = run in registry.queryCollections(
890 skipExistingIn,
891 collectionTypes=CollectionType.RUN,
892 )
894 idMaker: Optional[_DatasetIdMaker] = None
895 if resolveRefs:
896 assert run is not None, "run cannot be None when resolveRefs is True"
897 idMaker = _DatasetIdMaker(registry, run)
899 resolvedRefQueryResults: Iterable[DatasetRef]
901 # Look up [init] intermediate and output datasets in the output
902 # collection, if there is an output collection.
903 if run is not None or skip_collections_wildcard is not None:
904 for datasetType, refs in itertools.chain(
905 self.initIntermediates.items(),
906 self.initOutputs.items(),
907 self.intermediates.items(),
908 self.outputs.items(),
909 ):
910 _LOG.debug(
911 "Resolving %d datasets for intermediate and/or output dataset %s.",
912 len(refs),
913 datasetType.name,
914 )
915 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
916 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
917 # TODO: this assert incorrectly bans component inputs;
918 # investigate on DM-33027.
919 # assert not datasetType.isComponent(), \
920 # "Output datasets cannot be components."
921 #
922 # Instead we have to handle them manually to avoid a
923 # deprecation warning, but it is at least confusing and
924 # possibly a bug for components to appear here at all.
925 if datasetType.isComponent():
926 parent_dataset_type = datasetType.makeCompositeDatasetType()
927 component = datasetType.component()
928 else:
929 parent_dataset_type = datasetType
930 component = None
932 # look at RUN collection first
933 if run is not None:
934 try:
935 resolvedRefQueryResults = subset.findDatasets(
936 parent_dataset_type, collections=run, findFirst=True
937 )
938 except MissingDatasetTypeError:
939 resolvedRefQueryResults = []
940 for resolvedRef in resolvedRefQueryResults:
941 # TODO: we could easily support per-DatasetType
942 # skipExisting and I could imagine that being useful -
943 # it's probably required in order to support writing
944 # initOutputs before QuantumGraph generation.
945 assert resolvedRef.dataId in refs
946 if not (skipExistingInRun or isInit or clobberOutputs):
947 raise OutputExistsError(
948 f"Output dataset {datasetType.name} already exists in "
949 f"output RUN collection '{run}' with data ID"
950 f" {resolvedRef.dataId}."
951 )
952 # If we are going to resolve all outputs then we have
953 # to remember existing ones to avoid generating new
954 # dataset IDs for them.
955 if resolveRefs:
956 refs[resolvedRef.dataId] = (
957 resolvedRef.makeComponentRef(component)
958 if component is not None
959 else resolvedRef
960 )
962 # And check skipExistingIn too, if RUN collection is in
963 # it is handled above
964 if skip_collections_wildcard is not None:
965 try:
966 resolvedRefQueryResults = subset.findDatasets(
967 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True
968 )
969 except MissingDatasetTypeError:
970 resolvedRefQueryResults = []
971 for resolvedRef in resolvedRefQueryResults:
972 assert resolvedRef.dataId in refs
973 refs[resolvedRef.dataId] = (
974 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
975 )
977 # Look up input and initInput datasets in the input collection(s).
978 # container to accumulate unfound refs, if the common dataIs were not
979 # constrained on dataset type existence.
980 self.unfoundRefs = set()
981 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
982 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
983 if datasetType.isComponent():
984 parent_dataset_type = datasetType.makeCompositeDatasetType()
985 component = datasetType.component()
986 else:
987 parent_dataset_type = datasetType
988 component = None
989 try:
990 resolvedRefQueryResults = commonDataIds.subset(
991 datasetType.dimensions, unique=True
992 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
993 except MissingDatasetTypeError:
994 resolvedRefQueryResults = []
995 dataIdsNotFoundYet = set(refs.keys())
996 for resolvedRef in resolvedRefQueryResults:
997 dataIdsNotFoundYet.discard(resolvedRef.dataId)
998 refs[resolvedRef.dataId] = (
999 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1000 )
1001 if dataIdsNotFoundYet:
1002 if constrainedByAllDatasets:
1003 raise RuntimeError(
1004 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1005 f"'{datasetType.name}' was/were present in a previous "
1006 f"query, but could not be found now."
1007 f"This is either a logic bug in QuantumGraph generation "
1008 f"or the input collections have been modified since "
1009 f"QuantumGraph generation began."
1010 )
1011 else:
1012 # if the common dataIds were not constrained using all the
1013 # input dataset types, it is possible that some data ids
1014 # found dont correspond to existing dataset types and they
1015 # will be un-resolved. Mark these for later pruning from
1016 # the quantum graph.
1017 for k in dataIdsNotFoundYet:
1018 self.unfoundRefs.add(refs[k])
1020 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1021 # replacing the unresolved refs there, and then look up prerequisites.
1022 for task in self.tasks:
1023 _LOG.debug(
1024 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1025 len(task.quanta),
1026 task.taskDef.label,
1027 )
1028 # The way iterConnections is designed makes it impossible to
1029 # annotate precisely enough to satisfy MyPy here.
1030 lookupFunctions = {
1031 c.name: c.lookupFunction # type: ignore
1032 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1033 if c.lookupFunction is not None # type: ignore
1034 }
1035 dataIdsFailed = []
1036 dataIdsSucceeded = []
1037 for quantum in task.quanta.values():
1038 # Process outputs datasets only if skipExistingIn is not None
1039 # or there is a run to look for outputs in and clobberOutputs
1040 # is True. Note that if skipExistingIn is None, any output
1041 # datasets that already exist would have already caused an
1042 # exception to be raised. We never update the DatasetRefs in
1043 # the quantum because those should never be resolved.
1044 if skip_collections_wildcard is not None or (run is not None and clobberOutputs):
1045 resolvedRefs = []
1046 unresolvedRefs = []
1047 haveMetadata = False
1048 for datasetType, originalRefs in quantum.outputs.items():
1049 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
1050 if ref.id is not None:
1051 resolvedRefs.append(ref)
1052 if datasetType.name == task.taskDef.metadataDatasetName:
1053 haveMetadata = True
1054 else:
1055 unresolvedRefs.append(ref)
1056 if resolvedRefs:
1057 if haveMetadata or not unresolvedRefs:
1058 dataIdsSucceeded.append(quantum.dataId)
1059 if skip_collections_wildcard is not None:
1060 continue
1061 else:
1062 dataIdsFailed.append(quantum.dataId)
1063 if not clobberOutputs:
1064 raise OutputExistsError(
1065 f"Quantum {quantum.dataId} of task with label "
1066 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1067 f"({resolvedRefs}) "
1068 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1069 "and clobbering outputs was not enabled."
1070 )
1071 # Update the input DatasetRefs to the resolved ones we already
1072 # searched for.
1073 for datasetType, input_refs in quantum.inputs.items():
1074 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1075 input_refs[ref.dataId] = ref
1076 # Look up prerequisite datasets in the input collection(s).
1077 # These may have dimensions that extend beyond those we queried
1078 # for originally, because we want to permit those data ID
1079 # values to differ across quanta and dataset types.
1080 for datasetType in task.prerequisites:
1081 if datasetType.isComponent():
1082 parent_dataset_type = datasetType.makeCompositeDatasetType()
1083 component = datasetType.component()
1084 else:
1085 parent_dataset_type = datasetType
1086 component = None
1087 lookupFunction = lookupFunctions.get(datasetType.name)
1088 if lookupFunction is not None:
1089 # PipelineTask has provided its own function to do the
1090 # lookup. This always takes precedence.
1091 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1092 elif (
1093 datasetType.isCalibration()
1094 and datasetType.dimensions <= quantum.dataId.graph
1095 and quantum.dataId.graph.temporal
1096 ):
1097 # This is a master calibration lookup, which we have to
1098 # handle specially because the query system can't do a
1099 # temporal join on a non-dimension-based timespan yet.
1100 timespan = quantum.dataId.timespan
1101 try:
1102 prereq_ref = registry.findDataset(
1103 parent_dataset_type,
1104 quantum.dataId,
1105 collections=collections,
1106 timespan=timespan,
1107 )
1108 if prereq_ref is not None:
1109 if component is not None:
1110 prereq_ref = prereq_ref.makeComponentRef(component)
1111 prereq_refs = [prereq_ref]
1112 else:
1113 prereq_refs = []
1114 except (KeyError, MissingDatasetTypeError):
1115 # This dataset type is not present in the registry,
1116 # which just means there are no datasets here.
1117 prereq_refs = []
1118 else:
1119 # Most general case.
1120 prereq_refs = [
1121 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1122 for prereq_ref in registry.queryDatasets(
1123 parent_dataset_type,
1124 collections=collections,
1125 dataId=quantum.dataId,
1126 findFirst=True,
1127 ).expanded()
1128 ]
1129 quantum.prerequisites[datasetType].update(
1130 {ref.dataId: ref for ref in prereq_refs if ref is not None}
1131 )
1133 # Resolve all quantum inputs and outputs.
1134 if idMaker:
1135 for datasetDict in (quantum.inputs, quantum.outputs):
1136 for refDict in datasetDict.values():
1137 refDict.update(idMaker.resolveDict(refDict))
1139 # Resolve task initInputs and initOutputs.
1140 if idMaker:
1141 for datasetDict in (task.initInputs, task.initOutputs):
1142 for refDict in datasetDict.values():
1143 refDict.update(idMaker.resolveDict(refDict))
1145 # Actually remove any quanta that we decided to skip above.
1146 if dataIdsSucceeded:
1147 if skip_collections_wildcard is not None:
1148 _LOG.debug(
1149 "Pruning successful %d quanta for task with label '%s' because all of their "
1150 "outputs exist or metadata was written successfully.",
1151 len(dataIdsSucceeded),
1152 task.taskDef.label,
1153 )
1154 for dataId in dataIdsSucceeded:
1155 del task.quanta[dataId]
1156 elif clobberOutputs:
1157 _LOG.info(
1158 "Found %d successful quanta for task with label '%s' "
1159 "that will need to be clobbered during execution.",
1160 len(dataIdsSucceeded),
1161 task.taskDef.label,
1162 )
1163 else:
1164 raise AssertionError("OutputExistsError should have already been raised.")
1165 if dataIdsFailed:
1166 if clobberOutputs:
1167 _LOG.info(
1168 "Found %d failed/incomplete quanta for task with label '%s' "
1169 "that will need to be clobbered during execution.",
1170 len(dataIdsFailed),
1171 task.taskDef.label,
1172 )
1173 else:
1174 raise AssertionError("OutputExistsError should have already been raised.")
1176 def makeQuantumGraph(
1177 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None
1178 ) -> QuantumGraph:
1179 """Create a `QuantumGraph` from the quanta already present in
1180 the scaffolding data structure.
1182 Parameters
1183 ---------
1184 metadata : Optional Mapping of `str` to primitives
1185 This is an optional parameter of extra data to carry with the
1186 graph. Entries in this mapping should be able to be serialized in
1187 JSON.
1188 datastore : `Datastore`, optional
1189 If not `None` then fill datastore records in each generated
1190 Quantum.
1192 Returns
1193 -------
1194 graph : `QuantumGraph`
1195 The full `QuantumGraph`.
1196 """
1198 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1199 """Extract all DatasetRefs from the dictionaries"""
1200 for ref_dict in dataset_dict.values():
1201 yield from ref_dict.values()
1203 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1204 if datastore is not None:
1205 datastore_records = datastore.export_records(
1206 itertools.chain(
1207 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1208 )
1209 )
1211 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1212 for task in self.tasks:
1213 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1214 graphInput[task.taskDef] = qset
1216 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1217 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1219 graph = QuantumGraph(
1220 graphInput,
1221 metadata=metadata,
1222 pruneRefs=self.unfoundRefs,
1223 universe=self.dimensions.universe,
1224 initInputs=taskInitInputs,
1225 initOutputs=taskInitOutputs,
1226 )
1227 return graph
1230# ------------------------
1231# Exported definitions --
1232# ------------------------
1235class GraphBuilderError(Exception):
1236 """Base class for exceptions generated by graph builder."""
1238 pass
1241class OutputExistsError(GraphBuilderError):
1242 """Exception generated when output datasets already exist."""
1244 pass
1247class PrerequisiteMissingError(GraphBuilderError):
1248 """Exception generated when a prerequisite dataset does not exist."""
1250 pass
1253class GraphBuilder:
1254 """GraphBuilder class is responsible for building task execution graph from
1255 a Pipeline.
1257 Parameters
1258 ----------
1259 registry : `~lsst.daf.butler.Registry`
1260 Data butler instance.
1261 skipExistingIn
1262 Expressions representing the collections to search for existing
1263 output datasets that should be skipped. See
1264 :ref:`daf_butler_ordered_collection_searches`.
1265 clobberOutputs : `bool`, optional
1266 If `True` (default), allow quanta to created even if partial outputs
1267 exist; this requires the same behavior behavior to be enabled when
1268 executing.
1269 datastore : `Datastore`, optional
1270 If not `None` then fill datastore records in each generated Quantum.
1271 """
1273 def __init__(
1274 self,
1275 registry: Registry,
1276 skipExistingIn: Any = None,
1277 clobberOutputs: bool = True,
1278 datastore: Optional[Datastore] = None,
1279 ):
1280 self.registry = registry
1281 self.dimensions = registry.dimensions
1282 self.skipExistingIn = skipExistingIn
1283 self.clobberOutputs = clobberOutputs
1284 self.datastore = datastore
1286 def makeGraph(
1287 self,
1288 pipeline: Union[Pipeline, Iterable[TaskDef]],
1289 collections: Any,
1290 run: Optional[str],
1291 userQuery: Optional[str],
1292 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1293 metadata: Optional[Mapping[str, Any]] = None,
1294 resolveRefs: bool = False,
1295 bind: Optional[Mapping[str, Any]] = None,
1296 ) -> QuantumGraph:
1297 """Create execution graph for a pipeline.
1299 Parameters
1300 ----------
1301 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1302 Pipeline definition, task names/classes and their configs.
1303 collections
1304 Expressions representing the collections to search for input
1305 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1306 run : `str`, optional
1307 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1308 output datasets, if it already exists.
1309 userQuery : `str`
1310 String which defines user-defined selection for registry, should be
1311 empty or `None` if there is no restrictions on data selection.
1312 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1313 The query constraint variant that should be used to constraint the
1314 query based on dataset existance, defaults to
1315 `DatasetQueryConstraintVariant.ALL`.
1316 metadata : Optional Mapping of `str` to primitives
1317 This is an optional parameter of extra data to carry with the
1318 graph. Entries in this mapping should be able to be serialized in
1319 JSON.
1320 resolveRefs : `bool`, optional
1321 If `True` then resolve all input references and generate random
1322 dataset IDs for all output and intermediate datasets. True value
1323 requires ``run`` collection to be specified.
1324 bind : `Mapping`, optional
1325 Mapping containing literal values that should be injected into the
1326 ``userQuery`` expression, keyed by the identifiers they replace.
1328 Returns
1329 -------
1330 graph : `QuantumGraph`
1332 Raises
1333 ------
1334 UserExpressionError
1335 Raised when user expression cannot be parsed.
1336 OutputExistsError
1337 Raised when output datasets already exist.
1338 Exception
1339 Other exceptions types may be raised by underlying registry
1340 classes.
1341 """
1342 if resolveRefs and run is None:
1343 raise ValueError("`resolveRefs` requires `run` parameter.")
1344 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1345 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1346 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1347 instrument_class: Optional[Any] = None
1348 if isinstance(pipeline, Pipeline):
1349 instrument_class_name = pipeline.getInstrument()
1350 if instrument_class_name is not None:
1351 instrument_class = doImportType(instrument_class_name)
1352 pipeline = list(pipeline.toExpandedPipeline())
1353 if instrument_class is not None:
1354 dataId = DataCoordinate.standardize(
1355 instrument=instrument_class.getName(), universe=self.registry.dimensions
1356 )
1357 else:
1358 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1359 with scaffolding.connectDataIds(
1360 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1361 ) as commonDataIds:
1362 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1363 scaffolding.resolveDatasetRefs(
1364 self.registry,
1365 collections,
1366 run,
1367 commonDataIds,
1368 skipExistingIn=self.skipExistingIn,
1369 clobberOutputs=self.clobberOutputs,
1370 constrainedByAllDatasets=condition,
1371 resolveRefs=resolveRefs,
1372 )
1373 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)