Coverage for python/lsst/pipe/base/graphBuilder.py: 15%
490 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-05 02:05 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-05 02:05 -0800
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
38from lsst.daf.butler import (
39 CollectionType,
40 DataCoordinate,
41 DatasetIdGenEnum,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 Quantum,
50 Registry,
51)
52from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
53from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
54from lsst.daf.butler.registry.wildcards import CollectionWildcard
55from lsst.utils import doImportType
57from ._datasetQueryConstraints import DatasetQueryConstraintVariant
58from ._status import NoWorkFound
60# -----------------------------
61# Imports for other modules --
62# -----------------------------
63from .connections import AdjustQuantumHelper, iterConnections
64from .graph import QuantumGraph
65from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
67# ----------------------------------
68# Local non-exported definitions --
69# ----------------------------------
71_LOG = logging.getLogger(__name__)
74class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
75 """A custom dictionary that maps `DatasetType` to a nested dictionary of
76 the known `DatasetRef` instances of that type.
78 Parameters
79 ----------
80 args
81 Positional arguments are forwarded to the `dict` constructor.
82 universe : `DimensionUniverse`
83 Universe of all possible dimensions.
84 """
86 def __init__(self, *args: Any, universe: DimensionUniverse):
87 super().__init__(*args)
88 self.universe = universe
90 @classmethod
91 def fromDatasetTypes(
92 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
93 ) -> _DatasetDict:
94 """Construct a dictionary from a flat iterable of `DatasetType` keys.
96 Parameters
97 ----------
98 datasetTypes : `iterable` of `DatasetType`
99 DatasetTypes to use as keys for the dict. Values will be empty
100 dictionaries.
101 universe : `DimensionUniverse`
102 Universe of all possible dimensions.
104 Returns
105 -------
106 dictionary : `_DatasetDict`
107 A new `_DatasetDict` instance.
108 """
109 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
111 @classmethod
112 def fromSubset(
113 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
114 ) -> _DatasetDict:
115 """Return a new dictionary by extracting items corresponding to the
116 given keys from one or more existing dictionaries.
118 Parameters
119 ----------
120 datasetTypes : `iterable` of `DatasetType`
121 DatasetTypes to use as keys for the dict. Values will be obtained
122 by lookups against ``first`` and ``rest``.
123 first : `_DatasetDict`
124 Another dictionary from which to extract values.
125 rest
126 Additional dictionaries from which to extract values.
128 Returns
129 -------
130 dictionary : `_DatasetDict`
131 A new dictionary instance.
132 """
133 combined = ChainMap(first, *rest)
135 # Dataset types known to match immediately can be processed
136 # without checks.
137 matches = combined.keys() & set(datasetTypes)
138 _dict = {k: combined[k] for k in matches}
140 if len(_dict) < len(datasetTypes):
141 # Work out which ones are missing.
142 missing_datasetTypes = set(datasetTypes) - _dict.keys()
144 # Get the known names for comparison.
145 combined_by_name = {k.name: k for k in combined}
147 missing = set()
148 incompatible = {}
149 for datasetType in missing_datasetTypes:
150 # The dataset type is not found. It may not be listed
151 # or it may be that it is there with the same name
152 # but different definition.
153 if datasetType.name in combined_by_name:
154 # This implies some inconsistency in definitions
155 # for connections. If there is support for storage
156 # class conversion we can let it slide.
157 # At this point we do not know
158 # where the inconsistency is but trust that down
159 # stream code will be more explicit about input
160 # vs output incompatibilities.
161 existing = combined_by_name[datasetType.name]
162 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
163 _LOG.warning(
164 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
165 datasetType,
166 existing,
167 )
168 _dict[datasetType] = combined[existing]
169 else:
170 incompatible[datasetType] = existing
171 else:
172 missing.add(datasetType)
174 if missing or incompatible:
175 reasons = []
176 if missing:
177 reasons.append(
178 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
179 + ", ".join(d.name for d in combined)
180 )
181 if incompatible:
182 for x, y in incompatible.items():
183 reasons.append(f"{x} incompatible with {y}")
184 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
186 return cls(_dict, universe=first.universe)
188 @property
189 def dimensions(self) -> DimensionGraph:
190 """The union of all dimensions used by all dataset types in this
191 dictionary, including implied dependencies (`DimensionGraph`).
192 """
193 base = self.universe.empty
194 if len(self) == 0:
195 return base
196 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
198 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
199 """Unpack nested single-element `DatasetRef` dicts into a new
200 mapping with `DatasetType` keys and `DatasetRef` values.
202 This method assumes that each nest contains exactly one item, as is the
203 case for all "init" datasets.
205 Returns
206 -------
207 dictionary : `NamedKeyDict`
208 Dictionary mapping `DatasetType` to `DatasetRef`, with both
209 `DatasetType` instances and string names usable as keys.
210 """
212 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
213 (ref,) = refs.values()
214 return ref
216 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
218 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
219 """Unpack nested multi-element `DatasetRef` dicts into a new
220 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
222 Returns
223 -------
224 dictionary : `NamedKeyDict`
225 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
226 both `DatasetType` instances and string names usable as keys.
227 """
228 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
230 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
231 """Iterate over the contained `DatasetRef` instances that match the
232 given `DatasetType` and data IDs.
234 Parameters
235 ----------
236 datasetType : `DatasetType`
237 Dataset type to match.
238 dataIds : `Iterable` [ `DataCoordinate` ]
239 Data IDs to match.
241 Returns
242 -------
243 refs : `Iterator` [ `DatasetRef` ]
244 DatasetRef instances for which ``ref.datasetType == datasetType``
245 and ``ref.dataId`` is in ``dataIds``.
246 """
247 refs = self[datasetType]
248 return (refs[dataId] for dataId in dataIds)
251class _QuantumScaffolding:
252 """Helper class aggregating information about a `Quantum`, used when
253 constructing a `QuantumGraph`.
255 See `_PipelineScaffolding` for a top-down description of the full
256 scaffolding data structure.
258 Parameters
259 ----------
260 task : _TaskScaffolding
261 Back-reference to the helper object for the `PipelineTask` this quantum
262 represents an execution of.
263 dataId : `DataCoordinate`
264 Data ID for this quantum.
265 """
267 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
268 self.task = task
269 self.dataId = dataId
270 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
271 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
272 self.prerequisites = _DatasetDict.fromDatasetTypes(
273 task.prerequisites.keys(), universe=dataId.universe
274 )
276 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
278 def __repr__(self) -> str:
279 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
281 task: _TaskScaffolding
282 """Back-reference to the helper object for the `PipelineTask` this quantum
283 represents an execution of.
284 """
286 dataId: DataCoordinate
287 """Data ID for this quantum.
288 """
290 inputs: _DatasetDict
291 """Nested dictionary containing `DatasetRef` inputs to this quantum.
293 This is initialized to map each `DatasetType` to an empty dictionary at
294 construction. Those nested dictionaries are populated (with data IDs as
295 keys) with unresolved `DatasetRef` instances in
296 `_PipelineScaffolding.connectDataIds`.
297 """
299 outputs: _DatasetDict
300 """Nested dictionary containing `DatasetRef` outputs this quantum.
301 """
303 prerequisites: _DatasetDict
304 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
305 quantum.
306 """
308 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
309 """Transform the scaffolding object into a true `Quantum` instance.
311 Parameters
312 ----------
313 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
314 If not `None` then fill datastore records in each generated Quantum
315 using the records from this structure.
317 Returns
318 -------
319 quantum : `Quantum`
320 An actual `Quantum` instance.
321 """
322 allInputs = self.inputs.unpackMultiRefs()
323 allInputs.update(self.prerequisites.unpackMultiRefs())
324 # Give the task's Connections class an opportunity to remove some
325 # inputs, or complain if they are unacceptable.
326 # This will raise if one of the check conditions is not met, which is
327 # the intended behavior.
328 # If it raises NotWorkFound, there is a bug in the QG algorithm
329 # or the adjustQuantum is incorrectly trying to make a prerequisite
330 # input behave like a regular input; adjustQuantum should only raise
331 # NoWorkFound if a regular input is missing, and it shouldn't be
332 # possible for us to have generated ``self`` if that's true.
333 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
334 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
335 initInputs = self.task.initInputs.unpackSingleRefs()
336 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
337 if datastore_records is not None:
338 quantum_records = {}
339 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
340 input_refs += list(initInputs.values())
341 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
342 for datastore_name, records in datastore_records.items():
343 matching_records = records.subset(input_ids)
344 if matching_records is not None:
345 quantum_records[datastore_name] = matching_records
346 return Quantum(
347 taskName=self.task.taskDef.taskName,
348 taskClass=self.task.taskDef.taskClass,
349 dataId=self.dataId,
350 initInputs=initInputs,
351 inputs=helper.inputs,
352 outputs=helper.outputs,
353 datastore_records=quantum_records,
354 )
357@dataclass
358class _TaskScaffolding:
359 """Helper class aggregating information about a `PipelineTask`, used when
360 constructing a `QuantumGraph`.
362 See `_PipelineScaffolding` for a top-down description of the full
363 scaffolding data structure.
365 Parameters
366 ----------
367 taskDef : `TaskDef`
368 Data structure that identifies the task class and its config.
369 parent : `_PipelineScaffolding`
370 The parent data structure that will hold the instance being
371 constructed.
372 datasetTypes : `TaskDatasetTypes`
373 Data structure that categorizes the dataset types used by this task.
374 """
376 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
377 universe = parent.dimensions.universe
378 self.taskDef = taskDef
379 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
380 assert self.dimensions.issubset(parent.dimensions)
381 # Initialize _DatasetDicts as subsets of the one or two
382 # corresponding dicts in the parent _PipelineScaffolding.
383 self.initInputs = _DatasetDict.fromSubset(
384 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
385 )
386 self.initOutputs = _DatasetDict.fromSubset(
387 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
388 )
389 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
390 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
391 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
392 self.dataIds: Set[DataCoordinate] = set()
393 self.quanta = {}
395 def __repr__(self) -> str:
396 # Default dataclass-injected __repr__ gets caught in an infinite loop
397 # because of back-references.
398 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
400 taskDef: TaskDef
401 """Data structure that identifies the task class and its config
402 (`TaskDef`).
403 """
405 dimensions: DimensionGraph
406 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
407 """
409 initInputs: _DatasetDict
410 """Dictionary containing information about datasets used to construct this
411 task (`_DatasetDict`).
412 """
414 initOutputs: _DatasetDict
415 """Dictionary containing information about datasets produced as a
416 side-effect of constructing this task (`_DatasetDict`).
417 """
419 inputs: _DatasetDict
420 """Dictionary containing information about datasets used as regular,
421 graph-constraining inputs to this task (`_DatasetDict`).
422 """
424 outputs: _DatasetDict
425 """Dictionary containing information about datasets produced by this task
426 (`_DatasetDict`).
427 """
429 prerequisites: _DatasetDict
430 """Dictionary containing information about input datasets that must be
431 present in the repository before any Pipeline containing this task is run
432 (`_DatasetDict`).
433 """
435 quanta: Dict[DataCoordinate, _QuantumScaffolding]
436 """Dictionary mapping data ID to a scaffolding object for the Quantum of
437 this task with that data ID.
438 """
440 def makeQuantumSet(
441 self,
442 unresolvedRefs: Optional[Set[DatasetRef]] = None,
443 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
444 ) -> Set[Quantum]:
445 """Create a `set` of `Quantum` from the information in ``self``.
447 Parameters
448 ----------
449 unresolvedRefs : `set` [ `DatasetRef` ], optional
450 Input dataset refs that have not been found.
451 datastore_records : `dict`
454 Returns
455 -------
456 nodes : `set` of `Quantum`
457 The `Quantum` elements corresponding to this task.
458 """
459 if unresolvedRefs is None:
460 unresolvedRefs = set()
461 outputs = set()
462 for q in self.quanta.values():
463 try:
464 tmpQuanta = q.makeQuantum(datastore_records)
465 outputs.add(tmpQuanta)
466 except (NoWorkFound, FileNotFoundError) as exc:
467 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
468 if unresolvedRefs.intersection(refs):
469 # This means it is a node that is Known to be pruned
470 # later and should be left in even though some follow up
471 # queries fail. This allows the pruning to start from this
472 # quantum with known issues, and prune other nodes it
473 # touches
474 inputs = q.inputs.unpackMultiRefs()
475 inputs.update(q.prerequisites.unpackMultiRefs())
476 tmpQuantum = Quantum(
477 taskName=q.task.taskDef.taskName,
478 taskClass=q.task.taskDef.taskClass,
479 dataId=q.dataId,
480 initInputs=q.task.initInputs.unpackSingleRefs(),
481 inputs=inputs,
482 outputs=q.outputs.unpackMultiRefs(),
483 )
484 outputs.add(tmpQuantum)
485 else:
486 raise exc
487 return outputs
490class _DatasetIdMaker:
491 """Helper class which generates random dataset UUIDs for unresolved
492 datasets.
493 """
495 def __init__(self, registry: Registry, run: str):
496 self.datasetIdFactory = registry.datasetIdFactory
497 self.run = run
498 # Dataset IDs generated so far
499 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {}
501 def resolveRef(self, ref: DatasetRef) -> DatasetRef:
502 if ref.id is not None:
503 return ref
505 # For components we need their parent dataset ID.
506 if ref.isComponent():
507 parent_ref = ref.makeCompositeRef()
508 # Some basic check - parent should be resolved if this is an
509 # existing input, or it should be in the cache already if it is
510 # an intermediate.
511 if parent_ref.id is None:
512 key = parent_ref.datasetType, parent_ref.dataId
513 if key not in self.resolved:
514 raise ValueError(f"Composite dataset is missing from cache: {parent_ref}")
515 parent_ref = self.resolved[key]
516 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved"
517 return ref.resolved(parent_ref.id, parent_ref.run)
519 key = ref.datasetType, ref.dataId
520 if (resolved := self.resolved.get(key)) is None:
521 resolved = self.datasetIdFactory.resolveRef(ref, self.run, DatasetIdGenEnum.UNIQUE)
522 self.resolved[key] = resolved
523 return resolved
525 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]:
526 """Resolve all unresolved references in the provided dictionary."""
527 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()}
530@dataclass
531class _PipelineScaffolding:
532 """A helper data structure that organizes the information involved in
533 constructing a `QuantumGraph` for a `Pipeline`.
535 Parameters
536 ----------
537 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
538 Sequence of tasks from which a graph is to be constructed. Must
539 have nested task classes already imported.
540 universe : `DimensionUniverse`
541 Universe of all possible dimensions.
543 Notes
544 -----
545 The scaffolding data structure contains nested data structures for both
546 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
547 data structures are shared between the pipeline-level structure (which
548 aggregates all datasets and categorizes them from the perspective of the
549 complete pipeline) and the individual tasks that use them as inputs and
550 outputs.
552 `QuantumGraph` construction proceeds in four steps, with each corresponding
553 to a different `_PipelineScaffolding` method:
555 1. When `_PipelineScaffolding` is constructed, we extract and categorize
556 the DatasetTypes used by the pipeline (delegating to
557 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
558 nested `_TaskScaffolding` and `_DatasetDict` objects.
560 2. In `connectDataIds`, we construct and run the "Big Join Query", which
561 returns related tuples of all dimensions used to identify any regular
562 input, output, and intermediate datasets (not prerequisites). We then
563 iterate over these tuples of related dimensions, identifying the subsets
564 that correspond to distinct data IDs for each task and dataset type,
565 and then create `_QuantumScaffolding` objects.
567 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
568 dataset data IDs previously identified, transforming unresolved
569 DatasetRefs into resolved DatasetRefs where appropriate. We then look
570 up prerequisite datasets for all quanta.
572 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
573 per-task `_QuantumScaffolding` objects.
574 """
576 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
577 _LOG.debug("Initializing data structures for QuantumGraph generation.")
578 self.tasks = []
579 # Aggregate and categorize the DatasetTypes in the Pipeline.
580 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
581 # Construct dictionaries that map those DatasetTypes to structures
582 # that will (later) hold additional information about them.
583 for attr in (
584 "initInputs",
585 "initIntermediates",
586 "initOutputs",
587 "inputs",
588 "intermediates",
589 "outputs",
590 "prerequisites",
591 ):
592 setattr(
593 self,
594 attr,
595 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
596 )
597 # Aggregate all dimensions for all non-init, non-prerequisite
598 # DatasetTypes. These are the ones we'll include in the big join
599 # query.
600 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
601 # Construct scaffolding nodes for each Task, and add backreferences
602 # to the Task from each DatasetScaffolding node.
603 # Note that there's only one scaffolding node for each DatasetType,
604 # shared by _PipelineScaffolding and all _TaskScaffoldings that
605 # reference it.
606 if isinstance(pipeline, Pipeline):
607 pipeline = pipeline.toExpandedPipeline()
608 self.tasks = [
609 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
610 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
611 ]
613 def __repr__(self) -> str:
614 # Default dataclass-injected __repr__ gets caught in an infinite loop
615 # because of back-references.
616 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
618 tasks: List[_TaskScaffolding]
619 """Scaffolding data structures for each task in the pipeline
620 (`list` of `_TaskScaffolding`).
621 """
623 initInputs: _DatasetDict
624 """Datasets consumed but not produced when constructing the tasks in this
625 pipeline (`_DatasetDict`).
626 """
628 initIntermediates: _DatasetDict
629 """Datasets that are both consumed and produced when constructing the tasks
630 in this pipeline (`_DatasetDict`).
631 """
633 initOutputs: _DatasetDict
634 """Datasets produced but not consumed when constructing the tasks in this
635 pipeline (`_DatasetDict`).
636 """
638 inputs: _DatasetDict
639 """Datasets that are consumed but not produced when running this pipeline
640 (`_DatasetDict`).
641 """
643 intermediates: _DatasetDict
644 """Datasets that are both produced and consumed when running this pipeline
645 (`_DatasetDict`).
646 """
648 outputs: _DatasetDict
649 """Datasets produced but not consumed when when running this pipeline
650 (`_DatasetDict`).
651 """
653 prerequisites: _DatasetDict
654 """Datasets that are consumed when running this pipeline and looked up
655 per-Quantum when generating the graph (`_DatasetDict`).
656 """
658 dimensions: DimensionGraph
659 """All dimensions used by any regular input, intermediate, or output
660 (not prerequisite) dataset; the set of dimension used in the "Big Join
661 Query" (`DimensionGraph`).
663 This is required to be a superset of all task quantum dimensions.
664 """
666 globalInitOutputs: _DatasetDict | None = None
667 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
668 """
670 @contextmanager
671 def connectDataIds(
672 self,
673 registry: Registry,
674 collections: Any,
675 userQuery: Optional[str],
676 externalDataId: DataCoordinate,
677 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
678 bind: Optional[Mapping[str, Any]] = None,
679 ) -> Iterator[DataCoordinateQueryResults]:
680 """Query for the data IDs that connect nodes in the `QuantumGraph`.
682 This method populates `_TaskScaffolding.dataIds` and
683 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
685 Parameters
686 ----------
687 registry : `lsst.daf.butler.Registry`
688 Registry for the data repository; used for all data ID queries.
689 collections
690 Expressions representing the collections to search for input
691 datasets. See :ref:`daf_butler_ordered_collection_searches`.
692 userQuery : `str` or `None`
693 User-provided expression to limit the data IDs processed.
694 externalDataId : `DataCoordinate`
695 Externally-provided data ID that should be used to restrict the
696 results, just as if these constraints had been included via ``AND``
697 in ``userQuery``. This includes (at least) any instrument named
698 in the pipeline definition.
699 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
700 The query constraint variant that should be used to constraint the
701 query based on dataset existance, defaults to
702 `DatasetQueryConstraintVariant.ALL`.
703 bind : `Mapping`, optional
704 Mapping containing literal values that should be injected into the
705 ``userQuery`` expression, keyed by the identifiers they replace.
707 Returns
708 -------
709 commonDataIds : \
710 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
711 An interface to a database temporary table containing all data IDs
712 that will appear in this `QuantumGraph`. Returned inside a
713 context manager, which will drop the temporary table at the end of
714 the `with` block in which this method is called.
715 """
716 _LOG.debug("Building query for data IDs.")
717 # Initialization datasets always have empty data IDs.
718 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
719 for datasetType, refs in itertools.chain(
720 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
721 ):
722 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
723 # Run one big query for the data IDs for task dimensions and regular
724 # inputs and outputs. We limit the query to only dimensions that are
725 # associated with the input dataset types, but don't (yet) try to
726 # obtain the dataset_ids for those inputs.
727 _LOG.debug("Submitting data ID query and materializing results.")
728 queryArgs: Dict[str, Any] = {
729 "dimensions": self.dimensions,
730 "where": userQuery,
731 "dataId": externalDataId,
732 "bind": bind,
733 }
734 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
735 _LOG.debug("Constraining graph query using all datasets in pipeline.")
736 queryArgs["datasets"] = list(self.inputs)
737 queryArgs["collections"] = collections
738 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
739 _LOG.debug("Not using dataset existence to constrain query.")
740 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
741 constraint = set(datasetQueryConstraint)
742 inputs = {k.name: k for k in self.inputs.keys()}
743 if remainder := constraint.difference(inputs.keys()):
744 raise ValueError(
745 f"{remainder} dataset type(s) specified as a graph constraint, but"
746 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
747 )
748 _LOG.debug(f"Constraining graph query using {constraint}")
749 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
750 queryArgs["collections"] = collections
751 else:
752 raise ValueError(
753 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
754 )
756 if "datasets" in queryArgs:
757 for i, dataset_type in enumerate(queryArgs["datasets"]):
758 if dataset_type.isComponent():
759 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
761 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
762 _LOG.debug("Expanding data IDs.")
763 commonDataIds = commonDataIds.expanded()
764 _LOG.debug("Iterating over query results to associate quanta with datasets.")
765 # Iterate over query results, populating data IDs for datasets and
766 # quanta and then connecting them to each other.
767 n = -1
768 for n, commonDataId in enumerate(commonDataIds):
769 _LOG.debug("Next DataID = %s", commonDataId)
770 # Create DatasetRefs for all DatasetTypes from this result row,
771 # noting that we might have created some already.
772 # We remember both those that already existed and those that we
773 # create now.
774 refsForRow = {}
775 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
776 for datasetType, refs in itertools.chain(
777 self.inputs.items(), self.intermediates.items(), self.outputs.items()
778 ):
779 datasetDataId: Optional[DataCoordinate]
780 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
781 datasetDataId = commonDataId.subset(datasetType.dimensions)
782 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
783 ref = refs.get(datasetDataId)
784 if ref is None:
785 ref = DatasetRef(datasetType, datasetDataId)
786 _LOG.debug("Made new ref = %s", ref)
787 refs[datasetDataId] = ref
788 refsForRow[datasetType.name] = ref
789 # Create _QuantumScaffolding objects for all tasks from this
790 # result row, noting that we might have created some already.
791 for task in self.tasks:
792 quantumDataId = commonDataId.subset(task.dimensions)
793 quantum = task.quanta.get(quantumDataId)
794 if quantum is None:
795 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
796 task.quanta[quantumDataId] = quantum
797 # Whether this is a new quantum or an existing one, we can
798 # now associate the DatasetRefs for this row with it. The
799 # fact that a Quantum data ID and a dataset data ID both
800 # came from the same result row is what tells us they
801 # should be associated.
802 # Many of these associates will be duplicates (because
803 # another query row that differed from this one only in
804 # irrelevant dimensions already added them), and we use
805 # sets to skip.
806 for datasetType in task.inputs:
807 ref = refsForRow[datasetType.name]
808 quantum.inputs[datasetType.name][ref.dataId] = ref
809 for datasetType in task.outputs:
810 ref = refsForRow[datasetType.name]
811 quantum.outputs[datasetType.name][ref.dataId] = ref
812 if n < 0:
813 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
814 emptiness_explained = False
815 for message in commonDataIds.explain_no_results():
816 _LOG.critical(message)
817 emptiness_explained = True
818 if not emptiness_explained:
819 _LOG.critical(
820 "To reproduce this query for debugging purposes, run "
821 "Registry.queryDataIds with these arguments:"
822 )
823 # We could just repr() the queryArgs dict to get something
824 # the user could make sense of, but it's friendlier to
825 # put these args in an easier-to-construct equivalent form
826 # so they can read it more easily and copy and paste into
827 # a Python terminal.
828 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
829 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
830 if queryArgs["where"]:
831 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
832 if "datasets" in queryArgs:
833 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
834 if "collections" in queryArgs:
835 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
836 _LOG.debug("Finished processing %d rows from data ID query.", n)
837 yield commonDataIds
839 def resolveDatasetRefs(
840 self,
841 registry: Registry,
842 collections: Any,
843 run: Optional[str],
844 commonDataIds: DataCoordinateQueryResults,
845 *,
846 skipExistingIn: Any = None,
847 clobberOutputs: bool = True,
848 constrainedByAllDatasets: bool = True,
849 resolveRefs: bool = False,
850 ) -> None:
851 """Perform follow up queries for each dataset data ID produced in
852 `fillDataIds`.
854 This method populates `_DatasetScaffolding.refs` (except for those in
855 `prerequisites`).
857 Parameters
858 ----------
859 registry : `lsst.daf.butler.Registry`
860 Registry for the data repository; used for all data ID queries.
861 collections
862 Expressions representing the collections to search for input
863 datasets. See :ref:`daf_butler_ordered_collection_searches`.
864 run : `str`, optional
865 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
866 output datasets, if it already exists.
867 commonDataIds : \
868 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
869 Result of a previous call to `connectDataIds`.
870 skipExistingIn
871 Expressions representing the collections to search for existing
872 output datasets that should be skipped. See
873 :ref:`daf_butler_ordered_collection_searches` for allowed types.
874 `None` or empty string/sequence disables skipping.
875 clobberOutputs : `bool`, optional
876 If `True` (default), allow quanta to created even if outputs exist;
877 this requires the same behavior behavior to be enabled when
878 executing. If ``skipExistingIn`` is not `None`, completed quanta
879 (those with metadata, or all outputs if there is no metadata
880 dataset configured) will be skipped rather than clobbered.
881 constrainedByAllDatasets : `bool`, optional
882 Indicates if the commonDataIds were generated with a constraint on
883 all dataset types.
884 resolveRefs : `bool`, optional
885 If `True` then resolve all input references and generate random
886 dataset IDs for all output and intermediate datasets. True value
887 requires ``run`` collection to be specified.
889 Raises
890 ------
891 OutputExistsError
892 Raised if an output dataset already exists in the output run
893 and ``skipExistingIn`` does not include output run, or if only
894 some outputs are present and ``clobberOutputs`` is `False`.
895 """
896 # Run may be provided but it does not have to exist, in that case we
897 # use it for resolving references but don't check it for existing refs.
898 run_exists = False
899 if run:
900 try:
901 run_exists = bool(registry.queryCollections(run))
902 except MissingCollectionError:
903 # Undocumented exception is raise if it does not exist
904 pass
906 skip_collections_wildcard: CollectionWildcard | None = None
907 skipExistingInRun = False
908 if skipExistingIn:
909 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
910 if run_exists:
911 # as optimization check in the explicit list of names first
912 skipExistingInRun = run in skip_collections_wildcard.strings
913 if not skipExistingInRun:
914 # need to flatten it and check again
915 skipExistingInRun = run in registry.queryCollections(
916 skipExistingIn,
917 collectionTypes=CollectionType.RUN,
918 )
920 idMaker: Optional[_DatasetIdMaker] = None
921 if resolveRefs:
922 assert run is not None, "run cannot be None when resolveRefs is True"
923 idMaker = _DatasetIdMaker(registry, run)
925 resolvedRefQueryResults: Iterable[DatasetRef]
927 # Look up [init] intermediate and output datasets in the output
928 # collection, if there is an output collection.
929 if run_exists or skip_collections_wildcard is not None:
930 for datasetType, refs in itertools.chain(
931 self.initIntermediates.items(),
932 self.initOutputs.items(),
933 self.intermediates.items(),
934 self.outputs.items(),
935 ):
936 _LOG.debug(
937 "Resolving %d datasets for intermediate and/or output dataset %s.",
938 len(refs),
939 datasetType.name,
940 )
941 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
942 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
943 # TODO: this assert incorrectly bans component inputs;
944 # investigate on DM-33027.
945 # assert not datasetType.isComponent(), \
946 # "Output datasets cannot be components."
947 #
948 # Instead we have to handle them manually to avoid a
949 # deprecation warning, but it is at least confusing and
950 # possibly a bug for components to appear here at all.
951 if datasetType.isComponent():
952 parent_dataset_type = datasetType.makeCompositeDatasetType()
953 component = datasetType.component()
954 else:
955 parent_dataset_type = datasetType
956 component = None
958 # look at RUN collection first
959 if run_exists:
960 try:
961 resolvedRefQueryResults = subset.findDatasets(
962 parent_dataset_type, collections=run, findFirst=True
963 )
964 except MissingDatasetTypeError:
965 resolvedRefQueryResults = []
966 for resolvedRef in resolvedRefQueryResults:
967 # TODO: we could easily support per-DatasetType
968 # skipExisting and I could imagine that being useful -
969 # it's probably required in order to support writing
970 # initOutputs before QuantumGraph generation.
971 assert resolvedRef.dataId in refs
972 if not (skipExistingInRun or isInit or clobberOutputs):
973 raise OutputExistsError(
974 f"Output dataset {datasetType.name} already exists in "
975 f"output RUN collection '{run}' with data ID"
976 f" {resolvedRef.dataId}."
977 )
978 # If we are going to resolve all outputs then we have
979 # to remember existing ones to avoid generating new
980 # dataset IDs for them.
981 if resolveRefs:
982 refs[resolvedRef.dataId] = (
983 resolvedRef.makeComponentRef(component)
984 if component is not None
985 else resolvedRef
986 )
988 # And check skipExistingIn too, if RUN collection is in
989 # it is handled above
990 if skip_collections_wildcard is not None:
991 try:
992 resolvedRefQueryResults = subset.findDatasets(
993 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True
994 )
995 except MissingDatasetTypeError:
996 resolvedRefQueryResults = []
997 for resolvedRef in resolvedRefQueryResults:
998 assert resolvedRef.dataId in refs
999 refs[resolvedRef.dataId] = (
1000 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1001 )
1003 # Look up input and initInput datasets in the input collection(s).
1004 # container to accumulate unfound refs, if the common dataIs were not
1005 # constrained on dataset type existence.
1006 self.unfoundRefs = set()
1007 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1008 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
1009 if datasetType.isComponent():
1010 parent_dataset_type = datasetType.makeCompositeDatasetType()
1011 component = datasetType.component()
1012 else:
1013 parent_dataset_type = datasetType
1014 component = None
1015 try:
1016 resolvedRefQueryResults = commonDataIds.subset(
1017 datasetType.dimensions, unique=True
1018 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1019 except MissingDatasetTypeError:
1020 resolvedRefQueryResults = []
1021 dataIdsNotFoundYet = set(refs.keys())
1022 for resolvedRef in resolvedRefQueryResults:
1023 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1024 refs[resolvedRef.dataId] = (
1025 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1026 )
1027 if dataIdsNotFoundYet:
1028 if constrainedByAllDatasets:
1029 raise RuntimeError(
1030 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1031 f"'{datasetType.name}' was/were present in a previous "
1032 "query, but could not be found now. "
1033 "This is either a logic bug in QuantumGraph generation "
1034 "or the input collections have been modified since "
1035 "QuantumGraph generation began."
1036 )
1037 else:
1038 # if the common dataIds were not constrained using all the
1039 # input dataset types, it is possible that some data ids
1040 # found dont correspond to existing dataset types and they
1041 # will be un-resolved. Mark these for later pruning from
1042 # the quantum graph.
1043 for k in dataIdsNotFoundYet:
1044 self.unfoundRefs.add(refs[k])
1046 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1047 # replacing the unresolved refs there, and then look up prerequisites.
1048 for task in self.tasks:
1049 _LOG.debug(
1050 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1051 len(task.quanta),
1052 task.taskDef.label,
1053 )
1054 # The way iterConnections is designed makes it impossible to
1055 # annotate precisely enough to satisfy MyPy here.
1056 lookupFunctions = {
1057 c.name: c.lookupFunction # type: ignore
1058 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1059 if c.lookupFunction is not None # type: ignore
1060 }
1061 dataIdsFailed = []
1062 dataIdsSucceeded = []
1063 for quantum in task.quanta.values():
1064 # Process outputs datasets only if skipExistingIn is not None
1065 # or there is a run to look for outputs in and clobberOutputs
1066 # is True. Note that if skipExistingIn is None, any output
1067 # datasets that already exist would have already caused an
1068 # exception to be raised. We never update the DatasetRefs in
1069 # the quantum because those should never be resolved.
1070 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1071 resolvedRefs = []
1072 unresolvedRefs = []
1073 haveMetadata = False
1074 for datasetType, originalRefs in quantum.outputs.items():
1075 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
1076 if ref.id is not None:
1077 resolvedRefs.append(ref)
1078 if datasetType.name == task.taskDef.metadataDatasetName:
1079 haveMetadata = True
1080 else:
1081 unresolvedRefs.append(ref)
1082 if resolvedRefs:
1083 if haveMetadata or not unresolvedRefs:
1084 dataIdsSucceeded.append(quantum.dataId)
1085 if skip_collections_wildcard is not None:
1086 continue
1087 else:
1088 dataIdsFailed.append(quantum.dataId)
1089 if not clobberOutputs:
1090 raise OutputExistsError(
1091 f"Quantum {quantum.dataId} of task with label "
1092 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1093 f"({resolvedRefs}) "
1094 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1095 "and clobbering outputs was not enabled."
1096 )
1097 # Update the input DatasetRefs to the resolved ones we already
1098 # searched for.
1099 for datasetType, input_refs in quantum.inputs.items():
1100 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1101 input_refs[ref.dataId] = ref
1102 # Look up prerequisite datasets in the input collection(s).
1103 # These may have dimensions that extend beyond those we queried
1104 # for originally, because we want to permit those data ID
1105 # values to differ across quanta and dataset types.
1106 for datasetType in task.prerequisites:
1107 if datasetType.isComponent():
1108 parent_dataset_type = datasetType.makeCompositeDatasetType()
1109 component = datasetType.component()
1110 else:
1111 parent_dataset_type = datasetType
1112 component = None
1113 lookupFunction = lookupFunctions.get(datasetType.name)
1114 if lookupFunction is not None:
1115 # PipelineTask has provided its own function to do the
1116 # lookup. This always takes precedence.
1117 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1118 elif (
1119 datasetType.isCalibration()
1120 and datasetType.dimensions <= quantum.dataId.graph
1121 and quantum.dataId.graph.temporal
1122 ):
1123 # This is a master calibration lookup, which we have to
1124 # handle specially because the query system can't do a
1125 # temporal join on a non-dimension-based timespan yet.
1126 timespan = quantum.dataId.timespan
1127 try:
1128 prereq_ref = registry.findDataset(
1129 parent_dataset_type,
1130 quantum.dataId,
1131 collections=collections,
1132 timespan=timespan,
1133 )
1134 if prereq_ref is not None:
1135 if component is not None:
1136 prereq_ref = prereq_ref.makeComponentRef(component)
1137 prereq_refs = [prereq_ref]
1138 else:
1139 prereq_refs = []
1140 except (KeyError, MissingDatasetTypeError):
1141 # This dataset type is not present in the registry,
1142 # which just means there are no datasets here.
1143 prereq_refs = []
1144 else:
1145 # Most general case.
1146 prereq_refs = [
1147 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1148 for prereq_ref in registry.queryDatasets(
1149 parent_dataset_type,
1150 collections=collections,
1151 dataId=quantum.dataId,
1152 findFirst=True,
1153 ).expanded()
1154 ]
1155 prereq_refs_map = {ref.dataId: ref for ref in prereq_refs if ref is not None}
1156 quantum.prerequisites[datasetType].update(prereq_refs_map)
1157 task.prerequisites[datasetType].update(prereq_refs_map)
1159 # Resolve all quantum inputs and outputs.
1160 if idMaker:
1161 for datasetDict in (quantum.inputs, quantum.outputs):
1162 for refDict in datasetDict.values():
1163 refDict.update(idMaker.resolveDict(refDict))
1165 # Resolve task initInputs and initOutputs.
1166 if idMaker:
1167 for datasetDict in (task.initInputs, task.initOutputs):
1168 for refDict in datasetDict.values():
1169 refDict.update(idMaker.resolveDict(refDict))
1171 # Actually remove any quanta that we decided to skip above.
1172 if dataIdsSucceeded:
1173 if skip_collections_wildcard is not None:
1174 _LOG.debug(
1175 "Pruning successful %d quanta for task with label '%s' because all of their "
1176 "outputs exist or metadata was written successfully.",
1177 len(dataIdsSucceeded),
1178 task.taskDef.label,
1179 )
1180 for dataId in dataIdsSucceeded:
1181 del task.quanta[dataId]
1182 elif clobberOutputs:
1183 _LOG.info(
1184 "Found %d successful quanta for task with label '%s' "
1185 "that will need to be clobbered during execution.",
1186 len(dataIdsSucceeded),
1187 task.taskDef.label,
1188 )
1189 else:
1190 raise AssertionError("OutputExistsError should have already been raised.")
1191 if dataIdsFailed:
1192 if clobberOutputs:
1193 _LOG.info(
1194 "Found %d failed/incomplete quanta for task with label '%s' "
1195 "that will need to be clobbered during execution.",
1196 len(dataIdsFailed),
1197 task.taskDef.label,
1198 )
1199 else:
1200 raise AssertionError("OutputExistsError should have already been raised.")
1202 # Collect initOutputs that do not belong to any task.
1203 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1204 for task in self.tasks:
1205 global_dataset_types -= set(task.initOutputs)
1206 if global_dataset_types:
1207 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1208 if idMaker is not None:
1209 for refDict in self.globalInitOutputs.values():
1210 refDict.update(idMaker.resolveDict(refDict))
1212 def makeQuantumGraph(
1213 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None
1214 ) -> QuantumGraph:
1215 """Create a `QuantumGraph` from the quanta already present in
1216 the scaffolding data structure.
1218 Parameters
1219 ---------
1220 metadata : Optional Mapping of `str` to primitives
1221 This is an optional parameter of extra data to carry with the
1222 graph. Entries in this mapping should be able to be serialized in
1223 JSON.
1224 datastore : `Datastore`, optional
1225 If not `None` then fill datastore records in each generated
1226 Quantum.
1228 Returns
1229 -------
1230 graph : `QuantumGraph`
1231 The full `QuantumGraph`.
1232 """
1234 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1235 """Extract all DatasetRefs from the dictionaries"""
1236 for ref_dict in dataset_dict.values():
1237 yield from ref_dict.values()
1239 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1240 if datastore is not None:
1241 datastore_records = datastore.export_records(
1242 itertools.chain(
1243 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1244 )
1245 )
1247 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1248 for task in self.tasks:
1249 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1250 graphInput[task.taskDef] = qset
1252 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1253 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1255 globalInitOutputs: list[DatasetRef] = []
1256 if self.globalInitOutputs is not None:
1257 for refs_dict in self.globalInitOutputs.values():
1258 globalInitOutputs.extend(refs_dict.values())
1260 graph = QuantumGraph(
1261 graphInput,
1262 metadata=metadata,
1263 pruneRefs=self.unfoundRefs,
1264 universe=self.dimensions.universe,
1265 initInputs=taskInitInputs,
1266 initOutputs=taskInitOutputs,
1267 globalInitOutputs=globalInitOutputs,
1268 )
1269 return graph
1272# ------------------------
1273# Exported definitions --
1274# ------------------------
1277class GraphBuilderError(Exception):
1278 """Base class for exceptions generated by graph builder."""
1280 pass
1283class OutputExistsError(GraphBuilderError):
1284 """Exception generated when output datasets already exist."""
1286 pass
1289class PrerequisiteMissingError(GraphBuilderError):
1290 """Exception generated when a prerequisite dataset does not exist."""
1292 pass
1295class GraphBuilder:
1296 """GraphBuilder class is responsible for building task execution graph from
1297 a Pipeline.
1299 Parameters
1300 ----------
1301 registry : `~lsst.daf.butler.Registry`
1302 Data butler instance.
1303 skipExistingIn
1304 Expressions representing the collections to search for existing
1305 output datasets that should be skipped. See
1306 :ref:`daf_butler_ordered_collection_searches`.
1307 clobberOutputs : `bool`, optional
1308 If `True` (default), allow quanta to created even if partial outputs
1309 exist; this requires the same behavior behavior to be enabled when
1310 executing.
1311 datastore : `Datastore`, optional
1312 If not `None` then fill datastore records in each generated Quantum.
1313 """
1315 def __init__(
1316 self,
1317 registry: Registry,
1318 skipExistingIn: Any = None,
1319 clobberOutputs: bool = True,
1320 datastore: Optional[Datastore] = None,
1321 ):
1322 self.registry = registry
1323 self.dimensions = registry.dimensions
1324 self.skipExistingIn = skipExistingIn
1325 self.clobberOutputs = clobberOutputs
1326 self.datastore = datastore
1328 def makeGraph(
1329 self,
1330 pipeline: Union[Pipeline, Iterable[TaskDef]],
1331 collections: Any,
1332 run: Optional[str],
1333 userQuery: Optional[str],
1334 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1335 metadata: Optional[Mapping[str, Any]] = None,
1336 resolveRefs: bool = False,
1337 bind: Optional[Mapping[str, Any]] = None,
1338 ) -> QuantumGraph:
1339 """Create execution graph for a pipeline.
1341 Parameters
1342 ----------
1343 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1344 Pipeline definition, task names/classes and their configs.
1345 collections
1346 Expressions representing the collections to search for input
1347 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1348 run : `str`, optional
1349 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1350 output datasets. Collection does not have to exist and it will be
1351 created when graph is executed.
1352 userQuery : `str`
1353 String which defines user-defined selection for registry, should be
1354 empty or `None` if there is no restrictions on data selection.
1355 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1356 The query constraint variant that should be used to constraint the
1357 query based on dataset existance, defaults to
1358 `DatasetQueryConstraintVariant.ALL`.
1359 metadata : Optional Mapping of `str` to primitives
1360 This is an optional parameter of extra data to carry with the
1361 graph. Entries in this mapping should be able to be serialized in
1362 JSON.
1363 resolveRefs : `bool`, optional
1364 If `True` then resolve all input references and generate random
1365 dataset IDs for all output and intermediate datasets. True value
1366 requires ``run`` collection to be specified.
1367 bind : `Mapping`, optional
1368 Mapping containing literal values that should be injected into the
1369 ``userQuery`` expression, keyed by the identifiers they replace.
1371 Returns
1372 -------
1373 graph : `QuantumGraph`
1375 Raises
1376 ------
1377 UserExpressionError
1378 Raised when user expression cannot be parsed.
1379 OutputExistsError
1380 Raised when output datasets already exist.
1381 Exception
1382 Other exceptions types may be raised by underlying registry
1383 classes.
1384 """
1385 if resolveRefs and run is None:
1386 raise ValueError("`resolveRefs` requires `run` parameter.")
1387 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1388 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1389 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1390 instrument_class: Optional[Any] = None
1391 if isinstance(pipeline, Pipeline):
1392 instrument_class_name = pipeline.getInstrument()
1393 if instrument_class_name is not None:
1394 instrument_class = doImportType(instrument_class_name)
1395 pipeline = list(pipeline.toExpandedPipeline())
1396 if instrument_class is not None:
1397 dataId = DataCoordinate.standardize(
1398 instrument=instrument_class.getName(), universe=self.registry.dimensions
1399 )
1400 else:
1401 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1402 with scaffolding.connectDataIds(
1403 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1404 ) as commonDataIds:
1405 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1406 scaffolding.resolveDatasetRefs(
1407 self.registry,
1408 collections,
1409 run,
1410 commonDataIds,
1411 skipExistingIn=self.skipExistingIn,
1412 clobberOutputs=self.clobberOutputs,
1413 constrainedByAllDatasets=condition,
1414 resolveRefs=resolveRefs,
1415 )
1416 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)