Coverage for python/lsst/pipe/base/graphBuilder.py: 14%
516 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-30 02:48 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-30 02:48 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap, defaultdict
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
38from lsst.daf.butler import (
39 CollectionType,
40 DataCoordinate,
41 DatasetIdGenEnum,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
55from lsst.daf.butler.registry.wildcards import CollectionWildcard
56from lsst.utils import doImportType
58from ._datasetQueryConstraints import DatasetQueryConstraintVariant
59from ._status import NoWorkFound
61# -----------------------------
62# Imports for other modules --
63# -----------------------------
64from .connections import AdjustQuantumHelper, iterConnections
65from .graph import QuantumGraph
66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
68# ----------------------------------
69# Local non-exported definitions --
70# ----------------------------------
72_LOG = logging.getLogger(__name__)
75class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
76 """A custom dictionary that maps `DatasetType` to a nested dictionary of
77 the known `DatasetRef` instances of that type.
79 Parameters
80 ----------
81 args
82 Positional arguments are forwarded to the `dict` constructor.
83 universe : `DimensionUniverse`
84 Universe of all possible dimensions.
85 """
87 def __init__(self, *args: Any, universe: DimensionUniverse):
88 super().__init__(*args)
89 self.universe = universe
91 @classmethod
92 def fromDatasetTypes(
93 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
94 ) -> _DatasetDict:
95 """Construct a dictionary from a flat iterable of `DatasetType` keys.
97 Parameters
98 ----------
99 datasetTypes : `iterable` of `DatasetType`
100 DatasetTypes to use as keys for the dict. Values will be empty
101 dictionaries.
102 universe : `DimensionUniverse`
103 Universe of all possible dimensions.
105 Returns
106 -------
107 dictionary : `_DatasetDict`
108 A new `_DatasetDict` instance.
109 """
110 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
112 @classmethod
113 def fromSubset(
114 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
115 ) -> _DatasetDict:
116 """Return a new dictionary by extracting items corresponding to the
117 given keys from one or more existing dictionaries.
119 Parameters
120 ----------
121 datasetTypes : `iterable` of `DatasetType`
122 DatasetTypes to use as keys for the dict. Values will be obtained
123 by lookups against ``first`` and ``rest``.
124 first : `_DatasetDict`
125 Another dictionary from which to extract values.
126 rest
127 Additional dictionaries from which to extract values.
129 Returns
130 -------
131 dictionary : `_DatasetDict`
132 A new dictionary instance.
133 """
134 combined = ChainMap(first, *rest)
136 # Dataset types known to match immediately can be processed
137 # without checks.
138 matches = combined.keys() & set(datasetTypes)
139 _dict = {k: combined[k] for k in matches}
141 if len(_dict) < len(datasetTypes):
142 # Work out which ones are missing.
143 missing_datasetTypes = set(datasetTypes) - _dict.keys()
145 # Get the known names for comparison.
146 combined_by_name = {k.name: k for k in combined}
148 missing = set()
149 incompatible = {}
150 for datasetType in missing_datasetTypes:
151 # The dataset type is not found. It may not be listed
152 # or it may be that it is there with the same name
153 # but different definition.
154 if datasetType.name in combined_by_name:
155 # This implies some inconsistency in definitions
156 # for connections. If there is support for storage
157 # class conversion we can let it slide.
158 # At this point we do not know
159 # where the inconsistency is but trust that down
160 # stream code will be more explicit about input
161 # vs output incompatibilities.
162 existing = combined_by_name[datasetType.name]
163 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
164 _LOG.warning(
165 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
166 datasetType,
167 existing,
168 )
169 _dict[datasetType] = combined[existing]
170 else:
171 incompatible[datasetType] = existing
172 else:
173 missing.add(datasetType)
175 if missing or incompatible:
176 reasons = []
177 if missing:
178 reasons.append(
179 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
180 + ", ".join(d.name for d in combined)
181 )
182 if incompatible:
183 for x, y in incompatible.items():
184 reasons.append(f"{x} incompatible with {y}")
185 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
187 return cls(_dict, universe=first.universe)
189 @property
190 def dimensions(self) -> DimensionGraph:
191 """The union of all dimensions used by all dataset types in this
192 dictionary, including implied dependencies (`DimensionGraph`).
193 """
194 base = self.universe.empty
195 if len(self) == 0:
196 return base
197 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
199 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
200 """Unpack nested single-element `DatasetRef` dicts into a new
201 mapping with `DatasetType` keys and `DatasetRef` values.
203 This method assumes that each nest contains exactly one item, as is the
204 case for all "init" datasets.
206 Returns
207 -------
208 dictionary : `NamedKeyDict`
209 Dictionary mapping `DatasetType` to `DatasetRef`, with both
210 `DatasetType` instances and string names usable as keys.
211 """
213 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
214 (ref,) = refs.values()
215 return ref
217 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
219 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
220 """Unpack nested multi-element `DatasetRef` dicts into a new
221 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
223 Returns
224 -------
225 dictionary : `NamedKeyDict`
226 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
227 both `DatasetType` instances and string names usable as keys.
228 """
229 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
231 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
232 """Iterate over the contained `DatasetRef` instances that match the
233 given `DatasetType` and data IDs.
235 Parameters
236 ----------
237 datasetType : `DatasetType`
238 Dataset type to match.
239 dataIds : `Iterable` [ `DataCoordinate` ]
240 Data IDs to match.
242 Returns
243 -------
244 refs : `Iterator` [ `DatasetRef` ]
245 DatasetRef instances for which ``ref.datasetType == datasetType``
246 and ``ref.dataId`` is in ``dataIds``.
247 """
248 refs = self[datasetType]
249 return (refs[dataId] for dataId in dataIds)
252class _QuantumScaffolding:
253 """Helper class aggregating information about a `Quantum`, used when
254 constructing a `QuantumGraph`.
256 See `_PipelineScaffolding` for a top-down description of the full
257 scaffolding data structure.
259 Parameters
260 ----------
261 task : _TaskScaffolding
262 Back-reference to the helper object for the `PipelineTask` this quantum
263 represents an execution of.
264 dataId : `DataCoordinate`
265 Data ID for this quantum.
266 """
268 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
269 self.task = task
270 self.dataId = dataId
271 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
272 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
273 self.prerequisites = _DatasetDict.fromDatasetTypes(
274 task.prerequisites.keys(), universe=dataId.universe
275 )
277 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
279 def __repr__(self) -> str:
280 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
282 task: _TaskScaffolding
283 """Back-reference to the helper object for the `PipelineTask` this quantum
284 represents an execution of.
285 """
287 dataId: DataCoordinate
288 """Data ID for this quantum.
289 """
291 inputs: _DatasetDict
292 """Nested dictionary containing `DatasetRef` inputs to this quantum.
294 This is initialized to map each `DatasetType` to an empty dictionary at
295 construction. Those nested dictionaries are populated (with data IDs as
296 keys) with unresolved `DatasetRef` instances in
297 `_PipelineScaffolding.connectDataIds`.
298 """
300 outputs: _DatasetDict
301 """Nested dictionary containing `DatasetRef` outputs this quantum.
302 """
304 prerequisites: _DatasetDict
305 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
306 quantum.
307 """
309 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
310 """Transform the scaffolding object into a true `Quantum` instance.
312 Parameters
313 ----------
314 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
315 If not `None` then fill datastore records in each generated Quantum
316 using the records from this structure.
318 Returns
319 -------
320 quantum : `Quantum`
321 An actual `Quantum` instance.
322 """
323 allInputs = self.inputs.unpackMultiRefs()
324 allInputs.update(self.prerequisites.unpackMultiRefs())
325 # Give the task's Connections class an opportunity to remove some
326 # inputs, or complain if they are unacceptable.
327 # This will raise if one of the check conditions is not met, which is
328 # the intended behavior.
329 # If it raises NotWorkFound, there is a bug in the QG algorithm
330 # or the adjustQuantum is incorrectly trying to make a prerequisite
331 # input behave like a regular input; adjustQuantum should only raise
332 # NoWorkFound if a regular input is missing, and it shouldn't be
333 # possible for us to have generated ``self`` if that's true.
334 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
335 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
336 initInputs = self.task.initInputs.unpackSingleRefs()
337 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
338 if datastore_records is not None:
339 quantum_records = {}
340 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
341 input_refs += list(initInputs.values())
342 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
343 for datastore_name, records in datastore_records.items():
344 matching_records = records.subset(input_ids)
345 if matching_records is not None:
346 quantum_records[datastore_name] = matching_records
347 return Quantum(
348 taskName=self.task.taskDef.taskName,
349 taskClass=self.task.taskDef.taskClass,
350 dataId=self.dataId,
351 initInputs=initInputs,
352 inputs=helper.inputs,
353 outputs=helper.outputs,
354 datastore_records=quantum_records,
355 )
358@dataclass
359class _TaskScaffolding:
360 """Helper class aggregating information about a `PipelineTask`, used when
361 constructing a `QuantumGraph`.
363 See `_PipelineScaffolding` for a top-down description of the full
364 scaffolding data structure.
366 Parameters
367 ----------
368 taskDef : `TaskDef`
369 Data structure that identifies the task class and its config.
370 parent : `_PipelineScaffolding`
371 The parent data structure that will hold the instance being
372 constructed.
373 datasetTypes : `TaskDatasetTypes`
374 Data structure that categorizes the dataset types used by this task.
375 """
377 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
378 universe = parent.dimensions.universe
379 self.taskDef = taskDef
380 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
381 assert self.dimensions.issubset(parent.dimensions)
382 # Initialize _DatasetDicts as subsets of the one or two
383 # corresponding dicts in the parent _PipelineScaffolding.
384 self.initInputs = _DatasetDict.fromSubset(
385 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
386 )
387 self.initOutputs = _DatasetDict.fromSubset(
388 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
389 )
390 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
391 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
392 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
393 self.dataIds: Set[DataCoordinate] = set()
394 self.quanta = {}
396 def __repr__(self) -> str:
397 # Default dataclass-injected __repr__ gets caught in an infinite loop
398 # because of back-references.
399 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
401 taskDef: TaskDef
402 """Data structure that identifies the task class and its config
403 (`TaskDef`).
404 """
406 dimensions: DimensionGraph
407 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
408 """
410 initInputs: _DatasetDict
411 """Dictionary containing information about datasets used to construct this
412 task (`_DatasetDict`).
413 """
415 initOutputs: _DatasetDict
416 """Dictionary containing information about datasets produced as a
417 side-effect of constructing this task (`_DatasetDict`).
418 """
420 inputs: _DatasetDict
421 """Dictionary containing information about datasets used as regular,
422 graph-constraining inputs to this task (`_DatasetDict`).
423 """
425 outputs: _DatasetDict
426 """Dictionary containing information about datasets produced by this task
427 (`_DatasetDict`).
428 """
430 prerequisites: _DatasetDict
431 """Dictionary containing information about input datasets that must be
432 present in the repository before any Pipeline containing this task is run
433 (`_DatasetDict`).
434 """
436 quanta: Dict[DataCoordinate, _QuantumScaffolding]
437 """Dictionary mapping data ID to a scaffolding object for the Quantum of
438 this task with that data ID.
439 """
441 def makeQuantumSet(
442 self,
443 unresolvedRefs: Optional[Set[DatasetRef]] = None,
444 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
445 ) -> Set[Quantum]:
446 """Create a `set` of `Quantum` from the information in ``self``.
448 Parameters
449 ----------
450 unresolvedRefs : `set` [ `DatasetRef` ], optional
451 Input dataset refs that have not been found.
452 datastore_records : `dict`
455 Returns
456 -------
457 nodes : `set` of `Quantum`
458 The `Quantum` elements corresponding to this task.
459 """
460 if unresolvedRefs is None:
461 unresolvedRefs = set()
462 outputs = set()
463 for q in self.quanta.values():
464 try:
465 tmpQuanta = q.makeQuantum(datastore_records)
466 outputs.add(tmpQuanta)
467 except (NoWorkFound, FileNotFoundError) as exc:
468 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
469 if unresolvedRefs.intersection(refs):
470 # This means it is a node that is Known to be pruned
471 # later and should be left in even though some follow up
472 # queries fail. This allows the pruning to start from this
473 # quantum with known issues, and prune other nodes it
474 # touches
475 inputs = q.inputs.unpackMultiRefs()
476 inputs.update(q.prerequisites.unpackMultiRefs())
477 tmpQuantum = Quantum(
478 taskName=q.task.taskDef.taskName,
479 taskClass=q.task.taskDef.taskClass,
480 dataId=q.dataId,
481 initInputs=q.task.initInputs.unpackSingleRefs(),
482 inputs=inputs,
483 outputs=q.outputs.unpackMultiRefs(),
484 )
485 outputs.add(tmpQuantum)
486 else:
487 raise exc
488 return outputs
491class _DatasetIdMaker:
492 """Helper class which generates random dataset UUIDs for unresolved
493 datasets.
494 """
496 def __init__(self, registry: Registry, run: str):
497 self.datasetIdFactory = registry.datasetIdFactory
498 self.run = run
499 # Dataset IDs generated so far
500 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {}
502 def resolveRef(self, ref: DatasetRef) -> DatasetRef:
503 if ref.id is not None:
504 return ref
506 # For components we need their parent dataset ID.
507 if ref.isComponent():
508 parent_ref = ref.makeCompositeRef()
509 # Some basic check - parent should be resolved if this is an
510 # existing input, or it should be in the cache already if it is
511 # an intermediate.
512 if parent_ref.id is None:
513 key = parent_ref.datasetType, parent_ref.dataId
514 if key not in self.resolved:
515 raise ValueError(f"Composite dataset is missing from cache: {parent_ref}")
516 parent_ref = self.resolved[key]
517 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved"
518 return ref.resolved(parent_ref.id, parent_ref.run)
520 key = ref.datasetType, ref.dataId
521 if (resolved := self.resolved.get(key)) is None:
522 resolved = self.datasetIdFactory.resolveRef(ref, self.run, DatasetIdGenEnum.UNIQUE)
523 self.resolved[key] = resolved
524 return resolved
526 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]:
527 """Resolve all unresolved references in the provided dictionary."""
528 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()}
531@dataclass
532class _PipelineScaffolding:
533 """A helper data structure that organizes the information involved in
534 constructing a `QuantumGraph` for a `Pipeline`.
536 Parameters
537 ----------
538 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
539 Sequence of tasks from which a graph is to be constructed. Must
540 have nested task classes already imported.
541 universe : `DimensionUniverse`
542 Universe of all possible dimensions.
544 Notes
545 -----
546 The scaffolding data structure contains nested data structures for both
547 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
548 data structures are shared between the pipeline-level structure (which
549 aggregates all datasets and categorizes them from the perspective of the
550 complete pipeline) and the individual tasks that use them as inputs and
551 outputs.
553 `QuantumGraph` construction proceeds in four steps, with each corresponding
554 to a different `_PipelineScaffolding` method:
556 1. When `_PipelineScaffolding` is constructed, we extract and categorize
557 the DatasetTypes used by the pipeline (delegating to
558 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
559 nested `_TaskScaffolding` and `_DatasetDict` objects.
561 2. In `connectDataIds`, we construct and run the "Big Join Query", which
562 returns related tuples of all dimensions used to identify any regular
563 input, output, and intermediate datasets (not prerequisites). We then
564 iterate over these tuples of related dimensions, identifying the subsets
565 that correspond to distinct data IDs for each task and dataset type,
566 and then create `_QuantumScaffolding` objects.
568 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
569 dataset data IDs previously identified, transforming unresolved
570 DatasetRefs into resolved DatasetRefs where appropriate. We then look
571 up prerequisite datasets for all quanta.
573 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
574 per-task `_QuantumScaffolding` objects.
575 """
577 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
578 _LOG.debug("Initializing data structures for QuantumGraph generation.")
579 self.tasks = []
580 # Aggregate and categorize the DatasetTypes in the Pipeline.
581 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
582 # Construct dictionaries that map those DatasetTypes to structures
583 # that will (later) hold additional information about them.
584 for attr in (
585 "initInputs",
586 "initIntermediates",
587 "initOutputs",
588 "inputs",
589 "intermediates",
590 "outputs",
591 "prerequisites",
592 ):
593 setattr(
594 self,
595 attr,
596 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
597 )
598 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
599 # Aggregate all dimensions for all non-init, non-prerequisite
600 # DatasetTypes. These are the ones we'll include in the big join
601 # query.
602 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
603 # Construct scaffolding nodes for each Task, and add backreferences
604 # to the Task from each DatasetScaffolding node.
605 # Note that there's only one scaffolding node for each DatasetType,
606 # shared by _PipelineScaffolding and all _TaskScaffoldings that
607 # reference it.
608 if isinstance(pipeline, Pipeline):
609 pipeline = pipeline.toExpandedPipeline()
610 self.tasks = [
611 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
612 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
613 ]
615 def __repr__(self) -> str:
616 # Default dataclass-injected __repr__ gets caught in an infinite loop
617 # because of back-references.
618 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
620 tasks: List[_TaskScaffolding]
621 """Scaffolding data structures for each task in the pipeline
622 (`list` of `_TaskScaffolding`).
623 """
625 initInputs: _DatasetDict
626 """Datasets consumed but not produced when constructing the tasks in this
627 pipeline (`_DatasetDict`).
628 """
630 initIntermediates: _DatasetDict
631 """Datasets that are both consumed and produced when constructing the tasks
632 in this pipeline (`_DatasetDict`).
633 """
635 initOutputs: _DatasetDict
636 """Datasets produced but not consumed when constructing the tasks in this
637 pipeline (`_DatasetDict`).
638 """
640 inputs: _DatasetDict
641 """Datasets that are consumed but not produced when running this pipeline
642 (`_DatasetDict`).
643 """
645 intermediates: _DatasetDict
646 """Datasets that are both produced and consumed when running this pipeline
647 (`_DatasetDict`).
648 """
650 outputs: _DatasetDict
651 """Datasets produced but not consumed when when running this pipeline
652 (`_DatasetDict`).
653 """
655 prerequisites: _DatasetDict
656 """Datasets that are consumed when running this pipeline and looked up
657 per-Quantum when generating the graph (`_DatasetDict`).
658 """
660 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
661 """Datasets that should be used as constraints in the initial query,
662 according to tasks (`NamedValueSet`).
663 """
665 dimensions: DimensionGraph
666 """All dimensions used by any regular input, intermediate, or output
667 (not prerequisite) dataset; the set of dimension used in the "Big Join
668 Query" (`DimensionGraph`).
670 This is required to be a superset of all task quantum dimensions.
671 """
673 globalInitOutputs: _DatasetDict | None = None
674 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
675 """
677 @contextmanager
678 def connectDataIds(
679 self,
680 registry: Registry,
681 collections: Any,
682 userQuery: Optional[str],
683 externalDataId: DataCoordinate,
684 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
685 bind: Optional[Mapping[str, Any]] = None,
686 ) -> Iterator[DataCoordinateQueryResults]:
687 """Query for the data IDs that connect nodes in the `QuantumGraph`.
689 This method populates `_TaskScaffolding.dataIds` and
690 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
692 Parameters
693 ----------
694 registry : `lsst.daf.butler.Registry`
695 Registry for the data repository; used for all data ID queries.
696 collections
697 Expressions representing the collections to search for input
698 datasets. See :ref:`daf_butler_ordered_collection_searches`.
699 userQuery : `str` or `None`
700 User-provided expression to limit the data IDs processed.
701 externalDataId : `DataCoordinate`
702 Externally-provided data ID that should be used to restrict the
703 results, just as if these constraints had been included via ``AND``
704 in ``userQuery``. This includes (at least) any instrument named
705 in the pipeline definition.
706 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
707 The query constraint variant that should be used to constraint the
708 query based on dataset existance, defaults to
709 `DatasetQueryConstraintVariant.ALL`.
710 bind : `Mapping`, optional
711 Mapping containing literal values that should be injected into the
712 ``userQuery`` expression, keyed by the identifiers they replace.
714 Returns
715 -------
716 commonDataIds : \
717 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
718 An interface to a database temporary table containing all data IDs
719 that will appear in this `QuantumGraph`. Returned inside a
720 context manager, which will drop the temporary table at the end of
721 the `with` block in which this method is called.
722 """
723 _LOG.debug("Building query for data IDs.")
724 # Initialization datasets always have empty data IDs.
725 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
726 for datasetType, refs in itertools.chain(
727 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
728 ):
729 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
730 # Run one big query for the data IDs for task dimensions and regular
731 # inputs and outputs. We limit the query to only dimensions that are
732 # associated with the input dataset types, but don't (yet) try to
733 # obtain the dataset_ids for those inputs.
734 _LOG.debug(
735 "Submitting data ID query over dimensions %s and materializing results.",
736 list(self.dimensions.names),
737 )
738 queryArgs: Dict[str, Any] = {
739 "dimensions": self.dimensions,
740 "where": userQuery,
741 "dataId": externalDataId,
742 "bind": bind,
743 }
744 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
745 _LOG.debug(
746 "Constraining graph query using default of %s.",
747 list(self.defaultDatasetQueryConstraints.names),
748 )
749 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
750 queryArgs["collections"] = collections
751 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
752 _LOG.debug("Not using dataset existence to constrain query.")
753 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
754 constraint = set(datasetQueryConstraint)
755 inputs = {k.name: k for k in self.inputs.keys()}
756 if remainder := constraint.difference(inputs.keys()):
757 raise ValueError(
758 f"{remainder} dataset type(s) specified as a graph constraint, but"
759 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
760 )
761 _LOG.debug(f"Constraining graph query using {constraint}")
762 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
763 queryArgs["collections"] = collections
764 else:
765 raise ValueError(
766 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
767 )
769 if "datasets" in queryArgs:
770 for i, dataset_type in enumerate(queryArgs["datasets"]):
771 if dataset_type.isComponent():
772 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
774 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
775 _LOG.debug("Expanding data IDs.")
776 commonDataIds = commonDataIds.expanded()
777 _LOG.debug("Iterating over query results to associate quanta with datasets.")
778 # Iterate over query results, populating data IDs for datasets and
779 # quanta and then connecting them to each other.
780 n = -1
781 for n, commonDataId in enumerate(commonDataIds):
782 # Create DatasetRefs for all DatasetTypes from this result row,
783 # noting that we might have created some already.
784 # We remember both those that already existed and those that we
785 # create now.
786 refsForRow = {}
787 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
788 for datasetType, refs in itertools.chain(
789 self.inputs.items(), self.intermediates.items(), self.outputs.items()
790 ):
791 datasetDataId: Optional[DataCoordinate]
792 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
793 datasetDataId = commonDataId.subset(datasetType.dimensions)
794 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
795 ref = refs.get(datasetDataId)
796 if ref is None:
797 ref = DatasetRef(datasetType, datasetDataId)
798 refs[datasetDataId] = ref
799 refsForRow[datasetType.name] = ref
800 # Create _QuantumScaffolding objects for all tasks from this
801 # result row, noting that we might have created some already.
802 for task in self.tasks:
803 quantumDataId = commonDataId.subset(task.dimensions)
804 quantum = task.quanta.get(quantumDataId)
805 if quantum is None:
806 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
807 task.quanta[quantumDataId] = quantum
808 # Whether this is a new quantum or an existing one, we can
809 # now associate the DatasetRefs for this row with it. The
810 # fact that a Quantum data ID and a dataset data ID both
811 # came from the same result row is what tells us they
812 # should be associated.
813 # Many of these associates will be duplicates (because
814 # another query row that differed from this one only in
815 # irrelevant dimensions already added them), and we use
816 # sets to skip.
817 for datasetType in task.inputs:
818 ref = refsForRow[datasetType.name]
819 quantum.inputs[datasetType.name][ref.dataId] = ref
820 for datasetType in task.outputs:
821 ref = refsForRow[datasetType.name]
822 quantum.outputs[datasetType.name][ref.dataId] = ref
823 if n < 0:
824 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
825 emptiness_explained = False
826 for message in commonDataIds.explain_no_results():
827 _LOG.critical(message)
828 emptiness_explained = True
829 if not emptiness_explained:
830 _LOG.critical(
831 "To reproduce this query for debugging purposes, run "
832 "Registry.queryDataIds with these arguments:"
833 )
834 # We could just repr() the queryArgs dict to get something
835 # the user could make sense of, but it's friendlier to
836 # put these args in an easier-to-construct equivalent form
837 # so they can read it more easily and copy and paste into
838 # a Python terminal.
839 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
840 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
841 if queryArgs["where"]:
842 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
843 if "datasets" in queryArgs:
844 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
845 if "collections" in queryArgs:
846 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
847 _LOG.debug("Finished processing %d rows from data ID query.", n)
848 yield commonDataIds
850 def resolveDatasetRefs(
851 self,
852 registry: Registry,
853 collections: Any,
854 run: Optional[str],
855 commonDataIds: DataCoordinateQueryResults,
856 *,
857 skipExistingIn: Any = None,
858 clobberOutputs: bool = True,
859 constrainedByAllDatasets: bool = True,
860 resolveRefs: bool = False,
861 ) -> None:
862 """Perform follow up queries for each dataset data ID produced in
863 `fillDataIds`.
865 This method populates `_DatasetScaffolding.refs` (except for those in
866 `prerequisites`).
868 Parameters
869 ----------
870 registry : `lsst.daf.butler.Registry`
871 Registry for the data repository; used for all data ID queries.
872 collections
873 Expressions representing the collections to search for input
874 datasets. See :ref:`daf_butler_ordered_collection_searches`.
875 run : `str`, optional
876 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
877 output datasets, if it already exists.
878 commonDataIds : \
879 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
880 Result of a previous call to `connectDataIds`.
881 skipExistingIn
882 Expressions representing the collections to search for existing
883 output datasets that should be skipped. See
884 :ref:`daf_butler_ordered_collection_searches` for allowed types.
885 `None` or empty string/sequence disables skipping.
886 clobberOutputs : `bool`, optional
887 If `True` (default), allow quanta to created even if outputs exist;
888 this requires the same behavior behavior to be enabled when
889 executing. If ``skipExistingIn`` is not `None`, completed quanta
890 (those with metadata, or all outputs if there is no metadata
891 dataset configured) will be skipped rather than clobbered.
892 constrainedByAllDatasets : `bool`, optional
893 Indicates if the commonDataIds were generated with a constraint on
894 all dataset types.
895 resolveRefs : `bool`, optional
896 If `True` then resolve all input references and generate random
897 dataset IDs for all output and intermediate datasets. True value
898 requires ``run`` collection to be specified.
900 Raises
901 ------
902 OutputExistsError
903 Raised if an output dataset already exists in the output run
904 and ``skipExistingIn`` does not include output run, or if only
905 some outputs are present and ``clobberOutputs`` is `False`.
906 """
907 # Run may be provided but it does not have to exist, in that case we
908 # use it for resolving references but don't check it for existing refs.
909 run_exists = False
910 if run:
911 try:
912 run_exists = bool(registry.queryCollections(run))
913 except MissingCollectionError:
914 # Undocumented exception is raise if it does not exist
915 pass
917 skip_collections_wildcard: CollectionWildcard | None = None
918 skipExistingInRun = False
919 if skipExistingIn:
920 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
921 if run_exists:
922 # as optimization check in the explicit list of names first
923 skipExistingInRun = run in skip_collections_wildcard.strings
924 if not skipExistingInRun:
925 # need to flatten it and check again
926 skipExistingInRun = run in registry.queryCollections(
927 skipExistingIn,
928 collectionTypes=CollectionType.RUN,
929 )
931 idMaker: Optional[_DatasetIdMaker] = None
932 if resolveRefs:
933 assert run is not None, "run cannot be None when resolveRefs is True"
934 idMaker = _DatasetIdMaker(registry, run)
936 resolvedRefQueryResults: Iterable[DatasetRef]
938 # Updating constrainedByAllDatasets here is not ideal, but we have a
939 # few different code paths that each transfer different pieces of
940 # information about what dataset query constraints were applied here,
941 # and none of them has the complete picture until we get here. We're
942 # long overdue for a QG generation rewrite that will make this go away
943 # entirely anyway.
944 constrainedByAllDatasets = (
945 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
946 )
948 # Look up [init] intermediate and output datasets in the output
949 # collection, if there is an output collection.
950 if run_exists or skip_collections_wildcard is not None:
951 for datasetType, refs in itertools.chain(
952 self.initIntermediates.items(),
953 self.initOutputs.items(),
954 self.intermediates.items(),
955 self.outputs.items(),
956 ):
957 _LOG.debug(
958 "Resolving %d datasets for intermediate and/or output dataset %s.",
959 len(refs),
960 datasetType.name,
961 )
962 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
963 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
964 # TODO: this assert incorrectly bans component inputs;
965 # investigate on DM-33027.
966 # assert not datasetType.isComponent(), \
967 # "Output datasets cannot be components."
968 #
969 # Instead we have to handle them manually to avoid a
970 # deprecation warning, but it is at least confusing and
971 # possibly a bug for components to appear here at all.
972 if datasetType.isComponent():
973 parent_dataset_type = datasetType.makeCompositeDatasetType()
974 component = datasetType.component()
975 else:
976 parent_dataset_type = datasetType
977 component = None
979 # look at RUN collection first
980 if run_exists:
981 try:
982 resolvedRefQueryResults = subset.findDatasets(
983 parent_dataset_type, collections=run, findFirst=True
984 )
985 except MissingDatasetTypeError:
986 resolvedRefQueryResults = []
987 for resolvedRef in resolvedRefQueryResults:
988 # TODO: we could easily support per-DatasetType
989 # skipExisting and I could imagine that being useful -
990 # it's probably required in order to support writing
991 # initOutputs before QuantumGraph generation.
992 assert resolvedRef.dataId in refs
993 if not (skipExistingInRun or isInit or clobberOutputs):
994 raise OutputExistsError(
995 f"Output dataset {datasetType.name} already exists in "
996 f"output RUN collection '{run}' with data ID"
997 f" {resolvedRef.dataId}."
998 )
999 # If we are going to resolve all outputs then we have
1000 # to remember existing ones to avoid generating new
1001 # dataset IDs for them.
1002 if resolveRefs:
1003 refs[resolvedRef.dataId] = (
1004 resolvedRef.makeComponentRef(component)
1005 if component is not None
1006 else resolvedRef
1007 )
1009 # And check skipExistingIn too, if RUN collection is in
1010 # it is handled above
1011 if skip_collections_wildcard is not None:
1012 try:
1013 resolvedRefQueryResults = subset.findDatasets(
1014 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True
1015 )
1016 except MissingDatasetTypeError:
1017 resolvedRefQueryResults = []
1018 for resolvedRef in resolvedRefQueryResults:
1019 assert resolvedRef.dataId in refs
1020 refs[resolvedRef.dataId] = (
1021 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1022 )
1024 # Look up input and initInput datasets in the input collection(s).
1025 # container to accumulate unfound refs, if the common dataIs were not
1026 # constrained on dataset type existence.
1027 self.unfoundRefs = set()
1028 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1029 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
1030 if datasetType.isComponent():
1031 parent_dataset_type = datasetType.makeCompositeDatasetType()
1032 component = datasetType.component()
1033 else:
1034 parent_dataset_type = datasetType
1035 component = None
1036 try:
1037 resolvedRefQueryResults = commonDataIds.subset(
1038 datasetType.dimensions, unique=True
1039 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1040 except MissingDatasetTypeError:
1041 resolvedRefQueryResults = []
1042 dataIdsNotFoundYet = set(refs.keys())
1043 for resolvedRef in resolvedRefQueryResults:
1044 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1045 refs[resolvedRef.dataId] = (
1046 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1047 )
1048 if dataIdsNotFoundYet:
1049 if constrainedByAllDatasets:
1050 raise RuntimeError(
1051 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1052 f"'{datasetType.name}' was/were present in a previous "
1053 "query, but could not be found now. "
1054 "This is either a logic bug in QuantumGraph generation "
1055 "or the input collections have been modified since "
1056 "QuantumGraph generation began."
1057 )
1058 elif not datasetType.dimensions:
1059 raise RuntimeError(
1060 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1061 f"collections {collections}."
1062 )
1063 else:
1064 # if the common dataIds were not constrained using all the
1065 # input dataset types, it is possible that some data ids
1066 # found dont correspond to existing dataset types and they
1067 # will be un-resolved. Mark these for later pruning from
1068 # the quantum graph.
1069 for k in dataIdsNotFoundYet:
1070 self.unfoundRefs.add(refs[k])
1072 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1073 # replacing the unresolved refs there, and then look up prerequisites.
1074 for task in self.tasks:
1075 _LOG.debug(
1076 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1077 len(task.quanta),
1078 task.taskDef.label,
1079 )
1080 # The way iterConnections is designed makes it impossible to
1081 # annotate precisely enough to satisfy MyPy here.
1082 lookupFunctions = {
1083 c.name: c.lookupFunction # type: ignore
1084 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1085 if c.lookupFunction is not None # type: ignore
1086 }
1087 dataIdsFailed = []
1088 dataIdsSucceeded = []
1089 for quantum in task.quanta.values():
1090 # Process outputs datasets only if skipExistingIn is not None
1091 # or there is a run to look for outputs in and clobberOutputs
1092 # is True. Note that if skipExistingIn is None, any output
1093 # datasets that already exist would have already caused an
1094 # exception to be raised. We never update the DatasetRefs in
1095 # the quantum because those should never be resolved.
1096 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1097 resolvedRefs = []
1098 unresolvedRefs = []
1099 haveMetadata = False
1100 for datasetType, originalRefs in quantum.outputs.items():
1101 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
1102 if ref.id is not None:
1103 resolvedRefs.append(ref)
1104 if datasetType.name == task.taskDef.metadataDatasetName:
1105 haveMetadata = True
1106 else:
1107 unresolvedRefs.append(ref)
1108 if resolvedRefs:
1109 if haveMetadata or not unresolvedRefs:
1110 dataIdsSucceeded.append(quantum.dataId)
1111 if skip_collections_wildcard is not None:
1112 continue
1113 else:
1114 dataIdsFailed.append(quantum.dataId)
1115 if not clobberOutputs:
1116 raise OutputExistsError(
1117 f"Quantum {quantum.dataId} of task with label "
1118 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1119 f"({resolvedRefs}) "
1120 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1121 "and clobbering outputs was not enabled."
1122 )
1123 # Update the input DatasetRefs to the resolved ones we already
1124 # searched for.
1125 for datasetType, input_refs in quantum.inputs.items():
1126 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1127 input_refs[ref.dataId] = ref
1128 # Look up prerequisite datasets in the input collection(s).
1129 # These may have dimensions that extend beyond those we queried
1130 # for originally, because we want to permit those data ID
1131 # values to differ across quanta and dataset types.
1132 for datasetType in task.prerequisites:
1133 if datasetType.isComponent():
1134 parent_dataset_type = datasetType.makeCompositeDatasetType()
1135 component = datasetType.component()
1136 else:
1137 parent_dataset_type = datasetType
1138 component = None
1139 lookupFunction = lookupFunctions.get(datasetType.name)
1140 if lookupFunction is not None:
1141 # PipelineTask has provided its own function to do the
1142 # lookup. This always takes precedence.
1143 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1144 elif (
1145 datasetType.isCalibration()
1146 and datasetType.dimensions <= quantum.dataId.graph
1147 and quantum.dataId.graph.temporal
1148 ):
1149 # This is a master calibration lookup, which we have to
1150 # handle specially because the query system can't do a
1151 # temporal join on a non-dimension-based timespan yet.
1152 timespan = quantum.dataId.timespan
1153 try:
1154 prereq_ref = registry.findDataset(
1155 parent_dataset_type,
1156 quantum.dataId,
1157 collections=collections,
1158 timespan=timespan,
1159 )
1160 if prereq_ref is not None:
1161 if component is not None:
1162 prereq_ref = prereq_ref.makeComponentRef(component)
1163 prereq_refs = [prereq_ref]
1164 else:
1165 prereq_refs = []
1166 except (KeyError, MissingDatasetTypeError):
1167 # This dataset type is not present in the registry,
1168 # which just means there are no datasets here.
1169 prereq_refs = []
1170 else:
1171 # Most general case.
1172 prereq_refs = [
1173 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1174 for prereq_ref in registry.queryDatasets(
1175 parent_dataset_type,
1176 collections=collections,
1177 dataId=quantum.dataId,
1178 findFirst=True,
1179 ).expanded()
1180 ]
1181 prereq_refs_map = {ref.dataId: ref for ref in prereq_refs if ref is not None}
1182 quantum.prerequisites[datasetType].update(prereq_refs_map)
1183 task.prerequisites[datasetType].update(prereq_refs_map)
1185 # Resolve all quantum inputs and outputs.
1186 if idMaker:
1187 for datasetDict in (quantum.inputs, quantum.outputs):
1188 for refDict in datasetDict.values():
1189 refDict.update(idMaker.resolveDict(refDict))
1191 # Resolve task initInputs and initOutputs.
1192 if idMaker:
1193 for datasetDict in (task.initInputs, task.initOutputs):
1194 for refDict in datasetDict.values():
1195 refDict.update(idMaker.resolveDict(refDict))
1197 # Actually remove any quanta that we decided to skip above.
1198 if dataIdsSucceeded:
1199 if skip_collections_wildcard is not None:
1200 _LOG.debug(
1201 "Pruning successful %d quanta for task with label '%s' because all of their "
1202 "outputs exist or metadata was written successfully.",
1203 len(dataIdsSucceeded),
1204 task.taskDef.label,
1205 )
1206 for dataId in dataIdsSucceeded:
1207 del task.quanta[dataId]
1208 elif clobberOutputs:
1209 _LOG.info(
1210 "Found %d successful quanta for task with label '%s' "
1211 "that will need to be clobbered during execution.",
1212 len(dataIdsSucceeded),
1213 task.taskDef.label,
1214 )
1215 else:
1216 raise AssertionError("OutputExistsError should have already been raised.")
1217 if dataIdsFailed:
1218 if clobberOutputs:
1219 _LOG.info(
1220 "Found %d failed/incomplete quanta for task with label '%s' "
1221 "that will need to be clobbered during execution.",
1222 len(dataIdsFailed),
1223 task.taskDef.label,
1224 )
1225 else:
1226 raise AssertionError("OutputExistsError should have already been raised.")
1228 # Collect initOutputs that do not belong to any task.
1229 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1230 for task in self.tasks:
1231 global_dataset_types -= set(task.initOutputs)
1232 if global_dataset_types:
1233 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1234 if idMaker is not None:
1235 for refDict in self.globalInitOutputs.values():
1236 refDict.update(idMaker.resolveDict(refDict))
1238 def makeQuantumGraph(
1239 self,
1240 registry: Registry,
1241 metadata: Optional[Mapping[str, Any]] = None,
1242 datastore: Optional[Datastore] = None,
1243 ) -> QuantumGraph:
1244 """Create a `QuantumGraph` from the quanta already present in
1245 the scaffolding data structure.
1247 Parameters
1248 ---------
1249 registry : `lsst.daf.butler.Registry`
1250 Registry for the data repository; used for all data ID queries.
1251 metadata : Optional Mapping of `str` to primitives
1252 This is an optional parameter of extra data to carry with the
1253 graph. Entries in this mapping should be able to be serialized in
1254 JSON.
1255 datastore : `Datastore`, optional
1256 If not `None` then fill datastore records in each generated
1257 Quantum.
1259 Returns
1260 -------
1261 graph : `QuantumGraph`
1262 The full `QuantumGraph`.
1263 """
1265 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1266 """Extract all DatasetRefs from the dictionaries"""
1267 for ref_dict in dataset_dict.values():
1268 yield from ref_dict.values()
1270 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1271 if datastore is not None:
1272 datastore_records = datastore.export_records(
1273 itertools.chain(
1274 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1275 )
1276 )
1278 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1279 for task in self.tasks:
1280 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1281 graphInput[task.taskDef] = qset
1283 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1284 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1286 globalInitOutputs: list[DatasetRef] = []
1287 if self.globalInitOutputs is not None:
1288 for refs_dict in self.globalInitOutputs.values():
1289 globalInitOutputs.extend(refs_dict.values())
1291 graph = QuantumGraph(
1292 graphInput,
1293 metadata=metadata,
1294 pruneRefs=self.unfoundRefs,
1295 universe=self.dimensions.universe,
1296 initInputs=taskInitInputs,
1297 initOutputs=taskInitOutputs,
1298 globalInitOutputs=globalInitOutputs,
1299 registryDatasetTypes=self._get_registry_dataset_types(registry),
1300 )
1301 return graph
1303 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1304 """Make a list of all dataset types used by a graph as defined in
1305 registry.
1306 """
1307 chain = [
1308 self.initInputs,
1309 self.initIntermediates,
1310 self.initOutputs,
1311 self.inputs,
1312 self.intermediates,
1313 self.outputs,
1314 self.prerequisites,
1315 ]
1316 if self.globalInitOutputs is not None:
1317 chain.append(self.globalInitOutputs)
1319 # Collect names of all dataset types.
1320 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1321 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1323 # Check for types that do not exist in registry yet:
1324 # - inputs must exist
1325 # - intermediates and outputs may not exist, but there must not be
1326 # more than one definition (e.g. differing in storage class)
1327 # - prerequisites may not exist, treat it the same as outputs here
1328 for dstype in itertools.chain(self.initInputs, self.inputs):
1329 if dstype.name not in dataset_types:
1330 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1332 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1333 chain = [
1334 self.initIntermediates,
1335 self.initOutputs,
1336 self.intermediates,
1337 self.outputs,
1338 self.prerequisites,
1339 ]
1340 if self.globalInitOutputs is not None:
1341 chain.append(self.globalInitOutputs)
1342 for dstype in itertools.chain(*chain):
1343 if dstype.name not in dataset_types:
1344 new_outputs[dstype.name].add(dstype)
1345 for name, dstypes in new_outputs.items():
1346 if len(dstypes) > 1:
1347 raise ValueError(
1348 "Pipeline contains multiple definitions for a dataset type "
1349 f"which is not defined in registry yet: {dstypes}"
1350 )
1351 elif len(dstypes) == 1:
1352 dataset_types[name] = dstypes.pop()
1354 return dataset_types.values()
1357# ------------------------
1358# Exported definitions --
1359# ------------------------
1362class GraphBuilderError(Exception):
1363 """Base class for exceptions generated by graph builder."""
1365 pass
1368class OutputExistsError(GraphBuilderError):
1369 """Exception generated when output datasets already exist."""
1371 pass
1374class PrerequisiteMissingError(GraphBuilderError):
1375 """Exception generated when a prerequisite dataset does not exist."""
1377 pass
1380class GraphBuilder:
1381 """GraphBuilder class is responsible for building task execution graph from
1382 a Pipeline.
1384 Parameters
1385 ----------
1386 registry : `~lsst.daf.butler.Registry`
1387 Data butler instance.
1388 skipExistingIn
1389 Expressions representing the collections to search for existing
1390 output datasets that should be skipped. See
1391 :ref:`daf_butler_ordered_collection_searches`.
1392 clobberOutputs : `bool`, optional
1393 If `True` (default), allow quanta to created even if partial outputs
1394 exist; this requires the same behavior behavior to be enabled when
1395 executing.
1396 datastore : `Datastore`, optional
1397 If not `None` then fill datastore records in each generated Quantum.
1398 """
1400 def __init__(
1401 self,
1402 registry: Registry,
1403 skipExistingIn: Any = None,
1404 clobberOutputs: bool = True,
1405 datastore: Optional[Datastore] = None,
1406 ):
1407 self.registry = registry
1408 self.dimensions = registry.dimensions
1409 self.skipExistingIn = skipExistingIn
1410 self.clobberOutputs = clobberOutputs
1411 self.datastore = datastore
1413 def makeGraph(
1414 self,
1415 pipeline: Union[Pipeline, Iterable[TaskDef]],
1416 collections: Any,
1417 run: Optional[str],
1418 userQuery: Optional[str],
1419 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1420 metadata: Optional[Mapping[str, Any]] = None,
1421 resolveRefs: bool = False,
1422 bind: Optional[Mapping[str, Any]] = None,
1423 ) -> QuantumGraph:
1424 """Create execution graph for a pipeline.
1426 Parameters
1427 ----------
1428 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1429 Pipeline definition, task names/classes and their configs.
1430 collections
1431 Expressions representing the collections to search for input
1432 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1433 run : `str`, optional
1434 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1435 output datasets. Collection does not have to exist and it will be
1436 created when graph is executed.
1437 userQuery : `str`
1438 String which defines user-defined selection for registry, should be
1439 empty or `None` if there is no restrictions on data selection.
1440 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1441 The query constraint variant that should be used to constraint the
1442 query based on dataset existance, defaults to
1443 `DatasetQueryConstraintVariant.ALL`.
1444 metadata : Optional Mapping of `str` to primitives
1445 This is an optional parameter of extra data to carry with the
1446 graph. Entries in this mapping should be able to be serialized in
1447 JSON.
1448 resolveRefs : `bool`, optional
1449 If `True` then resolve all input references and generate random
1450 dataset IDs for all output and intermediate datasets. True value
1451 requires ``run`` collection to be specified.
1452 bind : `Mapping`, optional
1453 Mapping containing literal values that should be injected into the
1454 ``userQuery`` expression, keyed by the identifiers they replace.
1456 Returns
1457 -------
1458 graph : `QuantumGraph`
1460 Raises
1461 ------
1462 UserExpressionError
1463 Raised when user expression cannot be parsed.
1464 OutputExistsError
1465 Raised when output datasets already exist.
1466 Exception
1467 Other exceptions types may be raised by underlying registry
1468 classes.
1469 """
1470 if resolveRefs and run is None:
1471 raise ValueError("`resolveRefs` requires `run` parameter.")
1472 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1473 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1474 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1475 instrument_class: Optional[Any] = None
1476 if isinstance(pipeline, Pipeline):
1477 instrument_class_name = pipeline.getInstrument()
1478 if instrument_class_name is not None:
1479 instrument_class = doImportType(instrument_class_name)
1480 pipeline = list(pipeline.toExpandedPipeline())
1481 if instrument_class is not None:
1482 dataId = DataCoordinate.standardize(
1483 instrument=instrument_class.getName(), universe=self.registry.dimensions
1484 )
1485 else:
1486 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1487 with scaffolding.connectDataIds(
1488 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1489 ) as commonDataIds:
1490 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1491 scaffolding.resolveDatasetRefs(
1492 self.registry,
1493 collections,
1494 run,
1495 commonDataIds,
1496 skipExistingIn=self.skipExistingIn,
1497 clobberOutputs=self.clobberOutputs,
1498 constrainedByAllDatasets=condition,
1499 resolveRefs=resolveRefs,
1500 )
1501 return scaffolding.makeQuantumGraph(
1502 registry=self.registry, metadata=metadata, datastore=self.datastore
1503 )