Coverage for python/lsst/pipe/base/graphBuilder.py: 14%
532 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-27 02:47 -0700
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-27 02:47 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33import warnings
34from collections import ChainMap, defaultdict
35from contextlib import contextmanager
36from dataclasses import dataclass
37from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
39from lsst.daf.butler import (
40 CollectionType,
41 DataCoordinate,
42 DatasetIdGenEnum,
43 DatasetRef,
44 DatasetType,
45 Datastore,
46 DatastoreRecordData,
47 DimensionGraph,
48 DimensionUniverse,
49 NamedKeyDict,
50 NamedValueSet,
51 Quantum,
52 Registry,
53 UnresolvedRefWarning,
54)
55from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
56from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
57from lsst.daf.butler.registry.wildcards import CollectionWildcard
58from lsst.utils import doImportType
60from ._datasetQueryConstraints import DatasetQueryConstraintVariant
61from ._status import NoWorkFound
63# -----------------------------
64# Imports for other modules --
65# -----------------------------
66from .connections import AdjustQuantumHelper, iterConnections
67from .graph import QuantumGraph
68from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
70# ----------------------------------
71# Local non-exported definitions --
72# ----------------------------------
74_LOG = logging.getLogger(__name__)
77class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
78 """A custom dictionary that maps `DatasetType` to a nested dictionary of
79 the known `DatasetRef` instances of that type.
81 Parameters
82 ----------
83 args
84 Positional arguments are forwarded to the `dict` constructor.
85 universe : `DimensionUniverse`
86 Universe of all possible dimensions.
87 """
89 def __init__(self, *args: Any, universe: DimensionUniverse):
90 super().__init__(*args)
91 self.universe = universe
93 @classmethod
94 def fromDatasetTypes(
95 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
96 ) -> _DatasetDict:
97 """Construct a dictionary from a flat iterable of `DatasetType` keys.
99 Parameters
100 ----------
101 datasetTypes : `iterable` of `DatasetType`
102 DatasetTypes to use as keys for the dict. Values will be empty
103 dictionaries.
104 universe : `DimensionUniverse`
105 Universe of all possible dimensions.
107 Returns
108 -------
109 dictionary : `_DatasetDict`
110 A new `_DatasetDict` instance.
111 """
112 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
114 @classmethod
115 def fromSubset(
116 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
117 ) -> _DatasetDict:
118 """Return a new dictionary by extracting items corresponding to the
119 given keys from one or more existing dictionaries.
121 Parameters
122 ----------
123 datasetTypes : `iterable` of `DatasetType`
124 DatasetTypes to use as keys for the dict. Values will be obtained
125 by lookups against ``first`` and ``rest``.
126 first : `_DatasetDict`
127 Another dictionary from which to extract values.
128 rest
129 Additional dictionaries from which to extract values.
131 Returns
132 -------
133 dictionary : `_DatasetDict`
134 A new dictionary instance.
135 """
136 combined = ChainMap(first, *rest)
138 # Dataset types known to match immediately can be processed
139 # without checks.
140 matches = combined.keys() & set(datasetTypes)
141 _dict = {k: combined[k] for k in matches}
143 if len(_dict) < len(datasetTypes):
144 # Work out which ones are missing.
145 missing_datasetTypes = set(datasetTypes) - _dict.keys()
147 # Get the known names for comparison.
148 combined_by_name = {k.name: k for k in combined}
150 missing = set()
151 incompatible = {}
152 for datasetType in missing_datasetTypes:
153 # The dataset type is not found. It may not be listed
154 # or it may be that it is there with the same name
155 # but different definition.
156 if datasetType.name in combined_by_name:
157 # This implies some inconsistency in definitions
158 # for connections. If there is support for storage
159 # class conversion we can let it slide.
160 # At this point we do not know
161 # where the inconsistency is but trust that down
162 # stream code will be more explicit about input
163 # vs output incompatibilities.
164 existing = combined_by_name[datasetType.name]
165 convertible_to_existing = existing.is_compatible_with(datasetType)
166 convertible_from_existing = datasetType.is_compatible_with(existing)
167 if convertible_to_existing and convertible_from_existing:
168 _LOG.debug(
169 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
170 datasetType.name,
171 datasetType.storageClass_name,
172 existing.storageClass_name,
173 )
174 _dict[datasetType] = combined[existing]
175 elif convertible_to_existing or convertible_from_existing:
176 # We'd need to refactor a fair amount to recognize
177 # whether this is an error or not, so I'm not going to
178 # bother until we need to do that for other reasons
179 # (it won't be too long).
180 _LOG.info(
181 "Dataset type %s is present with multiple only partially-compatible storage "
182 "classes %s and %s.",
183 datasetType.name,
184 datasetType.storageClass_name,
185 existing.storageClass_name,
186 )
187 _dict[datasetType] = combined[existing]
188 else:
189 incompatible[datasetType] = existing
190 else:
191 missing.add(datasetType)
193 if missing or incompatible:
194 reasons = []
195 if missing:
196 reasons.append(
197 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
198 + ", ".join(d.name for d in combined)
199 )
200 if incompatible:
201 for x, y in incompatible.items():
202 reasons.append(f"{x} incompatible with {y}")
203 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
205 return cls(_dict, universe=first.universe)
207 @property
208 def dimensions(self) -> DimensionGraph:
209 """The union of all dimensions used by all dataset types in this
210 dictionary, including implied dependencies (`DimensionGraph`).
211 """
212 base = self.universe.empty
213 if len(self) == 0:
214 return base
215 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
217 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
218 """Unpack nested single-element `DatasetRef` dicts into a new
219 mapping with `DatasetType` keys and `DatasetRef` values.
221 This method assumes that each nest contains exactly one item, as is the
222 case for all "init" datasets.
224 Returns
225 -------
226 dictionary : `NamedKeyDict`
227 Dictionary mapping `DatasetType` to `DatasetRef`, with both
228 `DatasetType` instances and string names usable as keys.
229 """
231 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
232 (ref,) = refs.values()
233 return ref
235 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
237 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
238 """Unpack nested multi-element `DatasetRef` dicts into a new
239 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
241 Returns
242 -------
243 dictionary : `NamedKeyDict`
244 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
245 both `DatasetType` instances and string names usable as keys.
246 """
247 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
249 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
250 """Iterate over the contained `DatasetRef` instances that match the
251 given `DatasetType` and data IDs.
253 Parameters
254 ----------
255 datasetType : `DatasetType`
256 Dataset type to match.
257 dataIds : `Iterable` [ `DataCoordinate` ]
258 Data IDs to match.
260 Returns
261 -------
262 refs : `Iterator` [ `DatasetRef` ]
263 DatasetRef instances for which ``ref.datasetType == datasetType``
264 and ``ref.dataId`` is in ``dataIds``.
265 """
266 refs = self[datasetType]
267 return (refs[dataId] for dataId in dataIds)
270class _QuantumScaffolding:
271 """Helper class aggregating information about a `Quantum`, used when
272 constructing a `QuantumGraph`.
274 See `_PipelineScaffolding` for a top-down description of the full
275 scaffolding data structure.
277 Parameters
278 ----------
279 task : _TaskScaffolding
280 Back-reference to the helper object for the `PipelineTask` this quantum
281 represents an execution of.
282 dataId : `DataCoordinate`
283 Data ID for this quantum.
284 """
286 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
287 self.task = task
288 self.dataId = dataId
289 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
290 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
291 self.prerequisites = _DatasetDict.fromDatasetTypes(
292 task.prerequisites.keys(), universe=dataId.universe
293 )
295 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
297 def __repr__(self) -> str:
298 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
300 task: _TaskScaffolding
301 """Back-reference to the helper object for the `PipelineTask` this quantum
302 represents an execution of.
303 """
305 dataId: DataCoordinate
306 """Data ID for this quantum.
307 """
309 inputs: _DatasetDict
310 """Nested dictionary containing `DatasetRef` inputs to this quantum.
312 This is initialized to map each `DatasetType` to an empty dictionary at
313 construction. Those nested dictionaries are populated (with data IDs as
314 keys) with unresolved `DatasetRef` instances in
315 `_PipelineScaffolding.connectDataIds`.
316 """
318 outputs: _DatasetDict
319 """Nested dictionary containing `DatasetRef` outputs this quantum.
320 """
322 prerequisites: _DatasetDict
323 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
324 quantum.
325 """
327 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
328 """Transform the scaffolding object into a true `Quantum` instance.
330 Parameters
331 ----------
332 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
333 If not `None` then fill datastore records in each generated Quantum
334 using the records from this structure.
336 Returns
337 -------
338 quantum : `Quantum`
339 An actual `Quantum` instance.
340 """
341 allInputs = self.inputs.unpackMultiRefs()
342 allInputs.update(self.prerequisites.unpackMultiRefs())
343 # Give the task's Connections class an opportunity to remove some
344 # inputs, or complain if they are unacceptable.
345 # This will raise if one of the check conditions is not met, which is
346 # the intended behavior.
347 # If it raises NotWorkFound, there is a bug in the QG algorithm
348 # or the adjustQuantum is incorrectly trying to make a prerequisite
349 # input behave like a regular input; adjustQuantum should only raise
350 # NoWorkFound if a regular input is missing, and it shouldn't be
351 # possible for us to have generated ``self`` if that's true.
352 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
353 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
354 initInputs = self.task.initInputs.unpackSingleRefs()
355 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
356 if datastore_records is not None:
357 quantum_records = {}
358 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
359 input_refs += list(initInputs.values())
360 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
361 for datastore_name, records in datastore_records.items():
362 matching_records = records.subset(input_ids)
363 if matching_records is not None:
364 quantum_records[datastore_name] = matching_records
365 return Quantum(
366 taskName=self.task.taskDef.taskName,
367 taskClass=self.task.taskDef.taskClass,
368 dataId=self.dataId,
369 initInputs=initInputs,
370 inputs=helper.inputs,
371 outputs=helper.outputs,
372 datastore_records=quantum_records,
373 )
376@dataclass
377class _TaskScaffolding:
378 """Helper class aggregating information about a `PipelineTask`, used when
379 constructing a `QuantumGraph`.
381 See `_PipelineScaffolding` for a top-down description of the full
382 scaffolding data structure.
384 Parameters
385 ----------
386 taskDef : `TaskDef`
387 Data structure that identifies the task class and its config.
388 parent : `_PipelineScaffolding`
389 The parent data structure that will hold the instance being
390 constructed.
391 datasetTypes : `TaskDatasetTypes`
392 Data structure that categorizes the dataset types used by this task.
393 """
395 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
396 universe = parent.dimensions.universe
397 self.taskDef = taskDef
398 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
399 assert self.dimensions.issubset(parent.dimensions)
400 # Initialize _DatasetDicts as subsets of the one or two
401 # corresponding dicts in the parent _PipelineScaffolding.
402 self.initInputs = _DatasetDict.fromSubset(
403 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
404 )
405 self.initOutputs = _DatasetDict.fromSubset(
406 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
407 )
408 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
409 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
410 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
411 self.dataIds: Set[DataCoordinate] = set()
412 self.quanta = {}
414 def __repr__(self) -> str:
415 # Default dataclass-injected __repr__ gets caught in an infinite loop
416 # because of back-references.
417 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
419 taskDef: TaskDef
420 """Data structure that identifies the task class and its config
421 (`TaskDef`).
422 """
424 dimensions: DimensionGraph
425 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
426 """
428 initInputs: _DatasetDict
429 """Dictionary containing information about datasets used to construct this
430 task (`_DatasetDict`).
431 """
433 initOutputs: _DatasetDict
434 """Dictionary containing information about datasets produced as a
435 side-effect of constructing this task (`_DatasetDict`).
436 """
438 inputs: _DatasetDict
439 """Dictionary containing information about datasets used as regular,
440 graph-constraining inputs to this task (`_DatasetDict`).
441 """
443 outputs: _DatasetDict
444 """Dictionary containing information about datasets produced by this task
445 (`_DatasetDict`).
446 """
448 prerequisites: _DatasetDict
449 """Dictionary containing information about input datasets that must be
450 present in the repository before any Pipeline containing this task is run
451 (`_DatasetDict`).
452 """
454 quanta: Dict[DataCoordinate, _QuantumScaffolding]
455 """Dictionary mapping data ID to a scaffolding object for the Quantum of
456 this task with that data ID.
457 """
459 def makeQuantumSet(
460 self,
461 unresolvedRefs: Optional[Set[DatasetRef]] = None,
462 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
463 ) -> Set[Quantum]:
464 """Create a `set` of `Quantum` from the information in ``self``.
466 Parameters
467 ----------
468 unresolvedRefs : `set` [ `DatasetRef` ], optional
469 Input dataset refs that have not been found.
470 datastore_records : `dict`
473 Returns
474 -------
475 nodes : `set` of `Quantum`
476 The `Quantum` elements corresponding to this task.
477 """
478 if unresolvedRefs is None:
479 unresolvedRefs = set()
480 outputs = set()
481 for q in self.quanta.values():
482 try:
483 tmpQuanta = q.makeQuantum(datastore_records)
484 outputs.add(tmpQuanta)
485 except (NoWorkFound, FileNotFoundError) as exc:
486 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
487 if unresolvedRefs.intersection(refs):
488 # This means it is a node that is Known to be pruned
489 # later and should be left in even though some follow up
490 # queries fail. This allows the pruning to start from this
491 # quantum with known issues, and prune other nodes it
492 # touches
493 inputs = q.inputs.unpackMultiRefs()
494 inputs.update(q.prerequisites.unpackMultiRefs())
495 tmpQuantum = Quantum(
496 taskName=q.task.taskDef.taskName,
497 taskClass=q.task.taskDef.taskClass,
498 dataId=q.dataId,
499 initInputs=q.task.initInputs.unpackSingleRefs(),
500 inputs=inputs,
501 outputs=q.outputs.unpackMultiRefs(),
502 )
503 outputs.add(tmpQuantum)
504 else:
505 raise exc
506 return outputs
509class _DatasetIdMaker:
510 """Helper class which generates random dataset UUIDs for unresolved
511 datasets.
512 """
514 def __init__(self, registry: Registry, run: str):
515 self.datasetIdFactory = registry.datasetIdFactory
516 self.run = run
517 # Dataset IDs generated so far
518 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {}
520 def resolveRef(self, ref: DatasetRef) -> DatasetRef:
521 if ref.id is not None:
522 return ref
524 # For components we need their parent dataset ID.
525 if ref.isComponent():
526 with warnings.catch_warnings():
527 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
528 parent_ref = ref.makeCompositeRef()
529 # Some basic check - parent should be resolved if this is an
530 # existing input, or it should be in the cache already if it is
531 # an intermediate.
532 if parent_ref.id is None:
533 key = parent_ref.datasetType, parent_ref.dataId
534 if key not in self.resolved:
535 raise ValueError(f"Composite dataset is missing from cache: {parent_ref}")
536 parent_ref = self.resolved[key]
537 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved"
538 with warnings.catch_warnings():
539 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
540 return ref.resolved(parent_ref.id, parent_ref.run)
542 key = ref.datasetType, ref.dataId
543 if (resolved := self.resolved.get(key)) is None:
544 with warnings.catch_warnings():
545 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
546 resolved = self.datasetIdFactory.resolveRef(ref, self.run, DatasetIdGenEnum.UNIQUE)
547 self.resolved[key] = resolved
548 return resolved
550 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]:
551 """Resolve all unresolved references in the provided dictionary."""
552 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()}
555@dataclass
556class _PipelineScaffolding:
557 """A helper data structure that organizes the information involved in
558 constructing a `QuantumGraph` for a `Pipeline`.
560 Parameters
561 ----------
562 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
563 Sequence of tasks from which a graph is to be constructed. Must
564 have nested task classes already imported.
565 universe : `DimensionUniverse`
566 Universe of all possible dimensions.
568 Notes
569 -----
570 The scaffolding data structure contains nested data structures for both
571 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
572 data structures are shared between the pipeline-level structure (which
573 aggregates all datasets and categorizes them from the perspective of the
574 complete pipeline) and the individual tasks that use them as inputs and
575 outputs.
577 `QuantumGraph` construction proceeds in four steps, with each corresponding
578 to a different `_PipelineScaffolding` method:
580 1. When `_PipelineScaffolding` is constructed, we extract and categorize
581 the DatasetTypes used by the pipeline (delegating to
582 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
583 nested `_TaskScaffolding` and `_DatasetDict` objects.
585 2. In `connectDataIds`, we construct and run the "Big Join Query", which
586 returns related tuples of all dimensions used to identify any regular
587 input, output, and intermediate datasets (not prerequisites). We then
588 iterate over these tuples of related dimensions, identifying the subsets
589 that correspond to distinct data IDs for each task and dataset type,
590 and then create `_QuantumScaffolding` objects.
592 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
593 dataset data IDs previously identified, transforming unresolved
594 DatasetRefs into resolved DatasetRefs where appropriate. We then look
595 up prerequisite datasets for all quanta.
597 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
598 per-task `_QuantumScaffolding` objects.
599 """
601 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
602 _LOG.debug("Initializing data structures for QuantumGraph generation.")
603 self.tasks = []
604 # Aggregate and categorize the DatasetTypes in the Pipeline.
605 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
606 # Construct dictionaries that map those DatasetTypes to structures
607 # that will (later) hold additional information about them.
608 for attr in (
609 "initInputs",
610 "initIntermediates",
611 "initOutputs",
612 "inputs",
613 "intermediates",
614 "outputs",
615 "prerequisites",
616 ):
617 setattr(
618 self,
619 attr,
620 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
621 )
622 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
623 # Aggregate all dimensions for all non-init, non-prerequisite
624 # DatasetTypes. These are the ones we'll include in the big join
625 # query.
626 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
627 # Construct scaffolding nodes for each Task, and add backreferences
628 # to the Task from each DatasetScaffolding node.
629 # Note that there's only one scaffolding node for each DatasetType,
630 # shared by _PipelineScaffolding and all _TaskScaffoldings that
631 # reference it.
632 if isinstance(pipeline, Pipeline):
633 pipeline = pipeline.toExpandedPipeline()
634 self.tasks = [
635 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
636 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
637 ]
639 def __repr__(self) -> str:
640 # Default dataclass-injected __repr__ gets caught in an infinite loop
641 # because of back-references.
642 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
644 tasks: List[_TaskScaffolding]
645 """Scaffolding data structures for each task in the pipeline
646 (`list` of `_TaskScaffolding`).
647 """
649 initInputs: _DatasetDict
650 """Datasets consumed but not produced when constructing the tasks in this
651 pipeline (`_DatasetDict`).
652 """
654 initIntermediates: _DatasetDict
655 """Datasets that are both consumed and produced when constructing the tasks
656 in this pipeline (`_DatasetDict`).
657 """
659 initOutputs: _DatasetDict
660 """Datasets produced but not consumed when constructing the tasks in this
661 pipeline (`_DatasetDict`).
662 """
664 inputs: _DatasetDict
665 """Datasets that are consumed but not produced when running this pipeline
666 (`_DatasetDict`).
667 """
669 intermediates: _DatasetDict
670 """Datasets that are both produced and consumed when running this pipeline
671 (`_DatasetDict`).
672 """
674 outputs: _DatasetDict
675 """Datasets produced but not consumed when when running this pipeline
676 (`_DatasetDict`).
677 """
679 prerequisites: _DatasetDict
680 """Datasets that are consumed when running this pipeline and looked up
681 per-Quantum when generating the graph (`_DatasetDict`).
682 """
684 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
685 """Datasets that should be used as constraints in the initial query,
686 according to tasks (`NamedValueSet`).
687 """
689 dimensions: DimensionGraph
690 """All dimensions used by any regular input, intermediate, or output
691 (not prerequisite) dataset; the set of dimension used in the "Big Join
692 Query" (`DimensionGraph`).
694 This is required to be a superset of all task quantum dimensions.
695 """
697 globalInitOutputs: _DatasetDict | None = None
698 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
699 """
701 @contextmanager
702 def connectDataIds(
703 self,
704 registry: Registry,
705 collections: Any,
706 userQuery: Optional[str],
707 externalDataId: DataCoordinate,
708 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
709 bind: Optional[Mapping[str, Any]] = None,
710 ) -> Iterator[DataCoordinateQueryResults]:
711 """Query for the data IDs that connect nodes in the `QuantumGraph`.
713 This method populates `_TaskScaffolding.dataIds` and
714 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
716 Parameters
717 ----------
718 registry : `lsst.daf.butler.Registry`
719 Registry for the data repository; used for all data ID queries.
720 collections
721 Expressions representing the collections to search for input
722 datasets. See :ref:`daf_butler_ordered_collection_searches`.
723 userQuery : `str` or `None`
724 User-provided expression to limit the data IDs processed.
725 externalDataId : `DataCoordinate`
726 Externally-provided data ID that should be used to restrict the
727 results, just as if these constraints had been included via ``AND``
728 in ``userQuery``. This includes (at least) any instrument named
729 in the pipeline definition.
730 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
731 The query constraint variant that should be used to constraint the
732 query based on dataset existance, defaults to
733 `DatasetQueryConstraintVariant.ALL`.
734 bind : `Mapping`, optional
735 Mapping containing literal values that should be injected into the
736 ``userQuery`` expression, keyed by the identifiers they replace.
738 Returns
739 -------
740 commonDataIds : \
741 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
742 An interface to a database temporary table containing all data IDs
743 that will appear in this `QuantumGraph`. Returned inside a
744 context manager, which will drop the temporary table at the end of
745 the `with` block in which this method is called.
746 """
747 _LOG.debug("Building query for data IDs.")
748 # Initialization datasets always have empty data IDs.
749 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
750 for datasetType, refs in itertools.chain(
751 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
752 ):
753 with warnings.catch_warnings():
754 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
755 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
756 # Run one big query for the data IDs for task dimensions and regular
757 # inputs and outputs. We limit the query to only dimensions that are
758 # associated with the input dataset types, but don't (yet) try to
759 # obtain the dataset_ids for those inputs.
760 _LOG.debug(
761 "Submitting data ID query over dimensions %s and materializing results.",
762 list(self.dimensions.names),
763 )
764 queryArgs: Dict[str, Any] = {
765 "dimensions": self.dimensions,
766 "where": userQuery,
767 "dataId": externalDataId,
768 "bind": bind,
769 }
770 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
771 _LOG.debug(
772 "Constraining graph query using default of %s.",
773 list(self.defaultDatasetQueryConstraints.names),
774 )
775 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
776 queryArgs["collections"] = collections
777 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
778 _LOG.debug("Not using dataset existence to constrain query.")
779 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
780 constraint = set(datasetQueryConstraint)
781 inputs = {k.name: k for k in self.inputs.keys()}
782 if remainder := constraint.difference(inputs.keys()):
783 raise ValueError(
784 f"{remainder} dataset type(s) specified as a graph constraint, but"
785 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
786 )
787 _LOG.debug(f"Constraining graph query using {constraint}")
788 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
789 queryArgs["collections"] = collections
790 else:
791 raise ValueError(
792 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
793 )
795 if "datasets" in queryArgs:
796 for i, dataset_type in enumerate(queryArgs["datasets"]):
797 if dataset_type.isComponent():
798 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
800 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
801 _LOG.debug("Expanding data IDs.")
802 commonDataIds = commonDataIds.expanded()
803 _LOG.debug("Iterating over query results to associate quanta with datasets.")
804 # Iterate over query results, populating data IDs for datasets and
805 # quanta and then connecting them to each other.
806 n = -1
807 for n, commonDataId in enumerate(commonDataIds):
808 # Create DatasetRefs for all DatasetTypes from this result row,
809 # noting that we might have created some already.
810 # We remember both those that already existed and those that we
811 # create now.
812 refsForRow = {}
813 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
814 for datasetType, refs in itertools.chain(
815 self.inputs.items(), self.intermediates.items(), self.outputs.items()
816 ):
817 datasetDataId: Optional[DataCoordinate]
818 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
819 datasetDataId = commonDataId.subset(datasetType.dimensions)
820 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
821 ref = refs.get(datasetDataId)
822 if ref is None:
823 with warnings.catch_warnings():
824 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
825 ref = DatasetRef(datasetType, datasetDataId)
826 refs[datasetDataId] = ref
827 refsForRow[datasetType.name] = ref
828 # Create _QuantumScaffolding objects for all tasks from this
829 # result row, noting that we might have created some already.
830 for task in self.tasks:
831 quantumDataId = commonDataId.subset(task.dimensions)
832 quantum = task.quanta.get(quantumDataId)
833 if quantum is None:
834 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
835 task.quanta[quantumDataId] = quantum
836 # Whether this is a new quantum or an existing one, we can
837 # now associate the DatasetRefs for this row with it. The
838 # fact that a Quantum data ID and a dataset data ID both
839 # came from the same result row is what tells us they
840 # should be associated.
841 # Many of these associates will be duplicates (because
842 # another query row that differed from this one only in
843 # irrelevant dimensions already added them), and we use
844 # sets to skip.
845 for datasetType in task.inputs:
846 ref = refsForRow[datasetType.name]
847 quantum.inputs[datasetType.name][ref.dataId] = ref
848 for datasetType in task.outputs:
849 ref = refsForRow[datasetType.name]
850 quantum.outputs[datasetType.name][ref.dataId] = ref
851 if n < 0:
852 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
853 emptiness_explained = False
854 for message in commonDataIds.explain_no_results():
855 _LOG.critical(message)
856 emptiness_explained = True
857 if not emptiness_explained:
858 _LOG.critical(
859 "To reproduce this query for debugging purposes, run "
860 "Registry.queryDataIds with these arguments:"
861 )
862 # We could just repr() the queryArgs dict to get something
863 # the user could make sense of, but it's friendlier to
864 # put these args in an easier-to-construct equivalent form
865 # so they can read it more easily and copy and paste into
866 # a Python terminal.
867 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
868 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
869 if queryArgs["where"]:
870 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
871 if "datasets" in queryArgs:
872 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
873 if "collections" in queryArgs:
874 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
875 _LOG.debug("Finished processing %d rows from data ID query.", n)
876 yield commonDataIds
878 def resolveDatasetRefs(
879 self,
880 registry: Registry,
881 collections: Any,
882 run: Optional[str],
883 commonDataIds: DataCoordinateQueryResults,
884 *,
885 skipExistingIn: Any = None,
886 clobberOutputs: bool = True,
887 constrainedByAllDatasets: bool = True,
888 resolveRefs: bool = False,
889 ) -> None:
890 """Perform follow up queries for each dataset data ID produced in
891 `fillDataIds`.
893 This method populates `_DatasetScaffolding.refs` (except for those in
894 `prerequisites`).
896 Parameters
897 ----------
898 registry : `lsst.daf.butler.Registry`
899 Registry for the data repository; used for all data ID queries.
900 collections
901 Expressions representing the collections to search for input
902 datasets. See :ref:`daf_butler_ordered_collection_searches`.
903 run : `str`, optional
904 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
905 output datasets, if it already exists.
906 commonDataIds : \
907 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
908 Result of a previous call to `connectDataIds`.
909 skipExistingIn
910 Expressions representing the collections to search for existing
911 output datasets that should be skipped. See
912 :ref:`daf_butler_ordered_collection_searches` for allowed types.
913 `None` or empty string/sequence disables skipping.
914 clobberOutputs : `bool`, optional
915 If `True` (default), allow quanta to created even if outputs exist;
916 this requires the same behavior behavior to be enabled when
917 executing. If ``skipExistingIn`` is not `None`, completed quanta
918 (those with metadata, or all outputs if there is no metadata
919 dataset configured) will be skipped rather than clobbered.
920 constrainedByAllDatasets : `bool`, optional
921 Indicates if the commonDataIds were generated with a constraint on
922 all dataset types.
923 resolveRefs : `bool`, optional
924 If `True` then resolve all input references and generate random
925 dataset IDs for all output and intermediate datasets. True value
926 requires ``run`` collection to be specified.
928 Raises
929 ------
930 OutputExistsError
931 Raised if an output dataset already exists in the output run
932 and ``skipExistingIn`` does not include output run, or if only
933 some outputs are present and ``clobberOutputs`` is `False`.
934 """
935 # Run may be provided but it does not have to exist, in that case we
936 # use it for resolving references but don't check it for existing refs.
937 run_exists = False
938 if run:
939 try:
940 run_exists = bool(registry.queryCollections(run))
941 except MissingCollectionError:
942 # Undocumented exception is raise if it does not exist
943 pass
945 skip_collections_wildcard: CollectionWildcard | None = None
946 skipExistingInRun = False
947 if skipExistingIn:
948 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
949 if run_exists:
950 # as optimization check in the explicit list of names first
951 skipExistingInRun = run in skip_collections_wildcard.strings
952 if not skipExistingInRun:
953 # need to flatten it and check again
954 skipExistingInRun = run in registry.queryCollections(
955 skipExistingIn,
956 collectionTypes=CollectionType.RUN,
957 )
959 idMaker: Optional[_DatasetIdMaker] = None
960 if resolveRefs:
961 assert run is not None, "run cannot be None when resolveRefs is True"
962 idMaker = _DatasetIdMaker(registry, run)
964 resolvedRefQueryResults: Iterable[DatasetRef]
966 # Updating constrainedByAllDatasets here is not ideal, but we have a
967 # few different code paths that each transfer different pieces of
968 # information about what dataset query constraints were applied here,
969 # and none of them has the complete picture until we get here. We're
970 # long overdue for a QG generation rewrite that will make this go away
971 # entirely anyway.
972 constrainedByAllDatasets = (
973 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
974 )
976 # Look up [init] intermediate and output datasets in the output
977 # collection, if there is an output collection.
978 if run_exists or skip_collections_wildcard is not None:
979 for datasetType, refs in itertools.chain(
980 self.initIntermediates.items(),
981 self.initOutputs.items(),
982 self.intermediates.items(),
983 self.outputs.items(),
984 ):
985 _LOG.debug(
986 "Resolving %d datasets for intermediate and/or output dataset %s.",
987 len(refs),
988 datasetType.name,
989 )
990 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
991 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
992 # TODO: this assert incorrectly bans component inputs;
993 # investigate on DM-33027.
994 # assert not datasetType.isComponent(), \
995 # "Output datasets cannot be components."
996 #
997 # Instead we have to handle them manually to avoid a
998 # deprecation warning, but it is at least confusing and
999 # possibly a bug for components to appear here at all.
1000 if datasetType.isComponent():
1001 parent_dataset_type = datasetType.makeCompositeDatasetType()
1002 component = datasetType.component()
1003 else:
1004 parent_dataset_type = datasetType
1005 component = None
1007 # look at RUN collection first
1008 if run_exists:
1009 try:
1010 resolvedRefQueryResults = subset.findDatasets(
1011 parent_dataset_type, collections=run, findFirst=True
1012 )
1013 except MissingDatasetTypeError:
1014 resolvedRefQueryResults = []
1015 for resolvedRef in resolvedRefQueryResults:
1016 # TODO: we could easily support per-DatasetType
1017 # skipExisting and I could imagine that being useful -
1018 # it's probably required in order to support writing
1019 # initOutputs before QuantumGraph generation.
1020 assert resolvedRef.dataId in refs
1021 if not (skipExistingInRun or isInit or clobberOutputs):
1022 raise OutputExistsError(
1023 f"Output dataset {datasetType.name} already exists in "
1024 f"output RUN collection '{run}' with data ID"
1025 f" {resolvedRef.dataId}."
1026 )
1027 # If we are going to resolve all outputs then we have
1028 # to remember existing ones to avoid generating new
1029 # dataset IDs for them.
1030 if resolveRefs:
1031 refs[resolvedRef.dataId] = (
1032 resolvedRef.makeComponentRef(component)
1033 if component is not None
1034 else resolvedRef
1035 )
1037 # And check skipExistingIn too, if RUN collection is in
1038 # it is handled above
1039 if skip_collections_wildcard is not None:
1040 try:
1041 resolvedRefQueryResults = subset.findDatasets(
1042 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True
1043 )
1044 except MissingDatasetTypeError:
1045 resolvedRefQueryResults = []
1046 for resolvedRef in resolvedRefQueryResults:
1047 assert resolvedRef.dataId in refs
1048 refs[resolvedRef.dataId] = (
1049 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1050 )
1052 # Look up input and initInput datasets in the input collection(s).
1053 # container to accumulate unfound refs, if the common dataIs were not
1054 # constrained on dataset type existence.
1055 self.unfoundRefs = set()
1056 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1057 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
1058 if datasetType.isComponent():
1059 parent_dataset_type = datasetType.makeCompositeDatasetType()
1060 component = datasetType.component()
1061 else:
1062 parent_dataset_type = datasetType
1063 component = None
1064 try:
1065 resolvedRefQueryResults = commonDataIds.subset(
1066 datasetType.dimensions, unique=True
1067 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1068 except MissingDatasetTypeError:
1069 resolvedRefQueryResults = []
1070 dataIdsNotFoundYet = set(refs.keys())
1071 for resolvedRef in resolvedRefQueryResults:
1072 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1073 refs[resolvedRef.dataId] = (
1074 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1075 )
1076 if dataIdsNotFoundYet:
1077 if constrainedByAllDatasets:
1078 raise RuntimeError(
1079 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1080 f"'{datasetType.name}' was/were present in a previous "
1081 "query, but could not be found now. "
1082 "This is either a logic bug in QuantumGraph generation "
1083 "or the input collections have been modified since "
1084 "QuantumGraph generation began."
1085 )
1086 elif not datasetType.dimensions:
1087 raise RuntimeError(
1088 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1089 f"collections {collections}."
1090 )
1091 else:
1092 # if the common dataIds were not constrained using all the
1093 # input dataset types, it is possible that some data ids
1094 # found dont correspond to existing dataset types and they
1095 # will be un-resolved. Mark these for later pruning from
1096 # the quantum graph.
1097 for k in dataIdsNotFoundYet:
1098 self.unfoundRefs.add(refs[k])
1100 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1101 # replacing the unresolved refs there, and then look up prerequisites.
1102 for task in self.tasks:
1103 _LOG.debug(
1104 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1105 len(task.quanta),
1106 task.taskDef.label,
1107 )
1108 # The way iterConnections is designed makes it impossible to
1109 # annotate precisely enough to satisfy MyPy here.
1110 lookupFunctions = {
1111 c.name: c.lookupFunction # type: ignore
1112 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1113 if c.lookupFunction is not None # type: ignore
1114 }
1115 dataIdsFailed = []
1116 dataIdsSucceeded = []
1117 for quantum in task.quanta.values():
1118 # Process outputs datasets only if skipExistingIn is not None
1119 # or there is a run to look for outputs in and clobberOutputs
1120 # is True. Note that if skipExistingIn is None, any output
1121 # datasets that already exist would have already caused an
1122 # exception to be raised. We never update the DatasetRefs in
1123 # the quantum because those should never be resolved.
1124 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1125 resolvedRefs = []
1126 unresolvedRefs = []
1127 haveMetadata = False
1128 for datasetType, originalRefs in quantum.outputs.items():
1129 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
1130 if ref.id is not None:
1131 resolvedRefs.append(ref)
1132 if datasetType.name == task.taskDef.metadataDatasetName:
1133 haveMetadata = True
1134 else:
1135 unresolvedRefs.append(ref)
1136 if resolvedRefs:
1137 if haveMetadata or not unresolvedRefs:
1138 dataIdsSucceeded.append(quantum.dataId)
1139 if skip_collections_wildcard is not None:
1140 continue
1141 else:
1142 dataIdsFailed.append(quantum.dataId)
1143 if not clobberOutputs:
1144 raise OutputExistsError(
1145 f"Quantum {quantum.dataId} of task with label "
1146 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1147 f"({resolvedRefs}) "
1148 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1149 "and clobbering outputs was not enabled."
1150 )
1151 # Update the input DatasetRefs to the resolved ones we already
1152 # searched for.
1153 for datasetType, input_refs in quantum.inputs.items():
1154 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1155 input_refs[ref.dataId] = ref
1156 # Look up prerequisite datasets in the input collection(s).
1157 # These may have dimensions that extend beyond those we queried
1158 # for originally, because we want to permit those data ID
1159 # values to differ across quanta and dataset types.
1160 for datasetType in task.prerequisites:
1161 if datasetType.isComponent():
1162 parent_dataset_type = datasetType.makeCompositeDatasetType()
1163 component = datasetType.component()
1164 else:
1165 parent_dataset_type = datasetType
1166 component = None
1167 lookupFunction = lookupFunctions.get(datasetType.name)
1168 if lookupFunction is not None:
1169 # PipelineTask has provided its own function to do the
1170 # lookup. This always takes precedence.
1171 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1172 elif (
1173 datasetType.isCalibration()
1174 and datasetType.dimensions <= quantum.dataId.graph
1175 and quantum.dataId.graph.temporal
1176 ):
1177 # This is a master calibration lookup, which we have to
1178 # handle specially because the query system can't do a
1179 # temporal join on a non-dimension-based timespan yet.
1180 timespan = quantum.dataId.timespan
1181 try:
1182 prereq_ref = registry.findDataset(
1183 parent_dataset_type,
1184 quantum.dataId,
1185 collections=collections,
1186 timespan=timespan,
1187 )
1188 if prereq_ref is not None:
1189 if component is not None:
1190 prereq_ref = prereq_ref.makeComponentRef(component)
1191 prereq_refs = [prereq_ref]
1192 else:
1193 prereq_refs = []
1194 except (KeyError, MissingDatasetTypeError):
1195 # This dataset type is not present in the registry,
1196 # which just means there are no datasets here.
1197 prereq_refs = []
1198 else:
1199 # Most general case.
1200 prereq_refs = [
1201 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1202 for prereq_ref in registry.queryDatasets(
1203 parent_dataset_type,
1204 collections=collections,
1205 dataId=quantum.dataId,
1206 findFirst=True,
1207 ).expanded()
1208 ]
1209 prereq_refs_map = {ref.dataId: ref for ref in prereq_refs if ref is not None}
1210 quantum.prerequisites[datasetType].update(prereq_refs_map)
1211 task.prerequisites[datasetType].update(prereq_refs_map)
1213 # Resolve all quantum inputs and outputs.
1214 if idMaker:
1215 for datasetDict in (quantum.inputs, quantum.outputs):
1216 for refDict in datasetDict.values():
1217 refDict.update(idMaker.resolveDict(refDict))
1219 # Resolve task initInputs and initOutputs.
1220 if idMaker:
1221 for datasetDict in (task.initInputs, task.initOutputs):
1222 for refDict in datasetDict.values():
1223 refDict.update(idMaker.resolveDict(refDict))
1225 # Actually remove any quanta that we decided to skip above.
1226 if dataIdsSucceeded:
1227 if skip_collections_wildcard is not None:
1228 _LOG.debug(
1229 "Pruning successful %d quanta for task with label '%s' because all of their "
1230 "outputs exist or metadata was written successfully.",
1231 len(dataIdsSucceeded),
1232 task.taskDef.label,
1233 )
1234 for dataId in dataIdsSucceeded:
1235 del task.quanta[dataId]
1236 elif clobberOutputs:
1237 _LOG.info(
1238 "Found %d successful quanta for task with label '%s' "
1239 "that will need to be clobbered during execution.",
1240 len(dataIdsSucceeded),
1241 task.taskDef.label,
1242 )
1243 else:
1244 raise AssertionError("OutputExistsError should have already been raised.")
1245 if dataIdsFailed:
1246 if clobberOutputs:
1247 _LOG.info(
1248 "Found %d failed/incomplete quanta for task with label '%s' "
1249 "that will need to be clobbered during execution.",
1250 len(dataIdsFailed),
1251 task.taskDef.label,
1252 )
1253 else:
1254 raise AssertionError("OutputExistsError should have already been raised.")
1256 # Collect initOutputs that do not belong to any task.
1257 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1258 for task in self.tasks:
1259 global_dataset_types -= set(task.initOutputs)
1260 if global_dataset_types:
1261 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1262 if idMaker is not None:
1263 for refDict in self.globalInitOutputs.values():
1264 refDict.update(idMaker.resolveDict(refDict))
1266 def makeQuantumGraph(
1267 self,
1268 registry: Registry,
1269 metadata: Optional[Mapping[str, Any]] = None,
1270 datastore: Optional[Datastore] = None,
1271 ) -> QuantumGraph:
1272 """Create a `QuantumGraph` from the quanta already present in
1273 the scaffolding data structure.
1275 Parameters
1276 ---------
1277 registry : `lsst.daf.butler.Registry`
1278 Registry for the data repository; used for all data ID queries.
1279 metadata : Optional Mapping of `str` to primitives
1280 This is an optional parameter of extra data to carry with the
1281 graph. Entries in this mapping should be able to be serialized in
1282 JSON.
1283 datastore : `Datastore`, optional
1284 If not `None` then fill datastore records in each generated
1285 Quantum.
1287 Returns
1288 -------
1289 graph : `QuantumGraph`
1290 The full `QuantumGraph`.
1291 """
1293 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1294 """Extract all DatasetRefs from the dictionaries"""
1295 for ref_dict in dataset_dict.values():
1296 yield from ref_dict.values()
1298 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1299 if datastore is not None:
1300 datastore_records = datastore.export_records(
1301 itertools.chain(
1302 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1303 )
1304 )
1306 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1307 for task in self.tasks:
1308 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1309 graphInput[task.taskDef] = qset
1311 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1312 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1314 globalInitOutputs: list[DatasetRef] = []
1315 if self.globalInitOutputs is not None:
1316 for refs_dict in self.globalInitOutputs.values():
1317 globalInitOutputs.extend(refs_dict.values())
1319 graph = QuantumGraph(
1320 graphInput,
1321 metadata=metadata,
1322 pruneRefs=self.unfoundRefs,
1323 universe=self.dimensions.universe,
1324 initInputs=taskInitInputs,
1325 initOutputs=taskInitOutputs,
1326 globalInitOutputs=globalInitOutputs,
1327 registryDatasetTypes=self._get_registry_dataset_types(registry),
1328 )
1329 return graph
1331 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1332 """Make a list of all dataset types used by a graph as defined in
1333 registry.
1334 """
1335 chain = [
1336 self.initInputs,
1337 self.initIntermediates,
1338 self.initOutputs,
1339 self.inputs,
1340 self.intermediates,
1341 self.outputs,
1342 self.prerequisites,
1343 ]
1344 if self.globalInitOutputs is not None:
1345 chain.append(self.globalInitOutputs)
1347 # Collect names of all dataset types.
1348 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1349 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1351 # Check for types that do not exist in registry yet:
1352 # - inputs must exist
1353 # - intermediates and outputs may not exist, but there must not be
1354 # more than one definition (e.g. differing in storage class)
1355 # - prerequisites may not exist, treat it the same as outputs here
1356 for dstype in itertools.chain(self.initInputs, self.inputs):
1357 if dstype.name not in dataset_types:
1358 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1360 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1361 chain = [
1362 self.initIntermediates,
1363 self.initOutputs,
1364 self.intermediates,
1365 self.outputs,
1366 self.prerequisites,
1367 ]
1368 if self.globalInitOutputs is not None:
1369 chain.append(self.globalInitOutputs)
1370 for dstype in itertools.chain(*chain):
1371 if dstype.name not in dataset_types:
1372 new_outputs[dstype.name].add(dstype)
1373 for name, dstypes in new_outputs.items():
1374 if len(dstypes) > 1:
1375 raise ValueError(
1376 "Pipeline contains multiple definitions for a dataset type "
1377 f"which is not defined in registry yet: {dstypes}"
1378 )
1379 elif len(dstypes) == 1:
1380 dataset_types[name] = dstypes.pop()
1382 return dataset_types.values()
1385# ------------------------
1386# Exported definitions --
1387# ------------------------
1390class GraphBuilderError(Exception):
1391 """Base class for exceptions generated by graph builder."""
1393 pass
1396class OutputExistsError(GraphBuilderError):
1397 """Exception generated when output datasets already exist."""
1399 pass
1402class PrerequisiteMissingError(GraphBuilderError):
1403 """Exception generated when a prerequisite dataset does not exist."""
1405 pass
1408class GraphBuilder:
1409 """GraphBuilder class is responsible for building task execution graph from
1410 a Pipeline.
1412 Parameters
1413 ----------
1414 registry : `~lsst.daf.butler.Registry`
1415 Data butler instance.
1416 skipExistingIn
1417 Expressions representing the collections to search for existing
1418 output datasets that should be skipped. See
1419 :ref:`daf_butler_ordered_collection_searches`.
1420 clobberOutputs : `bool`, optional
1421 If `True` (default), allow quanta to created even if partial outputs
1422 exist; this requires the same behavior behavior to be enabled when
1423 executing.
1424 datastore : `Datastore`, optional
1425 If not `None` then fill datastore records in each generated Quantum.
1426 """
1428 def __init__(
1429 self,
1430 registry: Registry,
1431 skipExistingIn: Any = None,
1432 clobberOutputs: bool = True,
1433 datastore: Optional[Datastore] = None,
1434 ):
1435 self.registry = registry
1436 self.dimensions = registry.dimensions
1437 self.skipExistingIn = skipExistingIn
1438 self.clobberOutputs = clobberOutputs
1439 self.datastore = datastore
1441 def makeGraph(
1442 self,
1443 pipeline: Union[Pipeline, Iterable[TaskDef]],
1444 collections: Any,
1445 run: Optional[str],
1446 userQuery: Optional[str],
1447 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1448 metadata: Optional[Mapping[str, Any]] = None,
1449 resolveRefs: bool = False,
1450 bind: Optional[Mapping[str, Any]] = None,
1451 ) -> QuantumGraph:
1452 """Create execution graph for a pipeline.
1454 Parameters
1455 ----------
1456 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1457 Pipeline definition, task names/classes and their configs.
1458 collections
1459 Expressions representing the collections to search for input
1460 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1461 run : `str`, optional
1462 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1463 output datasets. Collection does not have to exist and it will be
1464 created when graph is executed.
1465 userQuery : `str`
1466 String which defines user-defined selection for registry, should be
1467 empty or `None` if there is no restrictions on data selection.
1468 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1469 The query constraint variant that should be used to constraint the
1470 query based on dataset existance, defaults to
1471 `DatasetQueryConstraintVariant.ALL`.
1472 metadata : Optional Mapping of `str` to primitives
1473 This is an optional parameter of extra data to carry with the
1474 graph. Entries in this mapping should be able to be serialized in
1475 JSON.
1476 resolveRefs : `bool`, optional
1477 If `True` then resolve all input references and generate random
1478 dataset IDs for all output and intermediate datasets. True value
1479 requires ``run`` collection to be specified.
1480 bind : `Mapping`, optional
1481 Mapping containing literal values that should be injected into the
1482 ``userQuery`` expression, keyed by the identifiers they replace.
1484 Returns
1485 -------
1486 graph : `QuantumGraph`
1488 Raises
1489 ------
1490 UserExpressionError
1491 Raised when user expression cannot be parsed.
1492 OutputExistsError
1493 Raised when output datasets already exist.
1494 Exception
1495 Other exceptions types may be raised by underlying registry
1496 classes.
1497 """
1498 if resolveRefs and run is None:
1499 raise ValueError("`resolveRefs` requires `run` parameter.")
1500 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1501 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1502 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1503 instrument_class: Optional[Any] = None
1504 if isinstance(pipeline, Pipeline):
1505 instrument_class_name = pipeline.getInstrument()
1506 if instrument_class_name is not None:
1507 instrument_class = doImportType(instrument_class_name)
1508 pipeline = list(pipeline.toExpandedPipeline())
1509 if instrument_class is not None:
1510 dataId = DataCoordinate.standardize(
1511 instrument=instrument_class.getName(), universe=self.registry.dimensions
1512 )
1513 else:
1514 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1515 with scaffolding.connectDataIds(
1516 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1517 ) as commonDataIds:
1518 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1519 scaffolding.resolveDatasetRefs(
1520 self.registry,
1521 collections,
1522 run,
1523 commonDataIds,
1524 skipExistingIn=self.skipExistingIn,
1525 clobberOutputs=self.clobberOutputs,
1526 constrainedByAllDatasets=condition,
1527 resolveRefs=resolveRefs,
1528 )
1529 return scaffolding.makeQuantumGraph(
1530 registry=self.registry, metadata=metadata, datastore=self.datastore
1531 )