Coverage for python/lsst/pipe/base/graphBuilder.py: 15%
555 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-17 02:45 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-17 02:45 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap, defaultdict
34from collections.abc import Collection, Iterable, Iterator, Mapping
35from contextlib import contextmanager
36from dataclasses import dataclass
37from typing import Any, Optional
39from lsst.daf.butler import (
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
55from lsst.daf.butler.registry.wildcards import CollectionWildcard
56from lsst.utils import doImportType
58# -----------------------------
59# Imports for other modules --
60# -----------------------------
61from . import automatic_connection_constants as acc
62from ._datasetQueryConstraints import DatasetQueryConstraintVariant
63from ._status import NoWorkFound
64from .connections import AdjustQuantumHelper, iterConnections
65from .graph import QuantumGraph
66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
68# ----------------------------------
69# Local non-exported definitions --
70# ----------------------------------
72_LOG = logging.getLogger(__name__)
75@dataclass
76class _RefHolder:
77 """Placeholder for `DatasetRef` representing a future resolved reference.
79 As we eliminated unresolved DatasetRefs we now use `None` to represent
80 a reference that is yet to be resolved. Information about its corresponding
81 dataset type and coordinate is stored in `_DatasetDict` mapping.
82 """
84 dataset_type: DatasetType
85 """Dataset type of the dataset to be created later. I need to store it here
86 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared
87 between different compatible dataset types."""
89 ref: DatasetRef | None = None
90 """Dataset reference, initially `None`, created when all datasets are
91 resolved.
92 """
94 @property
95 def resolved_ref(self) -> DatasetRef:
96 """Access resolved reference, should only be called after the
97 reference is set (`DatasetRef`)."""
98 assert self.ref is not None, "Dataset reference is not set."
99 return self.ref
102class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]):
103 """A custom dictionary that maps `DatasetType` to a nested dictionary of
104 the known `DatasetRef` instances of that type.
106 Parameters
107 ----------
108 args
109 Positional arguments are forwarded to the `dict` constructor.
110 universe : `DimensionUniverse`
111 Universe of all possible dimensions.
112 """
114 def __init__(self, *args: Any, universe: DimensionUniverse):
115 super().__init__(*args)
116 self.universe = universe
118 @classmethod
119 def fromDatasetTypes(
120 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
121 ) -> _DatasetDict:
122 """Construct a dictionary from a flat iterable of `DatasetType` keys.
124 Parameters
125 ----------
126 datasetTypes : `iterable` of `DatasetType`
127 DatasetTypes to use as keys for the dict. Values will be empty
128 dictionaries.
129 universe : `DimensionUniverse`
130 Universe of all possible dimensions.
132 Returns
133 -------
134 dictionary : `_DatasetDict`
135 A new `_DatasetDict` instance.
136 """
137 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
139 @classmethod
140 def fromSubset(
141 cls,
142 datasetTypes: Collection[DatasetType],
143 first: _DatasetDict,
144 *rest: _DatasetDict,
145 ) -> _DatasetDict:
146 """Return a new dictionary by extracting items corresponding to the
147 given keys from one or more existing dictionaries.
149 Parameters
150 ----------
151 datasetTypes : `iterable` of `DatasetType`
152 DatasetTypes to use as keys for the dict. Values will be obtained
153 by lookups against ``first`` and ``rest``.
154 first : `_DatasetDict`
155 Another dictionary from which to extract values.
156 rest
157 Additional dictionaries from which to extract values.
159 Returns
160 -------
161 dictionary : `_DatasetDict`
162 A new dictionary instance.
163 """
164 combined = ChainMap(first, *rest)
166 # Dataset types known to match immediately can be processed
167 # without checks.
168 matches = combined.keys() & set(datasetTypes)
169 _dict = {k: combined[k] for k in matches}
171 if len(_dict) < len(datasetTypes):
172 # Work out which ones are missing.
173 missing_datasetTypes = set(datasetTypes) - _dict.keys()
175 # Get the known names for comparison.
176 combined_by_name = {k.name: k for k in combined}
178 missing = set()
179 incompatible = {}
180 for datasetType in missing_datasetTypes:
181 # The dataset type is not found. It may not be listed
182 # or it may be that it is there with the same name
183 # but different definition.
184 if datasetType.name in combined_by_name:
185 # This implies some inconsistency in definitions
186 # for connections. If there is support for storage
187 # class conversion we can let it slide.
188 # At this point we do not know
189 # where the inconsistency is but trust that down
190 # stream code will be more explicit about input
191 # vs output incompatibilities.
192 existing = combined_by_name[datasetType.name]
193 convertible_to_existing = existing.is_compatible_with(datasetType)
194 convertible_from_existing = datasetType.is_compatible_with(existing)
195 if convertible_to_existing and convertible_from_existing:
196 _LOG.debug(
197 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
198 datasetType.name,
199 datasetType.storageClass_name,
200 existing.storageClass_name,
201 )
202 _dict[datasetType] = combined[existing]
203 elif convertible_to_existing or convertible_from_existing:
204 # We'd need to refactor a fair amount to recognize
205 # whether this is an error or not, so I'm not going to
206 # bother until we need to do that for other reasons
207 # (it won't be too long).
208 _LOG.info(
209 "Dataset type %s is present with multiple only partially-compatible storage "
210 "classes %s and %s.",
211 datasetType.name,
212 datasetType.storageClass_name,
213 existing.storageClass_name,
214 )
215 _dict[datasetType] = combined[existing]
216 else:
217 incompatible[datasetType] = existing
218 else:
219 missing.add(datasetType)
221 if missing or incompatible:
222 reasons = []
223 if missing:
224 reasons.append(
225 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known "
226 f"types: [{', '.join(d.name for d in combined)}]."
227 )
228 if incompatible:
229 for x, y in incompatible.items():
230 reasons.append(f"{x} incompatible with {y}")
231 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
233 return cls(_dict, universe=first.universe)
235 @property
236 def dimensions(self) -> DimensionGraph:
237 """The union of all dimensions used by all dataset types in this
238 dictionary, including implied dependencies (`DimensionGraph`).
239 """
240 base = self.universe.empty
241 if len(self) == 0:
242 return base
243 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
245 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]:
246 """Unpack nested single-element `DatasetRef` dicts into a new
247 mapping with `DatasetType` keys and `DatasetRef` values.
249 This method assumes that each nest contains exactly one item, as is the
250 case for all "init" datasets.
252 Parameters
253 ----------
254 storage_classes : `dict` [ `str`, `str` ]
255 Mapping from dataset type name to the storage class to use for that
256 dataset type. These are typically the storage classes declared
257 for a particular task, which may differ rom the data repository
258 definitions.
260 Returns
261 -------
262 dictionary : `NamedKeyDict`
263 Dictionary mapping `DatasetType` to `DatasetRef`, with both
264 `DatasetType` instances and string names usable as keys.
265 """
266 return NamedKeyDict(
267 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()}
268 )
270 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
271 """Unpack nested multi-element `DatasetRef` dicts into a new
272 mapping with `DatasetType` keys and `list` of `DatasetRef` values.
274 Parameters
275 ----------
276 storage_classes : `dict` [ `str`, `str` ]
277 Mapping from dataset type name to the storage class to use for that
278 dataset type. These are typically the storage classes declared
279 for a particular task, which may differ rom the data repository
280 definitions.
282 Returns
283 -------
284 dictionary : `NamedKeyDict`
285 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
286 both `DatasetType` instances and string names usable as keys.
287 """
288 result = {}
289 for dataset_type, holders in self.items():
290 if (
291 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name)
292 ) != dataset_type.storageClass_name:
293 dataset_type = dataset_type.overrideStorageClass(override)
294 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()]
295 else:
296 refs = [holder.resolved_ref for holder in holders.values()]
297 result[dataset_type] = refs
298 return NamedKeyDict(result)
300 def extract(
301 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
302 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]:
303 """Iterate over the contained `DatasetRef` instances that match the
304 given `DatasetType` and data IDs.
306 Parameters
307 ----------
308 datasetType : `DatasetType`
309 Dataset type to match.
310 dataIds : `Iterable` [ `DataCoordinate` ]
311 Data IDs to match.
313 Returns
314 -------
315 refs : `Iterator` [ `DatasetRef` ]
316 DatasetRef instances for which ``ref.datasetType == datasetType``
317 and ``ref.dataId`` is in ``dataIds``.
318 """
319 refs = self[datasetType]
320 return ((dataId, refs[dataId].ref) for dataId in dataIds)
322 def isdisjoint(self, other: _DatasetDict) -> bool:
323 """Test whether ``self`` and ``other`` have any datasets in common.
325 Datasets are considered in common if they have the same *parent*
326 dataset type name and data ID; storage classes and components are not
327 considered.
328 """
329 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()}
330 for k, v in other.items():
331 parent_name, _ = k.nameAndComponent()
332 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()):
333 return False
334 return True
336 def iter_resolved_refs(self) -> Iterator[DatasetRef]:
337 """Iterate over all DatasetRef instances held by this data structure,
338 assuming that each `_RefHolder` already carries are resolved ref.
339 """
340 for holders_by_data_id in self.values():
341 for holder in holders_by_data_id.values():
342 yield holder.resolved_ref
345class _QuantumScaffolding:
346 """Helper class aggregating information about a `Quantum`, used when
347 constructing a `QuantumGraph`.
349 See `_PipelineScaffolding` for a top-down description of the full
350 scaffolding data structure.
352 Parameters
353 ----------
354 task : _TaskScaffolding
355 Back-reference to the helper object for the `PipelineTask` this quantum
356 represents an execution of.
357 dataId : `DataCoordinate`
358 Data ID for this quantum.
359 """
361 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
362 self.task = task
363 self.dataId = dataId
364 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
365 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
366 self.prerequisites = _DatasetDict.fromDatasetTypes(
367 task.prerequisites.keys(), universe=dataId.universe
368 )
370 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
372 def __repr__(self) -> str:
373 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
375 task: _TaskScaffolding
376 """Back-reference to the helper object for the `PipelineTask` this quantum
377 represents an execution of.
378 """
380 dataId: DataCoordinate
381 """Data ID for this quantum.
382 """
384 inputs: _DatasetDict
385 """Nested dictionary containing `DatasetRef` inputs to this quantum.
387 This is initialized to map each `DatasetType` to an empty dictionary at
388 construction. Those nested dictionaries are populated (with data IDs as
389 keys) with unresolved `DatasetRef` instances in
390 `_PipelineScaffolding.connectDataIds`.
391 """
393 outputs: _DatasetDict
394 """Nested dictionary containing `DatasetRef` outputs this quantum.
395 """
397 prerequisites: _DatasetDict
398 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
399 quantum.
400 """
402 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
403 """Transform the scaffolding object into a true `Quantum` instance.
405 Parameters
406 ----------
407 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
408 If not `None` then fill datastore records in each generated Quantum
409 using the records from this structure.
411 Returns
412 -------
413 quantum : `Quantum`
414 An actual `Quantum` instance.
415 """
416 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes)
417 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes))
418 # Give the task's Connections class an opportunity to remove some
419 # inputs, or complain if they are unacceptable.
420 # This will raise if one of the check conditions is not met, which is
421 # the intended behavior.
422 # If it raises NotWorkFound, there is a bug in the QG algorithm
423 # or the adjustQuantum is incorrectly trying to make a prerequisite
424 # input behave like a regular input; adjustQuantum should only raise
425 # NoWorkFound if a regular input is missing, and it shouldn't be
426 # possible for us to have generated ``self`` if that's true.
427 helper = AdjustQuantumHelper(
428 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes)
429 )
430 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
431 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes)
432 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
433 if datastore_records is not None:
434 quantum_records = {}
435 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
436 input_refs += list(initInputs.values())
437 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
438 for datastore_name, records in datastore_records.items():
439 matching_records = records.subset(input_ids)
440 if matching_records is not None:
441 quantum_records[datastore_name] = matching_records
442 return Quantum(
443 taskName=self.task.taskDef.taskName,
444 taskClass=self.task.taskDef.taskClass,
445 dataId=self.dataId,
446 initInputs=initInputs,
447 inputs=helper.inputs,
448 outputs=helper.outputs,
449 datastore_records=quantum_records,
450 )
453@dataclass
454class _TaskScaffolding:
455 """Helper class aggregating information about a `PipelineTask`, used when
456 constructing a `QuantumGraph`.
458 See `_PipelineScaffolding` for a top-down description of the full
459 scaffolding data structure.
461 Parameters
462 ----------
463 taskDef : `TaskDef`
464 Data structure that identifies the task class and its config.
465 parent : `_PipelineScaffolding`
466 The parent data structure that will hold the instance being
467 constructed.
468 datasetTypes : `TaskDatasetTypes`
469 Data structure that categorizes the dataset types used by this task.
470 """
472 def __init__(
473 self,
474 taskDef: TaskDef,
475 parent: _PipelineScaffolding,
476 datasetTypes: TaskDatasetTypes,
477 ):
478 universe = parent.dimensions.universe
479 self.taskDef = taskDef
480 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
481 assert self.dimensions.issubset(parent.dimensions)
482 # Initialize _DatasetDicts as subsets of the one or two
483 # corresponding dicts in the parent _PipelineScaffolding.
484 self.initInputs = _DatasetDict.fromSubset(
485 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
486 )
487 self.initOutputs = _DatasetDict.fromSubset(
488 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
489 )
490 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
491 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
492 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
493 self.dataIds: set[DataCoordinate] = set()
494 self.quanta = {}
495 self.storage_classes = {
496 connection.name: connection.storageClass
497 for connection in self.taskDef.connections.allConnections.values()
498 }
499 self.storage_classes[
500 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
501 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS
502 self.storage_classes[
503 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
504 ] = acc.LOG_OUTPUT_STORAGE_CLASS
505 self.storage_classes[
506 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
507 ] = acc.METADATA_OUTPUT_STORAGE_CLASS
509 def __repr__(self) -> str:
510 # Default dataclass-injected __repr__ gets caught in an infinite loop
511 # because of back-references.
512 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
514 taskDef: TaskDef
515 """Data structure that identifies the task class and its config
516 (`TaskDef`).
517 """
519 dimensions: DimensionGraph
520 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
521 """
523 initInputs: _DatasetDict
524 """Dictionary containing information about datasets used to construct this
525 task (`_DatasetDict`).
526 """
528 initOutputs: _DatasetDict
529 """Dictionary containing information about datasets produced as a
530 side-effect of constructing this task (`_DatasetDict`).
531 """
533 inputs: _DatasetDict
534 """Dictionary containing information about datasets used as regular,
535 graph-constraining inputs to this task (`_DatasetDict`).
536 """
538 outputs: _DatasetDict
539 """Dictionary containing information about datasets produced by this task
540 (`_DatasetDict`).
541 """
543 prerequisites: _DatasetDict
544 """Dictionary containing information about input datasets that must be
545 present in the repository before any Pipeline containing this task is run
546 (`_DatasetDict`).
547 """
549 quanta: dict[DataCoordinate, _QuantumScaffolding]
550 """Dictionary mapping data ID to a scaffolding object for the Quantum of
551 this task with that data ID.
552 """
554 storage_classes: dict[str, str]
555 """Mapping from dataset type name to storage class declared by this task.
556 """
558 def makeQuantumSet(
559 self,
560 missing: _DatasetDict,
561 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
562 ) -> set[Quantum]:
563 """Create a `set` of `Quantum` from the information in ``self``.
565 Parameters
566 ----------
567 missing : `_DatasetDict`
568 Input datasets that have not been found.
569 datastore_records : `dict`
570 Record from the datastore to export with quanta.
572 Returns
573 -------
574 nodes : `set` of `Quantum`
575 The `Quantum` elements corresponding to this task.
576 """
577 outputs = set()
578 for q in self.quanta.values():
579 try:
580 tmpQuanta = q.makeQuantum(datastore_records)
581 outputs.add(tmpQuanta)
582 except (NoWorkFound, FileNotFoundError) as exc:
583 if not missing.isdisjoint(q.inputs):
584 # This is a node that is known to be pruned later and
585 # should be left in even though some follow up queries
586 # fail. This allows the pruning to start from this quantum
587 # with known issues, and prune other nodes it touches.
588 inputs = q.inputs.unpackMultiRefs(self.storage_classes)
589 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes))
590 tmpQuantum = Quantum(
591 taskName=q.task.taskDef.taskName,
592 taskClass=q.task.taskDef.taskClass,
593 dataId=q.dataId,
594 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes),
595 inputs=inputs,
596 outputs=q.outputs.unpackMultiRefs(self.storage_classes),
597 )
598 outputs.add(tmpQuantum)
599 else:
600 raise exc
601 return outputs
604class _DatasetIdMaker:
605 """Helper class which generates random dataset UUIDs for unresolved
606 datasets.
607 """
609 def __init__(self, run: str):
610 self.run = run
611 # Cache of dataset refs generated so far.
612 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {}
614 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef:
615 # For components we need their parent dataset ID.
616 if dataset_type.isComponent():
617 parent_type = dataset_type.makeCompositeDatasetType()
618 # Parent should be resolved if this is an existing input, or it
619 # should be in the cache already if it is an intermediate.
620 key = parent_type, data_id
621 if key not in self.resolved:
622 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}")
623 parent_ref = self.resolved[key]
624 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved"
625 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False)
627 key = dataset_type, data_id
628 if (resolved := self.resolved.get(key)) is None:
629 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False)
630 self.resolved[key] = resolved
631 return resolved
633 def resolveDict(self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder]) -> None:
634 """Resolve all unresolved references in the provided dictionary."""
635 for data_id, holder in refs.items():
636 if holder.ref is None:
637 holder.ref = self.resolveRef(holder.dataset_type, data_id)
640@dataclass
641class _PipelineScaffolding:
642 """A helper data structure that organizes the information involved in
643 constructing a `QuantumGraph` for a `Pipeline`.
645 Parameters
646 ----------
647 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
648 Sequence of tasks from which a graph is to be constructed. Must
649 have nested task classes already imported.
650 universe : `DimensionUniverse`
651 Universe of all possible dimensions.
653 Notes
654 -----
655 The scaffolding data structure contains nested data structures for both
656 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
657 data structures are shared between the pipeline-level structure (which
658 aggregates all datasets and categorizes them from the perspective of the
659 complete pipeline) and the individual tasks that use them as inputs and
660 outputs.
662 `QuantumGraph` construction proceeds in four steps, with each corresponding
663 to a different `_PipelineScaffolding` method:
665 1. When `_PipelineScaffolding` is constructed, we extract and categorize
666 the DatasetTypes used by the pipeline (delegating to
667 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
668 nested `_TaskScaffolding` and `_DatasetDict` objects.
670 2. In `connectDataIds`, we construct and run the "Big Join Query", which
671 returns related tuples of all dimensions used to identify any regular
672 input, output, and intermediate datasets (not prerequisites). We then
673 iterate over these tuples of related dimensions, identifying the subsets
674 that correspond to distinct data IDs for each task and dataset type,
675 and then create `_QuantumScaffolding` objects.
677 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
678 dataset data IDs previously identified, transforming unresolved
679 DatasetRefs into resolved DatasetRefs where appropriate. We then look
680 up prerequisite datasets for all quanta.
682 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
683 per-task `_QuantumScaffolding` objects.
684 """
686 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry):
687 _LOG.debug("Initializing data structures for QuantumGraph generation.")
688 self.tasks = []
689 # Aggregate and categorize the DatasetTypes in the Pipeline.
690 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
691 # Construct dictionaries that map those DatasetTypes to structures
692 # that will (later) hold additional information about them.
693 for attr in (
694 "initInputs",
695 "initIntermediates",
696 "initOutputs",
697 "inputs",
698 "intermediates",
699 "outputs",
700 "prerequisites",
701 ):
702 setattr(
703 self,
704 attr,
705 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
706 )
707 self.missing = _DatasetDict(universe=registry.dimensions)
708 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
709 # Aggregate all dimensions for all non-init, non-prerequisite
710 # DatasetTypes. These are the ones we'll include in the big join
711 # query.
712 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
713 # Construct scaffolding nodes for each Task, and add backreferences
714 # to the Task from each DatasetScaffolding node.
715 # Note that there's only one scaffolding node for each DatasetType,
716 # shared by _PipelineScaffolding and all _TaskScaffoldings that
717 # reference it.
718 if isinstance(pipeline, Pipeline):
719 pipeline = pipeline.toExpandedPipeline()
720 self.tasks = [
721 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
722 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
723 ]
725 def __repr__(self) -> str:
726 # Default dataclass-injected __repr__ gets caught in an infinite loop
727 # because of back-references.
728 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
730 tasks: list[_TaskScaffolding]
731 """Scaffolding data structures for each task in the pipeline
732 (`list` of `_TaskScaffolding`).
733 """
735 initInputs: _DatasetDict
736 """Datasets consumed but not produced when constructing the tasks in this
737 pipeline (`_DatasetDict`).
738 """
740 initIntermediates: _DatasetDict
741 """Datasets that are both consumed and produced when constructing the tasks
742 in this pipeline (`_DatasetDict`).
743 """
745 initOutputs: _DatasetDict
746 """Datasets produced but not consumed when constructing the tasks in this
747 pipeline (`_DatasetDict`).
748 """
750 inputs: _DatasetDict
751 """Datasets that are consumed but not produced when running this pipeline
752 (`_DatasetDict`).
753 """
755 intermediates: _DatasetDict
756 """Datasets that are both produced and consumed when running this pipeline
757 (`_DatasetDict`).
758 """
760 outputs: _DatasetDict
761 """Datasets produced but not consumed when when running this pipeline
762 (`_DatasetDict`).
763 """
765 prerequisites: _DatasetDict
766 """Datasets that are consumed when running this pipeline and looked up
767 per-Quantum when generating the graph (`_DatasetDict`).
768 """
770 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
771 """Datasets that should be used as constraints in the initial query,
772 according to tasks (`NamedValueSet`).
773 """
775 dimensions: DimensionGraph
776 """All dimensions used by any regular input, intermediate, or output
777 (not prerequisite) dataset; the set of dimension used in the "Big Join
778 Query" (`DimensionGraph`).
780 This is required to be a superset of all task quantum dimensions.
781 """
783 missing: _DatasetDict
784 """Datasets whose existence was originally predicted but were not
785 actually found.
787 Quanta that require these datasets as inputs will be pruned (recursively)
788 when actually constructing a `QuantumGraph` object.
790 These are currently populated only when the "initial dataset query
791 constraint" does not include all overall-input dataset types, and hence the
792 initial data ID query can include data IDs that it should not.
793 """
795 globalInitOutputs: _DatasetDict | None = None
796 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
797 """
799 @contextmanager
800 def connectDataIds(
801 self,
802 registry: Registry,
803 collections: Any,
804 userQuery: Optional[str],
805 externalDataId: DataCoordinate,
806 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
807 bind: Optional[Mapping[str, Any]] = None,
808 ) -> Iterator[DataCoordinateQueryResults]:
809 """Query for the data IDs that connect nodes in the `QuantumGraph`.
811 This method populates `_TaskScaffolding.dataIds` and
812 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
814 Parameters
815 ----------
816 registry : `lsst.daf.butler.Registry`
817 Registry for the data repository; used for all data ID queries.
818 collections
819 Expressions representing the collections to search for input
820 datasets. See :ref:`daf_butler_ordered_collection_searches`.
821 userQuery : `str` or `None`
822 User-provided expression to limit the data IDs processed.
823 externalDataId : `DataCoordinate`
824 Externally-provided data ID that should be used to restrict the
825 results, just as if these constraints had been included via ``AND``
826 in ``userQuery``. This includes (at least) any instrument named
827 in the pipeline definition.
828 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
829 The query constraint variant that should be used to constraint the
830 query based on dataset existance, defaults to
831 `DatasetQueryConstraintVariant.ALL`.
832 bind : `Mapping`, optional
833 Mapping containing literal values that should be injected into the
834 ``userQuery`` expression, keyed by the identifiers they replace.
836 Returns
837 -------
838 commonDataIds : \
839 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
840 An interface to a database temporary table containing all data IDs
841 that will appear in this `QuantumGraph`. Returned inside a
842 context manager, which will drop the temporary table at the end of
843 the `with` block in which this method is called.
844 """
845 _LOG.debug("Building query for data IDs.")
846 # Initialization datasets always have empty data IDs.
847 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
848 for datasetType, refs in itertools.chain(
849 self.initInputs.items(),
850 self.initIntermediates.items(),
851 self.initOutputs.items(),
852 ):
853 refs[emptyDataId] = _RefHolder(datasetType)
854 # Run one big query for the data IDs for task dimensions and regular
855 # inputs and outputs. We limit the query to only dimensions that are
856 # associated with the input dataset types, but don't (yet) try to
857 # obtain the dataset_ids for those inputs.
858 _LOG.debug(
859 "Submitting data ID query over dimensions %s and materializing results.",
860 list(self.dimensions.names),
861 )
862 queryArgs: dict[str, Any] = {
863 "dimensions": self.dimensions,
864 "where": userQuery,
865 "dataId": externalDataId,
866 "bind": bind,
867 }
868 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
869 _LOG.debug(
870 "Constraining graph query using default of %s.",
871 list(self.defaultDatasetQueryConstraints.names),
872 )
873 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
874 queryArgs["collections"] = collections
875 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
876 _LOG.debug("Not using dataset existence to constrain query.")
877 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
878 constraint = set(datasetQueryConstraint)
879 inputs = {k.name: k for k in self.inputs.keys()}
880 if remainder := constraint.difference(inputs.keys()):
881 raise ValueError(
882 f"{remainder} dataset type(s) specified as a graph constraint, but"
883 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
884 )
885 _LOG.debug(f"Constraining graph query using {constraint}")
886 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
887 queryArgs["collections"] = collections
888 else:
889 raise ValueError(
890 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
891 )
893 if "datasets" in queryArgs:
894 for i, dataset_type in enumerate(queryArgs["datasets"]):
895 if dataset_type.isComponent():
896 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
898 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
899 _LOG.debug("Expanding data IDs.")
900 commonDataIds = commonDataIds.expanded()
901 _LOG.debug("Iterating over query results to associate quanta with datasets.")
902 # Iterate over query results, populating data IDs for datasets and
903 # quanta and then connecting them to each other.
904 n = -1
905 for n, commonDataId in enumerate(commonDataIds):
906 # Create DatasetRefs for all DatasetTypes from this result row,
907 # noting that we might have created some already.
908 # We remember both those that already existed and those that we
909 # create now.
910 refsForRow = {}
911 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {}
912 for datasetType, refs in itertools.chain(
913 self.inputs.items(),
914 self.intermediates.items(),
915 self.outputs.items(),
916 ):
917 datasetDataId: Optional[DataCoordinate]
918 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
919 datasetDataId = commonDataId.subset(datasetType.dimensions)
920 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
921 ref_holder = refs.get(datasetDataId)
922 if ref_holder is None:
923 ref_holder = _RefHolder(datasetType)
924 refs[datasetDataId] = ref_holder
925 refsForRow[datasetType.name] = ref_holder
926 # Create _QuantumScaffolding objects for all tasks from this
927 # result row, noting that we might have created some already.
928 for task in self.tasks:
929 quantumDataId = commonDataId.subset(task.dimensions)
930 quantum = task.quanta.get(quantumDataId)
931 if quantum is None:
932 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
933 task.quanta[quantumDataId] = quantum
934 # Whether this is a new quantum or an existing one, we can
935 # now associate the DatasetRefs for this row with it. The
936 # fact that a Quantum data ID and a dataset data ID both
937 # came from the same result row is what tells us they
938 # should be associated.
939 # Many of these associates will be duplicates (because
940 # another query row that differed from this one only in
941 # irrelevant dimensions already added them), and we use
942 # sets to skip.
943 for datasetType in task.inputs:
944 dataId = dataIdCacheForRow[datasetType.dimensions]
945 ref_holder = refsForRow[datasetType.name]
946 quantum.inputs[datasetType.name][dataId] = ref_holder
947 for datasetType in task.outputs:
948 dataId = dataIdCacheForRow[datasetType.dimensions]
949 ref_holder = refsForRow[datasetType.name]
950 quantum.outputs[datasetType.name][dataId] = ref_holder
951 if n < 0:
952 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
953 emptiness_explained = False
954 for message in commonDataIds.explain_no_results():
955 _LOG.critical(message)
956 emptiness_explained = True
957 if not emptiness_explained:
958 _LOG.critical(
959 "To reproduce this query for debugging purposes, run "
960 "Registry.queryDataIds with these arguments:"
961 )
962 # We could just repr() the queryArgs dict to get something
963 # the user could make sense of, but it's friendlier to
964 # put these args in an easier-to-construct equivalent form
965 # so they can read it more easily and copy and paste into
966 # a Python terminal.
967 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
968 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
969 if queryArgs["where"]:
970 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
971 if "datasets" in queryArgs:
972 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
973 if "collections" in queryArgs:
974 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
975 _LOG.debug("Finished processing %d rows from data ID query.", n)
976 yield commonDataIds
978 def resolveDatasetRefs(
979 self,
980 registry: Registry,
981 collections: Any,
982 run: str,
983 commonDataIds: DataCoordinateQueryResults,
984 *,
985 skipExistingIn: Any = None,
986 clobberOutputs: bool = True,
987 constrainedByAllDatasets: bool = True,
988 ) -> None:
989 """Perform follow up queries for each dataset data ID produced in
990 `fillDataIds`.
992 This method populates `_DatasetScaffolding.refs` (except for those in
993 `prerequisites`).
995 Parameters
996 ----------
997 registry : `lsst.daf.butler.Registry`
998 Registry for the data repository; used for all data ID queries.
999 collections
1000 Expressions representing the collections to search for input
1001 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1002 run : `str`
1003 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1004 output datasets, if it already exists.
1005 commonDataIds : \
1006 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
1007 Result of a previous call to `connectDataIds`.
1008 skipExistingIn
1009 Expressions representing the collections to search for existing
1010 output datasets that should be skipped. See
1011 :ref:`daf_butler_ordered_collection_searches` for allowed types.
1012 `None` or empty string/sequence disables skipping.
1013 clobberOutputs : `bool`, optional
1014 If `True` (default), allow quanta to created even if outputs exist;
1015 this requires the same behavior behavior to be enabled when
1016 executing. If ``skipExistingIn`` is not `None`, completed quanta
1017 (those with metadata, or all outputs if there is no metadata
1018 dataset configured) will be skipped rather than clobbered.
1019 constrainedByAllDatasets : `bool`, optional
1020 Indicates if the commonDataIds were generated with a constraint on
1021 all dataset types.
1023 Raises
1024 ------
1025 OutputExistsError
1026 Raised if an output dataset already exists in the output run
1027 and ``skipExistingIn`` does not include output run, or if only
1028 some outputs are present and ``clobberOutputs`` is `False`.
1029 """
1030 # Run may be provided but it does not have to exist, in that case we
1031 # use it for resolving references but don't check it for existing refs.
1032 run_exists = False
1033 if run:
1034 try:
1035 run_exists = bool(registry.queryCollections(run))
1036 except MissingCollectionError:
1037 # Undocumented exception is raise if it does not exist
1038 pass
1040 skip_collections_wildcard: CollectionWildcard | None = None
1041 skipExistingInRun = False
1042 if skipExistingIn:
1043 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
1044 if run_exists:
1045 # as optimization check in the explicit list of names first
1046 skipExistingInRun = run in skip_collections_wildcard.strings
1047 if not skipExistingInRun:
1048 # need to flatten it and check again
1049 skipExistingInRun = run in registry.queryCollections(
1050 skipExistingIn,
1051 collectionTypes=CollectionType.RUN,
1052 )
1054 idMaker = _DatasetIdMaker(run)
1056 resolvedRefQueryResults: Iterable[DatasetRef]
1058 # Updating constrainedByAllDatasets here is not ideal, but we have a
1059 # few different code paths that each transfer different pieces of
1060 # information about what dataset query constraints were applied here,
1061 # and none of them has the complete picture until we get here. We're
1062 # long overdue for a QG generation rewrite that will make this go away
1063 # entirely anyway.
1064 constrainedByAllDatasets = (
1065 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
1066 )
1068 # Look up [init] intermediate and output datasets in the output
1069 # collection, if there is an output collection.
1070 if run_exists or skip_collections_wildcard is not None:
1071 for datasetType, refs in itertools.chain(
1072 self.initIntermediates.items(),
1073 self.initOutputs.items(),
1074 self.intermediates.items(),
1075 self.outputs.items(),
1076 ):
1077 _LOG.debug(
1078 "Resolving %d datasets for intermediate and/or output dataset %s.",
1079 len(refs),
1080 datasetType.name,
1081 )
1082 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
1083 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
1084 # TODO: this assert incorrectly bans component inputs;
1085 # investigate on DM-33027.
1086 # assert not datasetType.isComponent(), \
1087 # "Output datasets cannot be components."
1088 #
1089 # Instead we have to handle them manually to avoid a
1090 # deprecation warning, but it is at least confusing and
1091 # possibly a bug for components to appear here at all.
1092 if datasetType.isComponent():
1093 parent_dataset_type = datasetType.makeCompositeDatasetType()
1094 component = datasetType.component()
1095 else:
1096 parent_dataset_type = datasetType
1097 component = None
1099 # look at RUN collection first
1100 if run_exists:
1101 try:
1102 resolvedRefQueryResults = subset.findDatasets(
1103 parent_dataset_type, collections=run, findFirst=True
1104 )
1105 except MissingDatasetTypeError:
1106 resolvedRefQueryResults = []
1107 for resolvedRef in resolvedRefQueryResults:
1108 # TODO: we could easily support per-DatasetType
1109 # skipExisting and I could imagine that being useful -
1110 # it's probably required in order to support writing
1111 # initOutputs before QuantumGraph generation.
1112 assert resolvedRef.dataId in refs
1113 if not (skipExistingInRun or isInit or clobberOutputs):
1114 raise OutputExistsError(
1115 f"Output dataset {datasetType.name} already exists in "
1116 f"output RUN collection '{run}' with data ID"
1117 f" {resolvedRef.dataId}."
1118 )
1119 # To resolve all outputs we have to remember existing
1120 # ones to avoid generating new dataset IDs for them.
1121 refs[resolvedRef.dataId].ref = (
1122 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1123 )
1125 # And check skipExistingIn too, if RUN collection is in
1126 # it is handled above
1127 if skip_collections_wildcard is not None:
1128 try:
1129 resolvedRefQueryResults = subset.findDatasets(
1130 parent_dataset_type,
1131 collections=skip_collections_wildcard,
1132 findFirst=True,
1133 )
1134 except MissingDatasetTypeError:
1135 resolvedRefQueryResults = []
1136 for resolvedRef in resolvedRefQueryResults:
1137 if resolvedRef.dataId not in refs:
1138 continue
1139 refs[resolvedRef.dataId].ref = (
1140 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1141 )
1143 # Look up input and initInput datasets in the input collection(s). We
1144 # accumulate datasets in self.missing, if the common data IDs were not
1145 # constrained on dataset type existence.
1146 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1147 _LOG.debug(
1148 "Resolving %d datasets for input dataset %s.",
1149 len(refs),
1150 datasetType.name,
1151 )
1152 if datasetType.isComponent():
1153 parent_dataset_type = datasetType.makeCompositeDatasetType()
1154 component = datasetType.component()
1155 else:
1156 parent_dataset_type = datasetType
1157 component = None
1158 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {}
1159 try:
1160 resolvedRefQueryResults = commonDataIds.subset(
1161 datasetType.dimensions, unique=True
1162 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1163 except MissingDatasetTypeError:
1164 resolvedRefQueryResults = []
1165 dataIdsNotFoundYet = set(refs.keys())
1166 for resolvedRef in resolvedRefQueryResults:
1167 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1168 if resolvedRef.dataId not in refs:
1169 continue
1170 refs[resolvedRef.dataId].ref = (
1171 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1172 )
1173 if dataIdsNotFoundYet:
1174 if constrainedByAllDatasets:
1175 raise RuntimeError(
1176 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1177 f"'{datasetType.name}' was/were present in a previous "
1178 "query, but could not be found now. "
1179 "This is either a logic bug in QuantumGraph generation "
1180 "or the input collections have been modified since "
1181 "QuantumGraph generation began."
1182 )
1183 elif not datasetType.dimensions:
1184 raise RuntimeError(
1185 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1186 f"collections {collections}."
1187 )
1188 else:
1189 # If the common dataIds were not constrained using all the
1190 # input dataset types, it is possible that some data ids
1191 # found don't correspond to existing datasets. Mark these
1192 # for later pruning from the quantum graph.
1193 for k in dataIdsNotFoundYet:
1194 missing_for_dataset_type[k] = refs[k]
1195 if missing_for_dataset_type:
1196 self.missing[datasetType] = missing_for_dataset_type
1198 # Resolve the missing refs, just so they look like all of the others;
1199 # in the end other code will make sure they never appear in the QG.
1200 for dataset_type, refDict in self.missing.items():
1201 idMaker.resolveDict(dataset_type, refDict)
1203 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1204 # replacing the unresolved refs there, and then look up prerequisites.
1205 for task in self.tasks:
1206 _LOG.debug(
1207 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1208 len(task.quanta),
1209 task.taskDef.label,
1210 )
1211 # The way iterConnections is designed makes it impossible to
1212 # annotate precisely enough to satisfy MyPy here.
1213 lookupFunctions = {
1214 c.name: c.lookupFunction # type: ignore
1215 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1216 if c.lookupFunction is not None # type: ignore
1217 }
1218 dataIdsFailed = []
1219 dataIdsSucceeded = []
1220 for quantum in task.quanta.values():
1221 # Process outputs datasets only if skipExistingIn is not None
1222 # or there is a run to look for outputs in and clobberOutputs
1223 # is True. Note that if skipExistingIn is None, any output
1224 # datasets that already exist would have already caused an
1225 # exception to be raised.
1226 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1227 resolvedRefs = []
1228 unresolvedDataIds = []
1229 haveMetadata = False
1230 for datasetType, originalRefs in quantum.outputs.items():
1231 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()):
1232 if ref is not None:
1233 resolvedRefs.append(ref)
1234 originalRefs[dataId].ref = ref
1235 if datasetType.name == task.taskDef.metadataDatasetName:
1236 haveMetadata = True
1237 else:
1238 unresolvedDataIds.append((datasetType, dataId))
1239 if resolvedRefs:
1240 if haveMetadata or not unresolvedDataIds:
1241 dataIdsSucceeded.append(quantum.dataId)
1242 if skip_collections_wildcard is not None:
1243 continue
1244 else:
1245 dataIdsFailed.append(quantum.dataId)
1246 if not clobberOutputs:
1247 raise OutputExistsError(
1248 f"Quantum {quantum.dataId} of task with label "
1249 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1250 f"({resolvedRefs}) "
1251 f"and others that don't ({unresolvedDataIds}), with no metadata output, "
1252 "and clobbering outputs was not enabled."
1253 )
1254 # Update the input DatasetRefs to the resolved ones we already
1255 # searched for.
1256 for datasetType, input_refs in quantum.inputs.items():
1257 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()):
1258 input_refs[data_id].ref = ref
1259 # Look up prerequisite datasets in the input collection(s).
1260 # These may have dimensions that extend beyond those we queried
1261 # for originally, because we want to permit those data ID
1262 # values to differ across quanta and dataset types.
1263 for datasetType in task.prerequisites:
1264 if datasetType.isComponent():
1265 parent_dataset_type = datasetType.makeCompositeDatasetType()
1266 component = datasetType.component()
1267 else:
1268 parent_dataset_type = datasetType
1269 component = None
1270 lookupFunction = lookupFunctions.get(datasetType.name)
1271 if lookupFunction is not None:
1272 # PipelineTask has provided its own function to do the
1273 # lookup. This always takes precedence.
1274 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1275 elif (
1276 datasetType.isCalibration()
1277 and datasetType.dimensions <= quantum.dataId.graph
1278 and quantum.dataId.graph.temporal
1279 ):
1280 # This is a master calibration lookup, which we have to
1281 # handle specially because the query system can't do a
1282 # temporal join on a non-dimension-based timespan yet.
1283 timespan = quantum.dataId.timespan
1284 try:
1285 prereq_ref = registry.findDataset(
1286 parent_dataset_type,
1287 quantum.dataId,
1288 collections=collections,
1289 timespan=timespan,
1290 )
1291 if prereq_ref is not None:
1292 if component is not None:
1293 prereq_ref = prereq_ref.makeComponentRef(component)
1294 prereq_refs = [prereq_ref]
1295 else:
1296 prereq_refs = []
1297 except (KeyError, MissingDatasetTypeError):
1298 # This dataset type is not present in the registry,
1299 # which just means there are no datasets here.
1300 prereq_refs = []
1301 else:
1302 # Most general case.
1303 prereq_refs = [
1304 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1305 for prereq_ref in registry.queryDatasets(
1306 parent_dataset_type,
1307 collections=collections,
1308 dataId=quantum.dataId,
1309 findFirst=True,
1310 ).expanded()
1311 ]
1313 for ref in prereq_refs:
1314 if ref is not None:
1315 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1316 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1318 # Resolve all quantum inputs and outputs.
1319 for datasetDict in (quantum.inputs, quantum.outputs):
1320 for dataset_type, refDict in datasetDict.items():
1321 idMaker.resolveDict(dataset_type, refDict)
1323 # Resolve task initInputs and initOutputs.
1324 for datasetDict in (task.initInputs, task.initOutputs):
1325 for dataset_type, refDict in datasetDict.items():
1326 idMaker.resolveDict(dataset_type, refDict)
1328 # Actually remove any quanta that we decided to skip above.
1329 if dataIdsSucceeded:
1330 if skip_collections_wildcard is not None:
1331 _LOG.debug(
1332 "Pruning successful %d quanta for task with label '%s' because all of their "
1333 "outputs exist or metadata was written successfully.",
1334 len(dataIdsSucceeded),
1335 task.taskDef.label,
1336 )
1337 for dataId in dataIdsSucceeded:
1338 del task.quanta[dataId]
1339 elif clobberOutputs:
1340 _LOG.info(
1341 "Found %d successful quanta for task with label '%s' "
1342 "that will need to be clobbered during execution.",
1343 len(dataIdsSucceeded),
1344 task.taskDef.label,
1345 )
1346 else:
1347 raise AssertionError("OutputExistsError should have already been raised.")
1348 if dataIdsFailed:
1349 if clobberOutputs:
1350 _LOG.info(
1351 "Found %d failed/incomplete quanta for task with label '%s' "
1352 "that will need to be clobbered during execution.",
1353 len(dataIdsFailed),
1354 task.taskDef.label,
1355 )
1356 else:
1357 raise AssertionError("OutputExistsError should have already been raised.")
1359 # Collect initOutputs that do not belong to any task.
1360 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1361 for task in self.tasks:
1362 global_dataset_types -= set(task.initOutputs)
1363 if global_dataset_types:
1364 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1365 for dataset_type, refDict in self.globalInitOutputs.items():
1366 idMaker.resolveDict(dataset_type, refDict)
1368 def makeQuantumGraph(
1369 self,
1370 registry: Registry,
1371 metadata: Optional[Mapping[str, Any]] = None,
1372 datastore: Optional[Datastore] = None,
1373 ) -> QuantumGraph:
1374 """Create a `QuantumGraph` from the quanta already present in
1375 the scaffolding data structure.
1377 Parameters
1378 ---------
1379 registry : `lsst.daf.butler.Registry`
1380 Registry for the data repository; used for all data ID queries.
1381 metadata : Optional Mapping of `str` to primitives
1382 This is an optional parameter of extra data to carry with the
1383 graph. Entries in this mapping should be able to be serialized in
1384 JSON.
1385 datastore : `Datastore`, optional
1386 If not `None` then fill datastore records in each generated
1387 Quantum.
1389 Returns
1390 -------
1391 graph : `QuantumGraph`
1392 The full `QuantumGraph`.
1393 """
1395 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1396 """Extract all DatasetRefs from the dictionaries"""
1397 for ref_dict in dataset_dict.values():
1398 for holder in ref_dict.values():
1399 yield holder.resolved_ref
1401 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1402 if datastore is not None:
1403 datastore_records = datastore.export_records(
1404 itertools.chain(
1405 _make_refs(self.inputs),
1406 _make_refs(self.initInputs),
1407 _make_refs(self.prerequisites),
1408 )
1409 )
1411 graphInput: dict[TaskDef, set[Quantum]] = {}
1412 for task in self.tasks:
1413 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records)
1414 graphInput[task.taskDef] = qset
1416 taskInitInputs = {
1417 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values()
1418 for task in self.tasks
1419 }
1420 taskInitOutputs = {
1421 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values()
1422 for task in self.tasks
1423 }
1425 globalInitOutputs: list[DatasetRef] = []
1426 if self.globalInitOutputs is not None:
1427 for refs_dict in self.globalInitOutputs.values():
1428 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values())
1430 graph = QuantumGraph(
1431 graphInput,
1432 metadata=metadata,
1433 pruneRefs=list(self.missing.iter_resolved_refs()),
1434 universe=self.dimensions.universe,
1435 initInputs=taskInitInputs,
1436 initOutputs=taskInitOutputs,
1437 globalInitOutputs=globalInitOutputs,
1438 registryDatasetTypes=self._get_registry_dataset_types(registry),
1439 )
1440 return graph
1442 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1443 """Make a list of all dataset types used by a graph as defined in
1444 registry.
1445 """
1446 chain = [
1447 self.initInputs,
1448 self.initIntermediates,
1449 self.initOutputs,
1450 self.inputs,
1451 self.intermediates,
1452 self.outputs,
1453 self.prerequisites,
1454 ]
1455 if self.globalInitOutputs is not None:
1456 chain.append(self.globalInitOutputs)
1458 # Collect names of all dataset types.
1459 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1460 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1462 # Check for types that do not exist in registry yet:
1463 # - inputs must exist
1464 # - intermediates and outputs may not exist, but there must not be
1465 # more than one definition (e.g. differing in storage class)
1466 # - prerequisites may not exist, treat it the same as outputs here
1467 for dstype in itertools.chain(self.initInputs, self.inputs):
1468 if dstype.name not in dataset_types:
1469 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1471 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1472 chain = [
1473 self.initIntermediates,
1474 self.initOutputs,
1475 self.intermediates,
1476 self.outputs,
1477 self.prerequisites,
1478 ]
1479 if self.globalInitOutputs is not None:
1480 chain.append(self.globalInitOutputs)
1481 for dstype in itertools.chain(*chain):
1482 if dstype.name not in dataset_types:
1483 new_outputs[dstype.name].add(dstype)
1484 for name, dstypes in new_outputs.items():
1485 if len(dstypes) > 1:
1486 raise ValueError(
1487 "Pipeline contains multiple definitions for a dataset type "
1488 f"which is not defined in registry yet: {dstypes}"
1489 )
1490 elif len(dstypes) == 1:
1491 dataset_types[name] = dstypes.pop()
1493 return dataset_types.values()
1496# ------------------------
1497# Exported definitions --
1498# ------------------------
1501class GraphBuilderError(Exception):
1502 """Base class for exceptions generated by graph builder."""
1504 pass
1507class OutputExistsError(GraphBuilderError):
1508 """Exception generated when output datasets already exist."""
1510 pass
1513class PrerequisiteMissingError(GraphBuilderError):
1514 """Exception generated when a prerequisite dataset does not exist."""
1516 pass
1519class GraphBuilder:
1520 """GraphBuilder class is responsible for building task execution graph from
1521 a Pipeline.
1523 Parameters
1524 ----------
1525 registry : `~lsst.daf.butler.Registry`
1526 Data butler instance.
1527 skipExistingIn
1528 Expressions representing the collections to search for existing
1529 output datasets that should be skipped. See
1530 :ref:`daf_butler_ordered_collection_searches`.
1531 clobberOutputs : `bool`, optional
1532 If `True` (default), allow quanta to created even if partial outputs
1533 exist; this requires the same behavior behavior to be enabled when
1534 executing.
1535 datastore : `Datastore`, optional
1536 If not `None` then fill datastore records in each generated Quantum.
1537 """
1539 def __init__(
1540 self,
1541 registry: Registry,
1542 skipExistingIn: Any = None,
1543 clobberOutputs: bool = True,
1544 datastore: Optional[Datastore] = None,
1545 ):
1546 self.registry = registry
1547 self.dimensions = registry.dimensions
1548 self.skipExistingIn = skipExistingIn
1549 self.clobberOutputs = clobberOutputs
1550 self.datastore = datastore
1552 def makeGraph(
1553 self,
1554 pipeline: Pipeline | Iterable[TaskDef],
1555 collections: Any,
1556 run: str,
1557 userQuery: Optional[str],
1558 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1559 metadata: Optional[Mapping[str, Any]] = None,
1560 bind: Optional[Mapping[str, Any]] = None,
1561 ) -> QuantumGraph:
1562 """Create execution graph for a pipeline.
1564 Parameters
1565 ----------
1566 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1567 Pipeline definition, task names/classes and their configs.
1568 collections
1569 Expressions representing the collections to search for input
1570 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1571 run : `str`
1572 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1573 output datasets. Collection does not have to exist and it will be
1574 created when graph is executed.
1575 userQuery : `str`
1576 String which defines user-defined selection for registry, should be
1577 empty or `None` if there is no restrictions on data selection.
1578 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1579 The query constraint variant that should be used to constraint the
1580 query based on dataset existance, defaults to
1581 `DatasetQueryConstraintVariant.ALL`.
1582 metadata : Optional Mapping of `str` to primitives
1583 This is an optional parameter of extra data to carry with the
1584 graph. Entries in this mapping should be able to be serialized in
1585 JSON.
1586 bind : `Mapping`, optional
1587 Mapping containing literal values that should be injected into the
1588 ``userQuery`` expression, keyed by the identifiers they replace.
1590 Returns
1591 -------
1592 graph : `QuantumGraph`
1594 Raises
1595 ------
1596 UserExpressionError
1597 Raised when user expression cannot be parsed.
1598 OutputExistsError
1599 Raised when output datasets already exist.
1600 Exception
1601 Other exceptions types may be raised by underlying registry
1602 classes.
1603 """
1604 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1605 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1606 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1607 instrument_class: Optional[Any] = None
1608 if isinstance(pipeline, Pipeline):
1609 instrument_class_name = pipeline.getInstrument()
1610 if instrument_class_name is not None:
1611 instrument_class = doImportType(instrument_class_name)
1612 pipeline = list(pipeline.toExpandedPipeline())
1613 if instrument_class is not None:
1614 dataId = DataCoordinate.standardize(
1615 instrument=instrument_class.getName(), universe=self.registry.dimensions
1616 )
1617 else:
1618 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1619 with scaffolding.connectDataIds(
1620 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1621 ) as commonDataIds:
1622 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1623 scaffolding.resolveDatasetRefs(
1624 self.registry,
1625 collections,
1626 run,
1627 commonDataIds,
1628 skipExistingIn=self.skipExistingIn,
1629 clobberOutputs=self.clobberOutputs,
1630 constrainedByAllDatasets=condition,
1631 )
1632 return scaffolding.makeQuantumGraph(
1633 registry=self.registry, metadata=metadata, datastore=self.datastore
1634 )