Coverage for python/lsst/pipe/base/graphBuilder.py: 15%
554 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-02 02:17 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-02 02:17 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap, defaultdict
34from collections.abc import Collection, Iterable, Iterator, Mapping
35from contextlib import contextmanager
36from dataclasses import dataclass
37from typing import Any, Optional
39from lsst.daf.butler import (
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
55from lsst.daf.butler.registry.wildcards import CollectionWildcard
56from lsst.utils import doImportType
58# -----------------------------
59# Imports for other modules --
60# -----------------------------
61from . import automatic_connection_constants as acc
62from ._datasetQueryConstraints import DatasetQueryConstraintVariant
63from ._status import NoWorkFound
64from .connections import AdjustQuantumHelper, iterConnections
65from .graph import QuantumGraph
66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
68# ----------------------------------
69# Local non-exported definitions --
70# ----------------------------------
72_LOG = logging.getLogger(__name__)
75@dataclass
76class _RefHolder:
77 """Placeholder for `DatasetRef` representing a future resolved reference.
79 As we eliminated unresolved DatasetRefs we now use `None` to represent
80 a reference that is yet to be resolved. Information about its corresponding
81 dataset type and coordinate is stored in `_DatasetDict` mapping.
82 """
84 dataset_type: DatasetType
85 """Dataset type of the dataset to be created later. I need to store it here
86 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared
87 between different compatible dataset types."""
89 ref: DatasetRef | None = None
90 """Dataset reference, initially `None`, created when all datasets are
91 resolved.
92 """
94 @property
95 def resolved_ref(self) -> DatasetRef:
96 """Access resolved reference, should only be called after the
97 reference is set (`DatasetRef`)."""
98 assert self.ref is not None, "Dataset reference is not set."
99 return self.ref
102class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]):
103 """A custom dictionary that maps `DatasetType` to a nested dictionary of
104 the known `DatasetRef` instances of that type.
106 Parameters
107 ----------
108 args
109 Positional arguments are forwarded to the `dict` constructor.
110 universe : `DimensionUniverse`
111 Universe of all possible dimensions.
112 """
114 def __init__(self, *args: Any, universe: DimensionUniverse):
115 super().__init__(*args)
116 self.universe = universe
118 @classmethod
119 def fromDatasetTypes(
120 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
121 ) -> _DatasetDict:
122 """Construct a dictionary from a flat iterable of `DatasetType` keys.
124 Parameters
125 ----------
126 datasetTypes : `iterable` of `DatasetType`
127 DatasetTypes to use as keys for the dict. Values will be empty
128 dictionaries.
129 universe : `DimensionUniverse`
130 Universe of all possible dimensions.
132 Returns
133 -------
134 dictionary : `_DatasetDict`
135 A new `_DatasetDict` instance.
136 """
137 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
139 @classmethod
140 def fromSubset(
141 cls,
142 datasetTypes: Collection[DatasetType],
143 first: _DatasetDict,
144 *rest: _DatasetDict,
145 ) -> _DatasetDict:
146 """Return a new dictionary by extracting items corresponding to the
147 given keys from one or more existing dictionaries.
149 Parameters
150 ----------
151 datasetTypes : `iterable` of `DatasetType`
152 DatasetTypes to use as keys for the dict. Values will be obtained
153 by lookups against ``first`` and ``rest``.
154 first : `_DatasetDict`
155 Another dictionary from which to extract values.
156 rest
157 Additional dictionaries from which to extract values.
159 Returns
160 -------
161 dictionary : `_DatasetDict`
162 A new dictionary instance.
163 """
164 combined = ChainMap(first, *rest)
166 # Dataset types known to match immediately can be processed
167 # without checks.
168 matches = combined.keys() & set(datasetTypes)
169 _dict = {k: combined[k] for k in matches}
171 if len(_dict) < len(datasetTypes):
172 # Work out which ones are missing.
173 missing_datasetTypes = set(datasetTypes) - _dict.keys()
175 # Get the known names for comparison.
176 combined_by_name = {k.name: k for k in combined}
178 missing = set()
179 incompatible = {}
180 for datasetType in missing_datasetTypes:
181 # The dataset type is not found. It may not be listed
182 # or it may be that it is there with the same name
183 # but different definition.
184 if datasetType.name in combined_by_name:
185 # This implies some inconsistency in definitions
186 # for connections. If there is support for storage
187 # class conversion we can let it slide.
188 # At this point we do not know
189 # where the inconsistency is but trust that down
190 # stream code will be more explicit about input
191 # vs output incompatibilities.
192 existing = combined_by_name[datasetType.name]
193 convertible_to_existing = existing.is_compatible_with(datasetType)
194 convertible_from_existing = datasetType.is_compatible_with(existing)
195 if convertible_to_existing and convertible_from_existing:
196 _LOG.debug(
197 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
198 datasetType.name,
199 datasetType.storageClass_name,
200 existing.storageClass_name,
201 )
202 _dict[datasetType] = combined[existing]
203 elif convertible_to_existing or convertible_from_existing:
204 # We'd need to refactor a fair amount to recognize
205 # whether this is an error or not, so I'm not going to
206 # bother until we need to do that for other reasons
207 # (it won't be too long).
208 _LOG.info(
209 "Dataset type %s is present with multiple only partially-compatible storage "
210 "classes %s and %s.",
211 datasetType.name,
212 datasetType.storageClass_name,
213 existing.storageClass_name,
214 )
215 _dict[datasetType] = combined[existing]
216 else:
217 incompatible[datasetType] = existing
218 else:
219 missing.add(datasetType)
221 if missing or incompatible:
222 reasons = []
223 if missing:
224 reasons.append(
225 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known "
226 f"types: [{', '.join(d.name for d in combined)}]."
227 )
228 if incompatible:
229 for x, y in incompatible.items():
230 reasons.append(f"{x} incompatible with {y}")
231 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
233 return cls(_dict, universe=first.universe)
235 @property
236 def dimensions(self) -> DimensionGraph:
237 """The union of all dimensions used by all dataset types in this
238 dictionary, including implied dependencies (`DimensionGraph`).
239 """
240 base = self.universe.empty
241 if len(self) == 0:
242 return base
243 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
245 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]:
246 """Unpack nested single-element `DatasetRef` dicts into a new
247 mapping with `DatasetType` keys and `DatasetRef` values.
249 This method assumes that each nest contains exactly one item, as is the
250 case for all "init" datasets.
252 Parameters
253 ----------
254 storage_classes : `dict` [ `str`, `str` ]
255 Mapping from dataset type name to the storage class to use for that
256 dataset type. These are typically the storage classes declared
257 for a particular task, which may differ rom the data repository
258 definitions.
260 Returns
261 -------
262 dictionary : `NamedKeyDict`
263 Dictionary mapping `DatasetType` to `DatasetRef`, with both
264 `DatasetType` instances and string names usable as keys.
265 """
266 return NamedKeyDict(
267 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()}
268 )
270 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
271 """Unpack nested multi-element `DatasetRef` dicts into a new
272 mapping with `DatasetType` keys and `list` of `DatasetRef` values.
274 Parameters
275 ----------
276 storage_classes : `dict` [ `str`, `str` ]
277 Mapping from dataset type name to the storage class to use for that
278 dataset type. These are typically the storage classes declared
279 for a particular task, which may differ rom the data repository
280 definitions.
282 Returns
283 -------
284 dictionary : `NamedKeyDict`
285 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
286 both `DatasetType` instances and string names usable as keys.
287 """
288 result = {}
289 for dataset_type, holders in self.items():
290 if (
291 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name)
292 ) != dataset_type.storageClass_name:
293 dataset_type = dataset_type.overrideStorageClass(override)
294 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()]
295 else:
296 refs = [holder.resolved_ref for holder in holders.values()]
297 result[dataset_type] = refs
298 return NamedKeyDict(result)
300 def extract(
301 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
302 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]:
303 """Iterate over the contained `DatasetRef` instances that match the
304 given `DatasetType` and data IDs.
306 Parameters
307 ----------
308 datasetType : `DatasetType`
309 Dataset type to match.
310 dataIds : `Iterable` [ `DataCoordinate` ]
311 Data IDs to match.
313 Returns
314 -------
315 refs : `Iterator` [ `DatasetRef` ]
316 DatasetRef instances for which ``ref.datasetType == datasetType``
317 and ``ref.dataId`` is in ``dataIds``.
318 """
319 refs = self[datasetType]
320 return ((dataId, refs[dataId].ref) for dataId in dataIds)
322 def isdisjoint(self, other: _DatasetDict) -> bool:
323 """Test whether ``self`` and ``other`` have any datasets in common.
325 Datasets are considered in common if they have the same *parent*
326 dataset type name and data ID; storage classes and components are not
327 considered.
328 """
329 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()}
330 for k, v in other.items():
331 parent_name, _ = k.nameAndComponent()
332 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()):
333 return False
334 return True
336 def iter_resolved_refs(self) -> Iterator[DatasetRef]:
337 """Iterate over all DatasetRef instances held by this data structure,
338 assuming that each `_RefHolder` already carries are resolved ref.
339 """
340 for holders_by_data_id in self.values():
341 for holder in holders_by_data_id.values():
342 yield holder.resolved_ref
345class _QuantumScaffolding:
346 """Helper class aggregating information about a `Quantum`, used when
347 constructing a `QuantumGraph`.
349 See `_PipelineScaffolding` for a top-down description of the full
350 scaffolding data structure.
352 Parameters
353 ----------
354 task : _TaskScaffolding
355 Back-reference to the helper object for the `PipelineTask` this quantum
356 represents an execution of.
357 dataId : `DataCoordinate`
358 Data ID for this quantum.
359 """
361 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
362 self.task = task
363 self.dataId = dataId
364 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
365 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
366 self.prerequisites = _DatasetDict.fromDatasetTypes(
367 task.prerequisites.keys(), universe=dataId.universe
368 )
370 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
372 def __repr__(self) -> str:
373 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
375 task: _TaskScaffolding
376 """Back-reference to the helper object for the `PipelineTask` this quantum
377 represents an execution of.
378 """
380 dataId: DataCoordinate
381 """Data ID for this quantum.
382 """
384 inputs: _DatasetDict
385 """Nested dictionary containing `DatasetRef` inputs to this quantum.
387 This is initialized to map each `DatasetType` to an empty dictionary at
388 construction. Those nested dictionaries are populated (with data IDs as
389 keys) with unresolved `DatasetRef` instances in
390 `_PipelineScaffolding.connectDataIds`.
391 """
393 outputs: _DatasetDict
394 """Nested dictionary containing `DatasetRef` outputs this quantum.
395 """
397 prerequisites: _DatasetDict
398 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
399 quantum.
400 """
402 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
403 """Transform the scaffolding object into a true `Quantum` instance.
405 Parameters
406 ----------
407 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
408 If not `None` then fill datastore records in each generated Quantum
409 using the records from this structure.
411 Returns
412 -------
413 quantum : `Quantum`
414 An actual `Quantum` instance.
415 """
416 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes)
417 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes))
418 # Give the task's Connections class an opportunity to remove some
419 # inputs, or complain if they are unacceptable.
420 # This will raise if one of the check conditions is not met, which is
421 # the intended behavior.
422 # If it raises NotWorkFound, there is a bug in the QG algorithm
423 # or the adjustQuantum is incorrectly trying to make a prerequisite
424 # input behave like a regular input; adjustQuantum should only raise
425 # NoWorkFound if a regular input is missing, and it shouldn't be
426 # possible for us to have generated ``self`` if that's true.
427 helper = AdjustQuantumHelper(
428 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes)
429 )
430 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
431 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes)
432 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
433 if datastore_records is not None:
434 quantum_records = {}
435 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
436 input_refs += list(initInputs.values())
437 input_ids = set(ref.id for ref in input_refs)
438 for datastore_name, records in datastore_records.items():
439 matching_records = records.subset(input_ids)
440 if matching_records is not None:
441 quantum_records[datastore_name] = matching_records
442 return Quantum(
443 taskName=self.task.taskDef.taskName,
444 taskClass=self.task.taskDef.taskClass,
445 dataId=self.dataId,
446 initInputs=initInputs,
447 inputs=helper.inputs,
448 outputs=helper.outputs,
449 datastore_records=quantum_records,
450 )
453@dataclass
454class _TaskScaffolding:
455 """Helper class aggregating information about a `PipelineTask`, used when
456 constructing a `QuantumGraph`.
458 See `_PipelineScaffolding` for a top-down description of the full
459 scaffolding data structure.
461 Parameters
462 ----------
463 taskDef : `TaskDef`
464 Data structure that identifies the task class and its config.
465 parent : `_PipelineScaffolding`
466 The parent data structure that will hold the instance being
467 constructed.
468 datasetTypes : `TaskDatasetTypes`
469 Data structure that categorizes the dataset types used by this task.
470 """
472 def __init__(
473 self,
474 taskDef: TaskDef,
475 parent: _PipelineScaffolding,
476 datasetTypes: TaskDatasetTypes,
477 ):
478 universe = parent.dimensions.universe
479 self.taskDef = taskDef
480 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
481 assert self.dimensions.issubset(parent.dimensions)
482 # Initialize _DatasetDicts as subsets of the one or two
483 # corresponding dicts in the parent _PipelineScaffolding.
484 self.initInputs = _DatasetDict.fromSubset(
485 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
486 )
487 self.initOutputs = _DatasetDict.fromSubset(
488 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
489 )
490 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
491 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
492 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
493 self.dataIds: set[DataCoordinate] = set()
494 self.quanta = {}
495 self.storage_classes = {
496 connection.name: connection.storageClass
497 for connection in self.taskDef.connections.allConnections.values()
498 }
499 self.storage_classes[
500 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
501 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS
502 self.storage_classes[
503 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
504 ] = acc.LOG_OUTPUT_STORAGE_CLASS
505 self.storage_classes[
506 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
507 ] = acc.METADATA_OUTPUT_STORAGE_CLASS
509 def __repr__(self) -> str:
510 # Default dataclass-injected __repr__ gets caught in an infinite loop
511 # because of back-references.
512 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
514 taskDef: TaskDef
515 """Data structure that identifies the task class and its config
516 (`TaskDef`).
517 """
519 dimensions: DimensionGraph
520 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
521 """
523 initInputs: _DatasetDict
524 """Dictionary containing information about datasets used to construct this
525 task (`_DatasetDict`).
526 """
528 initOutputs: _DatasetDict
529 """Dictionary containing information about datasets produced as a
530 side-effect of constructing this task (`_DatasetDict`).
531 """
533 inputs: _DatasetDict
534 """Dictionary containing information about datasets used as regular,
535 graph-constraining inputs to this task (`_DatasetDict`).
536 """
538 outputs: _DatasetDict
539 """Dictionary containing information about datasets produced by this task
540 (`_DatasetDict`).
541 """
543 prerequisites: _DatasetDict
544 """Dictionary containing information about input datasets that must be
545 present in the repository before any Pipeline containing this task is run
546 (`_DatasetDict`).
547 """
549 quanta: dict[DataCoordinate, _QuantumScaffolding]
550 """Dictionary mapping data ID to a scaffolding object for the Quantum of
551 this task with that data ID.
552 """
554 storage_classes: dict[str, str]
555 """Mapping from dataset type name to storage class declared by this task.
556 """
558 def makeQuantumSet(
559 self,
560 missing: _DatasetDict,
561 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
562 ) -> set[Quantum]:
563 """Create a `set` of `Quantum` from the information in ``self``.
565 Parameters
566 ----------
567 missing : `_DatasetDict`
568 Input datasets that have not been found.
569 datastore_records : `dict`
570 Record from the datastore to export with quanta.
572 Returns
573 -------
574 nodes : `set` of `Quantum`
575 The `Quantum` elements corresponding to this task.
576 """
577 outputs = set()
578 for q in self.quanta.values():
579 try:
580 tmpQuanta = q.makeQuantum(datastore_records)
581 outputs.add(tmpQuanta)
582 except (NoWorkFound, FileNotFoundError) as exc:
583 if not missing.isdisjoint(q.inputs):
584 # This is a node that is known to be pruned later and
585 # should be left in even though some follow up queries
586 # fail. This allows the pruning to start from this quantum
587 # with known issues, and prune other nodes it touches.
588 inputs = q.inputs.unpackMultiRefs(self.storage_classes)
589 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes))
590 tmpQuantum = Quantum(
591 taskName=q.task.taskDef.taskName,
592 taskClass=q.task.taskDef.taskClass,
593 dataId=q.dataId,
594 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes),
595 inputs=inputs,
596 outputs=q.outputs.unpackMultiRefs(self.storage_classes),
597 )
598 outputs.add(tmpQuantum)
599 else:
600 raise exc
601 return outputs
604class _DatasetIdMaker:
605 """Helper class which generates random dataset UUIDs for unresolved
606 datasets.
607 """
609 def __init__(self, run: str):
610 self.run = run
611 # Cache of dataset refs generated so far.
612 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {}
614 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef:
615 # For components we need their parent dataset ID.
616 if dataset_type.isComponent():
617 parent_type = dataset_type.makeCompositeDatasetType()
618 # Parent should be resolved if this is an existing input, or it
619 # should be in the cache already if it is an intermediate.
620 key = parent_type, data_id
621 if key not in self.resolved:
622 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}")
623 parent_ref = self.resolved[key]
624 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False)
626 key = dataset_type, data_id
627 if (resolved := self.resolved.get(key)) is None:
628 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False)
629 self.resolved[key] = resolved
630 return resolved
632 def resolveDict(self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder]) -> None:
633 """Resolve all unresolved references in the provided dictionary."""
634 for data_id, holder in refs.items():
635 if holder.ref is None:
636 holder.ref = self.resolveRef(holder.dataset_type, data_id)
639@dataclass
640class _PipelineScaffolding:
641 """A helper data structure that organizes the information involved in
642 constructing a `QuantumGraph` for a `Pipeline`.
644 Parameters
645 ----------
646 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
647 Sequence of tasks from which a graph is to be constructed. Must
648 have nested task classes already imported.
649 universe : `DimensionUniverse`
650 Universe of all possible dimensions.
652 Notes
653 -----
654 The scaffolding data structure contains nested data structures for both
655 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
656 data structures are shared between the pipeline-level structure (which
657 aggregates all datasets and categorizes them from the perspective of the
658 complete pipeline) and the individual tasks that use them as inputs and
659 outputs.
661 `QuantumGraph` construction proceeds in four steps, with each corresponding
662 to a different `_PipelineScaffolding` method:
664 1. When `_PipelineScaffolding` is constructed, we extract and categorize
665 the DatasetTypes used by the pipeline (delegating to
666 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
667 nested `_TaskScaffolding` and `_DatasetDict` objects.
669 2. In `connectDataIds`, we construct and run the "Big Join Query", which
670 returns related tuples of all dimensions used to identify any regular
671 input, output, and intermediate datasets (not prerequisites). We then
672 iterate over these tuples of related dimensions, identifying the subsets
673 that correspond to distinct data IDs for each task and dataset type,
674 and then create `_QuantumScaffolding` objects.
676 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
677 dataset data IDs previously identified, transforming unresolved
678 DatasetRefs into resolved DatasetRefs where appropriate. We then look
679 up prerequisite datasets for all quanta.
681 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
682 per-task `_QuantumScaffolding` objects.
683 """
685 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry):
686 _LOG.debug("Initializing data structures for QuantumGraph generation.")
687 self.tasks = []
688 # Aggregate and categorize the DatasetTypes in the Pipeline.
689 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
690 # Construct dictionaries that map those DatasetTypes to structures
691 # that will (later) hold additional information about them.
692 for attr in (
693 "initInputs",
694 "initIntermediates",
695 "initOutputs",
696 "inputs",
697 "intermediates",
698 "outputs",
699 "prerequisites",
700 ):
701 setattr(
702 self,
703 attr,
704 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
705 )
706 self.missing = _DatasetDict(universe=registry.dimensions)
707 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
708 # Aggregate all dimensions for all non-init, non-prerequisite
709 # DatasetTypes. These are the ones we'll include in the big join
710 # query.
711 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
712 # Construct scaffolding nodes for each Task, and add backreferences
713 # to the Task from each DatasetScaffolding node.
714 # Note that there's only one scaffolding node for each DatasetType,
715 # shared by _PipelineScaffolding and all _TaskScaffoldings that
716 # reference it.
717 if isinstance(pipeline, Pipeline):
718 pipeline = pipeline.toExpandedPipeline()
719 self.tasks = [
720 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
721 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
722 ]
724 def __repr__(self) -> str:
725 # Default dataclass-injected __repr__ gets caught in an infinite loop
726 # because of back-references.
727 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
729 tasks: list[_TaskScaffolding]
730 """Scaffolding data structures for each task in the pipeline
731 (`list` of `_TaskScaffolding`).
732 """
734 initInputs: _DatasetDict
735 """Datasets consumed but not produced when constructing the tasks in this
736 pipeline (`_DatasetDict`).
737 """
739 initIntermediates: _DatasetDict
740 """Datasets that are both consumed and produced when constructing the tasks
741 in this pipeline (`_DatasetDict`).
742 """
744 initOutputs: _DatasetDict
745 """Datasets produced but not consumed when constructing the tasks in this
746 pipeline (`_DatasetDict`).
747 """
749 inputs: _DatasetDict
750 """Datasets that are consumed but not produced when running this pipeline
751 (`_DatasetDict`).
752 """
754 intermediates: _DatasetDict
755 """Datasets that are both produced and consumed when running this pipeline
756 (`_DatasetDict`).
757 """
759 outputs: _DatasetDict
760 """Datasets produced but not consumed when when running this pipeline
761 (`_DatasetDict`).
762 """
764 prerequisites: _DatasetDict
765 """Datasets that are consumed when running this pipeline and looked up
766 per-Quantum when generating the graph (`_DatasetDict`).
767 """
769 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
770 """Datasets that should be used as constraints in the initial query,
771 according to tasks (`NamedValueSet`).
772 """
774 dimensions: DimensionGraph
775 """All dimensions used by any regular input, intermediate, or output
776 (not prerequisite) dataset; the set of dimension used in the "Big Join
777 Query" (`DimensionGraph`).
779 This is required to be a superset of all task quantum dimensions.
780 """
782 missing: _DatasetDict
783 """Datasets whose existence was originally predicted but were not
784 actually found.
786 Quanta that require these datasets as inputs will be pruned (recursively)
787 when actually constructing a `QuantumGraph` object.
789 These are currently populated only when the "initial dataset query
790 constraint" does not include all overall-input dataset types, and hence the
791 initial data ID query can include data IDs that it should not.
792 """
794 globalInitOutputs: _DatasetDict | None = None
795 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
796 """
798 @contextmanager
799 def connectDataIds(
800 self,
801 registry: Registry,
802 collections: Any,
803 userQuery: Optional[str],
804 externalDataId: DataCoordinate,
805 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
806 bind: Optional[Mapping[str, Any]] = None,
807 ) -> Iterator[DataCoordinateQueryResults]:
808 """Query for the data IDs that connect nodes in the `QuantumGraph`.
810 This method populates `_TaskScaffolding.dataIds` and
811 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
813 Parameters
814 ----------
815 registry : `lsst.daf.butler.Registry`
816 Registry for the data repository; used for all data ID queries.
817 collections
818 Expressions representing the collections to search for input
819 datasets. See :ref:`daf_butler_ordered_collection_searches`.
820 userQuery : `str` or `None`
821 User-provided expression to limit the data IDs processed.
822 externalDataId : `DataCoordinate`
823 Externally-provided data ID that should be used to restrict the
824 results, just as if these constraints had been included via ``AND``
825 in ``userQuery``. This includes (at least) any instrument named
826 in the pipeline definition.
827 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
828 The query constraint variant that should be used to constraint the
829 query based on dataset existance, defaults to
830 `DatasetQueryConstraintVariant.ALL`.
831 bind : `Mapping`, optional
832 Mapping containing literal values that should be injected into the
833 ``userQuery`` expression, keyed by the identifiers they replace.
835 Returns
836 -------
837 commonDataIds : \
838 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
839 An interface to a database temporary table containing all data IDs
840 that will appear in this `QuantumGraph`. Returned inside a
841 context manager, which will drop the temporary table at the end of
842 the `with` block in which this method is called.
843 """
844 _LOG.debug("Building query for data IDs.")
845 # Initialization datasets always have empty data IDs.
846 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
847 for datasetType, refs in itertools.chain(
848 self.initInputs.items(),
849 self.initIntermediates.items(),
850 self.initOutputs.items(),
851 ):
852 refs[emptyDataId] = _RefHolder(datasetType)
853 # Run one big query for the data IDs for task dimensions and regular
854 # inputs and outputs. We limit the query to only dimensions that are
855 # associated with the input dataset types, but don't (yet) try to
856 # obtain the dataset_ids for those inputs.
857 _LOG.debug(
858 "Submitting data ID query over dimensions %s and materializing results.",
859 list(self.dimensions.names),
860 )
861 queryArgs: dict[str, Any] = {
862 "dimensions": self.dimensions,
863 "where": userQuery,
864 "dataId": externalDataId,
865 "bind": bind,
866 }
867 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
868 _LOG.debug(
869 "Constraining graph query using default of %s.",
870 list(self.defaultDatasetQueryConstraints.names),
871 )
872 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
873 queryArgs["collections"] = collections
874 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
875 _LOG.debug("Not using dataset existence to constrain query.")
876 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
877 constraint = set(datasetQueryConstraint)
878 inputs = {k.name: k for k in self.inputs.keys()}
879 if remainder := constraint.difference(inputs.keys()):
880 raise ValueError(
881 f"{remainder} dataset type(s) specified as a graph constraint, but"
882 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
883 )
884 _LOG.debug(f"Constraining graph query using {constraint}")
885 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
886 queryArgs["collections"] = collections
887 else:
888 raise ValueError(
889 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
890 )
892 if "datasets" in queryArgs:
893 for i, dataset_type in enumerate(queryArgs["datasets"]):
894 if dataset_type.isComponent():
895 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
897 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
898 _LOG.debug("Expanding data IDs.")
899 commonDataIds = commonDataIds.expanded()
900 _LOG.debug("Iterating over query results to associate quanta with datasets.")
901 # Iterate over query results, populating data IDs for datasets and
902 # quanta and then connecting them to each other.
903 n = -1
904 for n, commonDataId in enumerate(commonDataIds):
905 # Create DatasetRefs for all DatasetTypes from this result row,
906 # noting that we might have created some already.
907 # We remember both those that already existed and those that we
908 # create now.
909 refsForRow = {}
910 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {}
911 for datasetType, refs in itertools.chain(
912 self.inputs.items(),
913 self.intermediates.items(),
914 self.outputs.items(),
915 ):
916 datasetDataId: Optional[DataCoordinate]
917 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
918 datasetDataId = commonDataId.subset(datasetType.dimensions)
919 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
920 ref_holder = refs.get(datasetDataId)
921 if ref_holder is None:
922 ref_holder = _RefHolder(datasetType)
923 refs[datasetDataId] = ref_holder
924 refsForRow[datasetType.name] = ref_holder
925 # Create _QuantumScaffolding objects for all tasks from this
926 # result row, noting that we might have created some already.
927 for task in self.tasks:
928 quantumDataId = commonDataId.subset(task.dimensions)
929 quantum = task.quanta.get(quantumDataId)
930 if quantum is None:
931 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
932 task.quanta[quantumDataId] = quantum
933 # Whether this is a new quantum or an existing one, we can
934 # now associate the DatasetRefs for this row with it. The
935 # fact that a Quantum data ID and a dataset data ID both
936 # came from the same result row is what tells us they
937 # should be associated.
938 # Many of these associates will be duplicates (because
939 # another query row that differed from this one only in
940 # irrelevant dimensions already added them), and we use
941 # sets to skip.
942 for datasetType in task.inputs:
943 dataId = dataIdCacheForRow[datasetType.dimensions]
944 ref_holder = refsForRow[datasetType.name]
945 quantum.inputs[datasetType.name][dataId] = ref_holder
946 for datasetType in task.outputs:
947 dataId = dataIdCacheForRow[datasetType.dimensions]
948 ref_holder = refsForRow[datasetType.name]
949 quantum.outputs[datasetType.name][dataId] = ref_holder
950 if n < 0:
951 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
952 emptiness_explained = False
953 for message in commonDataIds.explain_no_results():
954 _LOG.critical(message)
955 emptiness_explained = True
956 if not emptiness_explained:
957 _LOG.critical(
958 "To reproduce this query for debugging purposes, run "
959 "Registry.queryDataIds with these arguments:"
960 )
961 # We could just repr() the queryArgs dict to get something
962 # the user could make sense of, but it's friendlier to
963 # put these args in an easier-to-construct equivalent form
964 # so they can read it more easily and copy and paste into
965 # a Python terminal.
966 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
967 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
968 if queryArgs["where"]:
969 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
970 if "datasets" in queryArgs:
971 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
972 if "collections" in queryArgs:
973 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
974 _LOG.debug("Finished processing %d rows from data ID query.", n)
975 yield commonDataIds
977 def resolveDatasetRefs(
978 self,
979 registry: Registry,
980 collections: Any,
981 run: str,
982 commonDataIds: DataCoordinateQueryResults,
983 *,
984 skipExistingIn: Any = None,
985 clobberOutputs: bool = True,
986 constrainedByAllDatasets: bool = True,
987 ) -> None:
988 """Perform follow up queries for each dataset data ID produced in
989 `fillDataIds`.
991 This method populates `_DatasetScaffolding.refs` (except for those in
992 `prerequisites`).
994 Parameters
995 ----------
996 registry : `lsst.daf.butler.Registry`
997 Registry for the data repository; used for all data ID queries.
998 collections
999 Expressions representing the collections to search for input
1000 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1001 run : `str`
1002 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1003 output datasets, if it already exists.
1004 commonDataIds : \
1005 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
1006 Result of a previous call to `connectDataIds`.
1007 skipExistingIn
1008 Expressions representing the collections to search for existing
1009 output datasets that should be skipped. See
1010 :ref:`daf_butler_ordered_collection_searches` for allowed types.
1011 `None` or empty string/sequence disables skipping.
1012 clobberOutputs : `bool`, optional
1013 If `True` (default), allow quanta to created even if outputs exist;
1014 this requires the same behavior behavior to be enabled when
1015 executing. If ``skipExistingIn`` is not `None`, completed quanta
1016 (those with metadata, or all outputs if there is no metadata
1017 dataset configured) will be skipped rather than clobbered.
1018 constrainedByAllDatasets : `bool`, optional
1019 Indicates if the commonDataIds were generated with a constraint on
1020 all dataset types.
1022 Raises
1023 ------
1024 OutputExistsError
1025 Raised if an output dataset already exists in the output run
1026 and ``skipExistingIn`` does not include output run, or if only
1027 some outputs are present and ``clobberOutputs`` is `False`.
1028 """
1029 # Run may be provided but it does not have to exist, in that case we
1030 # use it for resolving references but don't check it for existing refs.
1031 run_exists = False
1032 if run:
1033 try:
1034 run_exists = bool(registry.queryCollections(run))
1035 except MissingCollectionError:
1036 # Undocumented exception is raise if it does not exist
1037 pass
1039 skip_collections_wildcard: CollectionWildcard | None = None
1040 skipExistingInRun = False
1041 if skipExistingIn:
1042 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
1043 if run_exists:
1044 # as optimization check in the explicit list of names first
1045 skipExistingInRun = run in skip_collections_wildcard.strings
1046 if not skipExistingInRun:
1047 # need to flatten it and check again
1048 skipExistingInRun = run in registry.queryCollections(
1049 skipExistingIn,
1050 collectionTypes=CollectionType.RUN,
1051 )
1053 idMaker = _DatasetIdMaker(run)
1055 resolvedRefQueryResults: Iterable[DatasetRef]
1057 # Updating constrainedByAllDatasets here is not ideal, but we have a
1058 # few different code paths that each transfer different pieces of
1059 # information about what dataset query constraints were applied here,
1060 # and none of them has the complete picture until we get here. We're
1061 # long overdue for a QG generation rewrite that will make this go away
1062 # entirely anyway.
1063 constrainedByAllDatasets = (
1064 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
1065 )
1067 # Look up [init] intermediate and output datasets in the output
1068 # collection, if there is an output collection.
1069 if run_exists or skip_collections_wildcard is not None:
1070 for datasetType, refs in itertools.chain(
1071 self.initIntermediates.items(),
1072 self.initOutputs.items(),
1073 self.intermediates.items(),
1074 self.outputs.items(),
1075 ):
1076 _LOG.debug(
1077 "Resolving %d datasets for intermediate and/or output dataset %s.",
1078 len(refs),
1079 datasetType.name,
1080 )
1081 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
1082 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
1083 # TODO: this assert incorrectly bans component inputs;
1084 # investigate on DM-33027.
1085 # assert not datasetType.isComponent(), \
1086 # "Output datasets cannot be components."
1087 #
1088 # Instead we have to handle them manually to avoid a
1089 # deprecation warning, but it is at least confusing and
1090 # possibly a bug for components to appear here at all.
1091 if datasetType.isComponent():
1092 parent_dataset_type = datasetType.makeCompositeDatasetType()
1093 component = datasetType.component()
1094 else:
1095 parent_dataset_type = datasetType
1096 component = None
1098 # look at RUN collection first
1099 if run_exists:
1100 try:
1101 resolvedRefQueryResults = subset.findDatasets(
1102 parent_dataset_type, collections=run, findFirst=True
1103 )
1104 except MissingDatasetTypeError:
1105 resolvedRefQueryResults = []
1106 for resolvedRef in resolvedRefQueryResults:
1107 # TODO: we could easily support per-DatasetType
1108 # skipExisting and I could imagine that being useful -
1109 # it's probably required in order to support writing
1110 # initOutputs before QuantumGraph generation.
1111 assert resolvedRef.dataId in refs
1112 if not (skipExistingInRun or isInit or clobberOutputs):
1113 raise OutputExistsError(
1114 f"Output dataset {datasetType.name} already exists in "
1115 f"output RUN collection '{run}' with data ID"
1116 f" {resolvedRef.dataId}."
1117 )
1118 # To resolve all outputs we have to remember existing
1119 # ones to avoid generating new dataset IDs for them.
1120 refs[resolvedRef.dataId].ref = (
1121 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1122 )
1124 # And check skipExistingIn too, if RUN collection is in
1125 # it is handled above
1126 if skip_collections_wildcard is not None:
1127 try:
1128 resolvedRefQueryResults = subset.findDatasets(
1129 parent_dataset_type,
1130 collections=skip_collections_wildcard,
1131 findFirst=True,
1132 )
1133 except MissingDatasetTypeError:
1134 resolvedRefQueryResults = []
1135 for resolvedRef in resolvedRefQueryResults:
1136 if resolvedRef.dataId not in refs:
1137 continue
1138 refs[resolvedRef.dataId].ref = (
1139 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1140 )
1142 # Look up input and initInput datasets in the input collection(s). We
1143 # accumulate datasets in self.missing, if the common data IDs were not
1144 # constrained on dataset type existence.
1145 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1146 _LOG.debug(
1147 "Resolving %d datasets for input dataset %s.",
1148 len(refs),
1149 datasetType.name,
1150 )
1151 if datasetType.isComponent():
1152 parent_dataset_type = datasetType.makeCompositeDatasetType()
1153 component = datasetType.component()
1154 else:
1155 parent_dataset_type = datasetType
1156 component = None
1157 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {}
1158 try:
1159 resolvedRefQueryResults = commonDataIds.subset(
1160 datasetType.dimensions, unique=True
1161 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1162 except MissingDatasetTypeError:
1163 resolvedRefQueryResults = []
1164 dataIdsNotFoundYet = set(refs.keys())
1165 for resolvedRef in resolvedRefQueryResults:
1166 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1167 if resolvedRef.dataId not in refs:
1168 continue
1169 refs[resolvedRef.dataId].ref = (
1170 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1171 )
1172 if dataIdsNotFoundYet:
1173 if constrainedByAllDatasets:
1174 raise RuntimeError(
1175 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1176 f"'{datasetType.name}' was/were present in a previous "
1177 "query, but could not be found now. "
1178 "This is either a logic bug in QuantumGraph generation "
1179 "or the input collections have been modified since "
1180 "QuantumGraph generation began."
1181 )
1182 elif not datasetType.dimensions:
1183 raise RuntimeError(
1184 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1185 f"collections {collections}."
1186 )
1187 else:
1188 # If the common dataIds were not constrained using all the
1189 # input dataset types, it is possible that some data ids
1190 # found don't correspond to existing datasets. Mark these
1191 # for later pruning from the quantum graph.
1192 for k in dataIdsNotFoundYet:
1193 missing_for_dataset_type[k] = refs[k]
1194 if missing_for_dataset_type:
1195 self.missing[datasetType] = missing_for_dataset_type
1197 # Resolve the missing refs, just so they look like all of the others;
1198 # in the end other code will make sure they never appear in the QG.
1199 for dataset_type, refDict in self.missing.items():
1200 idMaker.resolveDict(dataset_type, refDict)
1202 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1203 # replacing the unresolved refs there, and then look up prerequisites.
1204 for task in self.tasks:
1205 _LOG.debug(
1206 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1207 len(task.quanta),
1208 task.taskDef.label,
1209 )
1210 # The way iterConnections is designed makes it impossible to
1211 # annotate precisely enough to satisfy MyPy here.
1212 lookupFunctions = {
1213 c.name: c.lookupFunction # type: ignore
1214 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1215 if c.lookupFunction is not None # type: ignore
1216 }
1217 dataIdsFailed = []
1218 dataIdsSucceeded = []
1219 for quantum in task.quanta.values():
1220 # Process outputs datasets only if skipExistingIn is not None
1221 # or there is a run to look for outputs in and clobberOutputs
1222 # is True. Note that if skipExistingIn is None, any output
1223 # datasets that already exist would have already caused an
1224 # exception to be raised.
1225 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1226 resolvedRefs = []
1227 unresolvedDataIds = []
1228 haveMetadata = False
1229 for datasetType, originalRefs in quantum.outputs.items():
1230 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()):
1231 if ref is not None:
1232 resolvedRefs.append(ref)
1233 originalRefs[dataId].ref = ref
1234 if datasetType.name == task.taskDef.metadataDatasetName:
1235 haveMetadata = True
1236 else:
1237 unresolvedDataIds.append((datasetType, dataId))
1238 if resolvedRefs:
1239 if haveMetadata or not unresolvedDataIds:
1240 dataIdsSucceeded.append(quantum.dataId)
1241 if skip_collections_wildcard is not None:
1242 continue
1243 else:
1244 dataIdsFailed.append(quantum.dataId)
1245 if not clobberOutputs:
1246 raise OutputExistsError(
1247 f"Quantum {quantum.dataId} of task with label "
1248 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1249 f"({resolvedRefs}) "
1250 f"and others that don't ({unresolvedDataIds}), with no metadata output, "
1251 "and clobbering outputs was not enabled."
1252 )
1253 # Update the input DatasetRefs to the resolved ones we already
1254 # searched for.
1255 for datasetType, input_refs in quantum.inputs.items():
1256 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()):
1257 input_refs[data_id].ref = ref
1258 # Look up prerequisite datasets in the input collection(s).
1259 # These may have dimensions that extend beyond those we queried
1260 # for originally, because we want to permit those data ID
1261 # values to differ across quanta and dataset types.
1262 for datasetType in task.prerequisites:
1263 if datasetType.isComponent():
1264 parent_dataset_type = datasetType.makeCompositeDatasetType()
1265 component = datasetType.component()
1266 else:
1267 parent_dataset_type = datasetType
1268 component = None
1269 lookupFunction = lookupFunctions.get(datasetType.name)
1270 if lookupFunction is not None:
1271 # PipelineTask has provided its own function to do the
1272 # lookup. This always takes precedence.
1273 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1274 elif (
1275 datasetType.isCalibration()
1276 and datasetType.dimensions <= quantum.dataId.graph
1277 and quantum.dataId.graph.temporal
1278 ):
1279 # This is a master calibration lookup, which we have to
1280 # handle specially because the query system can't do a
1281 # temporal join on a non-dimension-based timespan yet.
1282 timespan = quantum.dataId.timespan
1283 try:
1284 prereq_ref = registry.findDataset(
1285 parent_dataset_type,
1286 quantum.dataId,
1287 collections=collections,
1288 timespan=timespan,
1289 )
1290 if prereq_ref is not None:
1291 if component is not None:
1292 prereq_ref = prereq_ref.makeComponentRef(component)
1293 prereq_refs = [prereq_ref]
1294 else:
1295 prereq_refs = []
1296 except (KeyError, MissingDatasetTypeError):
1297 # This dataset type is not present in the registry,
1298 # which just means there are no datasets here.
1299 prereq_refs = []
1300 else:
1301 # Most general case.
1302 prereq_refs = [
1303 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1304 for prereq_ref in registry.queryDatasets(
1305 parent_dataset_type,
1306 collections=collections,
1307 dataId=quantum.dataId,
1308 findFirst=True,
1309 ).expanded()
1310 ]
1312 for ref in prereq_refs:
1313 if ref is not None:
1314 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1315 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1317 # Resolve all quantum inputs and outputs.
1318 for datasetDict in (quantum.inputs, quantum.outputs):
1319 for dataset_type, refDict in datasetDict.items():
1320 idMaker.resolveDict(dataset_type, refDict)
1322 # Resolve task initInputs and initOutputs.
1323 for datasetDict in (task.initInputs, task.initOutputs):
1324 for dataset_type, refDict in datasetDict.items():
1325 idMaker.resolveDict(dataset_type, refDict)
1327 # Actually remove any quanta that we decided to skip above.
1328 if dataIdsSucceeded:
1329 if skip_collections_wildcard is not None:
1330 _LOG.debug(
1331 "Pruning successful %d quanta for task with label '%s' because all of their "
1332 "outputs exist or metadata was written successfully.",
1333 len(dataIdsSucceeded),
1334 task.taskDef.label,
1335 )
1336 for dataId in dataIdsSucceeded:
1337 del task.quanta[dataId]
1338 elif clobberOutputs:
1339 _LOG.info(
1340 "Found %d successful quanta for task with label '%s' "
1341 "that will need to be clobbered during execution.",
1342 len(dataIdsSucceeded),
1343 task.taskDef.label,
1344 )
1345 else:
1346 raise AssertionError("OutputExistsError should have already been raised.")
1347 if dataIdsFailed:
1348 if clobberOutputs:
1349 _LOG.info(
1350 "Found %d failed/incomplete quanta for task with label '%s' "
1351 "that will need to be clobbered during execution.",
1352 len(dataIdsFailed),
1353 task.taskDef.label,
1354 )
1355 else:
1356 raise AssertionError("OutputExistsError should have already been raised.")
1358 # Collect initOutputs that do not belong to any task.
1359 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1360 for task in self.tasks:
1361 global_dataset_types -= set(task.initOutputs)
1362 if global_dataset_types:
1363 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1364 for dataset_type, refDict in self.globalInitOutputs.items():
1365 idMaker.resolveDict(dataset_type, refDict)
1367 def makeQuantumGraph(
1368 self,
1369 registry: Registry,
1370 metadata: Optional[Mapping[str, Any]] = None,
1371 datastore: Optional[Datastore] = None,
1372 ) -> QuantumGraph:
1373 """Create a `QuantumGraph` from the quanta already present in
1374 the scaffolding data structure.
1376 Parameters
1377 ---------
1378 registry : `lsst.daf.butler.Registry`
1379 Registry for the data repository; used for all data ID queries.
1380 metadata : Optional Mapping of `str` to primitives
1381 This is an optional parameter of extra data to carry with the
1382 graph. Entries in this mapping should be able to be serialized in
1383 JSON.
1384 datastore : `Datastore`, optional
1385 If not `None` then fill datastore records in each generated
1386 Quantum.
1388 Returns
1389 -------
1390 graph : `QuantumGraph`
1391 The full `QuantumGraph`.
1392 """
1394 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1395 """Extract all DatasetRefs from the dictionaries"""
1396 for ref_dict in dataset_dict.values():
1397 for holder in ref_dict.values():
1398 yield holder.resolved_ref
1400 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1401 if datastore is not None:
1402 datastore_records = datastore.export_records(
1403 itertools.chain(
1404 _make_refs(self.inputs),
1405 _make_refs(self.initInputs),
1406 _make_refs(self.prerequisites),
1407 )
1408 )
1410 graphInput: dict[TaskDef, set[Quantum]] = {}
1411 for task in self.tasks:
1412 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records)
1413 graphInput[task.taskDef] = qset
1415 taskInitInputs = {
1416 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values()
1417 for task in self.tasks
1418 }
1419 taskInitOutputs = {
1420 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values()
1421 for task in self.tasks
1422 }
1424 globalInitOutputs: list[DatasetRef] = []
1425 if self.globalInitOutputs is not None:
1426 for refs_dict in self.globalInitOutputs.values():
1427 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values())
1429 graph = QuantumGraph(
1430 graphInput,
1431 metadata=metadata,
1432 pruneRefs=list(self.missing.iter_resolved_refs()),
1433 universe=self.dimensions.universe,
1434 initInputs=taskInitInputs,
1435 initOutputs=taskInitOutputs,
1436 globalInitOutputs=globalInitOutputs,
1437 registryDatasetTypes=self._get_registry_dataset_types(registry),
1438 )
1439 return graph
1441 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1442 """Make a list of all dataset types used by a graph as defined in
1443 registry.
1444 """
1445 chain = [
1446 self.initInputs,
1447 self.initIntermediates,
1448 self.initOutputs,
1449 self.inputs,
1450 self.intermediates,
1451 self.outputs,
1452 self.prerequisites,
1453 ]
1454 if self.globalInitOutputs is not None:
1455 chain.append(self.globalInitOutputs)
1457 # Collect names of all dataset types.
1458 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1459 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1461 # Check for types that do not exist in registry yet:
1462 # - inputs must exist
1463 # - intermediates and outputs may not exist, but there must not be
1464 # more than one definition (e.g. differing in storage class)
1465 # - prerequisites may not exist, treat it the same as outputs here
1466 for dstype in itertools.chain(self.initInputs, self.inputs):
1467 if dstype.name not in dataset_types:
1468 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1470 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1471 chain = [
1472 self.initIntermediates,
1473 self.initOutputs,
1474 self.intermediates,
1475 self.outputs,
1476 self.prerequisites,
1477 ]
1478 if self.globalInitOutputs is not None:
1479 chain.append(self.globalInitOutputs)
1480 for dstype in itertools.chain(*chain):
1481 if dstype.name not in dataset_types:
1482 new_outputs[dstype.name].add(dstype)
1483 for name, dstypes in new_outputs.items():
1484 if len(dstypes) > 1:
1485 raise ValueError(
1486 "Pipeline contains multiple definitions for a dataset type "
1487 f"which is not defined in registry yet: {dstypes}"
1488 )
1489 elif len(dstypes) == 1:
1490 dataset_types[name] = dstypes.pop()
1492 return dataset_types.values()
1495# ------------------------
1496# Exported definitions --
1497# ------------------------
1500class GraphBuilderError(Exception):
1501 """Base class for exceptions generated by graph builder."""
1503 pass
1506class OutputExistsError(GraphBuilderError):
1507 """Exception generated when output datasets already exist."""
1509 pass
1512class PrerequisiteMissingError(GraphBuilderError):
1513 """Exception generated when a prerequisite dataset does not exist."""
1515 pass
1518class GraphBuilder:
1519 """GraphBuilder class is responsible for building task execution graph from
1520 a Pipeline.
1522 Parameters
1523 ----------
1524 registry : `~lsst.daf.butler.Registry`
1525 Data butler instance.
1526 skipExistingIn
1527 Expressions representing the collections to search for existing
1528 output datasets that should be skipped. See
1529 :ref:`daf_butler_ordered_collection_searches`.
1530 clobberOutputs : `bool`, optional
1531 If `True` (default), allow quanta to created even if partial outputs
1532 exist; this requires the same behavior behavior to be enabled when
1533 executing.
1534 datastore : `Datastore`, optional
1535 If not `None` then fill datastore records in each generated Quantum.
1536 """
1538 def __init__(
1539 self,
1540 registry: Registry,
1541 skipExistingIn: Any = None,
1542 clobberOutputs: bool = True,
1543 datastore: Optional[Datastore] = None,
1544 ):
1545 self.registry = registry
1546 self.dimensions = registry.dimensions
1547 self.skipExistingIn = skipExistingIn
1548 self.clobberOutputs = clobberOutputs
1549 self.datastore = datastore
1551 def makeGraph(
1552 self,
1553 pipeline: Pipeline | Iterable[TaskDef],
1554 collections: Any,
1555 run: str,
1556 userQuery: Optional[str],
1557 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1558 metadata: Optional[Mapping[str, Any]] = None,
1559 bind: Optional[Mapping[str, Any]] = None,
1560 ) -> QuantumGraph:
1561 """Create execution graph for a pipeline.
1563 Parameters
1564 ----------
1565 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1566 Pipeline definition, task names/classes and their configs.
1567 collections
1568 Expressions representing the collections to search for input
1569 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1570 run : `str`
1571 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1572 output datasets. Collection does not have to exist and it will be
1573 created when graph is executed.
1574 userQuery : `str`
1575 String which defines user-defined selection for registry, should be
1576 empty or `None` if there is no restrictions on data selection.
1577 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1578 The query constraint variant that should be used to constraint the
1579 query based on dataset existance, defaults to
1580 `DatasetQueryConstraintVariant.ALL`.
1581 metadata : Optional Mapping of `str` to primitives
1582 This is an optional parameter of extra data to carry with the
1583 graph. Entries in this mapping should be able to be serialized in
1584 JSON.
1585 bind : `Mapping`, optional
1586 Mapping containing literal values that should be injected into the
1587 ``userQuery`` expression, keyed by the identifiers they replace.
1589 Returns
1590 -------
1591 graph : `QuantumGraph`
1593 Raises
1594 ------
1595 UserExpressionError
1596 Raised when user expression cannot be parsed.
1597 OutputExistsError
1598 Raised when output datasets already exist.
1599 Exception
1600 Other exceptions types may be raised by underlying registry
1601 classes.
1602 """
1603 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1604 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1605 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1606 instrument_class: Optional[Any] = None
1607 if isinstance(pipeline, Pipeline):
1608 instrument_class_name = pipeline.getInstrument()
1609 if instrument_class_name is not None:
1610 instrument_class = doImportType(instrument_class_name)
1611 pipeline = list(pipeline.toExpandedPipeline())
1612 if instrument_class is not None:
1613 dataId = DataCoordinate.standardize(
1614 instrument=instrument_class.getName(), universe=self.registry.dimensions
1615 )
1616 else:
1617 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1618 with scaffolding.connectDataIds(
1619 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1620 ) as commonDataIds:
1621 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1622 scaffolding.resolveDatasetRefs(
1623 self.registry,
1624 collections,
1625 run,
1626 commonDataIds,
1627 skipExistingIn=self.skipExistingIn,
1628 clobberOutputs=self.clobberOutputs,
1629 constrainedByAllDatasets=condition,
1630 )
1631 return scaffolding.makeQuantumGraph(
1632 registry=self.registry, metadata=metadata, datastore=self.datastore
1633 )