Coverage for python/lsst/pipe/base/graphBuilder.py: 17%
597 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-06 02:28 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-06 02:28 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining GraphBuilder class and related methods.
23"""
25from __future__ import annotations
27__all__ = ["GraphBuilder"]
29# -------------------------------
30# Imports of standard modules --
31# -------------------------------
32import contextlib
33import itertools
34import logging
35from collections import ChainMap, defaultdict
36from collections.abc import Collection, Iterable, Iterator, Mapping
37from contextlib import contextmanager
38from dataclasses import dataclass
39from typing import Any, TypeVar, cast
41from lsst.daf.butler import (
42 CollectionType,
43 DataCoordinate,
44 DatasetRef,
45 DatasetType,
46 Datastore,
47 DatastoreRecordData,
48 DimensionGraph,
49 DimensionUniverse,
50 NamedKeyDict,
51 NamedValueSet,
52 Quantum,
53 Registry,
54 SkyPixDimension,
55)
56from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
57from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
58from lsst.daf.butler.registry.wildcards import CollectionWildcard
59from lsst.sphgeom import PixelizationABC, RangeSet
61# -----------------------------
62# Imports for other modules --
63# -----------------------------
64from . import automatic_connection_constants as acc
65from ._datasetQueryConstraints import DatasetQueryConstraintVariant
66from ._status import NoWorkFound
67from .connections import AdjustQuantumHelper, iterConnections
68from .graph import QuantumGraph
69from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
71# ----------------------------------
72# Local non-exported definitions --
73# ----------------------------------
75_LOG = logging.getLogger(__name__)
78@dataclass
79class _RefHolder:
80 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future
81 resolved reference.
83 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use
84 `None` to represent a reference that is yet to be resolved. Information
85 about its corresponding dataset type and coordinate is stored in
86 `_DatasetDict` mapping.
87 """
89 dataset_type: DatasetType
90 """Dataset type of the dataset to be created later. I need to store it here
91 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared
92 between different compatible dataset types."""
94 ref: DatasetRef | None = None
95 """Dataset reference, initially `None`, created when all datasets are
96 resolved.
97 """
99 @property
100 def resolved_ref(self) -> DatasetRef:
101 """Access resolved reference, should only be called after the
102 reference is set (`~lsst.daf.butler.DatasetRef`).
103 """
104 assert self.ref is not None, "Dataset reference is not set."
105 return self.ref
108_Refs = TypeVar("_Refs")
111class _DatasetDictBase(NamedKeyDict[DatasetType, _Refs]):
112 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested
113 collection of the known `~lsst.daf.butler.DatasetRef` instances of that
114 type.
116 Parameters
117 ----------
118 args
119 Positional arguments are forwarded to the `dict` constructor.
120 universe : `~lsst.daf.butler.DimensionUniverse`
121 Universe of all possible dimensions.
122 """
124 def __init__(self, *args: Any, universe: DimensionUniverse):
125 super().__init__(*args)
126 self.universe = universe
128 @classmethod
129 def _fromSubset(
130 cls,
131 datasetTypes: Collection[DatasetType],
132 first: _DatasetDictBase,
133 *rest: _DatasetDictBase,
134 ) -> _DatasetDictBase:
135 """Return a new dictionary by extracting items corresponding to the
136 given keys from one or more existing dictionaries.
138 Parameters
139 ----------
140 datasetTypes : `~collections.abc.Iterable` of \
141 `~lsst.daf.butler.DatasetType`
142 DatasetTypes to use as keys for the dict. Values will be obtained
143 by lookups against ``first`` and ``rest``.
144 first : `_DatasetDictBase`
145 Another dictionary from which to extract values. Its actual type
146 must be idedntical to the type of sub-class used to call this
147 method.
148 rest
149 Additional dictionaries from which to extract values.
151 Returns
152 -------
153 dictionary : `_DatasetDictBase`
154 A new dictionary instance.
155 """
156 combined = ChainMap(first, *rest)
158 # Dataset types known to match immediately can be processed
159 # without checks.
160 matches = combined.keys() & set(datasetTypes)
161 _dict = {k: combined[k] for k in matches}
163 if len(_dict) < len(datasetTypes):
164 # Work out which ones are missing.
165 missing_datasetTypes = set(datasetTypes) - _dict.keys()
167 # Get the known names for comparison.
168 combined_by_name = {k.name: k for k in combined}
170 missing = set()
171 incompatible = {}
172 for datasetType in missing_datasetTypes:
173 # The dataset type is not found. It may not be listed
174 # or it may be that it is there with the same name
175 # but different definition.
176 if datasetType.name in combined_by_name:
177 # This implies some inconsistency in definitions
178 # for connections. If there is support for storage
179 # class conversion we can let it slide.
180 # At this point we do not know
181 # where the inconsistency is but trust that down
182 # stream code will be more explicit about input
183 # vs output incompatibilities.
184 existing = combined_by_name[datasetType.name]
185 convertible_to_existing = existing.is_compatible_with(datasetType)
186 convertible_from_existing = datasetType.is_compatible_with(existing)
187 if convertible_to_existing and convertible_from_existing:
188 _LOG.debug(
189 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
190 datasetType.name,
191 datasetType.storageClass_name,
192 existing.storageClass_name,
193 )
194 _dict[datasetType] = combined[existing]
195 elif convertible_to_existing or convertible_from_existing:
196 # We'd need to refactor a fair amount to recognize
197 # whether this is an error or not, so I'm not going to
198 # bother until we need to do that for other reasons
199 # (it won't be too long).
200 _LOG.info(
201 "Dataset type %s is present with multiple only partially-compatible storage "
202 "classes %s and %s.",
203 datasetType.name,
204 datasetType.storageClass_name,
205 existing.storageClass_name,
206 )
207 _dict[datasetType] = combined[existing]
208 else:
209 incompatible[datasetType] = existing
210 else:
211 missing.add(datasetType)
213 if missing or incompatible:
214 reasons = []
215 if missing:
216 reasons.append(
217 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known "
218 f"types: [{', '.join(d.name for d in combined)}]."
219 )
220 if incompatible:
221 for x, y in incompatible.items():
222 reasons.append(f"{x} incompatible with {y}")
223 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
225 return cls(_dict, universe=first.universe)
227 @property
228 def dimensions(self) -> DimensionGraph:
229 """The union of all dimensions used by all dataset types in this
230 dictionary, including implied dependencies (`DimensionGraph`).
231 """
232 base = self.universe.empty
233 if len(self) == 0:
234 return base
235 return base.union(*[datasetType.dimensions for datasetType in self])
237 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]:
238 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts
239 into a new mapping with `~lsst.daf.butler.DatasetType` keys and
240 `~lsst.daf.butler.DatasetRef` values.
242 This method assumes that each nest contains exactly one item, as is the
243 case for all "init" datasets.
245 Parameters
246 ----------
247 storage_classes : `dict` [ `str`, `str` ]
248 Mapping from dataset type name to the storage class to use for that
249 dataset type. These are typically the storage classes declared
250 for a particular task, which may differ rom the data repository
251 definitions.
253 Returns
254 -------
255 dictionary : `~lsst.daf.butler.NamedKeyDict`
256 Dictionary mapping `~lsst.daf.butler.DatasetType` to
257 `~lsst.daf.butler.DatasetRef`, with both
258 `~lsst.daf.butler.DatasetType` instances and string names usable
259 as keys.
260 """
261 return NamedKeyDict(
262 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()}
263 )
265 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
266 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into
267 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of
268 `~lsst.daf.butler.DatasetRef` values.
270 Parameters
271 ----------
272 storage_classes : `dict` [ `str`, `str` ]
273 Mapping from dataset type name to the storage class to use for that
274 dataset type. These are typically the storage classes declared
275 for a particular task, which may differ rom the data repository
276 definitions.
278 Returns
279 -------
280 dictionary : `~lsst.daf.butler.NamedKeyDict`
281 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of
282 `~lsst.daf.butler.DatasetRef`, with both
283 `~lsst.daf.butler.DatasetType` instances and string names usable
284 as keys.
285 """
286 raise NotImplementedError()
289class _DatasetDict(_DatasetDictBase[dict[DataCoordinate, _RefHolder]]):
290 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested
291 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that
292 type.
293 """
295 @classmethod
296 def fromDatasetTypes(
297 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
298 ) -> _DatasetDict:
299 """Construct a dictionary from a flat iterable of
300 `~lsst.daf.butler.DatasetType` keys.
302 Parameters
303 ----------
304 datasetTypes : `~collections.abc.Iterable` of \
305 `~lsst.daf.butler.DatasetType`
306 DatasetTypes to use as keys for the dict. Values will be empty
307 dictionaries.
308 universe : `~lsst.daf.butler.DimensionUniverse`
309 Universe of all possible dimensions.
311 Returns
312 -------
313 dictionary : `_DatasetDict`
314 A new `_DatasetDict` instance.
315 """
316 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
318 @classmethod
319 def fromSubset(
320 cls,
321 datasetTypes: Collection[DatasetType],
322 first: _DatasetDict,
323 *rest: _DatasetDict,
324 ) -> _DatasetDict:
325 """Return a new dictionary by extracting items corresponding to the
326 given keys from one or more existing dictionaries.
328 Parameters
329 ----------
330 datasetTypes : `~collections.abc.Iterable` of \
331 `~lsst.daf.butler.DatasetType`
332 DatasetTypes to use as keys for the dict. Values will be obtained
333 by lookups against ``first`` and ``rest``.
334 first : `_DatasetDict`
335 Another dictionary from which to extract values.
336 rest
337 Additional dictionaries from which to extract values.
339 Returns
340 -------
341 dictionary : `_DatasetDict`
342 A new dictionary instance.
343 """
344 return cast(_DatasetDict, cls._fromSubset(datasetTypes, first, *rest))
346 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
347 # Docstring inherited.
348 result = {}
349 for dataset_type, holders in self.items():
350 if (
351 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name)
352 ) != dataset_type.storageClass_name:
353 dataset_type = dataset_type.overrideStorageClass(override)
354 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()]
355 else:
356 refs = [holder.resolved_ref for holder in holders.values()]
357 result[dataset_type] = refs
358 return NamedKeyDict(result)
360 def extract(
361 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
362 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]:
363 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances
364 that match the given `~lsst.daf.butler.DatasetType` and data IDs.
366 Parameters
367 ----------
368 datasetType : `~lsst.daf.butler.DatasetType`
369 Dataset type to match.
370 dataIds : `~collections.abc.Iterable` \
371 [ `~lsst.daf.butler.DataCoordinate` ]
372 Data IDs to match.
374 Returns
375 -------
376 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ]
377 DatasetRef instances for which ``ref.datasetType == datasetType``
378 and ``ref.dataId`` is in ``dataIds``.
379 """
380 refs = self[datasetType]
381 return ((dataId, refs[dataId].ref) for dataId in dataIds)
383 def isdisjoint(self, other: _DatasetDict) -> bool:
384 """Test whether ``self`` and ``other`` have any datasets in common.
386 Datasets are considered in common if they have the same *parent*
387 dataset type name and data ID; storage classes and components are not
388 considered.
389 """
390 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()}
391 for k, v in other.items():
392 parent_name, _ = k.nameAndComponent()
393 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()):
394 return False
395 return True
397 def iter_resolved_refs(self) -> Iterator[DatasetRef]:
398 """Iterate over all DatasetRef instances held by this data structure,
399 assuming that each `_RefHolder` already carries are resolved ref.
400 """
401 for holders_by_data_id in self.values():
402 for holder in holders_by_data_id.values():
403 yield holder.resolved_ref
406class _DatasetDictMulti(_DatasetDictBase[defaultdict[DataCoordinate, list[_RefHolder]]]):
407 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested
408 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that
409 type. Nexted dictionary can contain multiple refs for the same data ID,
410 suitable for use with calibration datasets.
411 """
413 @classmethod
414 def fromDatasetTypes(
415 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
416 ) -> _DatasetDictMulti:
417 """Construct a dictionary from a flat iterable of
418 `~lsst.daf.butler.DatasetType` keys.
420 Parameters
421 ----------
422 datasetTypes : `~collections.abc.Iterable` of \
423 `~lsst.daf.butler.DatasetType`
424 DatasetTypes to use as keys for the dict. Values will be empty
425 dictionaries.
426 universe : `~lsst.daf.butler.DimensionUniverse`
427 Universe of all possible dimensions.
429 Returns
430 -------
431 dictionary : `_DatasetDictMulti`
432 A new `_DatasetDictMulti` instance.
433 """
434 return cls({datasetType: defaultdict(list) for datasetType in datasetTypes}, universe=universe)
436 @classmethod
437 def fromSubset(
438 cls,
439 datasetTypes: Collection[DatasetType],
440 first: _DatasetDictMulti,
441 *rest: _DatasetDictMulti,
442 ) -> _DatasetDictMulti:
443 """Return a new dictionary by extracting items corresponding to the
444 given keys from one or more existing dictionaries.
446 Parameters
447 ----------
448 datasetTypes : `~collections.abc.Iterable` of \
449 `~lsst.daf.butler.DatasetType`
450 DatasetTypes to use as keys for the dict. Values will be obtained
451 by lookups against ``first`` and ``rest``.
452 first : `_DatasetDictMulti`
453 Another dictionary from which to extract values.
454 rest
455 Additional dictionaries from which to extract values.
457 Returns
458 -------
459 dictionary : `_DatasetDictMulti`
460 A new dictionary instance.
461 """
462 return cast(_DatasetDictMulti, cls._fromSubset(datasetTypes, first, *rest))
464 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
465 # Docstring inherited.
466 result = {}
467 for dataset_type, holder_map in self.items():
468 if (
469 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name)
470 ) != dataset_type.storageClass_name:
471 dataset_type = dataset_type.overrideStorageClass(override)
472 refs = []
473 for holder_list in holder_map.values():
474 refs += [holder.resolved_ref.overrideStorageClass(override) for holder in holder_list]
475 else:
476 refs = []
477 for holder_list in holder_map.values():
478 refs += [holder.resolved_ref for holder in holder_list]
479 result[dataset_type] = refs
480 return NamedKeyDict(result)
482 def iter_resolved_refs(self) -> Iterator[DatasetRef]:
483 """Iterate over all DatasetRef instances held by this data structure,
484 assuming that each `_RefHolder` already carries are resolved ref.
485 """
486 for holders_by_data_id in self.values():
487 for holder_list in holders_by_data_id.values():
488 for holder in holder_list:
489 yield holder.resolved_ref
492class _QuantumScaffolding:
493 """Helper class aggregating information about a `Quantum`, used when
494 constructing a `QuantumGraph`.
496 See `_PipelineScaffolding` for a top-down description of the full
497 scaffolding data structure.
499 Parameters
500 ----------
501 task : _TaskScaffolding
502 Back-reference to the helper object for the `PipelineTask` this quantum
503 represents an execution of.
504 dataId : `~lsst.daf.butler.DataCoordinate`
505 Data ID for this quantum.
506 """
508 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
509 self.task = task
510 self.dataId = dataId
511 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
512 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
513 self.prerequisites = _DatasetDict.fromDatasetTypes(
514 task.prerequisites.keys(), universe=dataId.universe
515 )
517 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
519 def __repr__(self) -> str:
520 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
522 task: _TaskScaffolding
523 """Back-reference to the helper object for the `PipelineTask` this quantum
524 represents an execution of.
525 """
527 dataId: DataCoordinate
528 """Data ID for this quantum.
529 """
531 inputs: _DatasetDict
532 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to
533 this quantum.
535 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty
536 dictionary at construction. Those nested dictionaries are populated
537 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef`
538 instances in `_PipelineScaffolding.connectDataIds`.
539 """
541 outputs: _DatasetDict
542 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this
543 quantum.
544 """
546 prerequisites: _DatasetDict
547 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite
548 inputs to this quantum.
549 """
551 def computeSpatialExtent(self, pixelization: PixelizationABC) -> RangeSet:
552 """Return the spatial extent of this quantum's inputs and outputs in
553 a skypix system.
555 Parameters
556 ----------
557 pixelization : `lsst.sphgeom.PixelizationABC`
558 Pixelization system.
560 Returns
561 -------
562 extent : `lsst.sphgeom.RangeSet`
563 Ranges of sky pixels that touch this quantum's inputs and outputs.
564 """
565 result = RangeSet()
566 for dataset_type, datasets in itertools.chain(self.inputs.items(), self.outputs.items()):
567 if dataset_type.dimensions.spatial:
568 for data_id in datasets:
569 result |= pixelization.envelope(data_id.region)
570 return result
572 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum:
573 """Transform the scaffolding object into a true `Quantum` instance.
575 Parameters
576 ----------
577 datastore_records : `~collections.abc.Mapping` [ `str`, \
578 `~lsst.daf.butler.DatastoreRecordData` ], optional
579 If not `None` then fill datastore records in each generated Quantum
580 using the records from this structure.
582 Returns
583 -------
584 quantum : `Quantum`
585 An actual `Quantum` instance.
586 """
587 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes)
588 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes))
589 # Give the task's Connections class an opportunity to remove some
590 # inputs, or complain if they are unacceptable.
591 # This will raise if one of the check conditions is not met, which is
592 # the intended behavior.
593 # If it raises NotWorkFound, there is a bug in the QG algorithm
594 # or the adjustQuantum is incorrectly trying to make a prerequisite
595 # input behave like a regular input; adjustQuantum should only raise
596 # NoWorkFound if a regular input is missing, and it shouldn't be
597 # possible for us to have generated ``self`` if that's true.
598 helper = AdjustQuantumHelper(
599 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes)
600 )
601 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
602 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes)
603 quantum_records: Mapping[str, DatastoreRecordData] | None = None
604 if datastore_records is not None:
605 quantum_records = {}
606 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
607 input_refs += list(initInputs.values())
608 input_ids = {ref.id for ref in input_refs}
609 for datastore_name, records in datastore_records.items():
610 matching_records = records.subset(input_ids)
611 if matching_records is not None:
612 quantum_records[datastore_name] = matching_records
613 # ignore the types because quantum really can take a sequence of inputs
614 return Quantum(
615 taskName=self.task.taskDef.taskName,
616 taskClass=self.task.taskDef.taskClass,
617 dataId=self.dataId,
618 initInputs=initInputs,
619 inputs=helper.inputs,
620 outputs=helper.outputs,
621 datastore_records=quantum_records,
622 )
625@dataclass
626class _TaskScaffolding:
627 """Helper class aggregating information about a `PipelineTask`, used when
628 constructing a `QuantumGraph`.
630 See `_PipelineScaffolding` for a top-down description of the full
631 scaffolding data structure.
633 Parameters
634 ----------
635 taskDef : `TaskDef`
636 Data structure that identifies the task class and its config.
637 parent : `_PipelineScaffolding`
638 The parent data structure that will hold the instance being
639 constructed.
640 datasetTypes : `TaskDatasetTypes`
641 Data structure that categorizes the dataset types used by this task.
642 """
644 def __init__(
645 self,
646 taskDef: TaskDef,
647 parent: _PipelineScaffolding,
648 datasetTypes: TaskDatasetTypes,
649 ):
650 universe = parent.dimensions.universe
651 self.taskDef = taskDef
652 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
653 assert self.dimensions.issubset(parent.dimensions)
654 # Initialize _DatasetDicts as subsets of the one or two
655 # corresponding dicts in the parent _PipelineScaffolding.
656 self.initInputs = _DatasetDict.fromSubset(
657 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
658 )
659 self.initOutputs = _DatasetDict.fromSubset(
660 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
661 )
662 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
663 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
664 self.prerequisites = _DatasetDictMulti.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
665 self.dataIds: set[DataCoordinate] = set()
666 self.quanta = {}
667 self.storage_classes = {
668 connection.name: connection.storageClass
669 for connection in self.taskDef.connections.allConnections.values()
670 }
671 self.storage_classes[
672 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
673 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS
674 self.storage_classes[
675 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
676 ] = acc.LOG_OUTPUT_STORAGE_CLASS
677 self.storage_classes[
678 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
679 ] = acc.METADATA_OUTPUT_STORAGE_CLASS
681 def __repr__(self) -> str:
682 # Default dataclass-injected __repr__ gets caught in an infinite loop
683 # because of back-references.
684 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
686 taskDef: TaskDef
687 """Data structure that identifies the task class and its config
688 (`TaskDef`).
689 """
691 dimensions: DimensionGraph
692 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
693 """
695 initInputs: _DatasetDict
696 """Dictionary containing information about datasets used to construct this
697 task (`_DatasetDict`).
698 """
700 initOutputs: _DatasetDict
701 """Dictionary containing information about datasets produced as a
702 side-effect of constructing this task (`_DatasetDict`).
703 """
705 inputs: _DatasetDict
706 """Dictionary containing information about datasets used as regular,
707 graph-constraining inputs to this task (`_DatasetDict`).
708 """
710 outputs: _DatasetDict
711 """Dictionary containing information about datasets produced by this task
712 (`_DatasetDict`).
713 """
715 prerequisites: _DatasetDictMulti
716 """Dictionary containing information about input datasets that must be
717 present in the repository before any Pipeline containing this task is run
718 (`_DatasetDictMulti`).
719 """
721 quanta: dict[DataCoordinate, _QuantumScaffolding]
722 """Dictionary mapping data ID to a scaffolding object for the Quantum of
723 this task with that data ID.
724 """
726 storage_classes: dict[str, str]
727 """Mapping from dataset type name to storage class declared by this task.
728 """
730 def makeQuantumSet(
731 self,
732 missing: _DatasetDict,
733 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
734 ) -> set[Quantum]:
735 """Create a `set` of `Quantum` from the information in ``self``.
737 Parameters
738 ----------
739 missing : `_DatasetDict`
740 Input datasets that have not been found.
741 datastore_records : `dict`
742 Record from the datastore to export with quanta.
744 Returns
745 -------
746 nodes : `set` of `Quantum`
747 The `Quantum` elements corresponding to this task.
748 """
749 outputs = set()
750 for q in self.quanta.values():
751 try:
752 tmpQuanta = q.makeQuantum(datastore_records)
753 outputs.add(tmpQuanta)
754 except (NoWorkFound, FileNotFoundError) as exc:
755 if not missing.isdisjoint(q.inputs):
756 # This is a node that is known to be pruned later and
757 # should be left in even though some follow up queries
758 # fail. This allows the pruning to start from this quantum
759 # with known issues, and prune other nodes it touches.
760 inputs = q.inputs.unpackMultiRefs(self.storage_classes)
761 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes))
762 tmpQuantum = Quantum(
763 taskName=q.task.taskDef.taskName,
764 taskClass=q.task.taskDef.taskClass,
765 dataId=q.dataId,
766 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes),
767 inputs=inputs,
768 outputs=q.outputs.unpackMultiRefs(self.storage_classes),
769 )
770 outputs.add(tmpQuantum)
771 else:
772 raise exc
773 return outputs
776class _DatasetIdMaker:
777 """Helper class which generates random dataset UUIDs for unresolved
778 datasets.
779 """
781 def __init__(self, run: str):
782 self.run = run
783 # Cache of dataset refs generated so far.
784 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {}
786 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef:
787 # For components we need their parent dataset ID.
788 if dataset_type.isComponent():
789 parent_type = dataset_type.makeCompositeDatasetType()
790 # Parent should be resolved if this is an existing input, or it
791 # should be in the cache already if it is an intermediate.
792 key = parent_type, data_id
793 if key not in self.resolved:
794 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}")
795 parent_ref = self.resolved[key]
796 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False)
798 key = dataset_type, data_id
799 if (resolved := self.resolved.get(key)) is None:
800 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False)
801 self.resolved[key] = resolved
802 return resolved
804 def resolveDict(
805 self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder], is_output: bool
806 ) -> None:
807 """Resolve all unresolved references in the provided dictionary."""
808 for data_id, holder in refs.items():
809 if holder.ref is None or (is_output and holder.ref.run != self.run):
810 holder.ref = self.resolveRef(holder.dataset_type, data_id)
813@dataclass
814class _PipelineScaffolding:
815 """A helper data structure that organizes the information involved in
816 constructing a `QuantumGraph` for a `Pipeline`.
818 Parameters
819 ----------
820 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
821 Sequence of tasks from which a graph is to be constructed. Must
822 have nested task classes already imported.
823 universe : `~lsst.daf.butler.DimensionUniverse`
824 Universe of all possible dimensions.
826 Notes
827 -----
828 The scaffolding data structure contains nested data structures for both
829 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
830 data structures are shared between the pipeline-level structure (which
831 aggregates all datasets and categorizes them from the perspective of the
832 complete pipeline) and the individual tasks that use them as inputs and
833 outputs.
835 `QuantumGraph` construction proceeds in four steps, with each corresponding
836 to a different `_PipelineScaffolding` method:
838 1. When `_PipelineScaffolding` is constructed, we extract and categorize
839 the DatasetTypes used by the pipeline (delegating to
840 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
841 nested `_TaskScaffolding` and `_DatasetDict` objects.
843 2. In `connectDataIds`, we construct and run the "Big Join Query", which
844 returns related tuples of all dimensions used to identify any regular
845 input, output, and intermediate datasets (not prerequisites). We then
846 iterate over these tuples of related dimensions, identifying the subsets
847 that correspond to distinct data IDs for each task and dataset type,
848 and then create `_QuantumScaffolding` objects.
850 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
851 dataset data IDs previously identified, transforming unresolved
852 DatasetRefs into resolved DatasetRefs where appropriate. We then look
853 up prerequisite datasets for all quanta.
855 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
856 per-task `_QuantumScaffolding` objects.
857 """
859 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry):
860 _LOG.debug("Initializing data structures for QuantumGraph generation.")
861 self.tasks = []
862 # Aggregate and categorize the DatasetTypes in the Pipeline.
863 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
864 # Construct dictionaries that map those DatasetTypes to structures
865 # that will (later) hold additional information about them.
866 for attr in (
867 "initInputs",
868 "initIntermediates",
869 "initOutputs",
870 "inputs",
871 "intermediates",
872 "outputs",
873 ):
874 setattr(
875 self,
876 attr,
877 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
878 )
879 self.prerequisites = _DatasetDictMulti.fromDatasetTypes(
880 datasetTypes.prerequisites, universe=registry.dimensions
881 )
882 self.missing = _DatasetDict(universe=registry.dimensions)
883 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
884 # Aggregate all dimensions for all non-init, non-prerequisite
885 # DatasetTypes. These are the ones we'll include in the big join
886 # query.
887 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
888 # Construct scaffolding nodes for each Task, and add backreferences
889 # to the Task from each DatasetScaffolding node.
890 # Note that there's only one scaffolding node for each DatasetType,
891 # shared by _PipelineScaffolding and all _TaskScaffoldings that
892 # reference it.
893 if isinstance(pipeline, Pipeline):
894 pipeline = pipeline.toExpandedPipeline()
895 self.tasks = [
896 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
897 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values(), strict=True)
898 ]
900 def __repr__(self) -> str:
901 # Default dataclass-injected __repr__ gets caught in an infinite loop
902 # because of back-references.
903 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
905 tasks: list[_TaskScaffolding]
906 """Scaffolding data structures for each task in the pipeline
907 (`list` of `_TaskScaffolding`).
908 """
910 initInputs: _DatasetDict
911 """Datasets consumed but not produced when constructing the tasks in this
912 pipeline (`_DatasetDict`).
913 """
915 initIntermediates: _DatasetDict
916 """Datasets that are both consumed and produced when constructing the tasks
917 in this pipeline (`_DatasetDict`).
918 """
920 initOutputs: _DatasetDict
921 """Datasets produced but not consumed when constructing the tasks in this
922 pipeline (`_DatasetDict`).
923 """
925 inputs: _DatasetDict
926 """Datasets that are consumed but not produced when running this pipeline
927 (`_DatasetDict`).
928 """
930 intermediates: _DatasetDict
931 """Datasets that are both produced and consumed when running this pipeline
932 (`_DatasetDict`).
933 """
935 outputs: _DatasetDict
936 """Datasets produced but not consumed when when running this pipeline
937 (`_DatasetDict`).
938 """
940 prerequisites: _DatasetDictMulti
941 """Datasets that are consumed when running this pipeline and looked up
942 per-Quantum when generating the graph (`_DatasetDictMulti`).
943 """
945 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
946 """Datasets that should be used as constraints in the initial query,
947 according to tasks (`~lsst.daf.butler.NamedValueSet`).
948 """
950 dimensions: DimensionGraph
951 """All dimensions used by any regular input, intermediate, or output
952 (not prerequisite) dataset; the set of dimension used in the "Big Join
953 Query" (`~lsst.daf.butler.DimensionGraph`).
955 This is required to be a superset of all task quantum dimensions.
956 """
958 missing: _DatasetDict
959 """Datasets whose existence was originally predicted but were not
960 actually found.
962 Quanta that require these datasets as inputs will be pruned (recursively)
963 when actually constructing a `QuantumGraph` object.
965 These are currently populated only when the "initial dataset query
966 constraint" does not include all overall-input dataset types, and hence the
967 initial data ID query can include data IDs that it should not.
968 """
970 globalInitOutputs: _DatasetDict | None = None
971 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
972 """
974 @contextmanager
975 def connectDataIds(
976 self,
977 registry: Registry,
978 collections: Any,
979 userQuery: str | None,
980 externalDataId: DataCoordinate,
981 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
982 bind: Mapping[str, Any] | None = None,
983 ) -> Iterator[DataCoordinateQueryResults]:
984 """Query for the data IDs that connect nodes in the `QuantumGraph`.
986 This method populates `_TaskScaffolding.dataIds` and
987 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
989 Parameters
990 ----------
991 registry : `lsst.daf.butler.Registry`
992 Registry for the data repository; used for all data ID queries.
993 collections
994 Expressions representing the collections to search for input
995 datasets. See :ref:`daf_butler_ordered_collection_searches`.
996 userQuery : `str` or `None`
997 User-provided expression to limit the data IDs processed.
998 externalDataId : `~lsst.daf.butler.DataCoordinate`
999 Externally-provided data ID that should be used to restrict the
1000 results, just as if these constraints had been included via ``AND``
1001 in ``userQuery``. This includes (at least) any instrument named
1002 in the pipeline definition.
1003 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1004 The query constraint variant that should be used to constraint the
1005 query based on dataset existance, defaults to
1006 `DatasetQueryConstraintVariant.ALL`.
1007 bind : `~collections.abc.Mapping`, optional
1008 Mapping containing literal values that should be injected into the
1009 ``userQuery`` expression, keyed by the identifiers they replace.
1011 Returns
1012 -------
1013 commonDataIds : \
1014 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
1015 An interface to a database temporary table containing all data IDs
1016 that will appear in this `QuantumGraph`. Returned inside a
1017 context manager, which will drop the temporary table at the end of
1018 the `with` block in which this method is called.
1019 """
1020 _LOG.debug("Building query for data IDs.")
1021 # Initialization datasets always have empty data IDs.
1022 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
1023 for datasetType, refs in itertools.chain(
1024 self.initInputs.items(),
1025 self.initIntermediates.items(),
1026 self.initOutputs.items(),
1027 ):
1028 refs[emptyDataId] = _RefHolder(datasetType)
1029 # Run one big query for the data IDs for task dimensions and regular
1030 # inputs and outputs. We limit the query to only dimensions that are
1031 # associated with the input dataset types, but don't (yet) try to
1032 # obtain the dataset_ids for those inputs.
1033 _LOG.debug(
1034 "Submitting data ID query over dimensions %s and materializing results.",
1035 list(self.dimensions.names),
1036 )
1037 queryArgs: dict[str, Any] = {
1038 "dimensions": self.dimensions,
1039 "where": userQuery,
1040 "dataId": externalDataId,
1041 "bind": bind,
1042 }
1043 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
1044 _LOG.debug(
1045 "Constraining graph query using default of %s.",
1046 list(self.defaultDatasetQueryConstraints.names),
1047 )
1048 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
1049 queryArgs["collections"] = collections
1050 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
1051 _LOG.debug("Not using dataset existence to constrain query.")
1052 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
1053 constraint = set(datasetQueryConstraint)
1054 inputs = {k.name: k for k in self.inputs}
1055 if remainder := constraint.difference(inputs.keys()):
1056 raise ValueError(
1057 f"{remainder} dataset type(s) specified as a graph constraint, but"
1058 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
1059 )
1060 _LOG.debug(f"Constraining graph query using {constraint}")
1061 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
1062 queryArgs["collections"] = collections
1063 else:
1064 raise ValueError(
1065 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
1066 )
1068 if "datasets" in queryArgs:
1069 for i, dataset_type in enumerate(queryArgs["datasets"]):
1070 if dataset_type.isComponent():
1071 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
1073 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
1074 _LOG.debug("Expanding data IDs.")
1075 commonDataIds = commonDataIds.expanded()
1076 _LOG.debug("Iterating over query results to associate quanta with datasets.")
1077 # Iterate over query results, populating data IDs for datasets and
1078 # quanta and then connecting them to each other.
1079 n = -1
1080 for commonDataId in commonDataIds:
1081 # Create DatasetRefs for all DatasetTypes from this result row,
1082 # noting that we might have created some already.
1083 # We remember both those that already existed and those that we
1084 # create now.
1085 refsForRow = {}
1086 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {}
1087 for datasetType, refs in itertools.chain(
1088 self.inputs.items(),
1089 self.intermediates.items(),
1090 self.outputs.items(),
1091 ):
1092 datasetDataId: DataCoordinate | None
1093 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
1094 datasetDataId = commonDataId.subset(datasetType.dimensions)
1095 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
1096 ref_holder = refs.get(datasetDataId)
1097 if ref_holder is None:
1098 ref_holder = _RefHolder(datasetType)
1099 refs[datasetDataId] = ref_holder
1100 refsForRow[datasetType.name] = ref_holder
1101 # Create _QuantumScaffolding objects for all tasks from this
1102 # result row, noting that we might have created some already.
1103 for task in self.tasks:
1104 quantumDataId = commonDataId.subset(task.dimensions)
1105 quantum = task.quanta.get(quantumDataId)
1106 if quantum is None:
1107 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
1108 task.quanta[quantumDataId] = quantum
1109 # Whether this is a new quantum or an existing one, we can
1110 # now associate the DatasetRefs for this row with it. The
1111 # fact that a Quantum data ID and a dataset data ID both
1112 # came from the same result row is what tells us they
1113 # should be associated.
1114 # Many of these associates will be duplicates (because
1115 # another query row that differed from this one only in
1116 # irrelevant dimensions already added them), and we use
1117 # sets to skip.
1118 for datasetType in task.inputs:
1119 dataId = dataIdCacheForRow[datasetType.dimensions]
1120 ref_holder = refsForRow[datasetType.name]
1121 quantum.inputs[datasetType.name][dataId] = ref_holder
1122 for datasetType in task.outputs:
1123 dataId = dataIdCacheForRow[datasetType.dimensions]
1124 ref_holder = refsForRow[datasetType.name]
1125 quantum.outputs[datasetType.name][dataId] = ref_holder
1126 if n < 0:
1127 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
1128 emptiness_explained = False
1129 for message in commonDataIds.explain_no_results():
1130 _LOG.critical(message)
1131 emptiness_explained = True
1132 if not emptiness_explained:
1133 _LOG.critical(
1134 "To reproduce this query for debugging purposes, run "
1135 "Registry.queryDataIds with these arguments:"
1136 )
1137 # We could just repr() the queryArgs dict to get something
1138 # the user could make sense of, but it's friendlier to
1139 # put these args in an easier-to-construct equivalent form
1140 # so they can read it more easily and copy and paste into
1141 # a Python terminal.
1142 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
1143 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
1144 if queryArgs["where"]:
1145 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
1146 if "datasets" in queryArgs:
1147 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
1148 if "collections" in queryArgs:
1149 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
1150 _LOG.debug("Finished processing %d rows from data ID query.", n)
1151 yield commonDataIds
1153 def resolveDatasetRefs(
1154 self,
1155 registry: Registry,
1156 collections: Any,
1157 run: str,
1158 commonDataIds: DataCoordinateQueryResults,
1159 *,
1160 skipExistingIn: Any = None,
1161 clobberOutputs: bool = True,
1162 constrainedByAllDatasets: bool = True,
1163 ) -> None:
1164 """Perform follow up queries for each dataset data ID produced in
1165 `fillDataIds`.
1167 This method populates `_DatasetScaffolding.refs` (except for those in
1168 `prerequisites`).
1170 Parameters
1171 ----------
1172 registry : `lsst.daf.butler.Registry`
1173 Registry for the data repository; used for all data ID queries.
1174 collections
1175 Expressions representing the collections to search for input
1176 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1177 run : `str`
1178 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1179 output datasets, if it already exists.
1180 commonDataIds : \
1181 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
1182 Result of a previous call to `connectDataIds`.
1183 skipExistingIn
1184 Expressions representing the collections to search for existing
1185 output datasets that should be skipped. See
1186 :ref:`daf_butler_ordered_collection_searches` for allowed types.
1187 `None` or empty string/sequence disables skipping.
1188 clobberOutputs : `bool`, optional
1189 If `True` (default), allow quanta to created even if outputs exist;
1190 this requires the same behavior behavior to be enabled when
1191 executing. If ``skipExistingIn`` is not `None`, completed quanta
1192 (those with metadata, or all outputs if there is no metadata
1193 dataset configured) will be skipped rather than clobbered.
1194 constrainedByAllDatasets : `bool`, optional
1195 Indicates if the commonDataIds were generated with a constraint on
1196 all dataset types.
1198 Raises
1199 ------
1200 OutputExistsError
1201 Raised if an output dataset already exists in the output run
1202 and ``skipExistingIn`` does not include output run, or if only
1203 some outputs are present and ``clobberOutputs`` is `False`.
1204 """
1205 # Run may be provided but it does not have to exist, in that case we
1206 # use it for resolving references but don't check it for existing refs.
1207 run_exists = False
1208 if run:
1209 with contextlib.suppress(MissingCollectionError):
1210 run_exists = bool(registry.queryCollections(run))
1212 skip_collections_wildcard: CollectionWildcard | None = None
1213 skipExistingInRun = False
1214 if skipExistingIn:
1215 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
1216 if run_exists:
1217 # as optimization check in the explicit list of names first
1218 skipExistingInRun = run in skip_collections_wildcard.strings
1219 if not skipExistingInRun:
1220 # need to flatten it and check again
1221 skipExistingInRun = run in registry.queryCollections(
1222 skipExistingIn,
1223 collectionTypes=CollectionType.RUN,
1224 )
1226 idMaker = _DatasetIdMaker(run)
1228 resolvedRefQueryResults: Iterable[DatasetRef]
1230 # Updating constrainedByAllDatasets here is not ideal, but we have a
1231 # few different code paths that each transfer different pieces of
1232 # information about what dataset query constraints were applied here,
1233 # and none of them has the complete picture until we get here. We're
1234 # long overdue for a QG generation rewrite that will make this go away
1235 # entirely anyway.
1236 constrainedByAllDatasets = (
1237 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
1238 )
1240 # Look up [init] intermediate and output datasets in the output
1241 # collection, if there is an output collection.
1242 if run_exists or skip_collections_wildcard is not None:
1243 for datasetType, refs in itertools.chain(
1244 self.initIntermediates.items(),
1245 self.initOutputs.items(),
1246 self.intermediates.items(),
1247 self.outputs.items(),
1248 ):
1249 _LOG.debug(
1250 "Resolving %d datasets for intermediate and/or output dataset %s.",
1251 len(refs),
1252 datasetType.name,
1253 )
1254 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
1255 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
1256 # TODO: this assert incorrectly bans component inputs;
1257 # investigate on DM-33027.
1258 # assert not datasetType.isComponent(), \
1259 # "Output datasets cannot be components."
1260 #
1261 # Instead we have to handle them manually to avoid a
1262 # deprecation warning, but it is at least confusing and
1263 # possibly a bug for components to appear here at all.
1264 if datasetType.isComponent():
1265 parent_dataset_type = datasetType.makeCompositeDatasetType()
1266 component = datasetType.component()
1267 else:
1268 parent_dataset_type = datasetType
1269 component = None
1271 # look at RUN collection first
1272 if run_exists:
1273 try:
1274 resolvedRefQueryResults = subset.findDatasets(
1275 parent_dataset_type, collections=run, findFirst=True
1276 )
1277 except MissingDatasetTypeError:
1278 resolvedRefQueryResults = []
1279 for resolvedRef in resolvedRefQueryResults:
1280 # TODO: we could easily support per-DatasetType
1281 # skipExisting and I could imagine that being useful -
1282 # it's probably required in order to support writing
1283 # initOutputs before QuantumGraph generation.
1284 assert resolvedRef.dataId in refs
1285 if not (skipExistingInRun or isInit or clobberOutputs):
1286 raise OutputExistsError(
1287 f"Output dataset {datasetType.name} already exists in "
1288 f"output RUN collection '{run}' with data ID"
1289 f" {resolvedRef.dataId}."
1290 )
1291 # To resolve all outputs we have to remember existing
1292 # ones to avoid generating new dataset IDs for them.
1293 refs[resolvedRef.dataId].ref = (
1294 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1295 )
1297 # And check skipExistingIn too, if RUN collection is in
1298 # it is handled above
1299 if skip_collections_wildcard is not None:
1300 try:
1301 resolvedRefQueryResults = subset.findDatasets(
1302 parent_dataset_type,
1303 collections=skip_collections_wildcard,
1304 findFirst=True,
1305 )
1306 except MissingDatasetTypeError:
1307 resolvedRefQueryResults = []
1308 for resolvedRef in resolvedRefQueryResults:
1309 if resolvedRef.dataId not in refs:
1310 continue
1311 refs[resolvedRef.dataId].ref = (
1312 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1313 )
1315 # Look up input and initInput datasets in the input collection(s). We
1316 # accumulate datasets in self.missing, if the common data IDs were not
1317 # constrained on dataset type existence.
1318 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1319 _LOG.debug(
1320 "Resolving %d datasets for input dataset %s.",
1321 len(refs),
1322 datasetType.name,
1323 )
1324 if datasetType.isComponent():
1325 parent_dataset_type = datasetType.makeCompositeDatasetType()
1326 component = datasetType.component()
1327 else:
1328 parent_dataset_type = datasetType
1329 component = None
1330 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {}
1331 try:
1332 resolvedRefQueryResults = commonDataIds.subset(
1333 datasetType.dimensions, unique=True
1334 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1335 except MissingDatasetTypeError:
1336 resolvedRefQueryResults = []
1337 dataIdsNotFoundYet = set(refs.keys())
1338 for resolvedRef in resolvedRefQueryResults:
1339 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1340 if resolvedRef.dataId not in refs:
1341 continue
1342 refs[resolvedRef.dataId].ref = (
1343 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1344 )
1345 if dataIdsNotFoundYet:
1346 if constrainedByAllDatasets:
1347 raise RuntimeError(
1348 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1349 f"'{datasetType.name}' was/were present in a previous "
1350 "query, but could not be found now. "
1351 "This is either a logic bug in QuantumGraph generation "
1352 "or the input collections have been modified since "
1353 "QuantumGraph generation began."
1354 )
1355 elif not datasetType.dimensions:
1356 raise RuntimeError(
1357 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1358 f"collections {collections}."
1359 )
1360 else:
1361 # If the common dataIds were not constrained using all the
1362 # input dataset types, it is possible that some data ids
1363 # found don't correspond to existing datasets. Mark these
1364 # for later pruning from the quantum graph.
1365 for k in dataIdsNotFoundYet:
1366 missing_for_dataset_type[k] = refs[k]
1367 if missing_for_dataset_type:
1368 self.missing[datasetType] = missing_for_dataset_type
1370 # Resolve the missing refs, just so they look like all of the others;
1371 # in the end other code will make sure they never appear in the QG.
1372 for dataset_type, refDict in self.missing.items():
1373 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1375 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1376 # replacing the unresolved refs there, and then look up prerequisites.
1377 for task in self.tasks:
1378 _LOG.debug(
1379 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1380 len(task.quanta),
1381 task.taskDef.label,
1382 )
1383 # The way iterConnections is designed makes it impossible to
1384 # annotate precisely enough to satisfy MyPy here.
1385 lookupFunctions = {
1386 c.name: c.lookupFunction # type: ignore
1387 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1388 if c.lookupFunction is not None # type: ignore
1389 }
1390 dataIdsFailed = []
1391 dataIdsSucceeded = []
1392 for quantum in task.quanta.values():
1393 # Process outputs datasets only if skipExistingIn is not None
1394 # or there is a run to look for outputs in and clobberOutputs
1395 # is True. Note that if skipExistingIn is None, any output
1396 # datasets that already exist would have already caused an
1397 # exception to be raised.
1398 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1399 resolvedRefs = []
1400 unresolvedDataIds = []
1401 haveMetadata = False
1402 for datasetType, originalRefs in quantum.outputs.items():
1403 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()):
1404 if ref is not None:
1405 resolvedRefs.append(ref)
1406 originalRefs[dataId].ref = ref
1407 if datasetType.name == task.taskDef.metadataDatasetName:
1408 haveMetadata = True
1409 else:
1410 unresolvedDataIds.append((datasetType, dataId))
1411 if resolvedRefs:
1412 if haveMetadata or not unresolvedDataIds:
1413 dataIdsSucceeded.append(quantum.dataId)
1414 if skip_collections_wildcard is not None:
1415 continue
1416 else:
1417 dataIdsFailed.append(quantum.dataId)
1418 if not clobberOutputs and run_exists:
1419 raise OutputExistsError(
1420 f"Quantum {quantum.dataId} of task with label "
1421 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1422 f"({resolvedRefs}) "
1423 f"and others that don't ({unresolvedDataIds}), with no metadata output, "
1424 "and clobbering outputs was not enabled."
1425 )
1426 # Update the input DatasetRefs to the resolved ones we already
1427 # searched for.
1428 for datasetType, input_refs in quantum.inputs.items():
1429 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()):
1430 input_refs[data_id].ref = ref
1431 # Look up prerequisite datasets in the input collection(s).
1432 # These may have dimensions that extend beyond those we queried
1433 # for originally, because we want to permit those data ID
1434 # values to differ across quanta and dataset types.
1435 for datasetType in task.prerequisites:
1436 if datasetType.isComponent():
1437 parent_dataset_type = datasetType.makeCompositeDatasetType()
1438 component = datasetType.component()
1439 else:
1440 parent_dataset_type = datasetType
1441 component = None
1442 lookupFunction = lookupFunctions.get(datasetType.name)
1443 if lookupFunction is not None:
1444 # PipelineTask has provided its own function to do the
1445 # lookup. This always takes precedence.
1446 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1447 elif (
1448 datasetType.isCalibration()
1449 and datasetType.dimensions <= quantum.dataId.graph
1450 and quantum.dataId.graph.temporal
1451 ):
1452 # This is a master calibration lookup, which we have to
1453 # handle specially because the query system can't do a
1454 # temporal join on a non-dimension-based timespan yet.
1455 timespan = quantum.dataId.timespan
1456 try:
1457 prereq_ref = registry.findDataset(
1458 parent_dataset_type,
1459 quantum.dataId,
1460 collections=collections,
1461 timespan=timespan,
1462 )
1463 if prereq_ref is not None:
1464 if component is not None:
1465 prereq_ref = prereq_ref.makeComponentRef(component)
1466 prereq_refs = [prereq_ref]
1467 else:
1468 prereq_refs = []
1469 except (KeyError, MissingDatasetTypeError):
1470 # This dataset type is not present in the registry,
1471 # which just means there are no datasets here.
1472 prereq_refs = []
1473 else:
1474 where = ""
1475 bind: dict[str, Any] = {}
1476 if not quantum.dataId.graph.spatial:
1477 # This has skypix dimensions (probably a reference
1478 # catalog), but the quantum's data is not spatial
1479 # (it's probably a full-survey sequence point).
1480 # Try to limit the spatial extent to the union of
1481 # the spatial extent of the inputs and outputs.
1482 for dimension in datasetType.dimensions:
1483 if isinstance(dimension, SkyPixDimension):
1484 extent = quantum.computeSpatialExtent(dimension.pixelization)
1485 pixels: list[int] = []
1486 for begin, end in extent:
1487 pixels.extend(range(begin, end))
1488 if not pixels:
1489 _LOG.warning(
1490 "Prerequisite input %r to task %r may be unbounded.",
1491 datasetType.name,
1492 quantum.task.taskDef.label,
1493 )
1494 else:
1495 bind["quantum_extent"] = pixels
1496 where = f"{dimension.name} IN (quantum_extent)"
1497 break
1498 # Most general case.
1499 prereq_refs = [
1500 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1501 for prereq_ref in registry.queryDatasets(
1502 parent_dataset_type,
1503 collections=collections,
1504 dataId=quantum.dataId,
1505 findFirst=True,
1506 where=where,
1507 bind=bind,
1508 ).expanded()
1509 ]
1511 for ref in prereq_refs:
1512 if ref is not None:
1513 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1514 task.prerequisites[datasetType][ref.dataId].append(_RefHolder(datasetType, ref))
1516 # Resolve all quantum inputs and outputs.
1517 for dataset_type, refDict in quantum.inputs.items():
1518 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1519 for dataset_type, refDict in quantum.outputs.items():
1520 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1522 # Resolve task initInputs and initOutputs.
1523 for dataset_type, refDict in task.initInputs.items():
1524 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1525 for dataset_type, refDict in task.initOutputs.items():
1526 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1528 # Actually remove any quanta that we decided to skip above.
1529 if dataIdsSucceeded:
1530 if skip_collections_wildcard is not None:
1531 _LOG.debug(
1532 "Pruning successful %d quanta for task with label '%s' because all of their "
1533 "outputs exist or metadata was written successfully.",
1534 len(dataIdsSucceeded),
1535 task.taskDef.label,
1536 )
1537 for dataId in dataIdsSucceeded:
1538 del task.quanta[dataId]
1539 elif clobberOutputs and run_exists:
1540 _LOG.info(
1541 "Found %d successful quanta for task with label '%s' "
1542 "that will need to be clobbered during execution.",
1543 len(dataIdsSucceeded),
1544 task.taskDef.label,
1545 )
1546 if dataIdsFailed:
1547 if clobberOutputs and run_exists:
1548 _LOG.info(
1549 "Found %d failed/incomplete quanta for task with label '%s' "
1550 "that will need to be clobbered during execution.",
1551 len(dataIdsFailed),
1552 task.taskDef.label,
1553 )
1555 # Collect initOutputs that do not belong to any task.
1556 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1557 for task in self.tasks:
1558 global_dataset_types -= set(task.initOutputs)
1559 if global_dataset_types:
1560 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1561 for dataset_type, refDict in self.globalInitOutputs.items():
1562 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1564 def makeQuantumGraph(
1565 self,
1566 registry: Registry,
1567 metadata: Mapping[str, Any] | None = None,
1568 datastore: Datastore | None = None,
1569 ) -> QuantumGraph:
1570 """Create a `QuantumGraph` from the quanta already present in
1571 the scaffolding data structure.
1573 Parameters
1574 ----------
1575 registry : `lsst.daf.butler.Registry`
1576 Registry for the data repository; used for all data ID queries.
1577 metadata : `~collections.abc.Mapping` of `str` to primitives, optional
1578 This is an optional parameter of extra data to carry with the
1579 graph. Entries in this mapping should be able to be serialized in
1580 JSON.
1581 datastore : `~lsst.daf.butler.Datastore`, optional
1582 If not `None` then fill datastore records in each generated
1583 Quantum.
1585 Returns
1586 -------
1587 graph : `QuantumGraph`
1588 The full `QuantumGraph`.
1589 """
1590 datastore_records: Mapping[str, DatastoreRecordData] | None = None
1591 if datastore is not None:
1592 datastore_records = datastore.export_records(
1593 itertools.chain(
1594 self.inputs.iter_resolved_refs(),
1595 self.initInputs.iter_resolved_refs(),
1596 self.prerequisites.iter_resolved_refs(),
1597 )
1598 )
1600 graphInput: dict[TaskDef, set[Quantum]] = {}
1601 for task in self.tasks:
1602 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records)
1603 graphInput[task.taskDef] = qset
1605 taskInitInputs = {
1606 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values()
1607 for task in self.tasks
1608 }
1609 taskInitOutputs = {
1610 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values()
1611 for task in self.tasks
1612 }
1614 globalInitOutputs: list[DatasetRef] = []
1615 if self.globalInitOutputs is not None:
1616 for refs_dict in self.globalInitOutputs.values():
1617 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values())
1619 graph = QuantumGraph(
1620 graphInput,
1621 metadata=metadata,
1622 pruneRefs=list(self.missing.iter_resolved_refs()),
1623 universe=self.dimensions.universe,
1624 initInputs=taskInitInputs,
1625 initOutputs=taskInitOutputs,
1626 globalInitOutputs=globalInitOutputs,
1627 registryDatasetTypes=self._get_registry_dataset_types(registry),
1628 )
1629 return graph
1631 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1632 """Make a list of all dataset types used by a graph as defined in
1633 registry.
1634 """
1635 chain: list[_DatasetDict | _DatasetDictMulti] = [
1636 self.initInputs,
1637 self.initIntermediates,
1638 self.initOutputs,
1639 self.inputs,
1640 self.intermediates,
1641 self.outputs,
1642 self.prerequisites,
1643 ]
1644 if self.globalInitOutputs is not None:
1645 chain.append(self.globalInitOutputs)
1647 # Collect names of all dataset types.
1648 all_names: set[str] = {dstype.name for dstype in itertools.chain(*chain)}
1649 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1651 # Check for types that do not exist in registry yet:
1652 # - inputs must exist
1653 # - intermediates and outputs may not exist, but there must not be
1654 # more than one definition (e.g. differing in storage class)
1655 # - prerequisites may not exist, treat it the same as outputs here
1656 for dstype in itertools.chain(self.initInputs, self.inputs):
1657 if dstype.name not in dataset_types:
1658 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1660 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1661 chain = [
1662 self.initIntermediates,
1663 self.initOutputs,
1664 self.intermediates,
1665 self.outputs,
1666 self.prerequisites,
1667 ]
1668 if self.globalInitOutputs is not None:
1669 chain.append(self.globalInitOutputs)
1670 for dstype in itertools.chain(*chain):
1671 if dstype.name not in dataset_types:
1672 new_outputs[dstype.name].add(dstype)
1673 for name, dstypes in new_outputs.items():
1674 if len(dstypes) > 1:
1675 raise ValueError(
1676 "Pipeline contains multiple definitions for a dataset type "
1677 f"which is not defined in registry yet: {dstypes}"
1678 )
1679 elif len(dstypes) == 1:
1680 dataset_types[name] = dstypes.pop()
1682 return dataset_types.values()
1685# ------------------------
1686# Exported definitions --
1687# ------------------------
1690class GraphBuilderError(Exception):
1691 """Base class for exceptions generated by graph builder."""
1693 pass
1696class OutputExistsError(GraphBuilderError):
1697 """Exception generated when output datasets already exist."""
1699 pass
1702class PrerequisiteMissingError(GraphBuilderError):
1703 """Exception generated when a prerequisite dataset does not exist."""
1705 pass
1708class GraphBuilder:
1709 """GraphBuilder class is responsible for building task execution graph from
1710 a Pipeline.
1712 Parameters
1713 ----------
1714 registry : `~lsst.daf.butler.Registry`
1715 Data butler instance.
1716 skipExistingIn
1717 Expressions representing the collections to search for existing
1718 output datasets that should be skipped. See
1719 :ref:`daf_butler_ordered_collection_searches`.
1720 clobberOutputs : `bool`, optional
1721 If `True` (default), allow quanta to created even if partial outputs
1722 exist; this requires the same behavior behavior to be enabled when
1723 executing.
1724 datastore : `~lsst.daf.butler.Datastore`, optional
1725 If not `None` then fill datastore records in each generated Quantum.
1726 """
1728 def __init__(
1729 self,
1730 registry: Registry,
1731 skipExistingIn: Any = None,
1732 clobberOutputs: bool = True,
1733 datastore: Datastore | None = None,
1734 ):
1735 self.registry = registry
1736 self.dimensions = registry.dimensions
1737 self.skipExistingIn = skipExistingIn
1738 self.clobberOutputs = clobberOutputs
1739 self.datastore = datastore
1741 def makeGraph(
1742 self,
1743 pipeline: Pipeline | Iterable[TaskDef],
1744 collections: Any,
1745 run: str,
1746 userQuery: str | None,
1747 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1748 metadata: Mapping[str, Any] | None = None,
1749 bind: Mapping[str, Any] | None = None,
1750 dataId: DataCoordinate | None = None,
1751 ) -> QuantumGraph:
1752 """Create execution graph for a pipeline.
1754 Parameters
1755 ----------
1756 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
1757 Pipeline definition, task names/classes and their configs.
1758 collections
1759 Expressions representing the collections to search for input
1760 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1761 run : `str`
1762 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1763 output datasets. Collection does not have to exist and it will be
1764 created when graph is executed.
1765 userQuery : `str`
1766 String which defines user-defined selection for registry, should be
1767 empty or `None` if there is no restrictions on data selection.
1768 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1769 The query constraint variant that should be used to constraint the
1770 query based on dataset existance, defaults to
1771 `DatasetQueryConstraintVariant.ALL`.
1772 metadata : Optional Mapping of `str` to primitives
1773 This is an optional parameter of extra data to carry with the
1774 graph. Entries in this mapping should be able to be serialized in
1775 JSON.
1776 bind : `~collections.abc.Mapping`, optional
1777 Mapping containing literal values that should be injected into the
1778 ``userQuery`` expression, keyed by the identifiers they replace.
1779 dataId : `lsst.daf.butler.DataCoordinate`, optional
1780 Data ID that should also be included in the query constraint.
1782 Returns
1783 -------
1784 graph : `QuantumGraph`
1786 Raises
1787 ------
1788 UserExpressionError
1789 Raised when user expression cannot be parsed.
1790 OutputExistsError
1791 Raised when output datasets already exist.
1792 Exception
1793 Other exceptions types may be raised by underlying registry
1794 classes.
1795 """
1796 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1797 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1798 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1799 if dataId is None:
1800 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1801 if isinstance(pipeline, Pipeline):
1802 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId)
1803 with scaffolding.connectDataIds(
1804 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1805 ) as commonDataIds:
1806 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1807 scaffolding.resolveDatasetRefs(
1808 self.registry,
1809 collections,
1810 run,
1811 commonDataIds,
1812 skipExistingIn=self.skipExistingIn,
1813 clobberOutputs=self.clobberOutputs,
1814 constrainedByAllDatasets=condition,
1815 )
1816 return scaffolding.makeQuantumGraph(
1817 registry=self.registry, metadata=metadata, datastore=self.datastore
1818 )