Coverage for python/lsst/pipe/base/graphBuilder.py: 16%
546 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 02:49 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 02:49 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap, defaultdict
34from collections.abc import Collection, Iterable, Iterator, Mapping
35from contextlib import contextmanager
36from dataclasses import dataclass
37from typing import Any
39from lsst.daf.butler import (
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
55from lsst.daf.butler.registry.wildcards import CollectionWildcard
57# -----------------------------
58# Imports for other modules --
59# -----------------------------
60from . import automatic_connection_constants as acc
61from ._datasetQueryConstraints import DatasetQueryConstraintVariant
62from ._status import NoWorkFound
63from .connections import AdjustQuantumHelper, iterConnections
64from .graph import QuantumGraph
65from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
67# ----------------------------------
68# Local non-exported definitions --
69# ----------------------------------
71_LOG = logging.getLogger(__name__)
74@dataclass
75class _RefHolder:
76 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future
77 resolved reference.
79 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use
80 `None` to represent a reference that is yet to be resolved. Information
81 about its corresponding dataset type and coordinate is stored in
82 `_DatasetDict` mapping.
83 """
85 dataset_type: DatasetType
86 """Dataset type of the dataset to be created later. I need to store it here
87 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared
88 between different compatible dataset types."""
90 ref: DatasetRef | None = None
91 """Dataset reference, initially `None`, created when all datasets are
92 resolved.
93 """
95 @property
96 def resolved_ref(self) -> DatasetRef:
97 """Access resolved reference, should only be called after the
98 reference is set (`~lsst.daf.butler.DatasetRef`).
99 """
100 assert self.ref is not None, "Dataset reference is not set."
101 return self.ref
104class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]):
105 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested
106 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that
107 type.
109 Parameters
110 ----------
111 args
112 Positional arguments are forwarded to the `dict` constructor.
113 universe : `~lsst.daf.butler.DimensionUniverse`
114 Universe of all possible dimensions.
115 """
117 def __init__(self, *args: Any, universe: DimensionUniverse):
118 super().__init__(*args)
119 self.universe = universe
121 @classmethod
122 def fromDatasetTypes(
123 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
124 ) -> _DatasetDict:
125 """Construct a dictionary from a flat iterable of
126 `~lsst.daf.butler.DatasetType` keys.
128 Parameters
129 ----------
130 datasetTypes : `~collections.abc.Iterable` of \
131 `~lsst.daf.butler.DatasetType`
132 DatasetTypes to use as keys for the dict. Values will be empty
133 dictionaries.
134 universe : `~lsst.daf.butler.DimensionUniverse`
135 Universe of all possible dimensions.
137 Returns
138 -------
139 dictionary : `_DatasetDict`
140 A new `_DatasetDict` instance.
141 """
142 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
144 @classmethod
145 def fromSubset(
146 cls,
147 datasetTypes: Collection[DatasetType],
148 first: _DatasetDict,
149 *rest: _DatasetDict,
150 ) -> _DatasetDict:
151 """Return a new dictionary by extracting items corresponding to the
152 given keys from one or more existing dictionaries.
154 Parameters
155 ----------
156 datasetTypes : `~collections.abc.Iterable` of \
157 `~lsst.daf.butler.DatasetType`
158 DatasetTypes to use as keys for the dict. Values will be obtained
159 by lookups against ``first`` and ``rest``.
160 first : `_DatasetDict`
161 Another dictionary from which to extract values.
162 rest
163 Additional dictionaries from which to extract values.
165 Returns
166 -------
167 dictionary : `_DatasetDict`
168 A new dictionary instance.
169 """
170 combined = ChainMap(first, *rest)
172 # Dataset types known to match immediately can be processed
173 # without checks.
174 matches = combined.keys() & set(datasetTypes)
175 _dict = {k: combined[k] for k in matches}
177 if len(_dict) < len(datasetTypes):
178 # Work out which ones are missing.
179 missing_datasetTypes = set(datasetTypes) - _dict.keys()
181 # Get the known names for comparison.
182 combined_by_name = {k.name: k for k in combined}
184 missing = set()
185 incompatible = {}
186 for datasetType in missing_datasetTypes:
187 # The dataset type is not found. It may not be listed
188 # or it may be that it is there with the same name
189 # but different definition.
190 if datasetType.name in combined_by_name:
191 # This implies some inconsistency in definitions
192 # for connections. If there is support for storage
193 # class conversion we can let it slide.
194 # At this point we do not know
195 # where the inconsistency is but trust that down
196 # stream code will be more explicit about input
197 # vs output incompatibilities.
198 existing = combined_by_name[datasetType.name]
199 convertible_to_existing = existing.is_compatible_with(datasetType)
200 convertible_from_existing = datasetType.is_compatible_with(existing)
201 if convertible_to_existing and convertible_from_existing:
202 _LOG.debug(
203 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
204 datasetType.name,
205 datasetType.storageClass_name,
206 existing.storageClass_name,
207 )
208 _dict[datasetType] = combined[existing]
209 elif convertible_to_existing or convertible_from_existing:
210 # We'd need to refactor a fair amount to recognize
211 # whether this is an error or not, so I'm not going to
212 # bother until we need to do that for other reasons
213 # (it won't be too long).
214 _LOG.info(
215 "Dataset type %s is present with multiple only partially-compatible storage "
216 "classes %s and %s.",
217 datasetType.name,
218 datasetType.storageClass_name,
219 existing.storageClass_name,
220 )
221 _dict[datasetType] = combined[existing]
222 else:
223 incompatible[datasetType] = existing
224 else:
225 missing.add(datasetType)
227 if missing or incompatible:
228 reasons = []
229 if missing:
230 reasons.append(
231 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known "
232 f"types: [{', '.join(d.name for d in combined)}]."
233 )
234 if incompatible:
235 for x, y in incompatible.items():
236 reasons.append(f"{x} incompatible with {y}")
237 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
239 return cls(_dict, universe=first.universe)
241 @property
242 def dimensions(self) -> DimensionGraph:
243 """The union of all dimensions used by all dataset types in this
244 dictionary, including implied dependencies (`DimensionGraph`).
245 """
246 base = self.universe.empty
247 if len(self) == 0:
248 return base
249 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
251 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]:
252 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts
253 into a new mapping with `~lsst.daf.butler.DatasetType` keys and
254 `~lsst.daf.butler.DatasetRef` values.
256 This method assumes that each nest contains exactly one item, as is the
257 case for all "init" datasets.
259 Parameters
260 ----------
261 storage_classes : `dict` [ `str`, `str` ]
262 Mapping from dataset type name to the storage class to use for that
263 dataset type. These are typically the storage classes declared
264 for a particular task, which may differ rom the data repository
265 definitions.
267 Returns
268 -------
269 dictionary : `~lsst.daf.butler.NamedKeyDict`
270 Dictionary mapping `~lsst.daf.butler.DatasetType` to
271 `~lsst.daf.butler.DatasetRef`, with both
272 `~lsst.daf.butler.DatasetType` instances and string names usable
273 as keys.
274 """
275 return NamedKeyDict(
276 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()}
277 )
279 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
280 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into
281 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of
282 `~lsst.daf.butler.DatasetRef` values.
284 Parameters
285 ----------
286 storage_classes : `dict` [ `str`, `str` ]
287 Mapping from dataset type name to the storage class to use for that
288 dataset type. These are typically the storage classes declared
289 for a particular task, which may differ rom the data repository
290 definitions.
292 Returns
293 -------
294 dictionary : `~lsst.daf.butler.NamedKeyDict`
295 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of
296 `~lsst.daf.butler.DatasetRef`, with both
297 `~lsst.daf.butler.DatasetType` instances and string names usable
298 as keys.
299 """
300 result = {}
301 for dataset_type, holders in self.items():
302 if (
303 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name)
304 ) != dataset_type.storageClass_name:
305 dataset_type = dataset_type.overrideStorageClass(override)
306 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()]
307 else:
308 refs = [holder.resolved_ref for holder in holders.values()]
309 result[dataset_type] = refs
310 return NamedKeyDict(result)
312 def extract(
313 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
314 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]:
315 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances
316 that match the given `~lsst.daf.butler.DatasetType` and data IDs.
318 Parameters
319 ----------
320 datasetType : `~lsst.daf.butler.DatasetType`
321 Dataset type to match.
322 dataIds : `~collections.abc.Iterable` \
323 [ `~lsst.daf.butler.DataCoordinate` ]
324 Data IDs to match.
326 Returns
327 -------
328 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ]
329 DatasetRef instances for which ``ref.datasetType == datasetType``
330 and ``ref.dataId`` is in ``dataIds``.
331 """
332 refs = self[datasetType]
333 return ((dataId, refs[dataId].ref) for dataId in dataIds)
335 def isdisjoint(self, other: _DatasetDict) -> bool:
336 """Test whether ``self`` and ``other`` have any datasets in common.
338 Datasets are considered in common if they have the same *parent*
339 dataset type name and data ID; storage classes and components are not
340 considered.
341 """
342 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()}
343 for k, v in other.items():
344 parent_name, _ = k.nameAndComponent()
345 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()):
346 return False
347 return True
349 def iter_resolved_refs(self) -> Iterator[DatasetRef]:
350 """Iterate over all DatasetRef instances held by this data structure,
351 assuming that each `_RefHolder` already carries are resolved ref.
352 """
353 for holders_by_data_id in self.values():
354 for holder in holders_by_data_id.values():
355 yield holder.resolved_ref
358class _QuantumScaffolding:
359 """Helper class aggregating information about a `Quantum`, used when
360 constructing a `QuantumGraph`.
362 See `_PipelineScaffolding` for a top-down description of the full
363 scaffolding data structure.
365 Parameters
366 ----------
367 task : _TaskScaffolding
368 Back-reference to the helper object for the `PipelineTask` this quantum
369 represents an execution of.
370 dataId : `~lsst.daf.butler.DataCoordinate`
371 Data ID for this quantum.
372 """
374 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
375 self.task = task
376 self.dataId = dataId
377 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
378 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
379 self.prerequisites = _DatasetDict.fromDatasetTypes(
380 task.prerequisites.keys(), universe=dataId.universe
381 )
383 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
385 def __repr__(self) -> str:
386 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
388 task: _TaskScaffolding
389 """Back-reference to the helper object for the `PipelineTask` this quantum
390 represents an execution of.
391 """
393 dataId: DataCoordinate
394 """Data ID for this quantum.
395 """
397 inputs: _DatasetDict
398 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to
399 this quantum.
401 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty
402 dictionary at construction. Those nested dictionaries are populated
403 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef`
404 instances in `_PipelineScaffolding.connectDataIds`.
405 """
407 outputs: _DatasetDict
408 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this
409 quantum.
410 """
412 prerequisites: _DatasetDict
413 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite
414 inputs to this quantum.
415 """
417 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum:
418 """Transform the scaffolding object into a true `Quantum` instance.
420 Parameters
421 ----------
422 datastore_records : `~collections.abc.Mapping` [ `str`, \
423 `~lsst.daf.butler.DatastoreRecordData` ], optional
424 If not `None` then fill datastore records in each generated Quantum
425 using the records from this structure.
427 Returns
428 -------
429 quantum : `Quantum`
430 An actual `Quantum` instance.
431 """
432 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes)
433 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes))
434 # Give the task's Connections class an opportunity to remove some
435 # inputs, or complain if they are unacceptable.
436 # This will raise if one of the check conditions is not met, which is
437 # the intended behavior.
438 # If it raises NotWorkFound, there is a bug in the QG algorithm
439 # or the adjustQuantum is incorrectly trying to make a prerequisite
440 # input behave like a regular input; adjustQuantum should only raise
441 # NoWorkFound if a regular input is missing, and it shouldn't be
442 # possible for us to have generated ``self`` if that's true.
443 helper = AdjustQuantumHelper(
444 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes)
445 )
446 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
447 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes)
448 quantum_records: Mapping[str, DatastoreRecordData] | None = None
449 if datastore_records is not None:
450 quantum_records = {}
451 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
452 input_refs += list(initInputs.values())
453 input_ids = set(ref.id for ref in input_refs)
454 for datastore_name, records in datastore_records.items():
455 matching_records = records.subset(input_ids)
456 if matching_records is not None:
457 quantum_records[datastore_name] = matching_records
458 return Quantum(
459 taskName=self.task.taskDef.taskName,
460 taskClass=self.task.taskDef.taskClass,
461 dataId=self.dataId,
462 initInputs=initInputs,
463 inputs=helper.inputs,
464 outputs=helper.outputs,
465 datastore_records=quantum_records,
466 )
469@dataclass
470class _TaskScaffolding:
471 """Helper class aggregating information about a `PipelineTask`, used when
472 constructing a `QuantumGraph`.
474 See `_PipelineScaffolding` for a top-down description of the full
475 scaffolding data structure.
477 Parameters
478 ----------
479 taskDef : `TaskDef`
480 Data structure that identifies the task class and its config.
481 parent : `_PipelineScaffolding`
482 The parent data structure that will hold the instance being
483 constructed.
484 datasetTypes : `TaskDatasetTypes`
485 Data structure that categorizes the dataset types used by this task.
486 """
488 def __init__(
489 self,
490 taskDef: TaskDef,
491 parent: _PipelineScaffolding,
492 datasetTypes: TaskDatasetTypes,
493 ):
494 universe = parent.dimensions.universe
495 self.taskDef = taskDef
496 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
497 assert self.dimensions.issubset(parent.dimensions)
498 # Initialize _DatasetDicts as subsets of the one or two
499 # corresponding dicts in the parent _PipelineScaffolding.
500 self.initInputs = _DatasetDict.fromSubset(
501 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
502 )
503 self.initOutputs = _DatasetDict.fromSubset(
504 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
505 )
506 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
507 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
508 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
509 self.dataIds: set[DataCoordinate] = set()
510 self.quanta = {}
511 self.storage_classes = {
512 connection.name: connection.storageClass
513 for connection in self.taskDef.connections.allConnections.values()
514 }
515 self.storage_classes[
516 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
517 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS
518 self.storage_classes[
519 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
520 ] = acc.LOG_OUTPUT_STORAGE_CLASS
521 self.storage_classes[
522 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
523 ] = acc.METADATA_OUTPUT_STORAGE_CLASS
525 def __repr__(self) -> str:
526 # Default dataclass-injected __repr__ gets caught in an infinite loop
527 # because of back-references.
528 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
530 taskDef: TaskDef
531 """Data structure that identifies the task class and its config
532 (`TaskDef`).
533 """
535 dimensions: DimensionGraph
536 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
537 """
539 initInputs: _DatasetDict
540 """Dictionary containing information about datasets used to construct this
541 task (`_DatasetDict`).
542 """
544 initOutputs: _DatasetDict
545 """Dictionary containing information about datasets produced as a
546 side-effect of constructing this task (`_DatasetDict`).
547 """
549 inputs: _DatasetDict
550 """Dictionary containing information about datasets used as regular,
551 graph-constraining inputs to this task (`_DatasetDict`).
552 """
554 outputs: _DatasetDict
555 """Dictionary containing information about datasets produced by this task
556 (`_DatasetDict`).
557 """
559 prerequisites: _DatasetDict
560 """Dictionary containing information about input datasets that must be
561 present in the repository before any Pipeline containing this task is run
562 (`_DatasetDict`).
563 """
565 quanta: dict[DataCoordinate, _QuantumScaffolding]
566 """Dictionary mapping data ID to a scaffolding object for the Quantum of
567 this task with that data ID.
568 """
570 storage_classes: dict[str, str]
571 """Mapping from dataset type name to storage class declared by this task.
572 """
574 def makeQuantumSet(
575 self,
576 missing: _DatasetDict,
577 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
578 ) -> set[Quantum]:
579 """Create a `set` of `Quantum` from the information in ``self``.
581 Parameters
582 ----------
583 missing : `_DatasetDict`
584 Input datasets that have not been found.
585 datastore_records : `dict`
586 Record from the datastore to export with quanta.
588 Returns
589 -------
590 nodes : `set` of `Quantum`
591 The `Quantum` elements corresponding to this task.
592 """
593 outputs = set()
594 for q in self.quanta.values():
595 try:
596 tmpQuanta = q.makeQuantum(datastore_records)
597 outputs.add(tmpQuanta)
598 except (NoWorkFound, FileNotFoundError) as exc:
599 if not missing.isdisjoint(q.inputs):
600 # This is a node that is known to be pruned later and
601 # should be left in even though some follow up queries
602 # fail. This allows the pruning to start from this quantum
603 # with known issues, and prune other nodes it touches.
604 inputs = q.inputs.unpackMultiRefs(self.storage_classes)
605 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes))
606 tmpQuantum = Quantum(
607 taskName=q.task.taskDef.taskName,
608 taskClass=q.task.taskDef.taskClass,
609 dataId=q.dataId,
610 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes),
611 inputs=inputs,
612 outputs=q.outputs.unpackMultiRefs(self.storage_classes),
613 )
614 outputs.add(tmpQuantum)
615 else:
616 raise exc
617 return outputs
620class _DatasetIdMaker:
621 """Helper class which generates random dataset UUIDs for unresolved
622 datasets.
623 """
625 def __init__(self, run: str):
626 self.run = run
627 # Cache of dataset refs generated so far.
628 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {}
630 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef:
631 # For components we need their parent dataset ID.
632 if dataset_type.isComponent():
633 parent_type = dataset_type.makeCompositeDatasetType()
634 # Parent should be resolved if this is an existing input, or it
635 # should be in the cache already if it is an intermediate.
636 key = parent_type, data_id
637 if key not in self.resolved:
638 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}")
639 parent_ref = self.resolved[key]
640 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False)
642 key = dataset_type, data_id
643 if (resolved := self.resolved.get(key)) is None:
644 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False)
645 self.resolved[key] = resolved
646 return resolved
648 def resolveDict(self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder]) -> None:
649 """Resolve all unresolved references in the provided dictionary."""
650 for data_id, holder in refs.items():
651 if holder.ref is None:
652 holder.ref = self.resolveRef(holder.dataset_type, data_id)
655@dataclass
656class _PipelineScaffolding:
657 """A helper data structure that organizes the information involved in
658 constructing a `QuantumGraph` for a `Pipeline`.
660 Parameters
661 ----------
662 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
663 Sequence of tasks from which a graph is to be constructed. Must
664 have nested task classes already imported.
665 universe : `~lsst.daf.butler.DimensionUniverse`
666 Universe of all possible dimensions.
668 Notes
669 -----
670 The scaffolding data structure contains nested data structures for both
671 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
672 data structures are shared between the pipeline-level structure (which
673 aggregates all datasets and categorizes them from the perspective of the
674 complete pipeline) and the individual tasks that use them as inputs and
675 outputs.
677 `QuantumGraph` construction proceeds in four steps, with each corresponding
678 to a different `_PipelineScaffolding` method:
680 1. When `_PipelineScaffolding` is constructed, we extract and categorize
681 the DatasetTypes used by the pipeline (delegating to
682 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
683 nested `_TaskScaffolding` and `_DatasetDict` objects.
685 2. In `connectDataIds`, we construct and run the "Big Join Query", which
686 returns related tuples of all dimensions used to identify any regular
687 input, output, and intermediate datasets (not prerequisites). We then
688 iterate over these tuples of related dimensions, identifying the subsets
689 that correspond to distinct data IDs for each task and dataset type,
690 and then create `_QuantumScaffolding` objects.
692 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
693 dataset data IDs previously identified, transforming unresolved
694 DatasetRefs into resolved DatasetRefs where appropriate. We then look
695 up prerequisite datasets for all quanta.
697 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
698 per-task `_QuantumScaffolding` objects.
699 """
701 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry):
702 _LOG.debug("Initializing data structures for QuantumGraph generation.")
703 self.tasks = []
704 # Aggregate and categorize the DatasetTypes in the Pipeline.
705 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
706 # Construct dictionaries that map those DatasetTypes to structures
707 # that will (later) hold additional information about them.
708 for attr in (
709 "initInputs",
710 "initIntermediates",
711 "initOutputs",
712 "inputs",
713 "intermediates",
714 "outputs",
715 "prerequisites",
716 ):
717 setattr(
718 self,
719 attr,
720 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
721 )
722 self.missing = _DatasetDict(universe=registry.dimensions)
723 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
724 # Aggregate all dimensions for all non-init, non-prerequisite
725 # DatasetTypes. These are the ones we'll include in the big join
726 # query.
727 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
728 # Construct scaffolding nodes for each Task, and add backreferences
729 # to the Task from each DatasetScaffolding node.
730 # Note that there's only one scaffolding node for each DatasetType,
731 # shared by _PipelineScaffolding and all _TaskScaffoldings that
732 # reference it.
733 if isinstance(pipeline, Pipeline):
734 pipeline = pipeline.toExpandedPipeline()
735 self.tasks = [
736 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
737 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
738 ]
740 def __repr__(self) -> str:
741 # Default dataclass-injected __repr__ gets caught in an infinite loop
742 # because of back-references.
743 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
745 tasks: list[_TaskScaffolding]
746 """Scaffolding data structures for each task in the pipeline
747 (`list` of `_TaskScaffolding`).
748 """
750 initInputs: _DatasetDict
751 """Datasets consumed but not produced when constructing the tasks in this
752 pipeline (`_DatasetDict`).
753 """
755 initIntermediates: _DatasetDict
756 """Datasets that are both consumed and produced when constructing the tasks
757 in this pipeline (`_DatasetDict`).
758 """
760 initOutputs: _DatasetDict
761 """Datasets produced but not consumed when constructing the tasks in this
762 pipeline (`_DatasetDict`).
763 """
765 inputs: _DatasetDict
766 """Datasets that are consumed but not produced when running this pipeline
767 (`_DatasetDict`).
768 """
770 intermediates: _DatasetDict
771 """Datasets that are both produced and consumed when running this pipeline
772 (`_DatasetDict`).
773 """
775 outputs: _DatasetDict
776 """Datasets produced but not consumed when when running this pipeline
777 (`_DatasetDict`).
778 """
780 prerequisites: _DatasetDict
781 """Datasets that are consumed when running this pipeline and looked up
782 per-Quantum when generating the graph (`_DatasetDict`).
783 """
785 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
786 """Datasets that should be used as constraints in the initial query,
787 according to tasks (`~lsst.daf.butler.NamedValueSet`).
788 """
790 dimensions: DimensionGraph
791 """All dimensions used by any regular input, intermediate, or output
792 (not prerequisite) dataset; the set of dimension used in the "Big Join
793 Query" (`~lsst.daf.butler.DimensionGraph`).
795 This is required to be a superset of all task quantum dimensions.
796 """
798 missing: _DatasetDict
799 """Datasets whose existence was originally predicted but were not
800 actually found.
802 Quanta that require these datasets as inputs will be pruned (recursively)
803 when actually constructing a `QuantumGraph` object.
805 These are currently populated only when the "initial dataset query
806 constraint" does not include all overall-input dataset types, and hence the
807 initial data ID query can include data IDs that it should not.
808 """
810 globalInitOutputs: _DatasetDict | None = None
811 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
812 """
814 @contextmanager
815 def connectDataIds(
816 self,
817 registry: Registry,
818 collections: Any,
819 userQuery: str | None,
820 externalDataId: DataCoordinate,
821 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
822 bind: Mapping[str, Any] | None = None,
823 ) -> Iterator[DataCoordinateQueryResults]:
824 """Query for the data IDs that connect nodes in the `QuantumGraph`.
826 This method populates `_TaskScaffolding.dataIds` and
827 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
829 Parameters
830 ----------
831 registry : `lsst.daf.butler.Registry`
832 Registry for the data repository; used for all data ID queries.
833 collections
834 Expressions representing the collections to search for input
835 datasets. See :ref:`daf_butler_ordered_collection_searches`.
836 userQuery : `str` or `None`
837 User-provided expression to limit the data IDs processed.
838 externalDataId : `~lsst.daf.butler.DataCoordinate`
839 Externally-provided data ID that should be used to restrict the
840 results, just as if these constraints had been included via ``AND``
841 in ``userQuery``. This includes (at least) any instrument named
842 in the pipeline definition.
843 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
844 The query constraint variant that should be used to constraint the
845 query based on dataset existance, defaults to
846 `DatasetQueryConstraintVariant.ALL`.
847 bind : `~collections.abc.Mapping`, optional
848 Mapping containing literal values that should be injected into the
849 ``userQuery`` expression, keyed by the identifiers they replace.
851 Returns
852 -------
853 commonDataIds : \
854 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
855 An interface to a database temporary table containing all data IDs
856 that will appear in this `QuantumGraph`. Returned inside a
857 context manager, which will drop the temporary table at the end of
858 the `with` block in which this method is called.
859 """
860 _LOG.debug("Building query for data IDs.")
861 # Initialization datasets always have empty data IDs.
862 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
863 for datasetType, refs in itertools.chain(
864 self.initInputs.items(),
865 self.initIntermediates.items(),
866 self.initOutputs.items(),
867 ):
868 refs[emptyDataId] = _RefHolder(datasetType)
869 # Run one big query for the data IDs for task dimensions and regular
870 # inputs and outputs. We limit the query to only dimensions that are
871 # associated with the input dataset types, but don't (yet) try to
872 # obtain the dataset_ids for those inputs.
873 _LOG.debug(
874 "Submitting data ID query over dimensions %s and materializing results.",
875 list(self.dimensions.names),
876 )
877 queryArgs: dict[str, Any] = {
878 "dimensions": self.dimensions,
879 "where": userQuery,
880 "dataId": externalDataId,
881 "bind": bind,
882 }
883 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
884 _LOG.debug(
885 "Constraining graph query using default of %s.",
886 list(self.defaultDatasetQueryConstraints.names),
887 )
888 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
889 queryArgs["collections"] = collections
890 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
891 _LOG.debug("Not using dataset existence to constrain query.")
892 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
893 constraint = set(datasetQueryConstraint)
894 inputs = {k.name: k for k in self.inputs.keys()}
895 if remainder := constraint.difference(inputs.keys()):
896 raise ValueError(
897 f"{remainder} dataset type(s) specified as a graph constraint, but"
898 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
899 )
900 _LOG.debug(f"Constraining graph query using {constraint}")
901 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
902 queryArgs["collections"] = collections
903 else:
904 raise ValueError(
905 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
906 )
908 if "datasets" in queryArgs:
909 for i, dataset_type in enumerate(queryArgs["datasets"]):
910 if dataset_type.isComponent():
911 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
913 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
914 _LOG.debug("Expanding data IDs.")
915 commonDataIds = commonDataIds.expanded()
916 _LOG.debug("Iterating over query results to associate quanta with datasets.")
917 # Iterate over query results, populating data IDs for datasets and
918 # quanta and then connecting them to each other.
919 n = -1
920 for n, commonDataId in enumerate(commonDataIds):
921 # Create DatasetRefs for all DatasetTypes from this result row,
922 # noting that we might have created some already.
923 # We remember both those that already existed and those that we
924 # create now.
925 refsForRow = {}
926 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {}
927 for datasetType, refs in itertools.chain(
928 self.inputs.items(),
929 self.intermediates.items(),
930 self.outputs.items(),
931 ):
932 datasetDataId: DataCoordinate | None
933 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
934 datasetDataId = commonDataId.subset(datasetType.dimensions)
935 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
936 ref_holder = refs.get(datasetDataId)
937 if ref_holder is None:
938 ref_holder = _RefHolder(datasetType)
939 refs[datasetDataId] = ref_holder
940 refsForRow[datasetType.name] = ref_holder
941 # Create _QuantumScaffolding objects for all tasks from this
942 # result row, noting that we might have created some already.
943 for task in self.tasks:
944 quantumDataId = commonDataId.subset(task.dimensions)
945 quantum = task.quanta.get(quantumDataId)
946 if quantum is None:
947 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
948 task.quanta[quantumDataId] = quantum
949 # Whether this is a new quantum or an existing one, we can
950 # now associate the DatasetRefs for this row with it. The
951 # fact that a Quantum data ID and a dataset data ID both
952 # came from the same result row is what tells us they
953 # should be associated.
954 # Many of these associates will be duplicates (because
955 # another query row that differed from this one only in
956 # irrelevant dimensions already added them), and we use
957 # sets to skip.
958 for datasetType in task.inputs:
959 dataId = dataIdCacheForRow[datasetType.dimensions]
960 ref_holder = refsForRow[datasetType.name]
961 quantum.inputs[datasetType.name][dataId] = ref_holder
962 for datasetType in task.outputs:
963 dataId = dataIdCacheForRow[datasetType.dimensions]
964 ref_holder = refsForRow[datasetType.name]
965 quantum.outputs[datasetType.name][dataId] = ref_holder
966 if n < 0:
967 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
968 emptiness_explained = False
969 for message in commonDataIds.explain_no_results():
970 _LOG.critical(message)
971 emptiness_explained = True
972 if not emptiness_explained:
973 _LOG.critical(
974 "To reproduce this query for debugging purposes, run "
975 "Registry.queryDataIds with these arguments:"
976 )
977 # We could just repr() the queryArgs dict to get something
978 # the user could make sense of, but it's friendlier to
979 # put these args in an easier-to-construct equivalent form
980 # so they can read it more easily and copy and paste into
981 # a Python terminal.
982 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
983 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
984 if queryArgs["where"]:
985 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
986 if "datasets" in queryArgs:
987 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
988 if "collections" in queryArgs:
989 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
990 _LOG.debug("Finished processing %d rows from data ID query.", n)
991 yield commonDataIds
993 def resolveDatasetRefs(
994 self,
995 registry: Registry,
996 collections: Any,
997 run: str,
998 commonDataIds: DataCoordinateQueryResults,
999 *,
1000 skipExistingIn: Any = None,
1001 clobberOutputs: bool = True,
1002 constrainedByAllDatasets: bool = True,
1003 ) -> None:
1004 """Perform follow up queries for each dataset data ID produced in
1005 `fillDataIds`.
1007 This method populates `_DatasetScaffolding.refs` (except for those in
1008 `prerequisites`).
1010 Parameters
1011 ----------
1012 registry : `lsst.daf.butler.Registry`
1013 Registry for the data repository; used for all data ID queries.
1014 collections
1015 Expressions representing the collections to search for input
1016 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1017 run : `str`
1018 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1019 output datasets, if it already exists.
1020 commonDataIds : \
1021 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
1022 Result of a previous call to `connectDataIds`.
1023 skipExistingIn
1024 Expressions representing the collections to search for existing
1025 output datasets that should be skipped. See
1026 :ref:`daf_butler_ordered_collection_searches` for allowed types.
1027 `None` or empty string/sequence disables skipping.
1028 clobberOutputs : `bool`, optional
1029 If `True` (default), allow quanta to created even if outputs exist;
1030 this requires the same behavior behavior to be enabled when
1031 executing. If ``skipExistingIn`` is not `None`, completed quanta
1032 (those with metadata, or all outputs if there is no metadata
1033 dataset configured) will be skipped rather than clobbered.
1034 constrainedByAllDatasets : `bool`, optional
1035 Indicates if the commonDataIds were generated with a constraint on
1036 all dataset types.
1038 Raises
1039 ------
1040 OutputExistsError
1041 Raised if an output dataset already exists in the output run
1042 and ``skipExistingIn`` does not include output run, or if only
1043 some outputs are present and ``clobberOutputs`` is `False`.
1044 """
1045 # Run may be provided but it does not have to exist, in that case we
1046 # use it for resolving references but don't check it for existing refs.
1047 run_exists = False
1048 if run:
1049 try:
1050 run_exists = bool(registry.queryCollections(run))
1051 except MissingCollectionError:
1052 # Undocumented exception is raise if it does not exist
1053 pass
1055 skip_collections_wildcard: CollectionWildcard | None = None
1056 skipExistingInRun = False
1057 if skipExistingIn:
1058 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
1059 if run_exists:
1060 # as optimization check in the explicit list of names first
1061 skipExistingInRun = run in skip_collections_wildcard.strings
1062 if not skipExistingInRun:
1063 # need to flatten it and check again
1064 skipExistingInRun = run in registry.queryCollections(
1065 skipExistingIn,
1066 collectionTypes=CollectionType.RUN,
1067 )
1069 idMaker = _DatasetIdMaker(run)
1071 resolvedRefQueryResults: Iterable[DatasetRef]
1073 # Updating constrainedByAllDatasets here is not ideal, but we have a
1074 # few different code paths that each transfer different pieces of
1075 # information about what dataset query constraints were applied here,
1076 # and none of them has the complete picture until we get here. We're
1077 # long overdue for a QG generation rewrite that will make this go away
1078 # entirely anyway.
1079 constrainedByAllDatasets = (
1080 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
1081 )
1083 # Look up [init] intermediate and output datasets in the output
1084 # collection, if there is an output collection.
1085 if run_exists or skip_collections_wildcard is not None:
1086 for datasetType, refs in itertools.chain(
1087 self.initIntermediates.items(),
1088 self.initOutputs.items(),
1089 self.intermediates.items(),
1090 self.outputs.items(),
1091 ):
1092 _LOG.debug(
1093 "Resolving %d datasets for intermediate and/or output dataset %s.",
1094 len(refs),
1095 datasetType.name,
1096 )
1097 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
1098 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
1099 # TODO: this assert incorrectly bans component inputs;
1100 # investigate on DM-33027.
1101 # assert not datasetType.isComponent(), \
1102 # "Output datasets cannot be components."
1103 #
1104 # Instead we have to handle them manually to avoid a
1105 # deprecation warning, but it is at least confusing and
1106 # possibly a bug for components to appear here at all.
1107 if datasetType.isComponent():
1108 parent_dataset_type = datasetType.makeCompositeDatasetType()
1109 component = datasetType.component()
1110 else:
1111 parent_dataset_type = datasetType
1112 component = None
1114 # look at RUN collection first
1115 if run_exists:
1116 try:
1117 resolvedRefQueryResults = subset.findDatasets(
1118 parent_dataset_type, collections=run, findFirst=True
1119 )
1120 except MissingDatasetTypeError:
1121 resolvedRefQueryResults = []
1122 for resolvedRef in resolvedRefQueryResults:
1123 # TODO: we could easily support per-DatasetType
1124 # skipExisting and I could imagine that being useful -
1125 # it's probably required in order to support writing
1126 # initOutputs before QuantumGraph generation.
1127 assert resolvedRef.dataId in refs
1128 if not (skipExistingInRun or isInit or clobberOutputs):
1129 raise OutputExistsError(
1130 f"Output dataset {datasetType.name} already exists in "
1131 f"output RUN collection '{run}' with data ID"
1132 f" {resolvedRef.dataId}."
1133 )
1134 # To resolve all outputs we have to remember existing
1135 # ones to avoid generating new dataset IDs for them.
1136 refs[resolvedRef.dataId].ref = (
1137 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1138 )
1140 # And check skipExistingIn too, if RUN collection is in
1141 # it is handled above
1142 if skip_collections_wildcard is not None:
1143 try:
1144 resolvedRefQueryResults = subset.findDatasets(
1145 parent_dataset_type,
1146 collections=skip_collections_wildcard,
1147 findFirst=True,
1148 )
1149 except MissingDatasetTypeError:
1150 resolvedRefQueryResults = []
1151 for resolvedRef in resolvedRefQueryResults:
1152 if resolvedRef.dataId not in refs:
1153 continue
1154 refs[resolvedRef.dataId].ref = (
1155 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1156 )
1158 # Look up input and initInput datasets in the input collection(s). We
1159 # accumulate datasets in self.missing, if the common data IDs were not
1160 # constrained on dataset type existence.
1161 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1162 _LOG.debug(
1163 "Resolving %d datasets for input dataset %s.",
1164 len(refs),
1165 datasetType.name,
1166 )
1167 if datasetType.isComponent():
1168 parent_dataset_type = datasetType.makeCompositeDatasetType()
1169 component = datasetType.component()
1170 else:
1171 parent_dataset_type = datasetType
1172 component = None
1173 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {}
1174 try:
1175 resolvedRefQueryResults = commonDataIds.subset(
1176 datasetType.dimensions, unique=True
1177 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1178 except MissingDatasetTypeError:
1179 resolvedRefQueryResults = []
1180 dataIdsNotFoundYet = set(refs.keys())
1181 for resolvedRef in resolvedRefQueryResults:
1182 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1183 if resolvedRef.dataId not in refs:
1184 continue
1185 refs[resolvedRef.dataId].ref = (
1186 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1187 )
1188 if dataIdsNotFoundYet:
1189 if constrainedByAllDatasets:
1190 raise RuntimeError(
1191 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1192 f"'{datasetType.name}' was/were present in a previous "
1193 "query, but could not be found now. "
1194 "This is either a logic bug in QuantumGraph generation "
1195 "or the input collections have been modified since "
1196 "QuantumGraph generation began."
1197 )
1198 elif not datasetType.dimensions:
1199 raise RuntimeError(
1200 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1201 f"collections {collections}."
1202 )
1203 else:
1204 # If the common dataIds were not constrained using all the
1205 # input dataset types, it is possible that some data ids
1206 # found don't correspond to existing datasets. Mark these
1207 # for later pruning from the quantum graph.
1208 for k in dataIdsNotFoundYet:
1209 missing_for_dataset_type[k] = refs[k]
1210 if missing_for_dataset_type:
1211 self.missing[datasetType] = missing_for_dataset_type
1213 # Resolve the missing refs, just so they look like all of the others;
1214 # in the end other code will make sure they never appear in the QG.
1215 for dataset_type, refDict in self.missing.items():
1216 idMaker.resolveDict(dataset_type, refDict)
1218 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1219 # replacing the unresolved refs there, and then look up prerequisites.
1220 for task in self.tasks:
1221 _LOG.debug(
1222 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1223 len(task.quanta),
1224 task.taskDef.label,
1225 )
1226 # The way iterConnections is designed makes it impossible to
1227 # annotate precisely enough to satisfy MyPy here.
1228 lookupFunctions = {
1229 c.name: c.lookupFunction # type: ignore
1230 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1231 if c.lookupFunction is not None # type: ignore
1232 }
1233 dataIdsFailed = []
1234 dataIdsSucceeded = []
1235 for quantum in task.quanta.values():
1236 # Process outputs datasets only if skipExistingIn is not None
1237 # or there is a run to look for outputs in and clobberOutputs
1238 # is True. Note that if skipExistingIn is None, any output
1239 # datasets that already exist would have already caused an
1240 # exception to be raised.
1241 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1242 resolvedRefs = []
1243 unresolvedDataIds = []
1244 haveMetadata = False
1245 for datasetType, originalRefs in quantum.outputs.items():
1246 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()):
1247 if ref is not None:
1248 resolvedRefs.append(ref)
1249 originalRefs[dataId].ref = ref
1250 if datasetType.name == task.taskDef.metadataDatasetName:
1251 haveMetadata = True
1252 else:
1253 unresolvedDataIds.append((datasetType, dataId))
1254 if resolvedRefs:
1255 if haveMetadata or not unresolvedDataIds:
1256 dataIdsSucceeded.append(quantum.dataId)
1257 if skip_collections_wildcard is not None:
1258 continue
1259 else:
1260 dataIdsFailed.append(quantum.dataId)
1261 if not clobberOutputs:
1262 raise OutputExistsError(
1263 f"Quantum {quantum.dataId} of task with label "
1264 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1265 f"({resolvedRefs}) "
1266 f"and others that don't ({unresolvedDataIds}), with no metadata output, "
1267 "and clobbering outputs was not enabled."
1268 )
1269 # Update the input DatasetRefs to the resolved ones we already
1270 # searched for.
1271 for datasetType, input_refs in quantum.inputs.items():
1272 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()):
1273 input_refs[data_id].ref = ref
1274 # Look up prerequisite datasets in the input collection(s).
1275 # These may have dimensions that extend beyond those we queried
1276 # for originally, because we want to permit those data ID
1277 # values to differ across quanta and dataset types.
1278 for datasetType in task.prerequisites:
1279 if datasetType.isComponent():
1280 parent_dataset_type = datasetType.makeCompositeDatasetType()
1281 component = datasetType.component()
1282 else:
1283 parent_dataset_type = datasetType
1284 component = None
1285 lookupFunction = lookupFunctions.get(datasetType.name)
1286 if lookupFunction is not None:
1287 # PipelineTask has provided its own function to do the
1288 # lookup. This always takes precedence.
1289 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1290 elif (
1291 datasetType.isCalibration()
1292 and datasetType.dimensions <= quantum.dataId.graph
1293 and quantum.dataId.graph.temporal
1294 ):
1295 # This is a master calibration lookup, which we have to
1296 # handle specially because the query system can't do a
1297 # temporal join on a non-dimension-based timespan yet.
1298 timespan = quantum.dataId.timespan
1299 try:
1300 prereq_ref = registry.findDataset(
1301 parent_dataset_type,
1302 quantum.dataId,
1303 collections=collections,
1304 timespan=timespan,
1305 )
1306 if prereq_ref is not None:
1307 if component is not None:
1308 prereq_ref = prereq_ref.makeComponentRef(component)
1309 prereq_refs = [prereq_ref]
1310 else:
1311 prereq_refs = []
1312 except (KeyError, MissingDatasetTypeError):
1313 # This dataset type is not present in the registry,
1314 # which just means there are no datasets here.
1315 prereq_refs = []
1316 else:
1317 # Most general case.
1318 prereq_refs = [
1319 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1320 for prereq_ref in registry.queryDatasets(
1321 parent_dataset_type,
1322 collections=collections,
1323 dataId=quantum.dataId,
1324 findFirst=True,
1325 ).expanded()
1326 ]
1328 for ref in prereq_refs:
1329 if ref is not None:
1330 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1331 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1333 # Resolve all quantum inputs and outputs.
1334 for datasetDict in (quantum.inputs, quantum.outputs):
1335 for dataset_type, refDict in datasetDict.items():
1336 idMaker.resolveDict(dataset_type, refDict)
1338 # Resolve task initInputs and initOutputs.
1339 for datasetDict in (task.initInputs, task.initOutputs):
1340 for dataset_type, refDict in datasetDict.items():
1341 idMaker.resolveDict(dataset_type, refDict)
1343 # Actually remove any quanta that we decided to skip above.
1344 if dataIdsSucceeded:
1345 if skip_collections_wildcard is not None:
1346 _LOG.debug(
1347 "Pruning successful %d quanta for task with label '%s' because all of their "
1348 "outputs exist or metadata was written successfully.",
1349 len(dataIdsSucceeded),
1350 task.taskDef.label,
1351 )
1352 for dataId in dataIdsSucceeded:
1353 del task.quanta[dataId]
1354 elif clobberOutputs:
1355 _LOG.info(
1356 "Found %d successful quanta for task with label '%s' "
1357 "that will need to be clobbered during execution.",
1358 len(dataIdsSucceeded),
1359 task.taskDef.label,
1360 )
1361 else:
1362 raise AssertionError("OutputExistsError should have already been raised.")
1363 if dataIdsFailed:
1364 if clobberOutputs:
1365 _LOG.info(
1366 "Found %d failed/incomplete quanta for task with label '%s' "
1367 "that will need to be clobbered during execution.",
1368 len(dataIdsFailed),
1369 task.taskDef.label,
1370 )
1371 else:
1372 raise AssertionError("OutputExistsError should have already been raised.")
1374 # Collect initOutputs that do not belong to any task.
1375 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1376 for task in self.tasks:
1377 global_dataset_types -= set(task.initOutputs)
1378 if global_dataset_types:
1379 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1380 for dataset_type, refDict in self.globalInitOutputs.items():
1381 idMaker.resolveDict(dataset_type, refDict)
1383 def makeQuantumGraph(
1384 self,
1385 registry: Registry,
1386 metadata: Mapping[str, Any] | None = None,
1387 datastore: Datastore | None = None,
1388 ) -> QuantumGraph:
1389 """Create a `QuantumGraph` from the quanta already present in
1390 the scaffolding data structure.
1392 Parameters
1393 ----------
1394 registry : `lsst.daf.butler.Registry`
1395 Registry for the data repository; used for all data ID queries.
1396 metadata : `~collections.abc.Mapping` of `str` to primitives, optional
1397 This is an optional parameter of extra data to carry with the
1398 graph. Entries in this mapping should be able to be serialized in
1399 JSON.
1400 datastore : `~lsst.daf.butler.Datastore`, optional
1401 If not `None` then fill datastore records in each generated
1402 Quantum.
1404 Returns
1405 -------
1406 graph : `QuantumGraph`
1407 The full `QuantumGraph`.
1408 """
1410 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1411 """Extract all DatasetRefs from the dictionaries"""
1412 for ref_dict in dataset_dict.values():
1413 for holder in ref_dict.values():
1414 yield holder.resolved_ref
1416 datastore_records: Mapping[str, DatastoreRecordData] | None = None
1417 if datastore is not None:
1418 datastore_records = datastore.export_records(
1419 itertools.chain(
1420 _make_refs(self.inputs),
1421 _make_refs(self.initInputs),
1422 _make_refs(self.prerequisites),
1423 )
1424 )
1426 graphInput: dict[TaskDef, set[Quantum]] = {}
1427 for task in self.tasks:
1428 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records)
1429 graphInput[task.taskDef] = qset
1431 taskInitInputs = {
1432 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values()
1433 for task in self.tasks
1434 }
1435 taskInitOutputs = {
1436 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values()
1437 for task in self.tasks
1438 }
1440 globalInitOutputs: list[DatasetRef] = []
1441 if self.globalInitOutputs is not None:
1442 for refs_dict in self.globalInitOutputs.values():
1443 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values())
1445 graph = QuantumGraph(
1446 graphInput,
1447 metadata=metadata,
1448 pruneRefs=list(self.missing.iter_resolved_refs()),
1449 universe=self.dimensions.universe,
1450 initInputs=taskInitInputs,
1451 initOutputs=taskInitOutputs,
1452 globalInitOutputs=globalInitOutputs,
1453 registryDatasetTypes=self._get_registry_dataset_types(registry),
1454 )
1455 return graph
1457 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1458 """Make a list of all dataset types used by a graph as defined in
1459 registry.
1460 """
1461 chain = [
1462 self.initInputs,
1463 self.initIntermediates,
1464 self.initOutputs,
1465 self.inputs,
1466 self.intermediates,
1467 self.outputs,
1468 self.prerequisites,
1469 ]
1470 if self.globalInitOutputs is not None:
1471 chain.append(self.globalInitOutputs)
1473 # Collect names of all dataset types.
1474 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1475 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1477 # Check for types that do not exist in registry yet:
1478 # - inputs must exist
1479 # - intermediates and outputs may not exist, but there must not be
1480 # more than one definition (e.g. differing in storage class)
1481 # - prerequisites may not exist, treat it the same as outputs here
1482 for dstype in itertools.chain(self.initInputs, self.inputs):
1483 if dstype.name not in dataset_types:
1484 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1486 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1487 chain = [
1488 self.initIntermediates,
1489 self.initOutputs,
1490 self.intermediates,
1491 self.outputs,
1492 self.prerequisites,
1493 ]
1494 if self.globalInitOutputs is not None:
1495 chain.append(self.globalInitOutputs)
1496 for dstype in itertools.chain(*chain):
1497 if dstype.name not in dataset_types:
1498 new_outputs[dstype.name].add(dstype)
1499 for name, dstypes in new_outputs.items():
1500 if len(dstypes) > 1:
1501 raise ValueError(
1502 "Pipeline contains multiple definitions for a dataset type "
1503 f"which is not defined in registry yet: {dstypes}"
1504 )
1505 elif len(dstypes) == 1:
1506 dataset_types[name] = dstypes.pop()
1508 return dataset_types.values()
1511# ------------------------
1512# Exported definitions --
1513# ------------------------
1516class GraphBuilderError(Exception):
1517 """Base class for exceptions generated by graph builder."""
1519 pass
1522class OutputExistsError(GraphBuilderError):
1523 """Exception generated when output datasets already exist."""
1525 pass
1528class PrerequisiteMissingError(GraphBuilderError):
1529 """Exception generated when a prerequisite dataset does not exist."""
1531 pass
1534class GraphBuilder:
1535 """GraphBuilder class is responsible for building task execution graph from
1536 a Pipeline.
1538 Parameters
1539 ----------
1540 registry : `~lsst.daf.butler.Registry`
1541 Data butler instance.
1542 skipExistingIn
1543 Expressions representing the collections to search for existing
1544 output datasets that should be skipped. See
1545 :ref:`daf_butler_ordered_collection_searches`.
1546 clobberOutputs : `bool`, optional
1547 If `True` (default), allow quanta to created even if partial outputs
1548 exist; this requires the same behavior behavior to be enabled when
1549 executing.
1550 datastore : `~lsst.daf.butler.Datastore`, optional
1551 If not `None` then fill datastore records in each generated Quantum.
1552 """
1554 def __init__(
1555 self,
1556 registry: Registry,
1557 skipExistingIn: Any = None,
1558 clobberOutputs: bool = True,
1559 datastore: Datastore | None = None,
1560 ):
1561 self.registry = registry
1562 self.dimensions = registry.dimensions
1563 self.skipExistingIn = skipExistingIn
1564 self.clobberOutputs = clobberOutputs
1565 self.datastore = datastore
1567 def makeGraph(
1568 self,
1569 pipeline: Pipeline | Iterable[TaskDef],
1570 collections: Any,
1571 run: str,
1572 userQuery: str | None,
1573 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1574 metadata: Mapping[str, Any] | None = None,
1575 bind: Mapping[str, Any] | None = None,
1576 dataId: DataCoordinate | None = None,
1577 ) -> QuantumGraph:
1578 """Create execution graph for a pipeline.
1580 Parameters
1581 ----------
1582 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
1583 Pipeline definition, task names/classes and their configs.
1584 collections
1585 Expressions representing the collections to search for input
1586 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1587 run : `str`
1588 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1589 output datasets. Collection does not have to exist and it will be
1590 created when graph is executed.
1591 userQuery : `str`
1592 String which defines user-defined selection for registry, should be
1593 empty or `None` if there is no restrictions on data selection.
1594 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1595 The query constraint variant that should be used to constraint the
1596 query based on dataset existance, defaults to
1597 `DatasetQueryConstraintVariant.ALL`.
1598 metadata : Optional Mapping of `str` to primitives
1599 This is an optional parameter of extra data to carry with the
1600 graph. Entries in this mapping should be able to be serialized in
1601 JSON.
1602 bind : `~collections.abc.Mapping`, optional
1603 Mapping containing literal values that should be injected into the
1604 ``userQuery`` expression, keyed by the identifiers they replace.
1605 dataId : `lsst.daf.butler.DataCoordinate`, optional
1606 Data ID that should also be included in the query constraint.
1608 Returns
1609 -------
1610 graph : `QuantumGraph`
1612 Raises
1613 ------
1614 UserExpressionError
1615 Raised when user expression cannot be parsed.
1616 OutputExistsError
1617 Raised when output datasets already exist.
1618 Exception
1619 Other exceptions types may be raised by underlying registry
1620 classes.
1621 """
1622 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1623 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1624 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1625 if dataId is None:
1626 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1627 if isinstance(pipeline, Pipeline):
1628 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId)
1629 with scaffolding.connectDataIds(
1630 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1631 ) as commonDataIds:
1632 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1633 scaffolding.resolveDatasetRefs(
1634 self.registry,
1635 collections,
1636 run,
1637 commonDataIds,
1638 skipExistingIn=self.skipExistingIn,
1639 clobberOutputs=self.clobberOutputs,
1640 constrainedByAllDatasets=condition,
1641 )
1642 return scaffolding.makeQuantumGraph(
1643 registry=self.registry, metadata=metadata, datastore=self.datastore
1644 )