Coverage for python/lsst/pipe/base/graphBuilder.py: 15%
548 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-25 09:14 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-25 09:14 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap, defaultdict
34from collections.abc import Collection, Iterable, Iterator, Mapping
35from contextlib import contextmanager
36from dataclasses import dataclass
37from typing import Any
39from lsst.daf.butler import (
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
55from lsst.daf.butler.registry.wildcards import CollectionWildcard
57# -----------------------------
58# Imports for other modules --
59# -----------------------------
60from . import automatic_connection_constants as acc
61from ._datasetQueryConstraints import DatasetQueryConstraintVariant
62from ._status import NoWorkFound
63from .connections import AdjustQuantumHelper, iterConnections
64from .graph import QuantumGraph
65from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
67# ----------------------------------
68# Local non-exported definitions --
69# ----------------------------------
71_LOG = logging.getLogger(__name__)
74@dataclass
75class _RefHolder:
76 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future
77 resolved reference.
79 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use
80 `None` to represent a reference that is yet to be resolved. Information
81 about its corresponding dataset type and coordinate is stored in
82 `_DatasetDict` mapping.
83 """
85 dataset_type: DatasetType
86 """Dataset type of the dataset to be created later. I need to store it here
87 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared
88 between different compatible dataset types."""
90 ref: DatasetRef | None = None
91 """Dataset reference, initially `None`, created when all datasets are
92 resolved.
93 """
95 @property
96 def resolved_ref(self) -> DatasetRef:
97 """Access resolved reference, should only be called after the
98 reference is set (`~lsst.daf.butler.DatasetRef`).
99 """
100 assert self.ref is not None, "Dataset reference is not set."
101 return self.ref
104class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]):
105 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested
106 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that
107 type.
109 Parameters
110 ----------
111 args
112 Positional arguments are forwarded to the `dict` constructor.
113 universe : `~lsst.daf.butler.DimensionUniverse`
114 Universe of all possible dimensions.
115 """
117 def __init__(self, *args: Any, universe: DimensionUniverse):
118 super().__init__(*args)
119 self.universe = universe
121 @classmethod
122 def fromDatasetTypes(
123 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
124 ) -> _DatasetDict:
125 """Construct a dictionary from a flat iterable of
126 `~lsst.daf.butler.DatasetType` keys.
128 Parameters
129 ----------
130 datasetTypes : `~collections.abc.Iterable` of \
131 `~lsst.daf.butler.DatasetType`
132 DatasetTypes to use as keys for the dict. Values will be empty
133 dictionaries.
134 universe : `~lsst.daf.butler.DimensionUniverse`
135 Universe of all possible dimensions.
137 Returns
138 -------
139 dictionary : `_DatasetDict`
140 A new `_DatasetDict` instance.
141 """
142 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
144 @classmethod
145 def fromSubset(
146 cls,
147 datasetTypes: Collection[DatasetType],
148 first: _DatasetDict,
149 *rest: _DatasetDict,
150 ) -> _DatasetDict:
151 """Return a new dictionary by extracting items corresponding to the
152 given keys from one or more existing dictionaries.
154 Parameters
155 ----------
156 datasetTypes : `~collections.abc.Iterable` of \
157 `~lsst.daf.butler.DatasetType`
158 DatasetTypes to use as keys for the dict. Values will be obtained
159 by lookups against ``first`` and ``rest``.
160 first : `_DatasetDict`
161 Another dictionary from which to extract values.
162 rest
163 Additional dictionaries from which to extract values.
165 Returns
166 -------
167 dictionary : `_DatasetDict`
168 A new dictionary instance.
169 """
170 combined = ChainMap(first, *rest)
172 # Dataset types known to match immediately can be processed
173 # without checks.
174 matches = combined.keys() & set(datasetTypes)
175 _dict = {k: combined[k] for k in matches}
177 if len(_dict) < len(datasetTypes):
178 # Work out which ones are missing.
179 missing_datasetTypes = set(datasetTypes) - _dict.keys()
181 # Get the known names for comparison.
182 combined_by_name = {k.name: k for k in combined}
184 missing = set()
185 incompatible = {}
186 for datasetType in missing_datasetTypes:
187 # The dataset type is not found. It may not be listed
188 # or it may be that it is there with the same name
189 # but different definition.
190 if datasetType.name in combined_by_name:
191 # This implies some inconsistency in definitions
192 # for connections. If there is support for storage
193 # class conversion we can let it slide.
194 # At this point we do not know
195 # where the inconsistency is but trust that down
196 # stream code will be more explicit about input
197 # vs output incompatibilities.
198 existing = combined_by_name[datasetType.name]
199 convertible_to_existing = existing.is_compatible_with(datasetType)
200 convertible_from_existing = datasetType.is_compatible_with(existing)
201 if convertible_to_existing and convertible_from_existing:
202 _LOG.debug(
203 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
204 datasetType.name,
205 datasetType.storageClass_name,
206 existing.storageClass_name,
207 )
208 _dict[datasetType] = combined[existing]
209 elif convertible_to_existing or convertible_from_existing:
210 # We'd need to refactor a fair amount to recognize
211 # whether this is an error or not, so I'm not going to
212 # bother until we need to do that for other reasons
213 # (it won't be too long).
214 _LOG.info(
215 "Dataset type %s is present with multiple only partially-compatible storage "
216 "classes %s and %s.",
217 datasetType.name,
218 datasetType.storageClass_name,
219 existing.storageClass_name,
220 )
221 _dict[datasetType] = combined[existing]
222 else:
223 incompatible[datasetType] = existing
224 else:
225 missing.add(datasetType)
227 if missing or incompatible:
228 reasons = []
229 if missing:
230 reasons.append(
231 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known "
232 f"types: [{', '.join(d.name for d in combined)}]."
233 )
234 if incompatible:
235 for x, y in incompatible.items():
236 reasons.append(f"{x} incompatible with {y}")
237 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
239 return cls(_dict, universe=first.universe)
241 @property
242 def dimensions(self) -> DimensionGraph:
243 """The union of all dimensions used by all dataset types in this
244 dictionary, including implied dependencies (`DimensionGraph`).
245 """
246 base = self.universe.empty
247 if len(self) == 0:
248 return base
249 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
251 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]:
252 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts
253 into a new mapping with `~lsst.daf.butler.DatasetType` keys and
254 `~lsst.daf.butler.DatasetRef` values.
256 This method assumes that each nest contains exactly one item, as is the
257 case for all "init" datasets.
259 Parameters
260 ----------
261 storage_classes : `dict` [ `str`, `str` ]
262 Mapping from dataset type name to the storage class to use for that
263 dataset type. These are typically the storage classes declared
264 for a particular task, which may differ rom the data repository
265 definitions.
267 Returns
268 -------
269 dictionary : `~lsst.daf.butler.NamedKeyDict`
270 Dictionary mapping `~lsst.daf.butler.DatasetType` to
271 `~lsst.daf.butler.DatasetRef`, with both
272 `~lsst.daf.butler.DatasetType` instances and string names usable
273 as keys.
274 """
275 return NamedKeyDict(
276 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()}
277 )
279 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
280 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into
281 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of
282 `~lsst.daf.butler.DatasetRef` values.
284 Parameters
285 ----------
286 storage_classes : `dict` [ `str`, `str` ]
287 Mapping from dataset type name to the storage class to use for that
288 dataset type. These are typically the storage classes declared
289 for a particular task, which may differ rom the data repository
290 definitions.
292 Returns
293 -------
294 dictionary : `~lsst.daf.butler.NamedKeyDict`
295 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of
296 `~lsst.daf.butler.DatasetRef`, with both
297 `~lsst.daf.butler.DatasetType` instances and string names usable
298 as keys.
299 """
300 result = {}
301 for dataset_type, holders in self.items():
302 if (
303 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name)
304 ) != dataset_type.storageClass_name:
305 dataset_type = dataset_type.overrideStorageClass(override)
306 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()]
307 else:
308 refs = [holder.resolved_ref for holder in holders.values()]
309 result[dataset_type] = refs
310 return NamedKeyDict(result)
312 def extract(
313 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
314 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]:
315 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances
316 that match the given `~lsst.daf.butler.DatasetType` and data IDs.
318 Parameters
319 ----------
320 datasetType : `~lsst.daf.butler.DatasetType`
321 Dataset type to match.
322 dataIds : `~collections.abc.Iterable` \
323 [ `~lsst.daf.butler.DataCoordinate` ]
324 Data IDs to match.
326 Returns
327 -------
328 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ]
329 DatasetRef instances for which ``ref.datasetType == datasetType``
330 and ``ref.dataId`` is in ``dataIds``.
331 """
332 refs = self[datasetType]
333 return ((dataId, refs[dataId].ref) for dataId in dataIds)
335 def isdisjoint(self, other: _DatasetDict) -> bool:
336 """Test whether ``self`` and ``other`` have any datasets in common.
338 Datasets are considered in common if they have the same *parent*
339 dataset type name and data ID; storage classes and components are not
340 considered.
341 """
342 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()}
343 for k, v in other.items():
344 parent_name, _ = k.nameAndComponent()
345 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()):
346 return False
347 return True
349 def iter_resolved_refs(self) -> Iterator[DatasetRef]:
350 """Iterate over all DatasetRef instances held by this data structure,
351 assuming that each `_RefHolder` already carries are resolved ref.
352 """
353 for holders_by_data_id in self.values():
354 for holder in holders_by_data_id.values():
355 yield holder.resolved_ref
358class _QuantumScaffolding:
359 """Helper class aggregating information about a `Quantum`, used when
360 constructing a `QuantumGraph`.
362 See `_PipelineScaffolding` for a top-down description of the full
363 scaffolding data structure.
365 Parameters
366 ----------
367 task : _TaskScaffolding
368 Back-reference to the helper object for the `PipelineTask` this quantum
369 represents an execution of.
370 dataId : `~lsst.daf.butler.DataCoordinate`
371 Data ID for this quantum.
372 """
374 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
375 self.task = task
376 self.dataId = dataId
377 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
378 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
379 self.prerequisites = _DatasetDict.fromDatasetTypes(
380 task.prerequisites.keys(), universe=dataId.universe
381 )
383 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
385 def __repr__(self) -> str:
386 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
388 task: _TaskScaffolding
389 """Back-reference to the helper object for the `PipelineTask` this quantum
390 represents an execution of.
391 """
393 dataId: DataCoordinate
394 """Data ID for this quantum.
395 """
397 inputs: _DatasetDict
398 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to
399 this quantum.
401 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty
402 dictionary at construction. Those nested dictionaries are populated
403 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef`
404 instances in `_PipelineScaffolding.connectDataIds`.
405 """
407 outputs: _DatasetDict
408 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this
409 quantum.
410 """
412 prerequisites: _DatasetDict
413 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite
414 inputs to this quantum.
415 """
417 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum:
418 """Transform the scaffolding object into a true `Quantum` instance.
420 Parameters
421 ----------
422 datastore_records : `~collections.abc.Mapping` [ `str`, \
423 `~lsst.daf.butler.DatastoreRecordData` ], optional
424 If not `None` then fill datastore records in each generated Quantum
425 using the records from this structure.
427 Returns
428 -------
429 quantum : `Quantum`
430 An actual `Quantum` instance.
431 """
432 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes)
433 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes))
434 # Give the task's Connections class an opportunity to remove some
435 # inputs, or complain if they are unacceptable.
436 # This will raise if one of the check conditions is not met, which is
437 # the intended behavior.
438 # If it raises NotWorkFound, there is a bug in the QG algorithm
439 # or the adjustQuantum is incorrectly trying to make a prerequisite
440 # input behave like a regular input; adjustQuantum should only raise
441 # NoWorkFound if a regular input is missing, and it shouldn't be
442 # possible for us to have generated ``self`` if that's true.
443 helper = AdjustQuantumHelper(
444 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes)
445 )
446 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
447 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes)
448 quantum_records: Mapping[str, DatastoreRecordData] | None = None
449 if datastore_records is not None:
450 quantum_records = {}
451 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
452 input_refs += list(initInputs.values())
453 input_ids = set(ref.id for ref in input_refs)
454 for datastore_name, records in datastore_records.items():
455 matching_records = records.subset(input_ids)
456 if matching_records is not None:
457 quantum_records[datastore_name] = matching_records
458 return Quantum(
459 taskName=self.task.taskDef.taskName,
460 taskClass=self.task.taskDef.taskClass,
461 dataId=self.dataId,
462 initInputs=initInputs,
463 inputs=helper.inputs,
464 outputs=helper.outputs,
465 datastore_records=quantum_records,
466 )
469@dataclass
470class _TaskScaffolding:
471 """Helper class aggregating information about a `PipelineTask`, used when
472 constructing a `QuantumGraph`.
474 See `_PipelineScaffolding` for a top-down description of the full
475 scaffolding data structure.
477 Parameters
478 ----------
479 taskDef : `TaskDef`
480 Data structure that identifies the task class and its config.
481 parent : `_PipelineScaffolding`
482 The parent data structure that will hold the instance being
483 constructed.
484 datasetTypes : `TaskDatasetTypes`
485 Data structure that categorizes the dataset types used by this task.
486 """
488 def __init__(
489 self,
490 taskDef: TaskDef,
491 parent: _PipelineScaffolding,
492 datasetTypes: TaskDatasetTypes,
493 ):
494 universe = parent.dimensions.universe
495 self.taskDef = taskDef
496 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
497 assert self.dimensions.issubset(parent.dimensions)
498 # Initialize _DatasetDicts as subsets of the one or two
499 # corresponding dicts in the parent _PipelineScaffolding.
500 self.initInputs = _DatasetDict.fromSubset(
501 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
502 )
503 self.initOutputs = _DatasetDict.fromSubset(
504 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
505 )
506 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
507 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
508 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
509 self.dataIds: set[DataCoordinate] = set()
510 self.quanta = {}
511 self.storage_classes = {
512 connection.name: connection.storageClass
513 for connection in self.taskDef.connections.allConnections.values()
514 }
515 self.storage_classes[
516 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
517 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS
518 self.storage_classes[
519 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
520 ] = acc.LOG_OUTPUT_STORAGE_CLASS
521 self.storage_classes[
522 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
523 ] = acc.METADATA_OUTPUT_STORAGE_CLASS
525 def __repr__(self) -> str:
526 # Default dataclass-injected __repr__ gets caught in an infinite loop
527 # because of back-references.
528 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
530 taskDef: TaskDef
531 """Data structure that identifies the task class and its config
532 (`TaskDef`).
533 """
535 dimensions: DimensionGraph
536 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
537 """
539 initInputs: _DatasetDict
540 """Dictionary containing information about datasets used to construct this
541 task (`_DatasetDict`).
542 """
544 initOutputs: _DatasetDict
545 """Dictionary containing information about datasets produced as a
546 side-effect of constructing this task (`_DatasetDict`).
547 """
549 inputs: _DatasetDict
550 """Dictionary containing information about datasets used as regular,
551 graph-constraining inputs to this task (`_DatasetDict`).
552 """
554 outputs: _DatasetDict
555 """Dictionary containing information about datasets produced by this task
556 (`_DatasetDict`).
557 """
559 prerequisites: _DatasetDict
560 """Dictionary containing information about input datasets that must be
561 present in the repository before any Pipeline containing this task is run
562 (`_DatasetDict`).
563 """
565 quanta: dict[DataCoordinate, _QuantumScaffolding]
566 """Dictionary mapping data ID to a scaffolding object for the Quantum of
567 this task with that data ID.
568 """
570 storage_classes: dict[str, str]
571 """Mapping from dataset type name to storage class declared by this task.
572 """
574 def makeQuantumSet(
575 self,
576 missing: _DatasetDict,
577 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
578 ) -> set[Quantum]:
579 """Create a `set` of `Quantum` from the information in ``self``.
581 Parameters
582 ----------
583 missing : `_DatasetDict`
584 Input datasets that have not been found.
585 datastore_records : `dict`
586 Record from the datastore to export with quanta.
588 Returns
589 -------
590 nodes : `set` of `Quantum`
591 The `Quantum` elements corresponding to this task.
592 """
593 outputs = set()
594 for q in self.quanta.values():
595 try:
596 tmpQuanta = q.makeQuantum(datastore_records)
597 outputs.add(tmpQuanta)
598 except (NoWorkFound, FileNotFoundError) as exc:
599 if not missing.isdisjoint(q.inputs):
600 # This is a node that is known to be pruned later and
601 # should be left in even though some follow up queries
602 # fail. This allows the pruning to start from this quantum
603 # with known issues, and prune other nodes it touches.
604 inputs = q.inputs.unpackMultiRefs(self.storage_classes)
605 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes))
606 tmpQuantum = Quantum(
607 taskName=q.task.taskDef.taskName,
608 taskClass=q.task.taskDef.taskClass,
609 dataId=q.dataId,
610 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes),
611 inputs=inputs,
612 outputs=q.outputs.unpackMultiRefs(self.storage_classes),
613 )
614 outputs.add(tmpQuantum)
615 else:
616 raise exc
617 return outputs
620class _DatasetIdMaker:
621 """Helper class which generates random dataset UUIDs for unresolved
622 datasets.
623 """
625 def __init__(self, run: str):
626 self.run = run
627 # Cache of dataset refs generated so far.
628 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {}
630 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef:
631 # For components we need their parent dataset ID.
632 if dataset_type.isComponent():
633 parent_type = dataset_type.makeCompositeDatasetType()
634 # Parent should be resolved if this is an existing input, or it
635 # should be in the cache already if it is an intermediate.
636 key = parent_type, data_id
637 if key not in self.resolved:
638 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}")
639 parent_ref = self.resolved[key]
640 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False)
642 key = dataset_type, data_id
643 if (resolved := self.resolved.get(key)) is None:
644 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False)
645 self.resolved[key] = resolved
646 return resolved
648 def resolveDict(
649 self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder], is_output: bool
650 ) -> None:
651 """Resolve all unresolved references in the provided dictionary."""
652 for data_id, holder in refs.items():
653 if holder.ref is None or (is_output and holder.ref.run != self.run):
654 holder.ref = self.resolveRef(holder.dataset_type, data_id)
657@dataclass
658class _PipelineScaffolding:
659 """A helper data structure that organizes the information involved in
660 constructing a `QuantumGraph` for a `Pipeline`.
662 Parameters
663 ----------
664 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
665 Sequence of tasks from which a graph is to be constructed. Must
666 have nested task classes already imported.
667 universe : `~lsst.daf.butler.DimensionUniverse`
668 Universe of all possible dimensions.
670 Notes
671 -----
672 The scaffolding data structure contains nested data structures for both
673 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
674 data structures are shared between the pipeline-level structure (which
675 aggregates all datasets and categorizes them from the perspective of the
676 complete pipeline) and the individual tasks that use them as inputs and
677 outputs.
679 `QuantumGraph` construction proceeds in four steps, with each corresponding
680 to a different `_PipelineScaffolding` method:
682 1. When `_PipelineScaffolding` is constructed, we extract and categorize
683 the DatasetTypes used by the pipeline (delegating to
684 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
685 nested `_TaskScaffolding` and `_DatasetDict` objects.
687 2. In `connectDataIds`, we construct and run the "Big Join Query", which
688 returns related tuples of all dimensions used to identify any regular
689 input, output, and intermediate datasets (not prerequisites). We then
690 iterate over these tuples of related dimensions, identifying the subsets
691 that correspond to distinct data IDs for each task and dataset type,
692 and then create `_QuantumScaffolding` objects.
694 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
695 dataset data IDs previously identified, transforming unresolved
696 DatasetRefs into resolved DatasetRefs where appropriate. We then look
697 up prerequisite datasets for all quanta.
699 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
700 per-task `_QuantumScaffolding` objects.
701 """
703 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry):
704 _LOG.debug("Initializing data structures for QuantumGraph generation.")
705 self.tasks = []
706 # Aggregate and categorize the DatasetTypes in the Pipeline.
707 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
708 # Construct dictionaries that map those DatasetTypes to structures
709 # that will (later) hold additional information about them.
710 for attr in (
711 "initInputs",
712 "initIntermediates",
713 "initOutputs",
714 "inputs",
715 "intermediates",
716 "outputs",
717 "prerequisites",
718 ):
719 setattr(
720 self,
721 attr,
722 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
723 )
724 self.missing = _DatasetDict(universe=registry.dimensions)
725 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
726 # Aggregate all dimensions for all non-init, non-prerequisite
727 # DatasetTypes. These are the ones we'll include in the big join
728 # query.
729 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
730 # Construct scaffolding nodes for each Task, and add backreferences
731 # to the Task from each DatasetScaffolding node.
732 # Note that there's only one scaffolding node for each DatasetType,
733 # shared by _PipelineScaffolding and all _TaskScaffoldings that
734 # reference it.
735 if isinstance(pipeline, Pipeline):
736 pipeline = pipeline.toExpandedPipeline()
737 self.tasks = [
738 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
739 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
740 ]
742 def __repr__(self) -> str:
743 # Default dataclass-injected __repr__ gets caught in an infinite loop
744 # because of back-references.
745 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
747 tasks: list[_TaskScaffolding]
748 """Scaffolding data structures for each task in the pipeline
749 (`list` of `_TaskScaffolding`).
750 """
752 initInputs: _DatasetDict
753 """Datasets consumed but not produced when constructing the tasks in this
754 pipeline (`_DatasetDict`).
755 """
757 initIntermediates: _DatasetDict
758 """Datasets that are both consumed and produced when constructing the tasks
759 in this pipeline (`_DatasetDict`).
760 """
762 initOutputs: _DatasetDict
763 """Datasets produced but not consumed when constructing the tasks in this
764 pipeline (`_DatasetDict`).
765 """
767 inputs: _DatasetDict
768 """Datasets that are consumed but not produced when running this pipeline
769 (`_DatasetDict`).
770 """
772 intermediates: _DatasetDict
773 """Datasets that are both produced and consumed when running this pipeline
774 (`_DatasetDict`).
775 """
777 outputs: _DatasetDict
778 """Datasets produced but not consumed when when running this pipeline
779 (`_DatasetDict`).
780 """
782 prerequisites: _DatasetDict
783 """Datasets that are consumed when running this pipeline and looked up
784 per-Quantum when generating the graph (`_DatasetDict`).
785 """
787 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
788 """Datasets that should be used as constraints in the initial query,
789 according to tasks (`~lsst.daf.butler.NamedValueSet`).
790 """
792 dimensions: DimensionGraph
793 """All dimensions used by any regular input, intermediate, or output
794 (not prerequisite) dataset; the set of dimension used in the "Big Join
795 Query" (`~lsst.daf.butler.DimensionGraph`).
797 This is required to be a superset of all task quantum dimensions.
798 """
800 missing: _DatasetDict
801 """Datasets whose existence was originally predicted but were not
802 actually found.
804 Quanta that require these datasets as inputs will be pruned (recursively)
805 when actually constructing a `QuantumGraph` object.
807 These are currently populated only when the "initial dataset query
808 constraint" does not include all overall-input dataset types, and hence the
809 initial data ID query can include data IDs that it should not.
810 """
812 globalInitOutputs: _DatasetDict | None = None
813 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
814 """
816 @contextmanager
817 def connectDataIds(
818 self,
819 registry: Registry,
820 collections: Any,
821 userQuery: str | None,
822 externalDataId: DataCoordinate,
823 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
824 bind: Mapping[str, Any] | None = None,
825 ) -> Iterator[DataCoordinateQueryResults]:
826 """Query for the data IDs that connect nodes in the `QuantumGraph`.
828 This method populates `_TaskScaffolding.dataIds` and
829 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
831 Parameters
832 ----------
833 registry : `lsst.daf.butler.Registry`
834 Registry for the data repository; used for all data ID queries.
835 collections
836 Expressions representing the collections to search for input
837 datasets. See :ref:`daf_butler_ordered_collection_searches`.
838 userQuery : `str` or `None`
839 User-provided expression to limit the data IDs processed.
840 externalDataId : `~lsst.daf.butler.DataCoordinate`
841 Externally-provided data ID that should be used to restrict the
842 results, just as if these constraints had been included via ``AND``
843 in ``userQuery``. This includes (at least) any instrument named
844 in the pipeline definition.
845 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
846 The query constraint variant that should be used to constraint the
847 query based on dataset existance, defaults to
848 `DatasetQueryConstraintVariant.ALL`.
849 bind : `~collections.abc.Mapping`, optional
850 Mapping containing literal values that should be injected into the
851 ``userQuery`` expression, keyed by the identifiers they replace.
853 Returns
854 -------
855 commonDataIds : \
856 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
857 An interface to a database temporary table containing all data IDs
858 that will appear in this `QuantumGraph`. Returned inside a
859 context manager, which will drop the temporary table at the end of
860 the `with` block in which this method is called.
861 """
862 _LOG.debug("Building query for data IDs.")
863 # Initialization datasets always have empty data IDs.
864 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
865 for datasetType, refs in itertools.chain(
866 self.initInputs.items(),
867 self.initIntermediates.items(),
868 self.initOutputs.items(),
869 ):
870 refs[emptyDataId] = _RefHolder(datasetType)
871 # Run one big query for the data IDs for task dimensions and regular
872 # inputs and outputs. We limit the query to only dimensions that are
873 # associated with the input dataset types, but don't (yet) try to
874 # obtain the dataset_ids for those inputs.
875 _LOG.debug(
876 "Submitting data ID query over dimensions %s and materializing results.",
877 list(self.dimensions.names),
878 )
879 queryArgs: dict[str, Any] = {
880 "dimensions": self.dimensions,
881 "where": userQuery,
882 "dataId": externalDataId,
883 "bind": bind,
884 }
885 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
886 _LOG.debug(
887 "Constraining graph query using default of %s.",
888 list(self.defaultDatasetQueryConstraints.names),
889 )
890 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
891 queryArgs["collections"] = collections
892 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
893 _LOG.debug("Not using dataset existence to constrain query.")
894 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
895 constraint = set(datasetQueryConstraint)
896 inputs = {k.name: k for k in self.inputs.keys()}
897 if remainder := constraint.difference(inputs.keys()):
898 raise ValueError(
899 f"{remainder} dataset type(s) specified as a graph constraint, but"
900 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
901 )
902 _LOG.debug(f"Constraining graph query using {constraint}")
903 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
904 queryArgs["collections"] = collections
905 else:
906 raise ValueError(
907 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
908 )
910 if "datasets" in queryArgs:
911 for i, dataset_type in enumerate(queryArgs["datasets"]):
912 if dataset_type.isComponent():
913 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
915 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
916 _LOG.debug("Expanding data IDs.")
917 commonDataIds = commonDataIds.expanded()
918 _LOG.debug("Iterating over query results to associate quanta with datasets.")
919 # Iterate over query results, populating data IDs for datasets and
920 # quanta and then connecting them to each other.
921 n = -1
922 for n, commonDataId in enumerate(commonDataIds):
923 # Create DatasetRefs for all DatasetTypes from this result row,
924 # noting that we might have created some already.
925 # We remember both those that already existed and those that we
926 # create now.
927 refsForRow = {}
928 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {}
929 for datasetType, refs in itertools.chain(
930 self.inputs.items(),
931 self.intermediates.items(),
932 self.outputs.items(),
933 ):
934 datasetDataId: DataCoordinate | None
935 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
936 datasetDataId = commonDataId.subset(datasetType.dimensions)
937 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
938 ref_holder = refs.get(datasetDataId)
939 if ref_holder is None:
940 ref_holder = _RefHolder(datasetType)
941 refs[datasetDataId] = ref_holder
942 refsForRow[datasetType.name] = ref_holder
943 # Create _QuantumScaffolding objects for all tasks from this
944 # result row, noting that we might have created some already.
945 for task in self.tasks:
946 quantumDataId = commonDataId.subset(task.dimensions)
947 quantum = task.quanta.get(quantumDataId)
948 if quantum is None:
949 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
950 task.quanta[quantumDataId] = quantum
951 # Whether this is a new quantum or an existing one, we can
952 # now associate the DatasetRefs for this row with it. The
953 # fact that a Quantum data ID and a dataset data ID both
954 # came from the same result row is what tells us they
955 # should be associated.
956 # Many of these associates will be duplicates (because
957 # another query row that differed from this one only in
958 # irrelevant dimensions already added them), and we use
959 # sets to skip.
960 for datasetType in task.inputs:
961 dataId = dataIdCacheForRow[datasetType.dimensions]
962 ref_holder = refsForRow[datasetType.name]
963 quantum.inputs[datasetType.name][dataId] = ref_holder
964 for datasetType in task.outputs:
965 dataId = dataIdCacheForRow[datasetType.dimensions]
966 ref_holder = refsForRow[datasetType.name]
967 quantum.outputs[datasetType.name][dataId] = ref_holder
968 if n < 0:
969 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
970 emptiness_explained = False
971 for message in commonDataIds.explain_no_results():
972 _LOG.critical(message)
973 emptiness_explained = True
974 if not emptiness_explained:
975 _LOG.critical(
976 "To reproduce this query for debugging purposes, run "
977 "Registry.queryDataIds with these arguments:"
978 )
979 # We could just repr() the queryArgs dict to get something
980 # the user could make sense of, but it's friendlier to
981 # put these args in an easier-to-construct equivalent form
982 # so they can read it more easily and copy and paste into
983 # a Python terminal.
984 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
985 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
986 if queryArgs["where"]:
987 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
988 if "datasets" in queryArgs:
989 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
990 if "collections" in queryArgs:
991 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
992 _LOG.debug("Finished processing %d rows from data ID query.", n)
993 yield commonDataIds
995 def resolveDatasetRefs(
996 self,
997 registry: Registry,
998 collections: Any,
999 run: str,
1000 commonDataIds: DataCoordinateQueryResults,
1001 *,
1002 skipExistingIn: Any = None,
1003 clobberOutputs: bool = True,
1004 constrainedByAllDatasets: bool = True,
1005 ) -> None:
1006 """Perform follow up queries for each dataset data ID produced in
1007 `fillDataIds`.
1009 This method populates `_DatasetScaffolding.refs` (except for those in
1010 `prerequisites`).
1012 Parameters
1013 ----------
1014 registry : `lsst.daf.butler.Registry`
1015 Registry for the data repository; used for all data ID queries.
1016 collections
1017 Expressions representing the collections to search for input
1018 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1019 run : `str`
1020 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1021 output datasets, if it already exists.
1022 commonDataIds : \
1023 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
1024 Result of a previous call to `connectDataIds`.
1025 skipExistingIn
1026 Expressions representing the collections to search for existing
1027 output datasets that should be skipped. See
1028 :ref:`daf_butler_ordered_collection_searches` for allowed types.
1029 `None` or empty string/sequence disables skipping.
1030 clobberOutputs : `bool`, optional
1031 If `True` (default), allow quanta to created even if outputs exist;
1032 this requires the same behavior behavior to be enabled when
1033 executing. If ``skipExistingIn`` is not `None`, completed quanta
1034 (those with metadata, or all outputs if there is no metadata
1035 dataset configured) will be skipped rather than clobbered.
1036 constrainedByAllDatasets : `bool`, optional
1037 Indicates if the commonDataIds were generated with a constraint on
1038 all dataset types.
1040 Raises
1041 ------
1042 OutputExistsError
1043 Raised if an output dataset already exists in the output run
1044 and ``skipExistingIn`` does not include output run, or if only
1045 some outputs are present and ``clobberOutputs`` is `False`.
1046 """
1047 # Run may be provided but it does not have to exist, in that case we
1048 # use it for resolving references but don't check it for existing refs.
1049 run_exists = False
1050 if run:
1051 try:
1052 run_exists = bool(registry.queryCollections(run))
1053 except MissingCollectionError:
1054 # Undocumented exception is raise if it does not exist
1055 pass
1057 skip_collections_wildcard: CollectionWildcard | None = None
1058 skipExistingInRun = False
1059 if skipExistingIn:
1060 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
1061 if run_exists:
1062 # as optimization check in the explicit list of names first
1063 skipExistingInRun = run in skip_collections_wildcard.strings
1064 if not skipExistingInRun:
1065 # need to flatten it and check again
1066 skipExistingInRun = run in registry.queryCollections(
1067 skipExistingIn,
1068 collectionTypes=CollectionType.RUN,
1069 )
1071 idMaker = _DatasetIdMaker(run)
1073 resolvedRefQueryResults: Iterable[DatasetRef]
1075 # Updating constrainedByAllDatasets here is not ideal, but we have a
1076 # few different code paths that each transfer different pieces of
1077 # information about what dataset query constraints were applied here,
1078 # and none of them has the complete picture until we get here. We're
1079 # long overdue for a QG generation rewrite that will make this go away
1080 # entirely anyway.
1081 constrainedByAllDatasets = (
1082 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
1083 )
1085 # Look up [init] intermediate and output datasets in the output
1086 # collection, if there is an output collection.
1087 if run_exists or skip_collections_wildcard is not None:
1088 for datasetType, refs in itertools.chain(
1089 self.initIntermediates.items(),
1090 self.initOutputs.items(),
1091 self.intermediates.items(),
1092 self.outputs.items(),
1093 ):
1094 _LOG.debug(
1095 "Resolving %d datasets for intermediate and/or output dataset %s.",
1096 len(refs),
1097 datasetType.name,
1098 )
1099 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
1100 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
1101 # TODO: this assert incorrectly bans component inputs;
1102 # investigate on DM-33027.
1103 # assert not datasetType.isComponent(), \
1104 # "Output datasets cannot be components."
1105 #
1106 # Instead we have to handle them manually to avoid a
1107 # deprecation warning, but it is at least confusing and
1108 # possibly a bug for components to appear here at all.
1109 if datasetType.isComponent():
1110 parent_dataset_type = datasetType.makeCompositeDatasetType()
1111 component = datasetType.component()
1112 else:
1113 parent_dataset_type = datasetType
1114 component = None
1116 # look at RUN collection first
1117 if run_exists:
1118 try:
1119 resolvedRefQueryResults = subset.findDatasets(
1120 parent_dataset_type, collections=run, findFirst=True
1121 )
1122 except MissingDatasetTypeError:
1123 resolvedRefQueryResults = []
1124 for resolvedRef in resolvedRefQueryResults:
1125 # TODO: we could easily support per-DatasetType
1126 # skipExisting and I could imagine that being useful -
1127 # it's probably required in order to support writing
1128 # initOutputs before QuantumGraph generation.
1129 assert resolvedRef.dataId in refs
1130 if not (skipExistingInRun or isInit or clobberOutputs):
1131 raise OutputExistsError(
1132 f"Output dataset {datasetType.name} already exists in "
1133 f"output RUN collection '{run}' with data ID"
1134 f" {resolvedRef.dataId}."
1135 )
1136 # To resolve all outputs we have to remember existing
1137 # ones to avoid generating new dataset IDs for them.
1138 refs[resolvedRef.dataId].ref = (
1139 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1140 )
1142 # And check skipExistingIn too, if RUN collection is in
1143 # it is handled above
1144 if skip_collections_wildcard is not None:
1145 try:
1146 resolvedRefQueryResults = subset.findDatasets(
1147 parent_dataset_type,
1148 collections=skip_collections_wildcard,
1149 findFirst=True,
1150 )
1151 except MissingDatasetTypeError:
1152 resolvedRefQueryResults = []
1153 for resolvedRef in resolvedRefQueryResults:
1154 if resolvedRef.dataId not in refs:
1155 continue
1156 refs[resolvedRef.dataId].ref = (
1157 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1158 )
1160 # Look up input and initInput datasets in the input collection(s). We
1161 # accumulate datasets in self.missing, if the common data IDs were not
1162 # constrained on dataset type existence.
1163 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1164 _LOG.debug(
1165 "Resolving %d datasets for input dataset %s.",
1166 len(refs),
1167 datasetType.name,
1168 )
1169 if datasetType.isComponent():
1170 parent_dataset_type = datasetType.makeCompositeDatasetType()
1171 component = datasetType.component()
1172 else:
1173 parent_dataset_type = datasetType
1174 component = None
1175 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {}
1176 try:
1177 resolvedRefQueryResults = commonDataIds.subset(
1178 datasetType.dimensions, unique=True
1179 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1180 except MissingDatasetTypeError:
1181 resolvedRefQueryResults = []
1182 dataIdsNotFoundYet = set(refs.keys())
1183 for resolvedRef in resolvedRefQueryResults:
1184 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1185 if resolvedRef.dataId not in refs:
1186 continue
1187 refs[resolvedRef.dataId].ref = (
1188 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1189 )
1190 if dataIdsNotFoundYet:
1191 if constrainedByAllDatasets:
1192 raise RuntimeError(
1193 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1194 f"'{datasetType.name}' was/were present in a previous "
1195 "query, but could not be found now. "
1196 "This is either a logic bug in QuantumGraph generation "
1197 "or the input collections have been modified since "
1198 "QuantumGraph generation began."
1199 )
1200 elif not datasetType.dimensions:
1201 raise RuntimeError(
1202 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1203 f"collections {collections}."
1204 )
1205 else:
1206 # If the common dataIds were not constrained using all the
1207 # input dataset types, it is possible that some data ids
1208 # found don't correspond to existing datasets. Mark these
1209 # for later pruning from the quantum graph.
1210 for k in dataIdsNotFoundYet:
1211 missing_for_dataset_type[k] = refs[k]
1212 if missing_for_dataset_type:
1213 self.missing[datasetType] = missing_for_dataset_type
1215 # Resolve the missing refs, just so they look like all of the others;
1216 # in the end other code will make sure they never appear in the QG.
1217 for dataset_type, refDict in self.missing.items():
1218 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1220 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1221 # replacing the unresolved refs there, and then look up prerequisites.
1222 for task in self.tasks:
1223 _LOG.debug(
1224 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1225 len(task.quanta),
1226 task.taskDef.label,
1227 )
1228 # The way iterConnections is designed makes it impossible to
1229 # annotate precisely enough to satisfy MyPy here.
1230 lookupFunctions = {
1231 c.name: c.lookupFunction # type: ignore
1232 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1233 if c.lookupFunction is not None # type: ignore
1234 }
1235 dataIdsFailed = []
1236 dataIdsSucceeded = []
1237 for quantum in task.quanta.values():
1238 # Process outputs datasets only if skipExistingIn is not None
1239 # or there is a run to look for outputs in and clobberOutputs
1240 # is True. Note that if skipExistingIn is None, any output
1241 # datasets that already exist would have already caused an
1242 # exception to be raised.
1243 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1244 resolvedRefs = []
1245 unresolvedDataIds = []
1246 haveMetadata = False
1247 for datasetType, originalRefs in quantum.outputs.items():
1248 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()):
1249 if ref is not None:
1250 resolvedRefs.append(ref)
1251 originalRefs[dataId].ref = ref
1252 if datasetType.name == task.taskDef.metadataDatasetName:
1253 haveMetadata = True
1254 else:
1255 unresolvedDataIds.append((datasetType, dataId))
1256 if resolvedRefs:
1257 if haveMetadata or not unresolvedDataIds:
1258 dataIdsSucceeded.append(quantum.dataId)
1259 if skip_collections_wildcard is not None:
1260 continue
1261 else:
1262 dataIdsFailed.append(quantum.dataId)
1263 if not clobberOutputs and run_exists:
1264 raise OutputExistsError(
1265 f"Quantum {quantum.dataId} of task with label "
1266 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1267 f"({resolvedRefs}) "
1268 f"and others that don't ({unresolvedDataIds}), with no metadata output, "
1269 "and clobbering outputs was not enabled."
1270 )
1271 # Update the input DatasetRefs to the resolved ones we already
1272 # searched for.
1273 for datasetType, input_refs in quantum.inputs.items():
1274 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()):
1275 input_refs[data_id].ref = ref
1276 # Look up prerequisite datasets in the input collection(s).
1277 # These may have dimensions that extend beyond those we queried
1278 # for originally, because we want to permit those data ID
1279 # values to differ across quanta and dataset types.
1280 for datasetType in task.prerequisites:
1281 if datasetType.isComponent():
1282 parent_dataset_type = datasetType.makeCompositeDatasetType()
1283 component = datasetType.component()
1284 else:
1285 parent_dataset_type = datasetType
1286 component = None
1287 lookupFunction = lookupFunctions.get(datasetType.name)
1288 if lookupFunction is not None:
1289 # PipelineTask has provided its own function to do the
1290 # lookup. This always takes precedence.
1291 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1292 elif (
1293 datasetType.isCalibration()
1294 and datasetType.dimensions <= quantum.dataId.graph
1295 and quantum.dataId.graph.temporal
1296 ):
1297 # This is a master calibration lookup, which we have to
1298 # handle specially because the query system can't do a
1299 # temporal join on a non-dimension-based timespan yet.
1300 timespan = quantum.dataId.timespan
1301 try:
1302 prereq_ref = registry.findDataset(
1303 parent_dataset_type,
1304 quantum.dataId,
1305 collections=collections,
1306 timespan=timespan,
1307 )
1308 if prereq_ref is not None:
1309 if component is not None:
1310 prereq_ref = prereq_ref.makeComponentRef(component)
1311 prereq_refs = [prereq_ref]
1312 else:
1313 prereq_refs = []
1314 except (KeyError, MissingDatasetTypeError):
1315 # This dataset type is not present in the registry,
1316 # which just means there are no datasets here.
1317 prereq_refs = []
1318 else:
1319 # Most general case.
1320 prereq_refs = [
1321 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1322 for prereq_ref in registry.queryDatasets(
1323 parent_dataset_type,
1324 collections=collections,
1325 dataId=quantum.dataId,
1326 findFirst=True,
1327 ).expanded()
1328 ]
1330 for ref in prereq_refs:
1331 if ref is not None:
1332 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1333 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1335 # Resolve all quantum inputs and outputs.
1336 for dataset_type, refDict in quantum.inputs.items():
1337 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1338 for dataset_type, refDict in quantum.outputs.items():
1339 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1341 # Resolve task initInputs and initOutputs.
1342 for dataset_type, refDict in task.initInputs.items():
1343 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1344 for dataset_type, refDict in task.initOutputs.items():
1345 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1347 # Actually remove any quanta that we decided to skip above.
1348 if dataIdsSucceeded:
1349 if skip_collections_wildcard is not None:
1350 _LOG.debug(
1351 "Pruning successful %d quanta for task with label '%s' because all of their "
1352 "outputs exist or metadata was written successfully.",
1353 len(dataIdsSucceeded),
1354 task.taskDef.label,
1355 )
1356 for dataId in dataIdsSucceeded:
1357 del task.quanta[dataId]
1358 elif clobberOutputs and run_exists:
1359 _LOG.info(
1360 "Found %d successful quanta for task with label '%s' "
1361 "that will need to be clobbered during execution.",
1362 len(dataIdsSucceeded),
1363 task.taskDef.label,
1364 )
1365 if dataIdsFailed:
1366 if clobberOutputs and run_exists:
1367 _LOG.info(
1368 "Found %d failed/incomplete quanta for task with label '%s' "
1369 "that will need to be clobbered during execution.",
1370 len(dataIdsFailed),
1371 task.taskDef.label,
1372 )
1374 # Collect initOutputs that do not belong to any task.
1375 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1376 for task in self.tasks:
1377 global_dataset_types -= set(task.initOutputs)
1378 if global_dataset_types:
1379 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1380 for dataset_type, refDict in self.globalInitOutputs.items():
1381 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1383 def makeQuantumGraph(
1384 self,
1385 registry: Registry,
1386 metadata: Mapping[str, Any] | None = None,
1387 datastore: Datastore | None = None,
1388 ) -> QuantumGraph:
1389 """Create a `QuantumGraph` from the quanta already present in
1390 the scaffolding data structure.
1392 Parameters
1393 ----------
1394 registry : `lsst.daf.butler.Registry`
1395 Registry for the data repository; used for all data ID queries.
1396 metadata : `~collections.abc.Mapping` of `str` to primitives, optional
1397 This is an optional parameter of extra data to carry with the
1398 graph. Entries in this mapping should be able to be serialized in
1399 JSON.
1400 datastore : `~lsst.daf.butler.Datastore`, optional
1401 If not `None` then fill datastore records in each generated
1402 Quantum.
1404 Returns
1405 -------
1406 graph : `QuantumGraph`
1407 The full `QuantumGraph`.
1408 """
1410 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1411 """Extract all DatasetRefs from the dictionaries"""
1412 for ref_dict in dataset_dict.values():
1413 for holder in ref_dict.values():
1414 yield holder.resolved_ref
1416 datastore_records: Mapping[str, DatastoreRecordData] | None = None
1417 if datastore is not None:
1418 datastore_records = datastore.export_records(
1419 itertools.chain(
1420 _make_refs(self.inputs),
1421 _make_refs(self.initInputs),
1422 _make_refs(self.prerequisites),
1423 )
1424 )
1426 graphInput: dict[TaskDef, set[Quantum]] = {}
1427 for task in self.tasks:
1428 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records)
1429 graphInput[task.taskDef] = qset
1431 taskInitInputs = {
1432 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values()
1433 for task in self.tasks
1434 }
1435 taskInitOutputs = {
1436 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values()
1437 for task in self.tasks
1438 }
1440 globalInitOutputs: list[DatasetRef] = []
1441 if self.globalInitOutputs is not None:
1442 for refs_dict in self.globalInitOutputs.values():
1443 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values())
1445 graph = QuantumGraph(
1446 graphInput,
1447 metadata=metadata,
1448 pruneRefs=list(self.missing.iter_resolved_refs()),
1449 universe=self.dimensions.universe,
1450 initInputs=taskInitInputs,
1451 initOutputs=taskInitOutputs,
1452 globalInitOutputs=globalInitOutputs,
1453 registryDatasetTypes=self._get_registry_dataset_types(registry),
1454 )
1455 return graph
1457 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1458 """Make a list of all dataset types used by a graph as defined in
1459 registry.
1460 """
1461 chain = [
1462 self.initInputs,
1463 self.initIntermediates,
1464 self.initOutputs,
1465 self.inputs,
1466 self.intermediates,
1467 self.outputs,
1468 self.prerequisites,
1469 ]
1470 if self.globalInitOutputs is not None:
1471 chain.append(self.globalInitOutputs)
1473 # Collect names of all dataset types.
1474 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1475 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1477 # Check for types that do not exist in registry yet:
1478 # - inputs must exist
1479 # - intermediates and outputs may not exist, but there must not be
1480 # more than one definition (e.g. differing in storage class)
1481 # - prerequisites may not exist, treat it the same as outputs here
1482 for dstype in itertools.chain(self.initInputs, self.inputs):
1483 if dstype.name not in dataset_types:
1484 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1486 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1487 chain = [
1488 self.initIntermediates,
1489 self.initOutputs,
1490 self.intermediates,
1491 self.outputs,
1492 self.prerequisites,
1493 ]
1494 if self.globalInitOutputs is not None:
1495 chain.append(self.globalInitOutputs)
1496 for dstype in itertools.chain(*chain):
1497 if dstype.name not in dataset_types:
1498 new_outputs[dstype.name].add(dstype)
1499 for name, dstypes in new_outputs.items():
1500 if len(dstypes) > 1:
1501 raise ValueError(
1502 "Pipeline contains multiple definitions for a dataset type "
1503 f"which is not defined in registry yet: {dstypes}"
1504 )
1505 elif len(dstypes) == 1:
1506 dataset_types[name] = dstypes.pop()
1508 return dataset_types.values()
1511# ------------------------
1512# Exported definitions --
1513# ------------------------
1516class GraphBuilderError(Exception):
1517 """Base class for exceptions generated by graph builder."""
1519 pass
1522class OutputExistsError(GraphBuilderError):
1523 """Exception generated when output datasets already exist."""
1525 pass
1528class PrerequisiteMissingError(GraphBuilderError):
1529 """Exception generated when a prerequisite dataset does not exist."""
1531 pass
1534class GraphBuilder:
1535 """GraphBuilder class is responsible for building task execution graph from
1536 a Pipeline.
1538 Parameters
1539 ----------
1540 registry : `~lsst.daf.butler.Registry`
1541 Data butler instance.
1542 skipExistingIn
1543 Expressions representing the collections to search for existing
1544 output datasets that should be skipped. See
1545 :ref:`daf_butler_ordered_collection_searches`.
1546 clobberOutputs : `bool`, optional
1547 If `True` (default), allow quanta to created even if partial outputs
1548 exist; this requires the same behavior behavior to be enabled when
1549 executing.
1550 datastore : `~lsst.daf.butler.Datastore`, optional
1551 If not `None` then fill datastore records in each generated Quantum.
1552 """
1554 def __init__(
1555 self,
1556 registry: Registry,
1557 skipExistingIn: Any = None,
1558 clobberOutputs: bool = True,
1559 datastore: Datastore | None = None,
1560 ):
1561 self.registry = registry
1562 self.dimensions = registry.dimensions
1563 self.skipExistingIn = skipExistingIn
1564 self.clobberOutputs = clobberOutputs
1565 self.datastore = datastore
1567 def makeGraph(
1568 self,
1569 pipeline: Pipeline | Iterable[TaskDef],
1570 collections: Any,
1571 run: str,
1572 userQuery: str | None,
1573 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1574 metadata: Mapping[str, Any] | None = None,
1575 bind: Mapping[str, Any] | None = None,
1576 dataId: DataCoordinate | None = None,
1577 ) -> QuantumGraph:
1578 """Create execution graph for a pipeline.
1580 Parameters
1581 ----------
1582 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
1583 Pipeline definition, task names/classes and their configs.
1584 collections
1585 Expressions representing the collections to search for input
1586 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1587 run : `str`
1588 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1589 output datasets. Collection does not have to exist and it will be
1590 created when graph is executed.
1591 userQuery : `str`
1592 String which defines user-defined selection for registry, should be
1593 empty or `None` if there is no restrictions on data selection.
1594 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1595 The query constraint variant that should be used to constraint the
1596 query based on dataset existance, defaults to
1597 `DatasetQueryConstraintVariant.ALL`.
1598 metadata : Optional Mapping of `str` to primitives
1599 This is an optional parameter of extra data to carry with the
1600 graph. Entries in this mapping should be able to be serialized in
1601 JSON.
1602 bind : `~collections.abc.Mapping`, optional
1603 Mapping containing literal values that should be injected into the
1604 ``userQuery`` expression, keyed by the identifiers they replace.
1605 dataId : `lsst.daf.butler.DataCoordinate`, optional
1606 Data ID that should also be included in the query constraint.
1608 Returns
1609 -------
1610 graph : `QuantumGraph`
1612 Raises
1613 ------
1614 UserExpressionError
1615 Raised when user expression cannot be parsed.
1616 OutputExistsError
1617 Raised when output datasets already exist.
1618 Exception
1619 Other exceptions types may be raised by underlying registry
1620 classes.
1621 """
1622 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1623 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1624 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1625 if dataId is None:
1626 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1627 if isinstance(pipeline, Pipeline):
1628 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId)
1629 with scaffolding.connectDataIds(
1630 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1631 ) as commonDataIds:
1632 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1633 scaffolding.resolveDatasetRefs(
1634 self.registry,
1635 collections,
1636 run,
1637 commonDataIds,
1638 skipExistingIn=self.skipExistingIn,
1639 clobberOutputs=self.clobberOutputs,
1640 constrainedByAllDatasets=condition,
1641 )
1642 return scaffolding.makeQuantumGraph(
1643 registry=self.registry, metadata=metadata, datastore=self.datastore
1644 )