Coverage for python/lsst/pipe/base/graphBuilder.py: 16%
548 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-23 08:14 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-23 08:14 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining GraphBuilder class and related methods.
23"""
25from __future__ import annotations
27__all__ = ["GraphBuilder"]
29# -------------------------------
30# Imports of standard modules --
31# -------------------------------
32import itertools
33import logging
34from collections import ChainMap, defaultdict
35from collections.abc import Collection, Iterable, Iterator, Mapping
36from contextlib import contextmanager
37from dataclasses import dataclass
38from typing import Any
40from lsst.daf.butler import (
41 CollectionType,
42 DataCoordinate,
43 DatasetRef,
44 DatasetType,
45 Datastore,
46 DatastoreRecordData,
47 DimensionGraph,
48 DimensionUniverse,
49 NamedKeyDict,
50 NamedValueSet,
51 Quantum,
52 Registry,
53)
54from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
55from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
56from lsst.daf.butler.registry.wildcards import CollectionWildcard
58# -----------------------------
59# Imports for other modules --
60# -----------------------------
61from . import automatic_connection_constants as acc
62from ._datasetQueryConstraints import DatasetQueryConstraintVariant
63from ._status import NoWorkFound
64from .connections import AdjustQuantumHelper, iterConnections
65from .graph import QuantumGraph
66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
68# ----------------------------------
69# Local non-exported definitions --
70# ----------------------------------
72_LOG = logging.getLogger(__name__)
75@dataclass
76class _RefHolder:
77 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future
78 resolved reference.
80 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use
81 `None` to represent a reference that is yet to be resolved. Information
82 about its corresponding dataset type and coordinate is stored in
83 `_DatasetDict` mapping.
84 """
86 dataset_type: DatasetType
87 """Dataset type of the dataset to be created later. I need to store it here
88 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared
89 between different compatible dataset types."""
91 ref: DatasetRef | None = None
92 """Dataset reference, initially `None`, created when all datasets are
93 resolved.
94 """
96 @property
97 def resolved_ref(self) -> DatasetRef:
98 """Access resolved reference, should only be called after the
99 reference is set (`~lsst.daf.butler.DatasetRef`).
100 """
101 assert self.ref is not None, "Dataset reference is not set."
102 return self.ref
105class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]):
106 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested
107 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that
108 type.
110 Parameters
111 ----------
112 args
113 Positional arguments are forwarded to the `dict` constructor.
114 universe : `~lsst.daf.butler.DimensionUniverse`
115 Universe of all possible dimensions.
116 """
118 def __init__(self, *args: Any, universe: DimensionUniverse):
119 super().__init__(*args)
120 self.universe = universe
122 @classmethod
123 def fromDatasetTypes(
124 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
125 ) -> _DatasetDict:
126 """Construct a dictionary from a flat iterable of
127 `~lsst.daf.butler.DatasetType` keys.
129 Parameters
130 ----------
131 datasetTypes : `~collections.abc.Iterable` of \
132 `~lsst.daf.butler.DatasetType`
133 DatasetTypes to use as keys for the dict. Values will be empty
134 dictionaries.
135 universe : `~lsst.daf.butler.DimensionUniverse`
136 Universe of all possible dimensions.
138 Returns
139 -------
140 dictionary : `_DatasetDict`
141 A new `_DatasetDict` instance.
142 """
143 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
145 @classmethod
146 def fromSubset(
147 cls,
148 datasetTypes: Collection[DatasetType],
149 first: _DatasetDict,
150 *rest: _DatasetDict,
151 ) -> _DatasetDict:
152 """Return a new dictionary by extracting items corresponding to the
153 given keys from one or more existing dictionaries.
155 Parameters
156 ----------
157 datasetTypes : `~collections.abc.Iterable` of \
158 `~lsst.daf.butler.DatasetType`
159 DatasetTypes to use as keys for the dict. Values will be obtained
160 by lookups against ``first`` and ``rest``.
161 first : `_DatasetDict`
162 Another dictionary from which to extract values.
163 rest
164 Additional dictionaries from which to extract values.
166 Returns
167 -------
168 dictionary : `_DatasetDict`
169 A new dictionary instance.
170 """
171 combined = ChainMap(first, *rest)
173 # Dataset types known to match immediately can be processed
174 # without checks.
175 matches = combined.keys() & set(datasetTypes)
176 _dict = {k: combined[k] for k in matches}
178 if len(_dict) < len(datasetTypes):
179 # Work out which ones are missing.
180 missing_datasetTypes = set(datasetTypes) - _dict.keys()
182 # Get the known names for comparison.
183 combined_by_name = {k.name: k for k in combined}
185 missing = set()
186 incompatible = {}
187 for datasetType in missing_datasetTypes:
188 # The dataset type is not found. It may not be listed
189 # or it may be that it is there with the same name
190 # but different definition.
191 if datasetType.name in combined_by_name:
192 # This implies some inconsistency in definitions
193 # for connections. If there is support for storage
194 # class conversion we can let it slide.
195 # At this point we do not know
196 # where the inconsistency is but trust that down
197 # stream code will be more explicit about input
198 # vs output incompatibilities.
199 existing = combined_by_name[datasetType.name]
200 convertible_to_existing = existing.is_compatible_with(datasetType)
201 convertible_from_existing = datasetType.is_compatible_with(existing)
202 if convertible_to_existing and convertible_from_existing:
203 _LOG.debug(
204 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
205 datasetType.name,
206 datasetType.storageClass_name,
207 existing.storageClass_name,
208 )
209 _dict[datasetType] = combined[existing]
210 elif convertible_to_existing or convertible_from_existing:
211 # We'd need to refactor a fair amount to recognize
212 # whether this is an error or not, so I'm not going to
213 # bother until we need to do that for other reasons
214 # (it won't be too long).
215 _LOG.info(
216 "Dataset type %s is present with multiple only partially-compatible storage "
217 "classes %s and %s.",
218 datasetType.name,
219 datasetType.storageClass_name,
220 existing.storageClass_name,
221 )
222 _dict[datasetType] = combined[existing]
223 else:
224 incompatible[datasetType] = existing
225 else:
226 missing.add(datasetType)
228 if missing or incompatible:
229 reasons = []
230 if missing:
231 reasons.append(
232 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known "
233 f"types: [{', '.join(d.name for d in combined)}]."
234 )
235 if incompatible:
236 for x, y in incompatible.items():
237 reasons.append(f"{x} incompatible with {y}")
238 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
240 return cls(_dict, universe=first.universe)
242 @property
243 def dimensions(self) -> DimensionGraph:
244 """The union of all dimensions used by all dataset types in this
245 dictionary, including implied dependencies (`DimensionGraph`).
246 """
247 base = self.universe.empty
248 if len(self) == 0:
249 return base
250 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
252 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]:
253 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts
254 into a new mapping with `~lsst.daf.butler.DatasetType` keys and
255 `~lsst.daf.butler.DatasetRef` values.
257 This method assumes that each nest contains exactly one item, as is the
258 case for all "init" datasets.
260 Parameters
261 ----------
262 storage_classes : `dict` [ `str`, `str` ]
263 Mapping from dataset type name to the storage class to use for that
264 dataset type. These are typically the storage classes declared
265 for a particular task, which may differ rom the data repository
266 definitions.
268 Returns
269 -------
270 dictionary : `~lsst.daf.butler.NamedKeyDict`
271 Dictionary mapping `~lsst.daf.butler.DatasetType` to
272 `~lsst.daf.butler.DatasetRef`, with both
273 `~lsst.daf.butler.DatasetType` instances and string names usable
274 as keys.
275 """
276 return NamedKeyDict(
277 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()}
278 )
280 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
281 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into
282 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of
283 `~lsst.daf.butler.DatasetRef` values.
285 Parameters
286 ----------
287 storage_classes : `dict` [ `str`, `str` ]
288 Mapping from dataset type name to the storage class to use for that
289 dataset type. These are typically the storage classes declared
290 for a particular task, which may differ rom the data repository
291 definitions.
293 Returns
294 -------
295 dictionary : `~lsst.daf.butler.NamedKeyDict`
296 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of
297 `~lsst.daf.butler.DatasetRef`, with both
298 `~lsst.daf.butler.DatasetType` instances and string names usable
299 as keys.
300 """
301 result = {}
302 for dataset_type, holders in self.items():
303 if (
304 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name)
305 ) != dataset_type.storageClass_name:
306 dataset_type = dataset_type.overrideStorageClass(override)
307 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()]
308 else:
309 refs = [holder.resolved_ref for holder in holders.values()]
310 result[dataset_type] = refs
311 return NamedKeyDict(result)
313 def extract(
314 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
315 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]:
316 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances
317 that match the given `~lsst.daf.butler.DatasetType` and data IDs.
319 Parameters
320 ----------
321 datasetType : `~lsst.daf.butler.DatasetType`
322 Dataset type to match.
323 dataIds : `~collections.abc.Iterable` \
324 [ `~lsst.daf.butler.DataCoordinate` ]
325 Data IDs to match.
327 Returns
328 -------
329 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ]
330 DatasetRef instances for which ``ref.datasetType == datasetType``
331 and ``ref.dataId`` is in ``dataIds``.
332 """
333 refs = self[datasetType]
334 return ((dataId, refs[dataId].ref) for dataId in dataIds)
336 def isdisjoint(self, other: _DatasetDict) -> bool:
337 """Test whether ``self`` and ``other`` have any datasets in common.
339 Datasets are considered in common if they have the same *parent*
340 dataset type name and data ID; storage classes and components are not
341 considered.
342 """
343 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()}
344 for k, v in other.items():
345 parent_name, _ = k.nameAndComponent()
346 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()):
347 return False
348 return True
350 def iter_resolved_refs(self) -> Iterator[DatasetRef]:
351 """Iterate over all DatasetRef instances held by this data structure,
352 assuming that each `_RefHolder` already carries are resolved ref.
353 """
354 for holders_by_data_id in self.values():
355 for holder in holders_by_data_id.values():
356 yield holder.resolved_ref
359class _QuantumScaffolding:
360 """Helper class aggregating information about a `Quantum`, used when
361 constructing a `QuantumGraph`.
363 See `_PipelineScaffolding` for a top-down description of the full
364 scaffolding data structure.
366 Parameters
367 ----------
368 task : _TaskScaffolding
369 Back-reference to the helper object for the `PipelineTask` this quantum
370 represents an execution of.
371 dataId : `~lsst.daf.butler.DataCoordinate`
372 Data ID for this quantum.
373 """
375 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
376 self.task = task
377 self.dataId = dataId
378 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
379 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
380 self.prerequisites = _DatasetDict.fromDatasetTypes(
381 task.prerequisites.keys(), universe=dataId.universe
382 )
384 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
386 def __repr__(self) -> str:
387 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
389 task: _TaskScaffolding
390 """Back-reference to the helper object for the `PipelineTask` this quantum
391 represents an execution of.
392 """
394 dataId: DataCoordinate
395 """Data ID for this quantum.
396 """
398 inputs: _DatasetDict
399 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to
400 this quantum.
402 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty
403 dictionary at construction. Those nested dictionaries are populated
404 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef`
405 instances in `_PipelineScaffolding.connectDataIds`.
406 """
408 outputs: _DatasetDict
409 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this
410 quantum.
411 """
413 prerequisites: _DatasetDict
414 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite
415 inputs to this quantum.
416 """
418 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum:
419 """Transform the scaffolding object into a true `Quantum` instance.
421 Parameters
422 ----------
423 datastore_records : `~collections.abc.Mapping` [ `str`, \
424 `~lsst.daf.butler.DatastoreRecordData` ], optional
425 If not `None` then fill datastore records in each generated Quantum
426 using the records from this structure.
428 Returns
429 -------
430 quantum : `Quantum`
431 An actual `Quantum` instance.
432 """
433 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes)
434 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes))
435 # Give the task's Connections class an opportunity to remove some
436 # inputs, or complain if they are unacceptable.
437 # This will raise if one of the check conditions is not met, which is
438 # the intended behavior.
439 # If it raises NotWorkFound, there is a bug in the QG algorithm
440 # or the adjustQuantum is incorrectly trying to make a prerequisite
441 # input behave like a regular input; adjustQuantum should only raise
442 # NoWorkFound if a regular input is missing, and it shouldn't be
443 # possible for us to have generated ``self`` if that's true.
444 helper = AdjustQuantumHelper(
445 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes)
446 )
447 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
448 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes)
449 quantum_records: Mapping[str, DatastoreRecordData] | None = None
450 if datastore_records is not None:
451 quantum_records = {}
452 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
453 input_refs += list(initInputs.values())
454 input_ids = set(ref.id for ref in input_refs)
455 for datastore_name, records in datastore_records.items():
456 matching_records = records.subset(input_ids)
457 if matching_records is not None:
458 quantum_records[datastore_name] = matching_records
459 # ignore the types because quantum really can take a sequence of inputs
460 return Quantum(
461 taskName=self.task.taskDef.taskName,
462 taskClass=self.task.taskDef.taskClass,
463 dataId=self.dataId,
464 initInputs=initInputs,
465 inputs=helper.inputs,
466 outputs=helper.outputs,
467 datastore_records=quantum_records,
468 )
471@dataclass
472class _TaskScaffolding:
473 """Helper class aggregating information about a `PipelineTask`, used when
474 constructing a `QuantumGraph`.
476 See `_PipelineScaffolding` for a top-down description of the full
477 scaffolding data structure.
479 Parameters
480 ----------
481 taskDef : `TaskDef`
482 Data structure that identifies the task class and its config.
483 parent : `_PipelineScaffolding`
484 The parent data structure that will hold the instance being
485 constructed.
486 datasetTypes : `TaskDatasetTypes`
487 Data structure that categorizes the dataset types used by this task.
488 """
490 def __init__(
491 self,
492 taskDef: TaskDef,
493 parent: _PipelineScaffolding,
494 datasetTypes: TaskDatasetTypes,
495 ):
496 universe = parent.dimensions.universe
497 self.taskDef = taskDef
498 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
499 assert self.dimensions.issubset(parent.dimensions)
500 # Initialize _DatasetDicts as subsets of the one or two
501 # corresponding dicts in the parent _PipelineScaffolding.
502 self.initInputs = _DatasetDict.fromSubset(
503 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
504 )
505 self.initOutputs = _DatasetDict.fromSubset(
506 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
507 )
508 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
509 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
510 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
511 self.dataIds: set[DataCoordinate] = set()
512 self.quanta = {}
513 self.storage_classes = {
514 connection.name: connection.storageClass
515 for connection in self.taskDef.connections.allConnections.values()
516 }
517 self.storage_classes[
518 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
519 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS
520 self.storage_classes[
521 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
522 ] = acc.LOG_OUTPUT_STORAGE_CLASS
523 self.storage_classes[
524 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label)
525 ] = acc.METADATA_OUTPUT_STORAGE_CLASS
527 def __repr__(self) -> str:
528 # Default dataclass-injected __repr__ gets caught in an infinite loop
529 # because of back-references.
530 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
532 taskDef: TaskDef
533 """Data structure that identifies the task class and its config
534 (`TaskDef`).
535 """
537 dimensions: DimensionGraph
538 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
539 """
541 initInputs: _DatasetDict
542 """Dictionary containing information about datasets used to construct this
543 task (`_DatasetDict`).
544 """
546 initOutputs: _DatasetDict
547 """Dictionary containing information about datasets produced as a
548 side-effect of constructing this task (`_DatasetDict`).
549 """
551 inputs: _DatasetDict
552 """Dictionary containing information about datasets used as regular,
553 graph-constraining inputs to this task (`_DatasetDict`).
554 """
556 outputs: _DatasetDict
557 """Dictionary containing information about datasets produced by this task
558 (`_DatasetDict`).
559 """
561 prerequisites: _DatasetDict
562 """Dictionary containing information about input datasets that must be
563 present in the repository before any Pipeline containing this task is run
564 (`_DatasetDict`).
565 """
567 quanta: dict[DataCoordinate, _QuantumScaffolding]
568 """Dictionary mapping data ID to a scaffolding object for the Quantum of
569 this task with that data ID.
570 """
572 storage_classes: dict[str, str]
573 """Mapping from dataset type name to storage class declared by this task.
574 """
576 def makeQuantumSet(
577 self,
578 missing: _DatasetDict,
579 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
580 ) -> set[Quantum]:
581 """Create a `set` of `Quantum` from the information in ``self``.
583 Parameters
584 ----------
585 missing : `_DatasetDict`
586 Input datasets that have not been found.
587 datastore_records : `dict`
588 Record from the datastore to export with quanta.
590 Returns
591 -------
592 nodes : `set` of `Quantum`
593 The `Quantum` elements corresponding to this task.
594 """
595 outputs = set()
596 for q in self.quanta.values():
597 try:
598 tmpQuanta = q.makeQuantum(datastore_records)
599 outputs.add(tmpQuanta)
600 except (NoWorkFound, FileNotFoundError) as exc:
601 if not missing.isdisjoint(q.inputs):
602 # This is a node that is known to be pruned later and
603 # should be left in even though some follow up queries
604 # fail. This allows the pruning to start from this quantum
605 # with known issues, and prune other nodes it touches.
606 inputs = q.inputs.unpackMultiRefs(self.storage_classes)
607 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes))
608 tmpQuantum = Quantum(
609 taskName=q.task.taskDef.taskName,
610 taskClass=q.task.taskDef.taskClass,
611 dataId=q.dataId,
612 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes),
613 inputs=inputs,
614 outputs=q.outputs.unpackMultiRefs(self.storage_classes),
615 )
616 outputs.add(tmpQuantum)
617 else:
618 raise exc
619 return outputs
622class _DatasetIdMaker:
623 """Helper class which generates random dataset UUIDs for unresolved
624 datasets.
625 """
627 def __init__(self, run: str):
628 self.run = run
629 # Cache of dataset refs generated so far.
630 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {}
632 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef:
633 # For components we need their parent dataset ID.
634 if dataset_type.isComponent():
635 parent_type = dataset_type.makeCompositeDatasetType()
636 # Parent should be resolved if this is an existing input, or it
637 # should be in the cache already if it is an intermediate.
638 key = parent_type, data_id
639 if key not in self.resolved:
640 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}")
641 parent_ref = self.resolved[key]
642 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False)
644 key = dataset_type, data_id
645 if (resolved := self.resolved.get(key)) is None:
646 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False)
647 self.resolved[key] = resolved
648 return resolved
650 def resolveDict(
651 self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder], is_output: bool
652 ) -> None:
653 """Resolve all unresolved references in the provided dictionary."""
654 for data_id, holder in refs.items():
655 if holder.ref is None or (is_output and holder.ref.run != self.run):
656 holder.ref = self.resolveRef(holder.dataset_type, data_id)
659@dataclass
660class _PipelineScaffolding:
661 """A helper data structure that organizes the information involved in
662 constructing a `QuantumGraph` for a `Pipeline`.
664 Parameters
665 ----------
666 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
667 Sequence of tasks from which a graph is to be constructed. Must
668 have nested task classes already imported.
669 universe : `~lsst.daf.butler.DimensionUniverse`
670 Universe of all possible dimensions.
672 Notes
673 -----
674 The scaffolding data structure contains nested data structures for both
675 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
676 data structures are shared between the pipeline-level structure (which
677 aggregates all datasets and categorizes them from the perspective of the
678 complete pipeline) and the individual tasks that use them as inputs and
679 outputs.
681 `QuantumGraph` construction proceeds in four steps, with each corresponding
682 to a different `_PipelineScaffolding` method:
684 1. When `_PipelineScaffolding` is constructed, we extract and categorize
685 the DatasetTypes used by the pipeline (delegating to
686 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
687 nested `_TaskScaffolding` and `_DatasetDict` objects.
689 2. In `connectDataIds`, we construct and run the "Big Join Query", which
690 returns related tuples of all dimensions used to identify any regular
691 input, output, and intermediate datasets (not prerequisites). We then
692 iterate over these tuples of related dimensions, identifying the subsets
693 that correspond to distinct data IDs for each task and dataset type,
694 and then create `_QuantumScaffolding` objects.
696 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
697 dataset data IDs previously identified, transforming unresolved
698 DatasetRefs into resolved DatasetRefs where appropriate. We then look
699 up prerequisite datasets for all quanta.
701 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
702 per-task `_QuantumScaffolding` objects.
703 """
705 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry):
706 _LOG.debug("Initializing data structures for QuantumGraph generation.")
707 self.tasks = []
708 # Aggregate and categorize the DatasetTypes in the Pipeline.
709 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
710 # Construct dictionaries that map those DatasetTypes to structures
711 # that will (later) hold additional information about them.
712 for attr in (
713 "initInputs",
714 "initIntermediates",
715 "initOutputs",
716 "inputs",
717 "intermediates",
718 "outputs",
719 "prerequisites",
720 ):
721 setattr(
722 self,
723 attr,
724 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
725 )
726 self.missing = _DatasetDict(universe=registry.dimensions)
727 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
728 # Aggregate all dimensions for all non-init, non-prerequisite
729 # DatasetTypes. These are the ones we'll include in the big join
730 # query.
731 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
732 # Construct scaffolding nodes for each Task, and add backreferences
733 # to the Task from each DatasetScaffolding node.
734 # Note that there's only one scaffolding node for each DatasetType,
735 # shared by _PipelineScaffolding and all _TaskScaffoldings that
736 # reference it.
737 if isinstance(pipeline, Pipeline):
738 pipeline = pipeline.toExpandedPipeline()
739 self.tasks = [
740 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
741 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
742 ]
744 def __repr__(self) -> str:
745 # Default dataclass-injected __repr__ gets caught in an infinite loop
746 # because of back-references.
747 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
749 tasks: list[_TaskScaffolding]
750 """Scaffolding data structures for each task in the pipeline
751 (`list` of `_TaskScaffolding`).
752 """
754 initInputs: _DatasetDict
755 """Datasets consumed but not produced when constructing the tasks in this
756 pipeline (`_DatasetDict`).
757 """
759 initIntermediates: _DatasetDict
760 """Datasets that are both consumed and produced when constructing the tasks
761 in this pipeline (`_DatasetDict`).
762 """
764 initOutputs: _DatasetDict
765 """Datasets produced but not consumed when constructing the tasks in this
766 pipeline (`_DatasetDict`).
767 """
769 inputs: _DatasetDict
770 """Datasets that are consumed but not produced when running this pipeline
771 (`_DatasetDict`).
772 """
774 intermediates: _DatasetDict
775 """Datasets that are both produced and consumed when running this pipeline
776 (`_DatasetDict`).
777 """
779 outputs: _DatasetDict
780 """Datasets produced but not consumed when when running this pipeline
781 (`_DatasetDict`).
782 """
784 prerequisites: _DatasetDict
785 """Datasets that are consumed when running this pipeline and looked up
786 per-Quantum when generating the graph (`_DatasetDict`).
787 """
789 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
790 """Datasets that should be used as constraints in the initial query,
791 according to tasks (`~lsst.daf.butler.NamedValueSet`).
792 """
794 dimensions: DimensionGraph
795 """All dimensions used by any regular input, intermediate, or output
796 (not prerequisite) dataset; the set of dimension used in the "Big Join
797 Query" (`~lsst.daf.butler.DimensionGraph`).
799 This is required to be a superset of all task quantum dimensions.
800 """
802 missing: _DatasetDict
803 """Datasets whose existence was originally predicted but were not
804 actually found.
806 Quanta that require these datasets as inputs will be pruned (recursively)
807 when actually constructing a `QuantumGraph` object.
809 These are currently populated only when the "initial dataset query
810 constraint" does not include all overall-input dataset types, and hence the
811 initial data ID query can include data IDs that it should not.
812 """
814 globalInitOutputs: _DatasetDict | None = None
815 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
816 """
818 @contextmanager
819 def connectDataIds(
820 self,
821 registry: Registry,
822 collections: Any,
823 userQuery: str | None,
824 externalDataId: DataCoordinate,
825 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
826 bind: Mapping[str, Any] | None = None,
827 ) -> Iterator[DataCoordinateQueryResults]:
828 """Query for the data IDs that connect nodes in the `QuantumGraph`.
830 This method populates `_TaskScaffolding.dataIds` and
831 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
833 Parameters
834 ----------
835 registry : `lsst.daf.butler.Registry`
836 Registry for the data repository; used for all data ID queries.
837 collections
838 Expressions representing the collections to search for input
839 datasets. See :ref:`daf_butler_ordered_collection_searches`.
840 userQuery : `str` or `None`
841 User-provided expression to limit the data IDs processed.
842 externalDataId : `~lsst.daf.butler.DataCoordinate`
843 Externally-provided data ID that should be used to restrict the
844 results, just as if these constraints had been included via ``AND``
845 in ``userQuery``. This includes (at least) any instrument named
846 in the pipeline definition.
847 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
848 The query constraint variant that should be used to constraint the
849 query based on dataset existance, defaults to
850 `DatasetQueryConstraintVariant.ALL`.
851 bind : `~collections.abc.Mapping`, optional
852 Mapping containing literal values that should be injected into the
853 ``userQuery`` expression, keyed by the identifiers they replace.
855 Returns
856 -------
857 commonDataIds : \
858 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
859 An interface to a database temporary table containing all data IDs
860 that will appear in this `QuantumGraph`. Returned inside a
861 context manager, which will drop the temporary table at the end of
862 the `with` block in which this method is called.
863 """
864 _LOG.debug("Building query for data IDs.")
865 # Initialization datasets always have empty data IDs.
866 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
867 for datasetType, refs in itertools.chain(
868 self.initInputs.items(),
869 self.initIntermediates.items(),
870 self.initOutputs.items(),
871 ):
872 refs[emptyDataId] = _RefHolder(datasetType)
873 # Run one big query for the data IDs for task dimensions and regular
874 # inputs and outputs. We limit the query to only dimensions that are
875 # associated with the input dataset types, but don't (yet) try to
876 # obtain the dataset_ids for those inputs.
877 _LOG.debug(
878 "Submitting data ID query over dimensions %s and materializing results.",
879 list(self.dimensions.names),
880 )
881 queryArgs: dict[str, Any] = {
882 "dimensions": self.dimensions,
883 "where": userQuery,
884 "dataId": externalDataId,
885 "bind": bind,
886 }
887 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
888 _LOG.debug(
889 "Constraining graph query using default of %s.",
890 list(self.defaultDatasetQueryConstraints.names),
891 )
892 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
893 queryArgs["collections"] = collections
894 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
895 _LOG.debug("Not using dataset existence to constrain query.")
896 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
897 constraint = set(datasetQueryConstraint)
898 inputs = {k.name: k for k in self.inputs.keys()}
899 if remainder := constraint.difference(inputs.keys()):
900 raise ValueError(
901 f"{remainder} dataset type(s) specified as a graph constraint, but"
902 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
903 )
904 _LOG.debug(f"Constraining graph query using {constraint}")
905 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
906 queryArgs["collections"] = collections
907 else:
908 raise ValueError(
909 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
910 )
912 if "datasets" in queryArgs:
913 for i, dataset_type in enumerate(queryArgs["datasets"]):
914 if dataset_type.isComponent():
915 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
917 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
918 _LOG.debug("Expanding data IDs.")
919 commonDataIds = commonDataIds.expanded()
920 _LOG.debug("Iterating over query results to associate quanta with datasets.")
921 # Iterate over query results, populating data IDs for datasets and
922 # quanta and then connecting them to each other.
923 n = -1
924 for n, commonDataId in enumerate(commonDataIds):
925 # Create DatasetRefs for all DatasetTypes from this result row,
926 # noting that we might have created some already.
927 # We remember both those that already existed and those that we
928 # create now.
929 refsForRow = {}
930 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {}
931 for datasetType, refs in itertools.chain(
932 self.inputs.items(),
933 self.intermediates.items(),
934 self.outputs.items(),
935 ):
936 datasetDataId: DataCoordinate | None
937 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
938 datasetDataId = commonDataId.subset(datasetType.dimensions)
939 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
940 ref_holder = refs.get(datasetDataId)
941 if ref_holder is None:
942 ref_holder = _RefHolder(datasetType)
943 refs[datasetDataId] = ref_holder
944 refsForRow[datasetType.name] = ref_holder
945 # Create _QuantumScaffolding objects for all tasks from this
946 # result row, noting that we might have created some already.
947 for task in self.tasks:
948 quantumDataId = commonDataId.subset(task.dimensions)
949 quantum = task.quanta.get(quantumDataId)
950 if quantum is None:
951 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
952 task.quanta[quantumDataId] = quantum
953 # Whether this is a new quantum or an existing one, we can
954 # now associate the DatasetRefs for this row with it. The
955 # fact that a Quantum data ID and a dataset data ID both
956 # came from the same result row is what tells us they
957 # should be associated.
958 # Many of these associates will be duplicates (because
959 # another query row that differed from this one only in
960 # irrelevant dimensions already added them), and we use
961 # sets to skip.
962 for datasetType in task.inputs:
963 dataId = dataIdCacheForRow[datasetType.dimensions]
964 ref_holder = refsForRow[datasetType.name]
965 quantum.inputs[datasetType.name][dataId] = ref_holder
966 for datasetType in task.outputs:
967 dataId = dataIdCacheForRow[datasetType.dimensions]
968 ref_holder = refsForRow[datasetType.name]
969 quantum.outputs[datasetType.name][dataId] = ref_holder
970 if n < 0:
971 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
972 emptiness_explained = False
973 for message in commonDataIds.explain_no_results():
974 _LOG.critical(message)
975 emptiness_explained = True
976 if not emptiness_explained:
977 _LOG.critical(
978 "To reproduce this query for debugging purposes, run "
979 "Registry.queryDataIds with these arguments:"
980 )
981 # We could just repr() the queryArgs dict to get something
982 # the user could make sense of, but it's friendlier to
983 # put these args in an easier-to-construct equivalent form
984 # so they can read it more easily and copy and paste into
985 # a Python terminal.
986 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
987 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
988 if queryArgs["where"]:
989 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
990 if "datasets" in queryArgs:
991 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
992 if "collections" in queryArgs:
993 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
994 _LOG.debug("Finished processing %d rows from data ID query.", n)
995 yield commonDataIds
997 def resolveDatasetRefs(
998 self,
999 registry: Registry,
1000 collections: Any,
1001 run: str,
1002 commonDataIds: DataCoordinateQueryResults,
1003 *,
1004 skipExistingIn: Any = None,
1005 clobberOutputs: bool = True,
1006 constrainedByAllDatasets: bool = True,
1007 ) -> None:
1008 """Perform follow up queries for each dataset data ID produced in
1009 `fillDataIds`.
1011 This method populates `_DatasetScaffolding.refs` (except for those in
1012 `prerequisites`).
1014 Parameters
1015 ----------
1016 registry : `lsst.daf.butler.Registry`
1017 Registry for the data repository; used for all data ID queries.
1018 collections
1019 Expressions representing the collections to search for input
1020 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1021 run : `str`
1022 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1023 output datasets, if it already exists.
1024 commonDataIds : \
1025 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
1026 Result of a previous call to `connectDataIds`.
1027 skipExistingIn
1028 Expressions representing the collections to search for existing
1029 output datasets that should be skipped. See
1030 :ref:`daf_butler_ordered_collection_searches` for allowed types.
1031 `None` or empty string/sequence disables skipping.
1032 clobberOutputs : `bool`, optional
1033 If `True` (default), allow quanta to created even if outputs exist;
1034 this requires the same behavior behavior to be enabled when
1035 executing. If ``skipExistingIn`` is not `None`, completed quanta
1036 (those with metadata, or all outputs if there is no metadata
1037 dataset configured) will be skipped rather than clobbered.
1038 constrainedByAllDatasets : `bool`, optional
1039 Indicates if the commonDataIds were generated with a constraint on
1040 all dataset types.
1042 Raises
1043 ------
1044 OutputExistsError
1045 Raised if an output dataset already exists in the output run
1046 and ``skipExistingIn`` does not include output run, or if only
1047 some outputs are present and ``clobberOutputs`` is `False`.
1048 """
1049 # Run may be provided but it does not have to exist, in that case we
1050 # use it for resolving references but don't check it for existing refs.
1051 run_exists = False
1052 if run:
1053 try:
1054 run_exists = bool(registry.queryCollections(run))
1055 except MissingCollectionError:
1056 # Undocumented exception is raise if it does not exist
1057 pass
1059 skip_collections_wildcard: CollectionWildcard | None = None
1060 skipExistingInRun = False
1061 if skipExistingIn:
1062 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
1063 if run_exists:
1064 # as optimization check in the explicit list of names first
1065 skipExistingInRun = run in skip_collections_wildcard.strings
1066 if not skipExistingInRun:
1067 # need to flatten it and check again
1068 skipExistingInRun = run in registry.queryCollections(
1069 skipExistingIn,
1070 collectionTypes=CollectionType.RUN,
1071 )
1073 idMaker = _DatasetIdMaker(run)
1075 resolvedRefQueryResults: Iterable[DatasetRef]
1077 # Updating constrainedByAllDatasets here is not ideal, but we have a
1078 # few different code paths that each transfer different pieces of
1079 # information about what dataset query constraints were applied here,
1080 # and none of them has the complete picture until we get here. We're
1081 # long overdue for a QG generation rewrite that will make this go away
1082 # entirely anyway.
1083 constrainedByAllDatasets = (
1084 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
1085 )
1087 # Look up [init] intermediate and output datasets in the output
1088 # collection, if there is an output collection.
1089 if run_exists or skip_collections_wildcard is not None:
1090 for datasetType, refs in itertools.chain(
1091 self.initIntermediates.items(),
1092 self.initOutputs.items(),
1093 self.intermediates.items(),
1094 self.outputs.items(),
1095 ):
1096 _LOG.debug(
1097 "Resolving %d datasets for intermediate and/or output dataset %s.",
1098 len(refs),
1099 datasetType.name,
1100 )
1101 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
1102 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
1103 # TODO: this assert incorrectly bans component inputs;
1104 # investigate on DM-33027.
1105 # assert not datasetType.isComponent(), \
1106 # "Output datasets cannot be components."
1107 #
1108 # Instead we have to handle them manually to avoid a
1109 # deprecation warning, but it is at least confusing and
1110 # possibly a bug for components to appear here at all.
1111 if datasetType.isComponent():
1112 parent_dataset_type = datasetType.makeCompositeDatasetType()
1113 component = datasetType.component()
1114 else:
1115 parent_dataset_type = datasetType
1116 component = None
1118 # look at RUN collection first
1119 if run_exists:
1120 try:
1121 resolvedRefQueryResults = subset.findDatasets(
1122 parent_dataset_type, collections=run, findFirst=True
1123 )
1124 except MissingDatasetTypeError:
1125 resolvedRefQueryResults = []
1126 for resolvedRef in resolvedRefQueryResults:
1127 # TODO: we could easily support per-DatasetType
1128 # skipExisting and I could imagine that being useful -
1129 # it's probably required in order to support writing
1130 # initOutputs before QuantumGraph generation.
1131 assert resolvedRef.dataId in refs
1132 if not (skipExistingInRun or isInit or clobberOutputs):
1133 raise OutputExistsError(
1134 f"Output dataset {datasetType.name} already exists in "
1135 f"output RUN collection '{run}' with data ID"
1136 f" {resolvedRef.dataId}."
1137 )
1138 # To resolve all outputs we have to remember existing
1139 # ones to avoid generating new dataset IDs for them.
1140 refs[resolvedRef.dataId].ref = (
1141 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1142 )
1144 # And check skipExistingIn too, if RUN collection is in
1145 # it is handled above
1146 if skip_collections_wildcard is not None:
1147 try:
1148 resolvedRefQueryResults = subset.findDatasets(
1149 parent_dataset_type,
1150 collections=skip_collections_wildcard,
1151 findFirst=True,
1152 )
1153 except MissingDatasetTypeError:
1154 resolvedRefQueryResults = []
1155 for resolvedRef in resolvedRefQueryResults:
1156 if resolvedRef.dataId not in refs:
1157 continue
1158 refs[resolvedRef.dataId].ref = (
1159 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1160 )
1162 # Look up input and initInput datasets in the input collection(s). We
1163 # accumulate datasets in self.missing, if the common data IDs were not
1164 # constrained on dataset type existence.
1165 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1166 _LOG.debug(
1167 "Resolving %d datasets for input dataset %s.",
1168 len(refs),
1169 datasetType.name,
1170 )
1171 if datasetType.isComponent():
1172 parent_dataset_type = datasetType.makeCompositeDatasetType()
1173 component = datasetType.component()
1174 else:
1175 parent_dataset_type = datasetType
1176 component = None
1177 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {}
1178 try:
1179 resolvedRefQueryResults = commonDataIds.subset(
1180 datasetType.dimensions, unique=True
1181 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1182 except MissingDatasetTypeError:
1183 resolvedRefQueryResults = []
1184 dataIdsNotFoundYet = set(refs.keys())
1185 for resolvedRef in resolvedRefQueryResults:
1186 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1187 if resolvedRef.dataId not in refs:
1188 continue
1189 refs[resolvedRef.dataId].ref = (
1190 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1191 )
1192 if dataIdsNotFoundYet:
1193 if constrainedByAllDatasets:
1194 raise RuntimeError(
1195 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1196 f"'{datasetType.name}' was/were present in a previous "
1197 "query, but could not be found now. "
1198 "This is either a logic bug in QuantumGraph generation "
1199 "or the input collections have been modified since "
1200 "QuantumGraph generation began."
1201 )
1202 elif not datasetType.dimensions:
1203 raise RuntimeError(
1204 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1205 f"collections {collections}."
1206 )
1207 else:
1208 # If the common dataIds were not constrained using all the
1209 # input dataset types, it is possible that some data ids
1210 # found don't correspond to existing datasets. Mark these
1211 # for later pruning from the quantum graph.
1212 for k in dataIdsNotFoundYet:
1213 missing_for_dataset_type[k] = refs[k]
1214 if missing_for_dataset_type:
1215 self.missing[datasetType] = missing_for_dataset_type
1217 # Resolve the missing refs, just so they look like all of the others;
1218 # in the end other code will make sure they never appear in the QG.
1219 for dataset_type, refDict in self.missing.items():
1220 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1222 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1223 # replacing the unresolved refs there, and then look up prerequisites.
1224 for task in self.tasks:
1225 _LOG.debug(
1226 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1227 len(task.quanta),
1228 task.taskDef.label,
1229 )
1230 # The way iterConnections is designed makes it impossible to
1231 # annotate precisely enough to satisfy MyPy here.
1232 lookupFunctions = {
1233 c.name: c.lookupFunction # type: ignore
1234 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1235 if c.lookupFunction is not None # type: ignore
1236 }
1237 dataIdsFailed = []
1238 dataIdsSucceeded = []
1239 for quantum in task.quanta.values():
1240 # Process outputs datasets only if skipExistingIn is not None
1241 # or there is a run to look for outputs in and clobberOutputs
1242 # is True. Note that if skipExistingIn is None, any output
1243 # datasets that already exist would have already caused an
1244 # exception to be raised.
1245 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1246 resolvedRefs = []
1247 unresolvedDataIds = []
1248 haveMetadata = False
1249 for datasetType, originalRefs in quantum.outputs.items():
1250 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()):
1251 if ref is not None:
1252 resolvedRefs.append(ref)
1253 originalRefs[dataId].ref = ref
1254 if datasetType.name == task.taskDef.metadataDatasetName:
1255 haveMetadata = True
1256 else:
1257 unresolvedDataIds.append((datasetType, dataId))
1258 if resolvedRefs:
1259 if haveMetadata or not unresolvedDataIds:
1260 dataIdsSucceeded.append(quantum.dataId)
1261 if skip_collections_wildcard is not None:
1262 continue
1263 else:
1264 dataIdsFailed.append(quantum.dataId)
1265 if not clobberOutputs and run_exists:
1266 raise OutputExistsError(
1267 f"Quantum {quantum.dataId} of task with label "
1268 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1269 f"({resolvedRefs}) "
1270 f"and others that don't ({unresolvedDataIds}), with no metadata output, "
1271 "and clobbering outputs was not enabled."
1272 )
1273 # Update the input DatasetRefs to the resolved ones we already
1274 # searched for.
1275 for datasetType, input_refs in quantum.inputs.items():
1276 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()):
1277 input_refs[data_id].ref = ref
1278 # Look up prerequisite datasets in the input collection(s).
1279 # These may have dimensions that extend beyond those we queried
1280 # for originally, because we want to permit those data ID
1281 # values to differ across quanta and dataset types.
1282 for datasetType in task.prerequisites:
1283 if datasetType.isComponent():
1284 parent_dataset_type = datasetType.makeCompositeDatasetType()
1285 component = datasetType.component()
1286 else:
1287 parent_dataset_type = datasetType
1288 component = None
1289 lookupFunction = lookupFunctions.get(datasetType.name)
1290 if lookupFunction is not None:
1291 # PipelineTask has provided its own function to do the
1292 # lookup. This always takes precedence.
1293 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1294 elif (
1295 datasetType.isCalibration()
1296 and datasetType.dimensions <= quantum.dataId.graph
1297 and quantum.dataId.graph.temporal
1298 ):
1299 # This is a master calibration lookup, which we have to
1300 # handle specially because the query system can't do a
1301 # temporal join on a non-dimension-based timespan yet.
1302 timespan = quantum.dataId.timespan
1303 try:
1304 prereq_ref = registry.findDataset(
1305 parent_dataset_type,
1306 quantum.dataId,
1307 collections=collections,
1308 timespan=timespan,
1309 )
1310 if prereq_ref is not None:
1311 if component is not None:
1312 prereq_ref = prereq_ref.makeComponentRef(component)
1313 prereq_refs = [prereq_ref]
1314 else:
1315 prereq_refs = []
1316 except (KeyError, MissingDatasetTypeError):
1317 # This dataset type is not present in the registry,
1318 # which just means there are no datasets here.
1319 prereq_refs = []
1320 else:
1321 # Most general case.
1322 prereq_refs = [
1323 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1324 for prereq_ref in registry.queryDatasets(
1325 parent_dataset_type,
1326 collections=collections,
1327 dataId=quantum.dataId,
1328 findFirst=True,
1329 ).expanded()
1330 ]
1332 for ref in prereq_refs:
1333 if ref is not None:
1334 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1335 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1337 # Resolve all quantum inputs and outputs.
1338 for dataset_type, refDict in quantum.inputs.items():
1339 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1340 for dataset_type, refDict in quantum.outputs.items():
1341 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1343 # Resolve task initInputs and initOutputs.
1344 for dataset_type, refDict in task.initInputs.items():
1345 idMaker.resolveDict(dataset_type, refDict, is_output=False)
1346 for dataset_type, refDict in task.initOutputs.items():
1347 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1349 # Actually remove any quanta that we decided to skip above.
1350 if dataIdsSucceeded:
1351 if skip_collections_wildcard is not None:
1352 _LOG.debug(
1353 "Pruning successful %d quanta for task with label '%s' because all of their "
1354 "outputs exist or metadata was written successfully.",
1355 len(dataIdsSucceeded),
1356 task.taskDef.label,
1357 )
1358 for dataId in dataIdsSucceeded:
1359 del task.quanta[dataId]
1360 elif clobberOutputs and run_exists:
1361 _LOG.info(
1362 "Found %d successful quanta for task with label '%s' "
1363 "that will need to be clobbered during execution.",
1364 len(dataIdsSucceeded),
1365 task.taskDef.label,
1366 )
1367 if dataIdsFailed:
1368 if clobberOutputs and run_exists:
1369 _LOG.info(
1370 "Found %d failed/incomplete quanta for task with label '%s' "
1371 "that will need to be clobbered during execution.",
1372 len(dataIdsFailed),
1373 task.taskDef.label,
1374 )
1376 # Collect initOutputs that do not belong to any task.
1377 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1378 for task in self.tasks:
1379 global_dataset_types -= set(task.initOutputs)
1380 if global_dataset_types:
1381 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1382 for dataset_type, refDict in self.globalInitOutputs.items():
1383 idMaker.resolveDict(dataset_type, refDict, is_output=True)
1385 def makeQuantumGraph(
1386 self,
1387 registry: Registry,
1388 metadata: Mapping[str, Any] | None = None,
1389 datastore: Datastore | None = None,
1390 ) -> QuantumGraph:
1391 """Create a `QuantumGraph` from the quanta already present in
1392 the scaffolding data structure.
1394 Parameters
1395 ----------
1396 registry : `lsst.daf.butler.Registry`
1397 Registry for the data repository; used for all data ID queries.
1398 metadata : `~collections.abc.Mapping` of `str` to primitives, optional
1399 This is an optional parameter of extra data to carry with the
1400 graph. Entries in this mapping should be able to be serialized in
1401 JSON.
1402 datastore : `~lsst.daf.butler.Datastore`, optional
1403 If not `None` then fill datastore records in each generated
1404 Quantum.
1406 Returns
1407 -------
1408 graph : `QuantumGraph`
1409 The full `QuantumGraph`.
1410 """
1412 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1413 """Extract all DatasetRefs from the dictionaries"""
1414 for ref_dict in dataset_dict.values():
1415 for holder in ref_dict.values():
1416 yield holder.resolved_ref
1418 datastore_records: Mapping[str, DatastoreRecordData] | None = None
1419 if datastore is not None:
1420 datastore_records = datastore.export_records(
1421 itertools.chain(
1422 _make_refs(self.inputs),
1423 _make_refs(self.initInputs),
1424 _make_refs(self.prerequisites),
1425 )
1426 )
1428 graphInput: dict[TaskDef, set[Quantum]] = {}
1429 for task in self.tasks:
1430 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records)
1431 graphInput[task.taskDef] = qset
1433 taskInitInputs = {
1434 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values()
1435 for task in self.tasks
1436 }
1437 taskInitOutputs = {
1438 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values()
1439 for task in self.tasks
1440 }
1442 globalInitOutputs: list[DatasetRef] = []
1443 if self.globalInitOutputs is not None:
1444 for refs_dict in self.globalInitOutputs.values():
1445 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values())
1447 graph = QuantumGraph(
1448 graphInput,
1449 metadata=metadata,
1450 pruneRefs=list(self.missing.iter_resolved_refs()),
1451 universe=self.dimensions.universe,
1452 initInputs=taskInitInputs,
1453 initOutputs=taskInitOutputs,
1454 globalInitOutputs=globalInitOutputs,
1455 registryDatasetTypes=self._get_registry_dataset_types(registry),
1456 )
1457 return graph
1459 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1460 """Make a list of all dataset types used by a graph as defined in
1461 registry.
1462 """
1463 chain = [
1464 self.initInputs,
1465 self.initIntermediates,
1466 self.initOutputs,
1467 self.inputs,
1468 self.intermediates,
1469 self.outputs,
1470 self.prerequisites,
1471 ]
1472 if self.globalInitOutputs is not None:
1473 chain.append(self.globalInitOutputs)
1475 # Collect names of all dataset types.
1476 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1477 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1479 # Check for types that do not exist in registry yet:
1480 # - inputs must exist
1481 # - intermediates and outputs may not exist, but there must not be
1482 # more than one definition (e.g. differing in storage class)
1483 # - prerequisites may not exist, treat it the same as outputs here
1484 for dstype in itertools.chain(self.initInputs, self.inputs):
1485 if dstype.name not in dataset_types:
1486 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1488 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1489 chain = [
1490 self.initIntermediates,
1491 self.initOutputs,
1492 self.intermediates,
1493 self.outputs,
1494 self.prerequisites,
1495 ]
1496 if self.globalInitOutputs is not None:
1497 chain.append(self.globalInitOutputs)
1498 for dstype in itertools.chain(*chain):
1499 if dstype.name not in dataset_types:
1500 new_outputs[dstype.name].add(dstype)
1501 for name, dstypes in new_outputs.items():
1502 if len(dstypes) > 1:
1503 raise ValueError(
1504 "Pipeline contains multiple definitions for a dataset type "
1505 f"which is not defined in registry yet: {dstypes}"
1506 )
1507 elif len(dstypes) == 1:
1508 dataset_types[name] = dstypes.pop()
1510 return dataset_types.values()
1513# ------------------------
1514# Exported definitions --
1515# ------------------------
1518class GraphBuilderError(Exception):
1519 """Base class for exceptions generated by graph builder."""
1521 pass
1524class OutputExistsError(GraphBuilderError):
1525 """Exception generated when output datasets already exist."""
1527 pass
1530class PrerequisiteMissingError(GraphBuilderError):
1531 """Exception generated when a prerequisite dataset does not exist."""
1533 pass
1536class GraphBuilder:
1537 """GraphBuilder class is responsible for building task execution graph from
1538 a Pipeline.
1540 Parameters
1541 ----------
1542 registry : `~lsst.daf.butler.Registry`
1543 Data butler instance.
1544 skipExistingIn
1545 Expressions representing the collections to search for existing
1546 output datasets that should be skipped. See
1547 :ref:`daf_butler_ordered_collection_searches`.
1548 clobberOutputs : `bool`, optional
1549 If `True` (default), allow quanta to created even if partial outputs
1550 exist; this requires the same behavior behavior to be enabled when
1551 executing.
1552 datastore : `~lsst.daf.butler.Datastore`, optional
1553 If not `None` then fill datastore records in each generated Quantum.
1554 """
1556 def __init__(
1557 self,
1558 registry: Registry,
1559 skipExistingIn: Any = None,
1560 clobberOutputs: bool = True,
1561 datastore: Datastore | None = None,
1562 ):
1563 self.registry = registry
1564 self.dimensions = registry.dimensions
1565 self.skipExistingIn = skipExistingIn
1566 self.clobberOutputs = clobberOutputs
1567 self.datastore = datastore
1569 def makeGraph(
1570 self,
1571 pipeline: Pipeline | Iterable[TaskDef],
1572 collections: Any,
1573 run: str,
1574 userQuery: str | None,
1575 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1576 metadata: Mapping[str, Any] | None = None,
1577 bind: Mapping[str, Any] | None = None,
1578 dataId: DataCoordinate | None = None,
1579 ) -> QuantumGraph:
1580 """Create execution graph for a pipeline.
1582 Parameters
1583 ----------
1584 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ]
1585 Pipeline definition, task names/classes and their configs.
1586 collections
1587 Expressions representing the collections to search for input
1588 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1589 run : `str`
1590 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1591 output datasets. Collection does not have to exist and it will be
1592 created when graph is executed.
1593 userQuery : `str`
1594 String which defines user-defined selection for registry, should be
1595 empty or `None` if there is no restrictions on data selection.
1596 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1597 The query constraint variant that should be used to constraint the
1598 query based on dataset existance, defaults to
1599 `DatasetQueryConstraintVariant.ALL`.
1600 metadata : Optional Mapping of `str` to primitives
1601 This is an optional parameter of extra data to carry with the
1602 graph. Entries in this mapping should be able to be serialized in
1603 JSON.
1604 bind : `~collections.abc.Mapping`, optional
1605 Mapping containing literal values that should be injected into the
1606 ``userQuery`` expression, keyed by the identifiers they replace.
1607 dataId : `lsst.daf.butler.DataCoordinate`, optional
1608 Data ID that should also be included in the query constraint.
1610 Returns
1611 -------
1612 graph : `QuantumGraph`
1614 Raises
1615 ------
1616 UserExpressionError
1617 Raised when user expression cannot be parsed.
1618 OutputExistsError
1619 Raised when output datasets already exist.
1620 Exception
1621 Other exceptions types may be raised by underlying registry
1622 classes.
1623 """
1624 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1625 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1626 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1627 if dataId is None:
1628 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1629 if isinstance(pipeline, Pipeline):
1630 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId)
1631 with scaffolding.connectDataIds(
1632 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1633 ) as commonDataIds:
1634 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1635 scaffolding.resolveDatasetRefs(
1636 self.registry,
1637 collections,
1638 run,
1639 commonDataIds,
1640 skipExistingIn=self.skipExistingIn,
1641 clobberOutputs=self.clobberOutputs,
1642 constrainedByAllDatasets=condition,
1643 )
1644 return scaffolding.makeQuantumGraph(
1645 registry=self.registry, metadata=metadata, datastore=self.datastore
1646 )