Coverage for python/lsst/pipe/base/graphBuilder.py: 15%
544 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-12 02:03 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-12 02:03 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap, defaultdict
34from collections.abc import Collection, Iterable, Iterator, Mapping
35from contextlib import contextmanager
36from dataclasses import dataclass
37from typing import Any, Optional
39from lsst.daf.butler import (
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
55from lsst.daf.butler.registry.wildcards import CollectionWildcard
56from lsst.utils import doImportType
58from ._datasetQueryConstraints import DatasetQueryConstraintVariant
59from ._status import NoWorkFound
61# -----------------------------
62# Imports for other modules --
63# -----------------------------
64from .connections import AdjustQuantumHelper, iterConnections
65from .graph import QuantumGraph
66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
68# ----------------------------------
69# Local non-exported definitions --
70# ----------------------------------
72_LOG = logging.getLogger(__name__)
75@dataclass
76class _RefHolder:
77 """Placeholder for `DatasetRef` representing a future resolved reference.
79 As we eliminated unresolved DatasetRefs we now use `None` to represent
80 a reference that is yet to be resolved. Information about its corresponding
81 dataset type and coordinate is stored in `_DatasetDict` mapping.
82 """
84 dataset_type: DatasetType
85 """Dataset type of the dataset to be created later. I need to store it here
86 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared
87 between different compatible dataset types."""
89 ref: DatasetRef | None = None
90 """Dataset reference, initially `None`, created when all datasets are
91 resolved.
92 """
94 @property
95 def resolved_ref(self) -> DatasetRef:
96 """Access resolved reference, should only be called after the
97 reference is set (`DatasetRef`)."""
98 assert self.ref is not None, "Dataset reference is not set."
99 return self.ref
102class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]):
103 """A custom dictionary that maps `DatasetType` to a nested dictionary of
104 the known `DatasetRef` instances of that type.
106 Parameters
107 ----------
108 args
109 Positional arguments are forwarded to the `dict` constructor.
110 universe : `DimensionUniverse`
111 Universe of all possible dimensions.
112 """
114 def __init__(self, *args: Any, universe: DimensionUniverse):
115 super().__init__(*args)
116 self.universe = universe
118 @classmethod
119 def fromDatasetTypes(
120 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
121 ) -> _DatasetDict:
122 """Construct a dictionary from a flat iterable of `DatasetType` keys.
124 Parameters
125 ----------
126 datasetTypes : `iterable` of `DatasetType`
127 DatasetTypes to use as keys for the dict. Values will be empty
128 dictionaries.
129 universe : `DimensionUniverse`
130 Universe of all possible dimensions.
132 Returns
133 -------
134 dictionary : `_DatasetDict`
135 A new `_DatasetDict` instance.
136 """
137 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
139 @classmethod
140 def fromSubset(
141 cls,
142 datasetTypes: Collection[DatasetType],
143 first: _DatasetDict,
144 *rest: _DatasetDict,
145 ) -> _DatasetDict:
146 """Return a new dictionary by extracting items corresponding to the
147 given keys from one or more existing dictionaries.
149 Parameters
150 ----------
151 datasetTypes : `iterable` of `DatasetType`
152 DatasetTypes to use as keys for the dict. Values will be obtained
153 by lookups against ``first`` and ``rest``.
154 first : `_DatasetDict`
155 Another dictionary from which to extract values.
156 rest
157 Additional dictionaries from which to extract values.
159 Returns
160 -------
161 dictionary : `_DatasetDict`
162 A new dictionary instance.
163 """
164 combined = ChainMap(first, *rest)
166 # Dataset types known to match immediately can be processed
167 # without checks.
168 matches = combined.keys() & set(datasetTypes)
169 _dict = {k: combined[k] for k in matches}
171 if len(_dict) < len(datasetTypes):
172 # Work out which ones are missing.
173 missing_datasetTypes = set(datasetTypes) - _dict.keys()
175 # Get the known names for comparison.
176 combined_by_name = {k.name: k for k in combined}
178 missing = set()
179 incompatible = {}
180 for datasetType in missing_datasetTypes:
181 # The dataset type is not found. It may not be listed
182 # or it may be that it is there with the same name
183 # but different definition.
184 if datasetType.name in combined_by_name:
185 # This implies some inconsistency in definitions
186 # for connections. If there is support for storage
187 # class conversion we can let it slide.
188 # At this point we do not know
189 # where the inconsistency is but trust that down
190 # stream code will be more explicit about input
191 # vs output incompatibilities.
192 existing = combined_by_name[datasetType.name]
193 convertible_to_existing = existing.is_compatible_with(datasetType)
194 convertible_from_existing = datasetType.is_compatible_with(existing)
195 if convertible_to_existing and convertible_from_existing:
196 _LOG.debug(
197 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
198 datasetType.name,
199 datasetType.storageClass_name,
200 existing.storageClass_name,
201 )
202 _dict[datasetType] = combined[existing]
203 elif convertible_to_existing or convertible_from_existing:
204 # We'd need to refactor a fair amount to recognize
205 # whether this is an error or not, so I'm not going to
206 # bother until we need to do that for other reasons
207 # (it won't be too long).
208 _LOG.info(
209 "Dataset type %s is present with multiple only partially-compatible storage "
210 "classes %s and %s.",
211 datasetType.name,
212 datasetType.storageClass_name,
213 existing.storageClass_name,
214 )
215 _dict[datasetType] = combined[existing]
216 else:
217 incompatible[datasetType] = existing
218 else:
219 missing.add(datasetType)
221 if missing or incompatible:
222 reasons = []
223 if missing:
224 reasons.append(
225 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known "
226 f"types: [{', '.join(d.name for d in combined)}]."
227 )
228 if incompatible:
229 for x, y in incompatible.items():
230 reasons.append(f"{x} incompatible with {y}")
231 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
233 return cls(_dict, universe=first.universe)
235 @property
236 def dimensions(self) -> DimensionGraph:
237 """The union of all dimensions used by all dataset types in this
238 dictionary, including implied dependencies (`DimensionGraph`).
239 """
240 base = self.universe.empty
241 if len(self) == 0:
242 return base
243 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
245 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
246 """Unpack nested single-element `DatasetRef` dicts into a new
247 mapping with `DatasetType` keys and `DatasetRef` values.
249 This method assumes that each nest contains exactly one item, as is the
250 case for all "init" datasets.
252 Returns
253 -------
254 dictionary : `NamedKeyDict`
255 Dictionary mapping `DatasetType` to `DatasetRef`, with both
256 `DatasetType` instances and string names usable as keys.
257 """
259 def getOne(refs: dict[DataCoordinate, _RefHolder]) -> DatasetRef:
260 (holder,) = refs.values()
261 return holder.resolved_ref
263 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
265 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
266 """Unpack nested multi-element `DatasetRef` dicts into a new
267 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
269 Returns
270 -------
271 dictionary : `NamedKeyDict`
272 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
273 both `DatasetType` instances and string names usable as keys.
274 """
275 return NamedKeyDict(
276 {
277 datasetType: list(holder.resolved_ref for holder in refs.values())
278 for datasetType, refs in self.items()
279 }
280 )
282 def extract(
283 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
284 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]:
285 """Iterate over the contained `DatasetRef` instances that match the
286 given `DatasetType` and data IDs.
288 Parameters
289 ----------
290 datasetType : `DatasetType`
291 Dataset type to match.
292 dataIds : `Iterable` [ `DataCoordinate` ]
293 Data IDs to match.
295 Returns
296 -------
297 refs : `Iterator` [ `DatasetRef` ]
298 DatasetRef instances for which ``ref.datasetType == datasetType``
299 and ``ref.dataId`` is in ``dataIds``.
300 """
301 refs = self[datasetType]
302 return ((dataId, refs[dataId].ref) for dataId in dataIds)
304 def isdisjoint(self, other: _DatasetDict) -> bool:
305 """Test whether ``self`` and ``other`` have any datasets in common.
307 Datasets are considered in common if they have the same *parent*
308 dataset type name and data ID; storage classes and components are not
309 considered.
310 """
311 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()}
312 for k, v in other.items():
313 parent_name, _ = k.nameAndComponent()
314 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()):
315 return False
316 return True
318 def iter_resolved_refs(self) -> Iterator[DatasetRef]:
319 """Iterate over all DatasetRef instances held by this data structure,
320 assuming that each `_RefHolder` already carries are resolved ref.
321 """
322 for holders_by_data_id in self.values():
323 for holder in holders_by_data_id.values():
324 yield holder.resolved_ref
327class _QuantumScaffolding:
328 """Helper class aggregating information about a `Quantum`, used when
329 constructing a `QuantumGraph`.
331 See `_PipelineScaffolding` for a top-down description of the full
332 scaffolding data structure.
334 Parameters
335 ----------
336 task : _TaskScaffolding
337 Back-reference to the helper object for the `PipelineTask` this quantum
338 represents an execution of.
339 dataId : `DataCoordinate`
340 Data ID for this quantum.
341 """
343 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
344 self.task = task
345 self.dataId = dataId
346 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
347 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
348 self.prerequisites = _DatasetDict.fromDatasetTypes(
349 task.prerequisites.keys(), universe=dataId.universe
350 )
352 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
354 def __repr__(self) -> str:
355 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
357 task: _TaskScaffolding
358 """Back-reference to the helper object for the `PipelineTask` this quantum
359 represents an execution of.
360 """
362 dataId: DataCoordinate
363 """Data ID for this quantum.
364 """
366 inputs: _DatasetDict
367 """Nested dictionary containing `DatasetRef` inputs to this quantum.
369 This is initialized to map each `DatasetType` to an empty dictionary at
370 construction. Those nested dictionaries are populated (with data IDs as
371 keys) with unresolved `DatasetRef` instances in
372 `_PipelineScaffolding.connectDataIds`.
373 """
375 outputs: _DatasetDict
376 """Nested dictionary containing `DatasetRef` outputs this quantum.
377 """
379 prerequisites: _DatasetDict
380 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
381 quantum.
382 """
384 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
385 """Transform the scaffolding object into a true `Quantum` instance.
387 Parameters
388 ----------
389 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
390 If not `None` then fill datastore records in each generated Quantum
391 using the records from this structure.
393 Returns
394 -------
395 quantum : `Quantum`
396 An actual `Quantum` instance.
397 """
398 allInputs = self.inputs.unpackMultiRefs()
399 allInputs.update(self.prerequisites.unpackMultiRefs())
400 # Give the task's Connections class an opportunity to remove some
401 # inputs, or complain if they are unacceptable.
402 # This will raise if one of the check conditions is not met, which is
403 # the intended behavior.
404 # If it raises NotWorkFound, there is a bug in the QG algorithm
405 # or the adjustQuantum is incorrectly trying to make a prerequisite
406 # input behave like a regular input; adjustQuantum should only raise
407 # NoWorkFound if a regular input is missing, and it shouldn't be
408 # possible for us to have generated ``self`` if that's true.
409 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
410 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
411 initInputs = self.task.initInputs.unpackSingleRefs()
412 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
413 if datastore_records is not None:
414 quantum_records = {}
415 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
416 input_refs += list(initInputs.values())
417 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
418 for datastore_name, records in datastore_records.items():
419 matching_records = records.subset(input_ids)
420 if matching_records is not None:
421 quantum_records[datastore_name] = matching_records
422 return Quantum(
423 taskName=self.task.taskDef.taskName,
424 taskClass=self.task.taskDef.taskClass,
425 dataId=self.dataId,
426 initInputs=initInputs,
427 inputs=helper.inputs,
428 outputs=helper.outputs,
429 datastore_records=quantum_records,
430 )
433@dataclass
434class _TaskScaffolding:
435 """Helper class aggregating information about a `PipelineTask`, used when
436 constructing a `QuantumGraph`.
438 See `_PipelineScaffolding` for a top-down description of the full
439 scaffolding data structure.
441 Parameters
442 ----------
443 taskDef : `TaskDef`
444 Data structure that identifies the task class and its config.
445 parent : `_PipelineScaffolding`
446 The parent data structure that will hold the instance being
447 constructed.
448 datasetTypes : `TaskDatasetTypes`
449 Data structure that categorizes the dataset types used by this task.
450 """
452 def __init__(
453 self,
454 taskDef: TaskDef,
455 parent: _PipelineScaffolding,
456 datasetTypes: TaskDatasetTypes,
457 ):
458 universe = parent.dimensions.universe
459 self.taskDef = taskDef
460 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
461 assert self.dimensions.issubset(parent.dimensions)
462 # Initialize _DatasetDicts as subsets of the one or two
463 # corresponding dicts in the parent _PipelineScaffolding.
464 self.initInputs = _DatasetDict.fromSubset(
465 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
466 )
467 self.initOutputs = _DatasetDict.fromSubset(
468 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
469 )
470 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
471 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
472 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
473 self.dataIds: set[DataCoordinate] = set()
474 self.quanta = {}
476 def __repr__(self) -> str:
477 # Default dataclass-injected __repr__ gets caught in an infinite loop
478 # because of back-references.
479 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
481 taskDef: TaskDef
482 """Data structure that identifies the task class and its config
483 (`TaskDef`).
484 """
486 dimensions: DimensionGraph
487 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
488 """
490 initInputs: _DatasetDict
491 """Dictionary containing information about datasets used to construct this
492 task (`_DatasetDict`).
493 """
495 initOutputs: _DatasetDict
496 """Dictionary containing information about datasets produced as a
497 side-effect of constructing this task (`_DatasetDict`).
498 """
500 inputs: _DatasetDict
501 """Dictionary containing information about datasets used as regular,
502 graph-constraining inputs to this task (`_DatasetDict`).
503 """
505 outputs: _DatasetDict
506 """Dictionary containing information about datasets produced by this task
507 (`_DatasetDict`).
508 """
510 prerequisites: _DatasetDict
511 """Dictionary containing information about input datasets that must be
512 present in the repository before any Pipeline containing this task is run
513 (`_DatasetDict`).
514 """
516 quanta: dict[DataCoordinate, _QuantumScaffolding]
517 """Dictionary mapping data ID to a scaffolding object for the Quantum of
518 this task with that data ID.
519 """
521 def makeQuantumSet(
522 self,
523 missing: _DatasetDict,
524 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
525 ) -> set[Quantum]:
526 """Create a `set` of `Quantum` from the information in ``self``.
528 Parameters
529 ----------
530 missing : `_DatasetDict`
531 Input datasets that have not been found.
532 datastore_records : `dict`
533 Record from the datastore to export with quanta.
535 Returns
536 -------
537 nodes : `set` of `Quantum`
538 The `Quantum` elements corresponding to this task.
539 """
540 outputs = set()
541 for q in self.quanta.values():
542 try:
543 tmpQuanta = q.makeQuantum(datastore_records)
544 outputs.add(tmpQuanta)
545 except (NoWorkFound, FileNotFoundError) as exc:
546 if not missing.isdisjoint(q.inputs):
547 # This is a node that is known to be pruned later and
548 # should be left in even though some follow up queries
549 # fail. This allows the pruning to start from this quantum
550 # with known issues, and prune other nodes it touches.
551 inputs = q.inputs.unpackMultiRefs()
552 inputs.update(q.prerequisites.unpackMultiRefs())
553 tmpQuantum = Quantum(
554 taskName=q.task.taskDef.taskName,
555 taskClass=q.task.taskDef.taskClass,
556 dataId=q.dataId,
557 initInputs=q.task.initInputs.unpackSingleRefs(),
558 inputs=inputs,
559 outputs=q.outputs.unpackMultiRefs(),
560 )
561 outputs.add(tmpQuantum)
562 else:
563 raise exc
564 return outputs
567class _DatasetIdMaker:
568 """Helper class which generates random dataset UUIDs for unresolved
569 datasets.
570 """
572 def __init__(self, run: str):
573 self.run = run
574 # Cache of dataset refs generated so far.
575 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {}
577 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef:
578 # For components we need their parent dataset ID.
579 if dataset_type.isComponent():
580 parent_type = dataset_type.makeCompositeDatasetType()
581 # Parent should be resolved if this is an existing input, or it
582 # should be in the cache already if it is an intermediate.
583 key = parent_type, data_id
584 if key not in self.resolved:
585 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}")
586 parent_ref = self.resolved[key]
587 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved"
588 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False)
590 key = dataset_type, data_id
591 if (resolved := self.resolved.get(key)) is None:
592 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False)
593 self.resolved[key] = resolved
594 return resolved
596 def resolveDict(self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder]) -> None:
597 """Resolve all unresolved references in the provided dictionary."""
598 for data_id, holder in refs.items():
599 if holder.ref is None:
600 holder.ref = self.resolveRef(holder.dataset_type, data_id)
603@dataclass
604class _PipelineScaffolding:
605 """A helper data structure that organizes the information involved in
606 constructing a `QuantumGraph` for a `Pipeline`.
608 Parameters
609 ----------
610 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
611 Sequence of tasks from which a graph is to be constructed. Must
612 have nested task classes already imported.
613 universe : `DimensionUniverse`
614 Universe of all possible dimensions.
616 Notes
617 -----
618 The scaffolding data structure contains nested data structures for both
619 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
620 data structures are shared between the pipeline-level structure (which
621 aggregates all datasets and categorizes them from the perspective of the
622 complete pipeline) and the individual tasks that use them as inputs and
623 outputs.
625 `QuantumGraph` construction proceeds in four steps, with each corresponding
626 to a different `_PipelineScaffolding` method:
628 1. When `_PipelineScaffolding` is constructed, we extract and categorize
629 the DatasetTypes used by the pipeline (delegating to
630 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
631 nested `_TaskScaffolding` and `_DatasetDict` objects.
633 2. In `connectDataIds`, we construct and run the "Big Join Query", which
634 returns related tuples of all dimensions used to identify any regular
635 input, output, and intermediate datasets (not prerequisites). We then
636 iterate over these tuples of related dimensions, identifying the subsets
637 that correspond to distinct data IDs for each task and dataset type,
638 and then create `_QuantumScaffolding` objects.
640 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
641 dataset data IDs previously identified, transforming unresolved
642 DatasetRefs into resolved DatasetRefs where appropriate. We then look
643 up prerequisite datasets for all quanta.
645 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
646 per-task `_QuantumScaffolding` objects.
647 """
649 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry):
650 _LOG.debug("Initializing data structures for QuantumGraph generation.")
651 self.tasks = []
652 # Aggregate and categorize the DatasetTypes in the Pipeline.
653 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
654 # Construct dictionaries that map those DatasetTypes to structures
655 # that will (later) hold additional information about them.
656 for attr in (
657 "initInputs",
658 "initIntermediates",
659 "initOutputs",
660 "inputs",
661 "intermediates",
662 "outputs",
663 "prerequisites",
664 ):
665 setattr(
666 self,
667 attr,
668 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
669 )
670 self.missing = _DatasetDict(universe=registry.dimensions)
671 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
672 # Aggregate all dimensions for all non-init, non-prerequisite
673 # DatasetTypes. These are the ones we'll include in the big join
674 # query.
675 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
676 # Construct scaffolding nodes for each Task, and add backreferences
677 # to the Task from each DatasetScaffolding node.
678 # Note that there's only one scaffolding node for each DatasetType,
679 # shared by _PipelineScaffolding and all _TaskScaffoldings that
680 # reference it.
681 if isinstance(pipeline, Pipeline):
682 pipeline = pipeline.toExpandedPipeline()
683 self.tasks = [
684 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
685 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
686 ]
688 def __repr__(self) -> str:
689 # Default dataclass-injected __repr__ gets caught in an infinite loop
690 # because of back-references.
691 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
693 tasks: list[_TaskScaffolding]
694 """Scaffolding data structures for each task in the pipeline
695 (`list` of `_TaskScaffolding`).
696 """
698 initInputs: _DatasetDict
699 """Datasets consumed but not produced when constructing the tasks in this
700 pipeline (`_DatasetDict`).
701 """
703 initIntermediates: _DatasetDict
704 """Datasets that are both consumed and produced when constructing the tasks
705 in this pipeline (`_DatasetDict`).
706 """
708 initOutputs: _DatasetDict
709 """Datasets produced but not consumed when constructing the tasks in this
710 pipeline (`_DatasetDict`).
711 """
713 inputs: _DatasetDict
714 """Datasets that are consumed but not produced when running this pipeline
715 (`_DatasetDict`).
716 """
718 intermediates: _DatasetDict
719 """Datasets that are both produced and consumed when running this pipeline
720 (`_DatasetDict`).
721 """
723 outputs: _DatasetDict
724 """Datasets produced but not consumed when when running this pipeline
725 (`_DatasetDict`).
726 """
728 prerequisites: _DatasetDict
729 """Datasets that are consumed when running this pipeline and looked up
730 per-Quantum when generating the graph (`_DatasetDict`).
731 """
733 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
734 """Datasets that should be used as constraints in the initial query,
735 according to tasks (`NamedValueSet`).
736 """
738 dimensions: DimensionGraph
739 """All dimensions used by any regular input, intermediate, or output
740 (not prerequisite) dataset; the set of dimension used in the "Big Join
741 Query" (`DimensionGraph`).
743 This is required to be a superset of all task quantum dimensions.
744 """
746 missing: _DatasetDict
747 """Datasets whose existence was originally predicted but were not
748 actually found.
750 Quanta that require these datasets as inputs will be pruned (recursively)
751 when actually constructing a `QuantumGraph` object.
753 These are currently populated only when the "initial dataset query
754 constraint" does not include all overall-input dataset types, and hence the
755 initial data ID query can include data IDs that it should not.
756 """
758 globalInitOutputs: _DatasetDict | None = None
759 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
760 """
762 @contextmanager
763 def connectDataIds(
764 self,
765 registry: Registry,
766 collections: Any,
767 userQuery: Optional[str],
768 externalDataId: DataCoordinate,
769 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
770 bind: Optional[Mapping[str, Any]] = None,
771 ) -> Iterator[DataCoordinateQueryResults]:
772 """Query for the data IDs that connect nodes in the `QuantumGraph`.
774 This method populates `_TaskScaffolding.dataIds` and
775 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
777 Parameters
778 ----------
779 registry : `lsst.daf.butler.Registry`
780 Registry for the data repository; used for all data ID queries.
781 collections
782 Expressions representing the collections to search for input
783 datasets. See :ref:`daf_butler_ordered_collection_searches`.
784 userQuery : `str` or `None`
785 User-provided expression to limit the data IDs processed.
786 externalDataId : `DataCoordinate`
787 Externally-provided data ID that should be used to restrict the
788 results, just as if these constraints had been included via ``AND``
789 in ``userQuery``. This includes (at least) any instrument named
790 in the pipeline definition.
791 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
792 The query constraint variant that should be used to constraint the
793 query based on dataset existance, defaults to
794 `DatasetQueryConstraintVariant.ALL`.
795 bind : `Mapping`, optional
796 Mapping containing literal values that should be injected into the
797 ``userQuery`` expression, keyed by the identifiers they replace.
799 Returns
800 -------
801 commonDataIds : \
802 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
803 An interface to a database temporary table containing all data IDs
804 that will appear in this `QuantumGraph`. Returned inside a
805 context manager, which will drop the temporary table at the end of
806 the `with` block in which this method is called.
807 """
808 _LOG.debug("Building query for data IDs.")
809 # Initialization datasets always have empty data IDs.
810 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
811 for datasetType, refs in itertools.chain(
812 self.initInputs.items(),
813 self.initIntermediates.items(),
814 self.initOutputs.items(),
815 ):
816 refs[emptyDataId] = _RefHolder(datasetType)
817 # Run one big query for the data IDs for task dimensions and regular
818 # inputs and outputs. We limit the query to only dimensions that are
819 # associated with the input dataset types, but don't (yet) try to
820 # obtain the dataset_ids for those inputs.
821 _LOG.debug(
822 "Submitting data ID query over dimensions %s and materializing results.",
823 list(self.dimensions.names),
824 )
825 queryArgs: dict[str, Any] = {
826 "dimensions": self.dimensions,
827 "where": userQuery,
828 "dataId": externalDataId,
829 "bind": bind,
830 }
831 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
832 _LOG.debug(
833 "Constraining graph query using default of %s.",
834 list(self.defaultDatasetQueryConstraints.names),
835 )
836 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
837 queryArgs["collections"] = collections
838 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
839 _LOG.debug("Not using dataset existence to constrain query.")
840 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
841 constraint = set(datasetQueryConstraint)
842 inputs = {k.name: k for k in self.inputs.keys()}
843 if remainder := constraint.difference(inputs.keys()):
844 raise ValueError(
845 f"{remainder} dataset type(s) specified as a graph constraint, but"
846 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
847 )
848 _LOG.debug(f"Constraining graph query using {constraint}")
849 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
850 queryArgs["collections"] = collections
851 else:
852 raise ValueError(
853 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
854 )
856 if "datasets" in queryArgs:
857 for i, dataset_type in enumerate(queryArgs["datasets"]):
858 if dataset_type.isComponent():
859 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
861 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
862 _LOG.debug("Expanding data IDs.")
863 commonDataIds = commonDataIds.expanded()
864 _LOG.debug("Iterating over query results to associate quanta with datasets.")
865 # Iterate over query results, populating data IDs for datasets and
866 # quanta and then connecting them to each other.
867 n = -1
868 for n, commonDataId in enumerate(commonDataIds):
869 # Create DatasetRefs for all DatasetTypes from this result row,
870 # noting that we might have created some already.
871 # We remember both those that already existed and those that we
872 # create now.
873 refsForRow = {}
874 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {}
875 for datasetType, refs in itertools.chain(
876 self.inputs.items(),
877 self.intermediates.items(),
878 self.outputs.items(),
879 ):
880 datasetDataId: Optional[DataCoordinate]
881 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
882 datasetDataId = commonDataId.subset(datasetType.dimensions)
883 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
884 ref_holder = refs.get(datasetDataId)
885 if ref_holder is None:
886 ref_holder = _RefHolder(datasetType)
887 refs[datasetDataId] = ref_holder
888 refsForRow[datasetType.name] = ref_holder
889 # Create _QuantumScaffolding objects for all tasks from this
890 # result row, noting that we might have created some already.
891 for task in self.tasks:
892 quantumDataId = commonDataId.subset(task.dimensions)
893 quantum = task.quanta.get(quantumDataId)
894 if quantum is None:
895 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
896 task.quanta[quantumDataId] = quantum
897 # Whether this is a new quantum or an existing one, we can
898 # now associate the DatasetRefs for this row with it. The
899 # fact that a Quantum data ID and a dataset data ID both
900 # came from the same result row is what tells us they
901 # should be associated.
902 # Many of these associates will be duplicates (because
903 # another query row that differed from this one only in
904 # irrelevant dimensions already added them), and we use
905 # sets to skip.
906 for datasetType in task.inputs:
907 dataId = dataIdCacheForRow[datasetType.dimensions]
908 ref_holder = refsForRow[datasetType.name]
909 quantum.inputs[datasetType.name][dataId] = ref_holder
910 for datasetType in task.outputs:
911 dataId = dataIdCacheForRow[datasetType.dimensions]
912 ref_holder = refsForRow[datasetType.name]
913 quantum.outputs[datasetType.name][dataId] = ref_holder
914 if n < 0:
915 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
916 emptiness_explained = False
917 for message in commonDataIds.explain_no_results():
918 _LOG.critical(message)
919 emptiness_explained = True
920 if not emptiness_explained:
921 _LOG.critical(
922 "To reproduce this query for debugging purposes, run "
923 "Registry.queryDataIds with these arguments:"
924 )
925 # We could just repr() the queryArgs dict to get something
926 # the user could make sense of, but it's friendlier to
927 # put these args in an easier-to-construct equivalent form
928 # so they can read it more easily and copy and paste into
929 # a Python terminal.
930 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
931 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
932 if queryArgs["where"]:
933 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
934 if "datasets" in queryArgs:
935 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
936 if "collections" in queryArgs:
937 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
938 _LOG.debug("Finished processing %d rows from data ID query.", n)
939 yield commonDataIds
941 def resolveDatasetRefs(
942 self,
943 registry: Registry,
944 collections: Any,
945 run: str,
946 commonDataIds: DataCoordinateQueryResults,
947 *,
948 skipExistingIn: Any = None,
949 clobberOutputs: bool = True,
950 constrainedByAllDatasets: bool = True,
951 ) -> None:
952 """Perform follow up queries for each dataset data ID produced in
953 `fillDataIds`.
955 This method populates `_DatasetScaffolding.refs` (except for those in
956 `prerequisites`).
958 Parameters
959 ----------
960 registry : `lsst.daf.butler.Registry`
961 Registry for the data repository; used for all data ID queries.
962 collections
963 Expressions representing the collections to search for input
964 datasets. See :ref:`daf_butler_ordered_collection_searches`.
965 run : `str`
966 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
967 output datasets, if it already exists.
968 commonDataIds : \
969 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
970 Result of a previous call to `connectDataIds`.
971 skipExistingIn
972 Expressions representing the collections to search for existing
973 output datasets that should be skipped. See
974 :ref:`daf_butler_ordered_collection_searches` for allowed types.
975 `None` or empty string/sequence disables skipping.
976 clobberOutputs : `bool`, optional
977 If `True` (default), allow quanta to created even if outputs exist;
978 this requires the same behavior behavior to be enabled when
979 executing. If ``skipExistingIn`` is not `None`, completed quanta
980 (those with metadata, or all outputs if there is no metadata
981 dataset configured) will be skipped rather than clobbered.
982 constrainedByAllDatasets : `bool`, optional
983 Indicates if the commonDataIds were generated with a constraint on
984 all dataset types.
986 Raises
987 ------
988 OutputExistsError
989 Raised if an output dataset already exists in the output run
990 and ``skipExistingIn`` does not include output run, or if only
991 some outputs are present and ``clobberOutputs`` is `False`.
992 """
993 # Run may be provided but it does not have to exist, in that case we
994 # use it for resolving references but don't check it for existing refs.
995 run_exists = False
996 if run:
997 try:
998 run_exists = bool(registry.queryCollections(run))
999 except MissingCollectionError:
1000 # Undocumented exception is raise if it does not exist
1001 pass
1003 skip_collections_wildcard: CollectionWildcard | None = None
1004 skipExistingInRun = False
1005 if skipExistingIn:
1006 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
1007 if run_exists:
1008 # as optimization check in the explicit list of names first
1009 skipExistingInRun = run in skip_collections_wildcard.strings
1010 if not skipExistingInRun:
1011 # need to flatten it and check again
1012 skipExistingInRun = run in registry.queryCollections(
1013 skipExistingIn,
1014 collectionTypes=CollectionType.RUN,
1015 )
1017 idMaker = _DatasetIdMaker(run)
1019 resolvedRefQueryResults: Iterable[DatasetRef]
1021 # Updating constrainedByAllDatasets here is not ideal, but we have a
1022 # few different code paths that each transfer different pieces of
1023 # information about what dataset query constraints were applied here,
1024 # and none of them has the complete picture until we get here. We're
1025 # long overdue for a QG generation rewrite that will make this go away
1026 # entirely anyway.
1027 constrainedByAllDatasets = (
1028 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
1029 )
1031 # Look up [init] intermediate and output datasets in the output
1032 # collection, if there is an output collection.
1033 if run_exists or skip_collections_wildcard is not None:
1034 for datasetType, refs in itertools.chain(
1035 self.initIntermediates.items(),
1036 self.initOutputs.items(),
1037 self.intermediates.items(),
1038 self.outputs.items(),
1039 ):
1040 _LOG.debug(
1041 "Resolving %d datasets for intermediate and/or output dataset %s.",
1042 len(refs),
1043 datasetType.name,
1044 )
1045 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
1046 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
1047 # TODO: this assert incorrectly bans component inputs;
1048 # investigate on DM-33027.
1049 # assert not datasetType.isComponent(), \
1050 # "Output datasets cannot be components."
1051 #
1052 # Instead we have to handle them manually to avoid a
1053 # deprecation warning, but it is at least confusing and
1054 # possibly a bug for components to appear here at all.
1055 if datasetType.isComponent():
1056 parent_dataset_type = datasetType.makeCompositeDatasetType()
1057 component = datasetType.component()
1058 else:
1059 parent_dataset_type = datasetType
1060 component = None
1062 # look at RUN collection first
1063 if run_exists:
1064 try:
1065 resolvedRefQueryResults = subset.findDatasets(
1066 parent_dataset_type, collections=run, findFirst=True
1067 )
1068 except MissingDatasetTypeError:
1069 resolvedRefQueryResults = []
1070 for resolvedRef in resolvedRefQueryResults:
1071 # TODO: we could easily support per-DatasetType
1072 # skipExisting and I could imagine that being useful -
1073 # it's probably required in order to support writing
1074 # initOutputs before QuantumGraph generation.
1075 assert resolvedRef.dataId in refs
1076 if not (skipExistingInRun or isInit or clobberOutputs):
1077 raise OutputExistsError(
1078 f"Output dataset {datasetType.name} already exists in "
1079 f"output RUN collection '{run}' with data ID"
1080 f" {resolvedRef.dataId}."
1081 )
1082 # To resolve all outputs we have to remember existing
1083 # ones to avoid generating new dataset IDs for them.
1084 refs[resolvedRef.dataId].ref = (
1085 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1086 )
1088 # And check skipExistingIn too, if RUN collection is in
1089 # it is handled above
1090 if skip_collections_wildcard is not None:
1091 try:
1092 resolvedRefQueryResults = subset.findDatasets(
1093 parent_dataset_type,
1094 collections=skip_collections_wildcard,
1095 findFirst=True,
1096 )
1097 except MissingDatasetTypeError:
1098 resolvedRefQueryResults = []
1099 for resolvedRef in resolvedRefQueryResults:
1100 if resolvedRef.dataId not in refs:
1101 continue
1102 refs[resolvedRef.dataId].ref = (
1103 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1104 )
1106 # Look up input and initInput datasets in the input collection(s). We
1107 # accumulate datasets in self.missing, if the common data IDs were not
1108 # constrained on dataset type existence.
1109 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1110 _LOG.debug(
1111 "Resolving %d datasets for input dataset %s.",
1112 len(refs),
1113 datasetType.name,
1114 )
1115 if datasetType.isComponent():
1116 parent_dataset_type = datasetType.makeCompositeDatasetType()
1117 component = datasetType.component()
1118 else:
1119 parent_dataset_type = datasetType
1120 component = None
1121 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {}
1122 try:
1123 resolvedRefQueryResults = commonDataIds.subset(
1124 datasetType.dimensions, unique=True
1125 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1126 except MissingDatasetTypeError:
1127 resolvedRefQueryResults = []
1128 dataIdsNotFoundYet = set(refs.keys())
1129 for resolvedRef in resolvedRefQueryResults:
1130 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1131 if resolvedRef.dataId not in refs:
1132 continue
1133 refs[resolvedRef.dataId].ref = (
1134 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1135 )
1136 if dataIdsNotFoundYet:
1137 if constrainedByAllDatasets:
1138 raise RuntimeError(
1139 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1140 f"'{datasetType.name}' was/were present in a previous "
1141 "query, but could not be found now. "
1142 "This is either a logic bug in QuantumGraph generation "
1143 "or the input collections have been modified since "
1144 "QuantumGraph generation began."
1145 )
1146 elif not datasetType.dimensions:
1147 raise RuntimeError(
1148 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1149 f"collections {collections}."
1150 )
1151 else:
1152 # If the common dataIds were not constrained using all the
1153 # input dataset types, it is possible that some data ids
1154 # found don't correspond to existing datasets. Mark these
1155 # for later pruning from the quantum graph.
1156 for k in dataIdsNotFoundYet:
1157 missing_for_dataset_type[k] = refs[k]
1158 if missing_for_dataset_type:
1159 self.missing[datasetType] = missing_for_dataset_type
1161 # Resolve the missing refs, just so they look like all of the others;
1162 # in the end other code will make sure they never appear in the QG.
1163 for dataset_type, refDict in self.missing.items():
1164 idMaker.resolveDict(dataset_type, refDict)
1166 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1167 # replacing the unresolved refs there, and then look up prerequisites.
1168 for task in self.tasks:
1169 _LOG.debug(
1170 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1171 len(task.quanta),
1172 task.taskDef.label,
1173 )
1174 # The way iterConnections is designed makes it impossible to
1175 # annotate precisely enough to satisfy MyPy here.
1176 lookupFunctions = {
1177 c.name: c.lookupFunction # type: ignore
1178 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1179 if c.lookupFunction is not None # type: ignore
1180 }
1181 dataIdsFailed = []
1182 dataIdsSucceeded = []
1183 for quantum in task.quanta.values():
1184 # Process outputs datasets only if skipExistingIn is not None
1185 # or there is a run to look for outputs in and clobberOutputs
1186 # is True. Note that if skipExistingIn is None, any output
1187 # datasets that already exist would have already caused an
1188 # exception to be raised.
1189 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1190 resolvedRefs = []
1191 unresolvedDataIds = []
1192 haveMetadata = False
1193 for datasetType, originalRefs in quantum.outputs.items():
1194 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()):
1195 if ref is not None:
1196 resolvedRefs.append(ref)
1197 originalRefs[dataId].ref = ref
1198 if datasetType.name == task.taskDef.metadataDatasetName:
1199 haveMetadata = True
1200 else:
1201 unresolvedDataIds.append((datasetType, dataId))
1202 if resolvedRefs:
1203 if haveMetadata or not unresolvedDataIds:
1204 dataIdsSucceeded.append(quantum.dataId)
1205 if skip_collections_wildcard is not None:
1206 continue
1207 else:
1208 dataIdsFailed.append(quantum.dataId)
1209 if not clobberOutputs:
1210 raise OutputExistsError(
1211 f"Quantum {quantum.dataId} of task with label "
1212 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1213 f"({resolvedRefs}) "
1214 f"and others that don't ({unresolvedDataIds}), with no metadata output, "
1215 "and clobbering outputs was not enabled."
1216 )
1217 # Update the input DatasetRefs to the resolved ones we already
1218 # searched for.
1219 for datasetType, input_refs in quantum.inputs.items():
1220 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()):
1221 input_refs[data_id].ref = ref
1222 # Look up prerequisite datasets in the input collection(s).
1223 # These may have dimensions that extend beyond those we queried
1224 # for originally, because we want to permit those data ID
1225 # values to differ across quanta and dataset types.
1226 for datasetType in task.prerequisites:
1227 if datasetType.isComponent():
1228 parent_dataset_type = datasetType.makeCompositeDatasetType()
1229 component = datasetType.component()
1230 else:
1231 parent_dataset_type = datasetType
1232 component = None
1233 lookupFunction = lookupFunctions.get(datasetType.name)
1234 if lookupFunction is not None:
1235 # PipelineTask has provided its own function to do the
1236 # lookup. This always takes precedence.
1237 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1238 elif (
1239 datasetType.isCalibration()
1240 and datasetType.dimensions <= quantum.dataId.graph
1241 and quantum.dataId.graph.temporal
1242 ):
1243 # This is a master calibration lookup, which we have to
1244 # handle specially because the query system can't do a
1245 # temporal join on a non-dimension-based timespan yet.
1246 timespan = quantum.dataId.timespan
1247 try:
1248 prereq_ref = registry.findDataset(
1249 parent_dataset_type,
1250 quantum.dataId,
1251 collections=collections,
1252 timespan=timespan,
1253 )
1254 if prereq_ref is not None:
1255 if component is not None:
1256 prereq_ref = prereq_ref.makeComponentRef(component)
1257 prereq_refs = [prereq_ref]
1258 else:
1259 prereq_refs = []
1260 except (KeyError, MissingDatasetTypeError):
1261 # This dataset type is not present in the registry,
1262 # which just means there are no datasets here.
1263 prereq_refs = []
1264 else:
1265 # Most general case.
1266 prereq_refs = [
1267 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1268 for prereq_ref in registry.queryDatasets(
1269 parent_dataset_type,
1270 collections=collections,
1271 dataId=quantum.dataId,
1272 findFirst=True,
1273 ).expanded()
1274 ]
1276 for ref in prereq_refs:
1277 if ref is not None:
1278 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1279 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref)
1281 # Resolve all quantum inputs and outputs.
1282 for datasetDict in (quantum.inputs, quantum.outputs):
1283 for dataset_type, refDict in datasetDict.items():
1284 idMaker.resolveDict(dataset_type, refDict)
1286 # Resolve task initInputs and initOutputs.
1287 for datasetDict in (task.initInputs, task.initOutputs):
1288 for dataset_type, refDict in datasetDict.items():
1289 idMaker.resolveDict(dataset_type, refDict)
1291 # Actually remove any quanta that we decided to skip above.
1292 if dataIdsSucceeded:
1293 if skip_collections_wildcard is not None:
1294 _LOG.debug(
1295 "Pruning successful %d quanta for task with label '%s' because all of their "
1296 "outputs exist or metadata was written successfully.",
1297 len(dataIdsSucceeded),
1298 task.taskDef.label,
1299 )
1300 for dataId in dataIdsSucceeded:
1301 del task.quanta[dataId]
1302 elif clobberOutputs:
1303 _LOG.info(
1304 "Found %d successful quanta for task with label '%s' "
1305 "that will need to be clobbered during execution.",
1306 len(dataIdsSucceeded),
1307 task.taskDef.label,
1308 )
1309 else:
1310 raise AssertionError("OutputExistsError should have already been raised.")
1311 if dataIdsFailed:
1312 if clobberOutputs:
1313 _LOG.info(
1314 "Found %d failed/incomplete quanta for task with label '%s' "
1315 "that will need to be clobbered during execution.",
1316 len(dataIdsFailed),
1317 task.taskDef.label,
1318 )
1319 else:
1320 raise AssertionError("OutputExistsError should have already been raised.")
1322 # Collect initOutputs that do not belong to any task.
1323 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1324 for task in self.tasks:
1325 global_dataset_types -= set(task.initOutputs)
1326 if global_dataset_types:
1327 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1328 for dataset_type, refDict in self.globalInitOutputs.items():
1329 idMaker.resolveDict(dataset_type, refDict)
1331 def makeQuantumGraph(
1332 self,
1333 registry: Registry,
1334 metadata: Optional[Mapping[str, Any]] = None,
1335 datastore: Optional[Datastore] = None,
1336 ) -> QuantumGraph:
1337 """Create a `QuantumGraph` from the quanta already present in
1338 the scaffolding data structure.
1340 Parameters
1341 ---------
1342 registry : `lsst.daf.butler.Registry`
1343 Registry for the data repository; used for all data ID queries.
1344 metadata : Optional Mapping of `str` to primitives
1345 This is an optional parameter of extra data to carry with the
1346 graph. Entries in this mapping should be able to be serialized in
1347 JSON.
1348 datastore : `Datastore`, optional
1349 If not `None` then fill datastore records in each generated
1350 Quantum.
1352 Returns
1353 -------
1354 graph : `QuantumGraph`
1355 The full `QuantumGraph`.
1356 """
1358 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1359 """Extract all DatasetRefs from the dictionaries"""
1360 for ref_dict in dataset_dict.values():
1361 for holder in ref_dict.values():
1362 yield holder.resolved_ref
1364 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1365 if datastore is not None:
1366 datastore_records = datastore.export_records(
1367 itertools.chain(
1368 _make_refs(self.inputs),
1369 _make_refs(self.initInputs),
1370 _make_refs(self.prerequisites),
1371 )
1372 )
1374 graphInput: dict[TaskDef, set[Quantum]] = {}
1375 for task in self.tasks:
1376 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records)
1377 graphInput[task.taskDef] = qset
1379 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1380 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1382 globalInitOutputs: list[DatasetRef] = []
1383 if self.globalInitOutputs is not None:
1384 for refs_dict in self.globalInitOutputs.values():
1385 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values())
1387 graph = QuantumGraph(
1388 graphInput,
1389 metadata=metadata,
1390 pruneRefs=list(self.missing.iter_resolved_refs()),
1391 universe=self.dimensions.universe,
1392 initInputs=taskInitInputs,
1393 initOutputs=taskInitOutputs,
1394 globalInitOutputs=globalInitOutputs,
1395 registryDatasetTypes=self._get_registry_dataset_types(registry),
1396 )
1397 return graph
1399 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1400 """Make a list of all dataset types used by a graph as defined in
1401 registry.
1402 """
1403 chain = [
1404 self.initInputs,
1405 self.initIntermediates,
1406 self.initOutputs,
1407 self.inputs,
1408 self.intermediates,
1409 self.outputs,
1410 self.prerequisites,
1411 ]
1412 if self.globalInitOutputs is not None:
1413 chain.append(self.globalInitOutputs)
1415 # Collect names of all dataset types.
1416 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1417 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1419 # Check for types that do not exist in registry yet:
1420 # - inputs must exist
1421 # - intermediates and outputs may not exist, but there must not be
1422 # more than one definition (e.g. differing in storage class)
1423 # - prerequisites may not exist, treat it the same as outputs here
1424 for dstype in itertools.chain(self.initInputs, self.inputs):
1425 if dstype.name not in dataset_types:
1426 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1428 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1429 chain = [
1430 self.initIntermediates,
1431 self.initOutputs,
1432 self.intermediates,
1433 self.outputs,
1434 self.prerequisites,
1435 ]
1436 if self.globalInitOutputs is not None:
1437 chain.append(self.globalInitOutputs)
1438 for dstype in itertools.chain(*chain):
1439 if dstype.name not in dataset_types:
1440 new_outputs[dstype.name].add(dstype)
1441 for name, dstypes in new_outputs.items():
1442 if len(dstypes) > 1:
1443 raise ValueError(
1444 "Pipeline contains multiple definitions for a dataset type "
1445 f"which is not defined in registry yet: {dstypes}"
1446 )
1447 elif len(dstypes) == 1:
1448 dataset_types[name] = dstypes.pop()
1450 return dataset_types.values()
1453# ------------------------
1454# Exported definitions --
1455# ------------------------
1458class GraphBuilderError(Exception):
1459 """Base class for exceptions generated by graph builder."""
1461 pass
1464class OutputExistsError(GraphBuilderError):
1465 """Exception generated when output datasets already exist."""
1467 pass
1470class PrerequisiteMissingError(GraphBuilderError):
1471 """Exception generated when a prerequisite dataset does not exist."""
1473 pass
1476class GraphBuilder:
1477 """GraphBuilder class is responsible for building task execution graph from
1478 a Pipeline.
1480 Parameters
1481 ----------
1482 registry : `~lsst.daf.butler.Registry`
1483 Data butler instance.
1484 skipExistingIn
1485 Expressions representing the collections to search for existing
1486 output datasets that should be skipped. See
1487 :ref:`daf_butler_ordered_collection_searches`.
1488 clobberOutputs : `bool`, optional
1489 If `True` (default), allow quanta to created even if partial outputs
1490 exist; this requires the same behavior behavior to be enabled when
1491 executing.
1492 datastore : `Datastore`, optional
1493 If not `None` then fill datastore records in each generated Quantum.
1494 """
1496 def __init__(
1497 self,
1498 registry: Registry,
1499 skipExistingIn: Any = None,
1500 clobberOutputs: bool = True,
1501 datastore: Optional[Datastore] = None,
1502 ):
1503 self.registry = registry
1504 self.dimensions = registry.dimensions
1505 self.skipExistingIn = skipExistingIn
1506 self.clobberOutputs = clobberOutputs
1507 self.datastore = datastore
1509 def makeGraph(
1510 self,
1511 pipeline: Pipeline | Iterable[TaskDef],
1512 collections: Any,
1513 run: str,
1514 userQuery: Optional[str],
1515 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1516 metadata: Optional[Mapping[str, Any]] = None,
1517 bind: Optional[Mapping[str, Any]] = None,
1518 ) -> QuantumGraph:
1519 """Create execution graph for a pipeline.
1521 Parameters
1522 ----------
1523 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1524 Pipeline definition, task names/classes and their configs.
1525 collections
1526 Expressions representing the collections to search for input
1527 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1528 run : `str`
1529 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1530 output datasets. Collection does not have to exist and it will be
1531 created when graph is executed.
1532 userQuery : `str`
1533 String which defines user-defined selection for registry, should be
1534 empty or `None` if there is no restrictions on data selection.
1535 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1536 The query constraint variant that should be used to constraint the
1537 query based on dataset existance, defaults to
1538 `DatasetQueryConstraintVariant.ALL`.
1539 metadata : Optional Mapping of `str` to primitives
1540 This is an optional parameter of extra data to carry with the
1541 graph. Entries in this mapping should be able to be serialized in
1542 JSON.
1543 bind : `Mapping`, optional
1544 Mapping containing literal values that should be injected into the
1545 ``userQuery`` expression, keyed by the identifiers they replace.
1547 Returns
1548 -------
1549 graph : `QuantumGraph`
1551 Raises
1552 ------
1553 UserExpressionError
1554 Raised when user expression cannot be parsed.
1555 OutputExistsError
1556 Raised when output datasets already exist.
1557 Exception
1558 Other exceptions types may be raised by underlying registry
1559 classes.
1560 """
1561 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1562 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1563 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1564 instrument_class: Optional[Any] = None
1565 if isinstance(pipeline, Pipeline):
1566 instrument_class_name = pipeline.getInstrument()
1567 if instrument_class_name is not None:
1568 instrument_class = doImportType(instrument_class_name)
1569 pipeline = list(pipeline.toExpandedPipeline())
1570 if instrument_class is not None:
1571 dataId = DataCoordinate.standardize(
1572 instrument=instrument_class.getName(), universe=self.registry.dimensions
1573 )
1574 else:
1575 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1576 with scaffolding.connectDataIds(
1577 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1578 ) as commonDataIds:
1579 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1580 scaffolding.resolveDatasetRefs(
1581 self.registry,
1582 collections,
1583 run,
1584 commonDataIds,
1585 skipExistingIn=self.skipExistingIn,
1586 clobberOutputs=self.clobberOutputs,
1587 constrainedByAllDatasets=condition,
1588 )
1589 return scaffolding.makeQuantumGraph(
1590 registry=self.registry, metadata=metadata, datastore=self.datastore
1591 )