Coverage for python/lsst/pipe/base/graphBuilder.py: 14%
521 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-12 09:13 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-12 09:13 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap, defaultdict
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
38from lsst.daf.butler import (
39 CollectionType,
40 DataCoordinate,
41 DatasetIdGenEnum,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
55from lsst.daf.butler.registry.wildcards import CollectionWildcard
56from lsst.utils import doImportType
58from ._datasetQueryConstraints import DatasetQueryConstraintVariant
59from ._status import NoWorkFound
61# -----------------------------
62# Imports for other modules --
63# -----------------------------
64from .connections import AdjustQuantumHelper, iterConnections
65from .graph import QuantumGraph
66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
68# ----------------------------------
69# Local non-exported definitions --
70# ----------------------------------
72_LOG = logging.getLogger(__name__)
75class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
76 """A custom dictionary that maps `DatasetType` to a nested dictionary of
77 the known `DatasetRef` instances of that type.
79 Parameters
80 ----------
81 args
82 Positional arguments are forwarded to the `dict` constructor.
83 universe : `DimensionUniverse`
84 Universe of all possible dimensions.
85 """
87 def __init__(self, *args: Any, universe: DimensionUniverse):
88 super().__init__(*args)
89 self.universe = universe
91 @classmethod
92 def fromDatasetTypes(
93 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
94 ) -> _DatasetDict:
95 """Construct a dictionary from a flat iterable of `DatasetType` keys.
97 Parameters
98 ----------
99 datasetTypes : `iterable` of `DatasetType`
100 DatasetTypes to use as keys for the dict. Values will be empty
101 dictionaries.
102 universe : `DimensionUniverse`
103 Universe of all possible dimensions.
105 Returns
106 -------
107 dictionary : `_DatasetDict`
108 A new `_DatasetDict` instance.
109 """
110 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
112 @classmethod
113 def fromSubset(
114 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
115 ) -> _DatasetDict:
116 """Return a new dictionary by extracting items corresponding to the
117 given keys from one or more existing dictionaries.
119 Parameters
120 ----------
121 datasetTypes : `iterable` of `DatasetType`
122 DatasetTypes to use as keys for the dict. Values will be obtained
123 by lookups against ``first`` and ``rest``.
124 first : `_DatasetDict`
125 Another dictionary from which to extract values.
126 rest
127 Additional dictionaries from which to extract values.
129 Returns
130 -------
131 dictionary : `_DatasetDict`
132 A new dictionary instance.
133 """
134 combined = ChainMap(first, *rest)
136 # Dataset types known to match immediately can be processed
137 # without checks.
138 matches = combined.keys() & set(datasetTypes)
139 _dict = {k: combined[k] for k in matches}
141 if len(_dict) < len(datasetTypes):
142 # Work out which ones are missing.
143 missing_datasetTypes = set(datasetTypes) - _dict.keys()
145 # Get the known names for comparison.
146 combined_by_name = {k.name: k for k in combined}
148 missing = set()
149 incompatible = {}
150 for datasetType in missing_datasetTypes:
151 # The dataset type is not found. It may not be listed
152 # or it may be that it is there with the same name
153 # but different definition.
154 if datasetType.name in combined_by_name:
155 # This implies some inconsistency in definitions
156 # for connections. If there is support for storage
157 # class conversion we can let it slide.
158 # At this point we do not know
159 # where the inconsistency is but trust that down
160 # stream code will be more explicit about input
161 # vs output incompatibilities.
162 existing = combined_by_name[datasetType.name]
163 convertible_to_existing = existing.is_compatible_with(datasetType)
164 convertible_from_existing = datasetType.is_compatible_with(existing)
165 if convertible_to_existing and convertible_from_existing:
166 _LOG.debug(
167 "Dataset type %s has multiple fully-compatible storage classes %s and %s",
168 datasetType.name,
169 datasetType.storageClass_name,
170 existing.storageClass_name,
171 )
172 _dict[datasetType] = combined[existing]
173 elif convertible_to_existing or convertible_from_existing:
174 # We'd need to refactor a fair amount to recognize
175 # whether this is an error or not, so I'm not going to
176 # bother until we need to do that for other reasons
177 # (it won't be too long).
178 _LOG.info(
179 "Dataset type %s is present with multiple only partially-compatible storage "
180 "classes %s and %s.",
181 datasetType.name,
182 datasetType.storageClass_name,
183 existing.storageClass_name,
184 )
185 _dict[datasetType] = combined[existing]
186 else:
187 incompatible[datasetType] = existing
188 else:
189 missing.add(datasetType)
191 if missing or incompatible:
192 reasons = []
193 if missing:
194 reasons.append(
195 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
196 + ", ".join(d.name for d in combined)
197 )
198 if incompatible:
199 for x, y in incompatible.items():
200 reasons.append(f"{x} incompatible with {y}")
201 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
203 return cls(_dict, universe=first.universe)
205 @property
206 def dimensions(self) -> DimensionGraph:
207 """The union of all dimensions used by all dataset types in this
208 dictionary, including implied dependencies (`DimensionGraph`).
209 """
210 base = self.universe.empty
211 if len(self) == 0:
212 return base
213 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
215 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
216 """Unpack nested single-element `DatasetRef` dicts into a new
217 mapping with `DatasetType` keys and `DatasetRef` values.
219 This method assumes that each nest contains exactly one item, as is the
220 case for all "init" datasets.
222 Returns
223 -------
224 dictionary : `NamedKeyDict`
225 Dictionary mapping `DatasetType` to `DatasetRef`, with both
226 `DatasetType` instances and string names usable as keys.
227 """
229 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
230 (ref,) = refs.values()
231 return ref
233 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
235 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
236 """Unpack nested multi-element `DatasetRef` dicts into a new
237 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
239 Returns
240 -------
241 dictionary : `NamedKeyDict`
242 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
243 both `DatasetType` instances and string names usable as keys.
244 """
245 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
247 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
248 """Iterate over the contained `DatasetRef` instances that match the
249 given `DatasetType` and data IDs.
251 Parameters
252 ----------
253 datasetType : `DatasetType`
254 Dataset type to match.
255 dataIds : `Iterable` [ `DataCoordinate` ]
256 Data IDs to match.
258 Returns
259 -------
260 refs : `Iterator` [ `DatasetRef` ]
261 DatasetRef instances for which ``ref.datasetType == datasetType``
262 and ``ref.dataId`` is in ``dataIds``.
263 """
264 refs = self[datasetType]
265 return (refs[dataId] for dataId in dataIds)
268class _QuantumScaffolding:
269 """Helper class aggregating information about a `Quantum`, used when
270 constructing a `QuantumGraph`.
272 See `_PipelineScaffolding` for a top-down description of the full
273 scaffolding data structure.
275 Parameters
276 ----------
277 task : _TaskScaffolding
278 Back-reference to the helper object for the `PipelineTask` this quantum
279 represents an execution of.
280 dataId : `DataCoordinate`
281 Data ID for this quantum.
282 """
284 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
285 self.task = task
286 self.dataId = dataId
287 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
288 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
289 self.prerequisites = _DatasetDict.fromDatasetTypes(
290 task.prerequisites.keys(), universe=dataId.universe
291 )
293 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
295 def __repr__(self) -> str:
296 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
298 task: _TaskScaffolding
299 """Back-reference to the helper object for the `PipelineTask` this quantum
300 represents an execution of.
301 """
303 dataId: DataCoordinate
304 """Data ID for this quantum.
305 """
307 inputs: _DatasetDict
308 """Nested dictionary containing `DatasetRef` inputs to this quantum.
310 This is initialized to map each `DatasetType` to an empty dictionary at
311 construction. Those nested dictionaries are populated (with data IDs as
312 keys) with unresolved `DatasetRef` instances in
313 `_PipelineScaffolding.connectDataIds`.
314 """
316 outputs: _DatasetDict
317 """Nested dictionary containing `DatasetRef` outputs this quantum.
318 """
320 prerequisites: _DatasetDict
321 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
322 quantum.
323 """
325 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
326 """Transform the scaffolding object into a true `Quantum` instance.
328 Parameters
329 ----------
330 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
331 If not `None` then fill datastore records in each generated Quantum
332 using the records from this structure.
334 Returns
335 -------
336 quantum : `Quantum`
337 An actual `Quantum` instance.
338 """
339 allInputs = self.inputs.unpackMultiRefs()
340 allInputs.update(self.prerequisites.unpackMultiRefs())
341 # Give the task's Connections class an opportunity to remove some
342 # inputs, or complain if they are unacceptable.
343 # This will raise if one of the check conditions is not met, which is
344 # the intended behavior.
345 # If it raises NotWorkFound, there is a bug in the QG algorithm
346 # or the adjustQuantum is incorrectly trying to make a prerequisite
347 # input behave like a regular input; adjustQuantum should only raise
348 # NoWorkFound if a regular input is missing, and it shouldn't be
349 # possible for us to have generated ``self`` if that's true.
350 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
351 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
352 initInputs = self.task.initInputs.unpackSingleRefs()
353 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
354 if datastore_records is not None:
355 quantum_records = {}
356 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
357 input_refs += list(initInputs.values())
358 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
359 for datastore_name, records in datastore_records.items():
360 matching_records = records.subset(input_ids)
361 if matching_records is not None:
362 quantum_records[datastore_name] = matching_records
363 return Quantum(
364 taskName=self.task.taskDef.taskName,
365 taskClass=self.task.taskDef.taskClass,
366 dataId=self.dataId,
367 initInputs=initInputs,
368 inputs=helper.inputs,
369 outputs=helper.outputs,
370 datastore_records=quantum_records,
371 )
374@dataclass
375class _TaskScaffolding:
376 """Helper class aggregating information about a `PipelineTask`, used when
377 constructing a `QuantumGraph`.
379 See `_PipelineScaffolding` for a top-down description of the full
380 scaffolding data structure.
382 Parameters
383 ----------
384 taskDef : `TaskDef`
385 Data structure that identifies the task class and its config.
386 parent : `_PipelineScaffolding`
387 The parent data structure that will hold the instance being
388 constructed.
389 datasetTypes : `TaskDatasetTypes`
390 Data structure that categorizes the dataset types used by this task.
391 """
393 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
394 universe = parent.dimensions.universe
395 self.taskDef = taskDef
396 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
397 assert self.dimensions.issubset(parent.dimensions)
398 # Initialize _DatasetDicts as subsets of the one or two
399 # corresponding dicts in the parent _PipelineScaffolding.
400 self.initInputs = _DatasetDict.fromSubset(
401 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
402 )
403 self.initOutputs = _DatasetDict.fromSubset(
404 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
405 )
406 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
407 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
408 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
409 self.dataIds: Set[DataCoordinate] = set()
410 self.quanta = {}
412 def __repr__(self) -> str:
413 # Default dataclass-injected __repr__ gets caught in an infinite loop
414 # because of back-references.
415 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
417 taskDef: TaskDef
418 """Data structure that identifies the task class and its config
419 (`TaskDef`).
420 """
422 dimensions: DimensionGraph
423 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
424 """
426 initInputs: _DatasetDict
427 """Dictionary containing information about datasets used to construct this
428 task (`_DatasetDict`).
429 """
431 initOutputs: _DatasetDict
432 """Dictionary containing information about datasets produced as a
433 side-effect of constructing this task (`_DatasetDict`).
434 """
436 inputs: _DatasetDict
437 """Dictionary containing information about datasets used as regular,
438 graph-constraining inputs to this task (`_DatasetDict`).
439 """
441 outputs: _DatasetDict
442 """Dictionary containing information about datasets produced by this task
443 (`_DatasetDict`).
444 """
446 prerequisites: _DatasetDict
447 """Dictionary containing information about input datasets that must be
448 present in the repository before any Pipeline containing this task is run
449 (`_DatasetDict`).
450 """
452 quanta: Dict[DataCoordinate, _QuantumScaffolding]
453 """Dictionary mapping data ID to a scaffolding object for the Quantum of
454 this task with that data ID.
455 """
457 def makeQuantumSet(
458 self,
459 unresolvedRefs: Optional[Set[DatasetRef]] = None,
460 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
461 ) -> Set[Quantum]:
462 """Create a `set` of `Quantum` from the information in ``self``.
464 Parameters
465 ----------
466 unresolvedRefs : `set` [ `DatasetRef` ], optional
467 Input dataset refs that have not been found.
468 datastore_records : `dict`
471 Returns
472 -------
473 nodes : `set` of `Quantum`
474 The `Quantum` elements corresponding to this task.
475 """
476 if unresolvedRefs is None:
477 unresolvedRefs = set()
478 outputs = set()
479 for q in self.quanta.values():
480 try:
481 tmpQuanta = q.makeQuantum(datastore_records)
482 outputs.add(tmpQuanta)
483 except (NoWorkFound, FileNotFoundError) as exc:
484 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
485 if unresolvedRefs.intersection(refs):
486 # This means it is a node that is Known to be pruned
487 # later and should be left in even though some follow up
488 # queries fail. This allows the pruning to start from this
489 # quantum with known issues, and prune other nodes it
490 # touches
491 inputs = q.inputs.unpackMultiRefs()
492 inputs.update(q.prerequisites.unpackMultiRefs())
493 tmpQuantum = Quantum(
494 taskName=q.task.taskDef.taskName,
495 taskClass=q.task.taskDef.taskClass,
496 dataId=q.dataId,
497 initInputs=q.task.initInputs.unpackSingleRefs(),
498 inputs=inputs,
499 outputs=q.outputs.unpackMultiRefs(),
500 )
501 outputs.add(tmpQuantum)
502 else:
503 raise exc
504 return outputs
507class _DatasetIdMaker:
508 """Helper class which generates random dataset UUIDs for unresolved
509 datasets.
510 """
512 def __init__(self, registry: Registry, run: str):
513 self.datasetIdFactory = registry.datasetIdFactory
514 self.run = run
515 # Dataset IDs generated so far
516 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {}
518 def resolveRef(self, ref: DatasetRef) -> DatasetRef:
519 if ref.id is not None:
520 return ref
522 # For components we need their parent dataset ID.
523 if ref.isComponent():
524 parent_ref = ref.makeCompositeRef()
525 # Some basic check - parent should be resolved if this is an
526 # existing input, or it should be in the cache already if it is
527 # an intermediate.
528 if parent_ref.id is None:
529 key = parent_ref.datasetType, parent_ref.dataId
530 if key not in self.resolved:
531 raise ValueError(f"Composite dataset is missing from cache: {parent_ref}")
532 parent_ref = self.resolved[key]
533 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved"
534 return ref.resolved(parent_ref.id, parent_ref.run)
536 key = ref.datasetType, ref.dataId
537 if (resolved := self.resolved.get(key)) is None:
538 resolved = self.datasetIdFactory.resolveRef(ref, self.run, DatasetIdGenEnum.UNIQUE)
539 self.resolved[key] = resolved
540 return resolved
542 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]:
543 """Resolve all unresolved references in the provided dictionary."""
544 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()}
547@dataclass
548class _PipelineScaffolding:
549 """A helper data structure that organizes the information involved in
550 constructing a `QuantumGraph` for a `Pipeline`.
552 Parameters
553 ----------
554 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
555 Sequence of tasks from which a graph is to be constructed. Must
556 have nested task classes already imported.
557 universe : `DimensionUniverse`
558 Universe of all possible dimensions.
560 Notes
561 -----
562 The scaffolding data structure contains nested data structures for both
563 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
564 data structures are shared between the pipeline-level structure (which
565 aggregates all datasets and categorizes them from the perspective of the
566 complete pipeline) and the individual tasks that use them as inputs and
567 outputs.
569 `QuantumGraph` construction proceeds in four steps, with each corresponding
570 to a different `_PipelineScaffolding` method:
572 1. When `_PipelineScaffolding` is constructed, we extract and categorize
573 the DatasetTypes used by the pipeline (delegating to
574 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
575 nested `_TaskScaffolding` and `_DatasetDict` objects.
577 2. In `connectDataIds`, we construct and run the "Big Join Query", which
578 returns related tuples of all dimensions used to identify any regular
579 input, output, and intermediate datasets (not prerequisites). We then
580 iterate over these tuples of related dimensions, identifying the subsets
581 that correspond to distinct data IDs for each task and dataset type,
582 and then create `_QuantumScaffolding` objects.
584 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
585 dataset data IDs previously identified, transforming unresolved
586 DatasetRefs into resolved DatasetRefs where appropriate. We then look
587 up prerequisite datasets for all quanta.
589 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
590 per-task `_QuantumScaffolding` objects.
591 """
593 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
594 _LOG.debug("Initializing data structures for QuantumGraph generation.")
595 self.tasks = []
596 # Aggregate and categorize the DatasetTypes in the Pipeline.
597 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
598 # Construct dictionaries that map those DatasetTypes to structures
599 # that will (later) hold additional information about them.
600 for attr in (
601 "initInputs",
602 "initIntermediates",
603 "initOutputs",
604 "inputs",
605 "intermediates",
606 "outputs",
607 "prerequisites",
608 ):
609 setattr(
610 self,
611 attr,
612 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
613 )
614 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
615 # Aggregate all dimensions for all non-init, non-prerequisite
616 # DatasetTypes. These are the ones we'll include in the big join
617 # query.
618 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
619 # Construct scaffolding nodes for each Task, and add backreferences
620 # to the Task from each DatasetScaffolding node.
621 # Note that there's only one scaffolding node for each DatasetType,
622 # shared by _PipelineScaffolding and all _TaskScaffoldings that
623 # reference it.
624 if isinstance(pipeline, Pipeline):
625 pipeline = pipeline.toExpandedPipeline()
626 self.tasks = [
627 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
628 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
629 ]
631 def __repr__(self) -> str:
632 # Default dataclass-injected __repr__ gets caught in an infinite loop
633 # because of back-references.
634 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
636 tasks: List[_TaskScaffolding]
637 """Scaffolding data structures for each task in the pipeline
638 (`list` of `_TaskScaffolding`).
639 """
641 initInputs: _DatasetDict
642 """Datasets consumed but not produced when constructing the tasks in this
643 pipeline (`_DatasetDict`).
644 """
646 initIntermediates: _DatasetDict
647 """Datasets that are both consumed and produced when constructing the tasks
648 in this pipeline (`_DatasetDict`).
649 """
651 initOutputs: _DatasetDict
652 """Datasets produced but not consumed when constructing the tasks in this
653 pipeline (`_DatasetDict`).
654 """
656 inputs: _DatasetDict
657 """Datasets that are consumed but not produced when running this pipeline
658 (`_DatasetDict`).
659 """
661 intermediates: _DatasetDict
662 """Datasets that are both produced and consumed when running this pipeline
663 (`_DatasetDict`).
664 """
666 outputs: _DatasetDict
667 """Datasets produced but not consumed when when running this pipeline
668 (`_DatasetDict`).
669 """
671 prerequisites: _DatasetDict
672 """Datasets that are consumed when running this pipeline and looked up
673 per-Quantum when generating the graph (`_DatasetDict`).
674 """
676 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
677 """Datasets that should be used as constraints in the initial query,
678 according to tasks (`NamedValueSet`).
679 """
681 dimensions: DimensionGraph
682 """All dimensions used by any regular input, intermediate, or output
683 (not prerequisite) dataset; the set of dimension used in the "Big Join
684 Query" (`DimensionGraph`).
686 This is required to be a superset of all task quantum dimensions.
687 """
689 globalInitOutputs: _DatasetDict | None = None
690 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
691 """
693 @contextmanager
694 def connectDataIds(
695 self,
696 registry: Registry,
697 collections: Any,
698 userQuery: Optional[str],
699 externalDataId: DataCoordinate,
700 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
701 bind: Optional[Mapping[str, Any]] = None,
702 ) -> Iterator[DataCoordinateQueryResults]:
703 """Query for the data IDs that connect nodes in the `QuantumGraph`.
705 This method populates `_TaskScaffolding.dataIds` and
706 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
708 Parameters
709 ----------
710 registry : `lsst.daf.butler.Registry`
711 Registry for the data repository; used for all data ID queries.
712 collections
713 Expressions representing the collections to search for input
714 datasets. See :ref:`daf_butler_ordered_collection_searches`.
715 userQuery : `str` or `None`
716 User-provided expression to limit the data IDs processed.
717 externalDataId : `DataCoordinate`
718 Externally-provided data ID that should be used to restrict the
719 results, just as if these constraints had been included via ``AND``
720 in ``userQuery``. This includes (at least) any instrument named
721 in the pipeline definition.
722 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
723 The query constraint variant that should be used to constraint the
724 query based on dataset existance, defaults to
725 `DatasetQueryConstraintVariant.ALL`.
726 bind : `Mapping`, optional
727 Mapping containing literal values that should be injected into the
728 ``userQuery`` expression, keyed by the identifiers they replace.
730 Returns
731 -------
732 commonDataIds : \
733 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
734 An interface to a database temporary table containing all data IDs
735 that will appear in this `QuantumGraph`. Returned inside a
736 context manager, which will drop the temporary table at the end of
737 the `with` block in which this method is called.
738 """
739 _LOG.debug("Building query for data IDs.")
740 # Initialization datasets always have empty data IDs.
741 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
742 for datasetType, refs in itertools.chain(
743 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
744 ):
745 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
746 # Run one big query for the data IDs for task dimensions and regular
747 # inputs and outputs. We limit the query to only dimensions that are
748 # associated with the input dataset types, but don't (yet) try to
749 # obtain the dataset_ids for those inputs.
750 _LOG.debug(
751 "Submitting data ID query over dimensions %s and materializing results.",
752 list(self.dimensions.names),
753 )
754 queryArgs: Dict[str, Any] = {
755 "dimensions": self.dimensions,
756 "where": userQuery,
757 "dataId": externalDataId,
758 "bind": bind,
759 }
760 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
761 _LOG.debug(
762 "Constraining graph query using default of %s.",
763 list(self.defaultDatasetQueryConstraints.names),
764 )
765 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
766 queryArgs["collections"] = collections
767 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
768 _LOG.debug("Not using dataset existence to constrain query.")
769 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
770 constraint = set(datasetQueryConstraint)
771 inputs = {k.name: k for k in self.inputs.keys()}
772 if remainder := constraint.difference(inputs.keys()):
773 raise ValueError(
774 f"{remainder} dataset type(s) specified as a graph constraint, but"
775 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
776 )
777 _LOG.debug(f"Constraining graph query using {constraint}")
778 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
779 queryArgs["collections"] = collections
780 else:
781 raise ValueError(
782 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
783 )
785 if "datasets" in queryArgs:
786 for i, dataset_type in enumerate(queryArgs["datasets"]):
787 if dataset_type.isComponent():
788 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
790 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
791 _LOG.debug("Expanding data IDs.")
792 commonDataIds = commonDataIds.expanded()
793 _LOG.debug("Iterating over query results to associate quanta with datasets.")
794 # Iterate over query results, populating data IDs for datasets and
795 # quanta and then connecting them to each other.
796 n = -1
797 for n, commonDataId in enumerate(commonDataIds):
798 # Create DatasetRefs for all DatasetTypes from this result row,
799 # noting that we might have created some already.
800 # We remember both those that already existed and those that we
801 # create now.
802 refsForRow = {}
803 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
804 for datasetType, refs in itertools.chain(
805 self.inputs.items(), self.intermediates.items(), self.outputs.items()
806 ):
807 datasetDataId: Optional[DataCoordinate]
808 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
809 datasetDataId = commonDataId.subset(datasetType.dimensions)
810 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
811 ref = refs.get(datasetDataId)
812 if ref is None:
813 ref = DatasetRef(datasetType, datasetDataId)
814 refs[datasetDataId] = ref
815 refsForRow[datasetType.name] = ref
816 # Create _QuantumScaffolding objects for all tasks from this
817 # result row, noting that we might have created some already.
818 for task in self.tasks:
819 quantumDataId = commonDataId.subset(task.dimensions)
820 quantum = task.quanta.get(quantumDataId)
821 if quantum is None:
822 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
823 task.quanta[quantumDataId] = quantum
824 # Whether this is a new quantum or an existing one, we can
825 # now associate the DatasetRefs for this row with it. The
826 # fact that a Quantum data ID and a dataset data ID both
827 # came from the same result row is what tells us they
828 # should be associated.
829 # Many of these associates will be duplicates (because
830 # another query row that differed from this one only in
831 # irrelevant dimensions already added them), and we use
832 # sets to skip.
833 for datasetType in task.inputs:
834 ref = refsForRow[datasetType.name]
835 quantum.inputs[datasetType.name][ref.dataId] = ref
836 for datasetType in task.outputs:
837 ref = refsForRow[datasetType.name]
838 quantum.outputs[datasetType.name][ref.dataId] = ref
839 if n < 0:
840 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
841 emptiness_explained = False
842 for message in commonDataIds.explain_no_results():
843 _LOG.critical(message)
844 emptiness_explained = True
845 if not emptiness_explained:
846 _LOG.critical(
847 "To reproduce this query for debugging purposes, run "
848 "Registry.queryDataIds with these arguments:"
849 )
850 # We could just repr() the queryArgs dict to get something
851 # the user could make sense of, but it's friendlier to
852 # put these args in an easier-to-construct equivalent form
853 # so they can read it more easily and copy and paste into
854 # a Python terminal.
855 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
856 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
857 if queryArgs["where"]:
858 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
859 if "datasets" in queryArgs:
860 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
861 if "collections" in queryArgs:
862 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
863 _LOG.debug("Finished processing %d rows from data ID query.", n)
864 yield commonDataIds
866 def resolveDatasetRefs(
867 self,
868 registry: Registry,
869 collections: Any,
870 run: Optional[str],
871 commonDataIds: DataCoordinateQueryResults,
872 *,
873 skipExistingIn: Any = None,
874 clobberOutputs: bool = True,
875 constrainedByAllDatasets: bool = True,
876 resolveRefs: bool = False,
877 ) -> None:
878 """Perform follow up queries for each dataset data ID produced in
879 `fillDataIds`.
881 This method populates `_DatasetScaffolding.refs` (except for those in
882 `prerequisites`).
884 Parameters
885 ----------
886 registry : `lsst.daf.butler.Registry`
887 Registry for the data repository; used for all data ID queries.
888 collections
889 Expressions representing the collections to search for input
890 datasets. See :ref:`daf_butler_ordered_collection_searches`.
891 run : `str`, optional
892 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
893 output datasets, if it already exists.
894 commonDataIds : \
895 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
896 Result of a previous call to `connectDataIds`.
897 skipExistingIn
898 Expressions representing the collections to search for existing
899 output datasets that should be skipped. See
900 :ref:`daf_butler_ordered_collection_searches` for allowed types.
901 `None` or empty string/sequence disables skipping.
902 clobberOutputs : `bool`, optional
903 If `True` (default), allow quanta to created even if outputs exist;
904 this requires the same behavior behavior to be enabled when
905 executing. If ``skipExistingIn`` is not `None`, completed quanta
906 (those with metadata, or all outputs if there is no metadata
907 dataset configured) will be skipped rather than clobbered.
908 constrainedByAllDatasets : `bool`, optional
909 Indicates if the commonDataIds were generated with a constraint on
910 all dataset types.
911 resolveRefs : `bool`, optional
912 If `True` then resolve all input references and generate random
913 dataset IDs for all output and intermediate datasets. True value
914 requires ``run`` collection to be specified.
916 Raises
917 ------
918 OutputExistsError
919 Raised if an output dataset already exists in the output run
920 and ``skipExistingIn`` does not include output run, or if only
921 some outputs are present and ``clobberOutputs`` is `False`.
922 """
923 # Run may be provided but it does not have to exist, in that case we
924 # use it for resolving references but don't check it for existing refs.
925 run_exists = False
926 if run:
927 try:
928 run_exists = bool(registry.queryCollections(run))
929 except MissingCollectionError:
930 # Undocumented exception is raise if it does not exist
931 pass
933 skip_collections_wildcard: CollectionWildcard | None = None
934 skipExistingInRun = False
935 if skipExistingIn:
936 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
937 if run_exists:
938 # as optimization check in the explicit list of names first
939 skipExistingInRun = run in skip_collections_wildcard.strings
940 if not skipExistingInRun:
941 # need to flatten it and check again
942 skipExistingInRun = run in registry.queryCollections(
943 skipExistingIn,
944 collectionTypes=CollectionType.RUN,
945 )
947 idMaker: Optional[_DatasetIdMaker] = None
948 if resolveRefs:
949 assert run is not None, "run cannot be None when resolveRefs is True"
950 idMaker = _DatasetIdMaker(registry, run)
952 resolvedRefQueryResults: Iterable[DatasetRef]
954 # Updating constrainedByAllDatasets here is not ideal, but we have a
955 # few different code paths that each transfer different pieces of
956 # information about what dataset query constraints were applied here,
957 # and none of them has the complete picture until we get here. We're
958 # long overdue for a QG generation rewrite that will make this go away
959 # entirely anyway.
960 constrainedByAllDatasets = (
961 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
962 )
964 # Look up [init] intermediate and output datasets in the output
965 # collection, if there is an output collection.
966 if run_exists or skip_collections_wildcard is not None:
967 for datasetType, refs in itertools.chain(
968 self.initIntermediates.items(),
969 self.initOutputs.items(),
970 self.intermediates.items(),
971 self.outputs.items(),
972 ):
973 _LOG.debug(
974 "Resolving %d datasets for intermediate and/or output dataset %s.",
975 len(refs),
976 datasetType.name,
977 )
978 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
979 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
980 # TODO: this assert incorrectly bans component inputs;
981 # investigate on DM-33027.
982 # assert not datasetType.isComponent(), \
983 # "Output datasets cannot be components."
984 #
985 # Instead we have to handle them manually to avoid a
986 # deprecation warning, but it is at least confusing and
987 # possibly a bug for components to appear here at all.
988 if datasetType.isComponent():
989 parent_dataset_type = datasetType.makeCompositeDatasetType()
990 component = datasetType.component()
991 else:
992 parent_dataset_type = datasetType
993 component = None
995 # look at RUN collection first
996 if run_exists:
997 try:
998 resolvedRefQueryResults = subset.findDatasets(
999 parent_dataset_type, collections=run, findFirst=True
1000 )
1001 except MissingDatasetTypeError:
1002 resolvedRefQueryResults = []
1003 for resolvedRef in resolvedRefQueryResults:
1004 # TODO: we could easily support per-DatasetType
1005 # skipExisting and I could imagine that being useful -
1006 # it's probably required in order to support writing
1007 # initOutputs before QuantumGraph generation.
1008 assert resolvedRef.dataId in refs
1009 if not (skipExistingInRun or isInit or clobberOutputs):
1010 raise OutputExistsError(
1011 f"Output dataset {datasetType.name} already exists in "
1012 f"output RUN collection '{run}' with data ID"
1013 f" {resolvedRef.dataId}."
1014 )
1015 # If we are going to resolve all outputs then we have
1016 # to remember existing ones to avoid generating new
1017 # dataset IDs for them.
1018 if resolveRefs:
1019 refs[resolvedRef.dataId] = (
1020 resolvedRef.makeComponentRef(component)
1021 if component is not None
1022 else resolvedRef
1023 )
1025 # And check skipExistingIn too, if RUN collection is in
1026 # it is handled above
1027 if skip_collections_wildcard is not None:
1028 try:
1029 resolvedRefQueryResults = subset.findDatasets(
1030 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True
1031 )
1032 except MissingDatasetTypeError:
1033 resolvedRefQueryResults = []
1034 for resolvedRef in resolvedRefQueryResults:
1035 assert resolvedRef.dataId in refs
1036 refs[resolvedRef.dataId] = (
1037 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
1038 )
1040 # Look up input and initInput datasets in the input collection(s).
1041 # container to accumulate unfound refs, if the common dataIs were not
1042 # constrained on dataset type existence.
1043 self.unfoundRefs = set()
1044 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1045 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
1046 if datasetType.isComponent():
1047 parent_dataset_type = datasetType.makeCompositeDatasetType()
1048 component = datasetType.component()
1049 else:
1050 parent_dataset_type = datasetType
1051 component = None
1052 try:
1053 resolvedRefQueryResults = commonDataIds.subset(
1054 datasetType.dimensions, unique=True
1055 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1056 except MissingDatasetTypeError:
1057 resolvedRefQueryResults = []
1058 dataIdsNotFoundYet = set(refs.keys())
1059 for resolvedRef in resolvedRefQueryResults:
1060 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1061 refs[resolvedRef.dataId] = (
1062 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1063 )
1064 if dataIdsNotFoundYet:
1065 if constrainedByAllDatasets:
1066 raise RuntimeError(
1067 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1068 f"'{datasetType.name}' was/were present in a previous "
1069 "query, but could not be found now. "
1070 "This is either a logic bug in QuantumGraph generation "
1071 "or the input collections have been modified since "
1072 "QuantumGraph generation began."
1073 )
1074 elif not datasetType.dimensions:
1075 raise RuntimeError(
1076 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
1077 f"collections {collections}."
1078 )
1079 else:
1080 # if the common dataIds were not constrained using all the
1081 # input dataset types, it is possible that some data ids
1082 # found dont correspond to existing dataset types and they
1083 # will be un-resolved. Mark these for later pruning from
1084 # the quantum graph.
1085 for k in dataIdsNotFoundYet:
1086 self.unfoundRefs.add(refs[k])
1088 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1089 # replacing the unresolved refs there, and then look up prerequisites.
1090 for task in self.tasks:
1091 _LOG.debug(
1092 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1093 len(task.quanta),
1094 task.taskDef.label,
1095 )
1096 # The way iterConnections is designed makes it impossible to
1097 # annotate precisely enough to satisfy MyPy here.
1098 lookupFunctions = {
1099 c.name: c.lookupFunction # type: ignore
1100 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1101 if c.lookupFunction is not None # type: ignore
1102 }
1103 dataIdsFailed = []
1104 dataIdsSucceeded = []
1105 for quantum in task.quanta.values():
1106 # Process outputs datasets only if skipExistingIn is not None
1107 # or there is a run to look for outputs in and clobberOutputs
1108 # is True. Note that if skipExistingIn is None, any output
1109 # datasets that already exist would have already caused an
1110 # exception to be raised. We never update the DatasetRefs in
1111 # the quantum because those should never be resolved.
1112 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1113 resolvedRefs = []
1114 unresolvedRefs = []
1115 haveMetadata = False
1116 for datasetType, originalRefs in quantum.outputs.items():
1117 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
1118 if ref.id is not None:
1119 resolvedRefs.append(ref)
1120 if datasetType.name == task.taskDef.metadataDatasetName:
1121 haveMetadata = True
1122 else:
1123 unresolvedRefs.append(ref)
1124 if resolvedRefs:
1125 if haveMetadata or not unresolvedRefs:
1126 dataIdsSucceeded.append(quantum.dataId)
1127 if skip_collections_wildcard is not None:
1128 continue
1129 else:
1130 dataIdsFailed.append(quantum.dataId)
1131 if not clobberOutputs:
1132 raise OutputExistsError(
1133 f"Quantum {quantum.dataId} of task with label "
1134 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1135 f"({resolvedRefs}) "
1136 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1137 "and clobbering outputs was not enabled."
1138 )
1139 # Update the input DatasetRefs to the resolved ones we already
1140 # searched for.
1141 for datasetType, input_refs in quantum.inputs.items():
1142 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1143 input_refs[ref.dataId] = ref
1144 # Look up prerequisite datasets in the input collection(s).
1145 # These may have dimensions that extend beyond those we queried
1146 # for originally, because we want to permit those data ID
1147 # values to differ across quanta and dataset types.
1148 for datasetType in task.prerequisites:
1149 if datasetType.isComponent():
1150 parent_dataset_type = datasetType.makeCompositeDatasetType()
1151 component = datasetType.component()
1152 else:
1153 parent_dataset_type = datasetType
1154 component = None
1155 lookupFunction = lookupFunctions.get(datasetType.name)
1156 if lookupFunction is not None:
1157 # PipelineTask has provided its own function to do the
1158 # lookup. This always takes precedence.
1159 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1160 elif (
1161 datasetType.isCalibration()
1162 and datasetType.dimensions <= quantum.dataId.graph
1163 and quantum.dataId.graph.temporal
1164 ):
1165 # This is a master calibration lookup, which we have to
1166 # handle specially because the query system can't do a
1167 # temporal join on a non-dimension-based timespan yet.
1168 timespan = quantum.dataId.timespan
1169 try:
1170 prereq_ref = registry.findDataset(
1171 parent_dataset_type,
1172 quantum.dataId,
1173 collections=collections,
1174 timespan=timespan,
1175 )
1176 if prereq_ref is not None:
1177 if component is not None:
1178 prereq_ref = prereq_ref.makeComponentRef(component)
1179 prereq_refs = [prereq_ref]
1180 else:
1181 prereq_refs = []
1182 except (KeyError, MissingDatasetTypeError):
1183 # This dataset type is not present in the registry,
1184 # which just means there are no datasets here.
1185 prereq_refs = []
1186 else:
1187 # Most general case.
1188 prereq_refs = [
1189 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1190 for prereq_ref in registry.queryDatasets(
1191 parent_dataset_type,
1192 collections=collections,
1193 dataId=quantum.dataId,
1194 findFirst=True,
1195 ).expanded()
1196 ]
1197 prereq_refs_map = {ref.dataId: ref for ref in prereq_refs if ref is not None}
1198 quantum.prerequisites[datasetType].update(prereq_refs_map)
1199 task.prerequisites[datasetType].update(prereq_refs_map)
1201 # Resolve all quantum inputs and outputs.
1202 if idMaker:
1203 for datasetDict in (quantum.inputs, quantum.outputs):
1204 for refDict in datasetDict.values():
1205 refDict.update(idMaker.resolveDict(refDict))
1207 # Resolve task initInputs and initOutputs.
1208 if idMaker:
1209 for datasetDict in (task.initInputs, task.initOutputs):
1210 for refDict in datasetDict.values():
1211 refDict.update(idMaker.resolveDict(refDict))
1213 # Actually remove any quanta that we decided to skip above.
1214 if dataIdsSucceeded:
1215 if skip_collections_wildcard is not None:
1216 _LOG.debug(
1217 "Pruning successful %d quanta for task with label '%s' because all of their "
1218 "outputs exist or metadata was written successfully.",
1219 len(dataIdsSucceeded),
1220 task.taskDef.label,
1221 )
1222 for dataId in dataIdsSucceeded:
1223 del task.quanta[dataId]
1224 elif clobberOutputs:
1225 _LOG.info(
1226 "Found %d successful quanta for task with label '%s' "
1227 "that will need to be clobbered during execution.",
1228 len(dataIdsSucceeded),
1229 task.taskDef.label,
1230 )
1231 else:
1232 raise AssertionError("OutputExistsError should have already been raised.")
1233 if dataIdsFailed:
1234 if clobberOutputs:
1235 _LOG.info(
1236 "Found %d failed/incomplete quanta for task with label '%s' "
1237 "that will need to be clobbered during execution.",
1238 len(dataIdsFailed),
1239 task.taskDef.label,
1240 )
1241 else:
1242 raise AssertionError("OutputExistsError should have already been raised.")
1244 # Collect initOutputs that do not belong to any task.
1245 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1246 for task in self.tasks:
1247 global_dataset_types -= set(task.initOutputs)
1248 if global_dataset_types:
1249 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1250 if idMaker is not None:
1251 for refDict in self.globalInitOutputs.values():
1252 refDict.update(idMaker.resolveDict(refDict))
1254 def makeQuantumGraph(
1255 self,
1256 registry: Registry,
1257 metadata: Optional[Mapping[str, Any]] = None,
1258 datastore: Optional[Datastore] = None,
1259 ) -> QuantumGraph:
1260 """Create a `QuantumGraph` from the quanta already present in
1261 the scaffolding data structure.
1263 Parameters
1264 ---------
1265 registry : `lsst.daf.butler.Registry`
1266 Registry for the data repository; used for all data ID queries.
1267 metadata : Optional Mapping of `str` to primitives
1268 This is an optional parameter of extra data to carry with the
1269 graph. Entries in this mapping should be able to be serialized in
1270 JSON.
1271 datastore : `Datastore`, optional
1272 If not `None` then fill datastore records in each generated
1273 Quantum.
1275 Returns
1276 -------
1277 graph : `QuantumGraph`
1278 The full `QuantumGraph`.
1279 """
1281 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1282 """Extract all DatasetRefs from the dictionaries"""
1283 for ref_dict in dataset_dict.values():
1284 yield from ref_dict.values()
1286 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1287 if datastore is not None:
1288 datastore_records = datastore.export_records(
1289 itertools.chain(
1290 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1291 )
1292 )
1294 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1295 for task in self.tasks:
1296 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1297 graphInput[task.taskDef] = qset
1299 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1300 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1302 globalInitOutputs: list[DatasetRef] = []
1303 if self.globalInitOutputs is not None:
1304 for refs_dict in self.globalInitOutputs.values():
1305 globalInitOutputs.extend(refs_dict.values())
1307 graph = QuantumGraph(
1308 graphInput,
1309 metadata=metadata,
1310 pruneRefs=self.unfoundRefs,
1311 universe=self.dimensions.universe,
1312 initInputs=taskInitInputs,
1313 initOutputs=taskInitOutputs,
1314 globalInitOutputs=globalInitOutputs,
1315 registryDatasetTypes=self._get_registry_dataset_types(registry),
1316 )
1317 return graph
1319 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]:
1320 """Make a list of all dataset types used by a graph as defined in
1321 registry.
1322 """
1323 chain = [
1324 self.initInputs,
1325 self.initIntermediates,
1326 self.initOutputs,
1327 self.inputs,
1328 self.intermediates,
1329 self.outputs,
1330 self.prerequisites,
1331 ]
1332 if self.globalInitOutputs is not None:
1333 chain.append(self.globalInitOutputs)
1335 # Collect names of all dataset types.
1336 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain))
1337 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)}
1339 # Check for types that do not exist in registry yet:
1340 # - inputs must exist
1341 # - intermediates and outputs may not exist, but there must not be
1342 # more than one definition (e.g. differing in storage class)
1343 # - prerequisites may not exist, treat it the same as outputs here
1344 for dstype in itertools.chain(self.initInputs, self.inputs):
1345 if dstype.name not in dataset_types:
1346 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}")
1348 new_outputs: dict[str, set[DatasetType]] = defaultdict(set)
1349 chain = [
1350 self.initIntermediates,
1351 self.initOutputs,
1352 self.intermediates,
1353 self.outputs,
1354 self.prerequisites,
1355 ]
1356 if self.globalInitOutputs is not None:
1357 chain.append(self.globalInitOutputs)
1358 for dstype in itertools.chain(*chain):
1359 if dstype.name not in dataset_types:
1360 new_outputs[dstype.name].add(dstype)
1361 for name, dstypes in new_outputs.items():
1362 if len(dstypes) > 1:
1363 raise ValueError(
1364 "Pipeline contains multiple definitions for a dataset type "
1365 f"which is not defined in registry yet: {dstypes}"
1366 )
1367 elif len(dstypes) == 1:
1368 dataset_types[name] = dstypes.pop()
1370 return dataset_types.values()
1373# ------------------------
1374# Exported definitions --
1375# ------------------------
1378class GraphBuilderError(Exception):
1379 """Base class for exceptions generated by graph builder."""
1381 pass
1384class OutputExistsError(GraphBuilderError):
1385 """Exception generated when output datasets already exist."""
1387 pass
1390class PrerequisiteMissingError(GraphBuilderError):
1391 """Exception generated when a prerequisite dataset does not exist."""
1393 pass
1396class GraphBuilder:
1397 """GraphBuilder class is responsible for building task execution graph from
1398 a Pipeline.
1400 Parameters
1401 ----------
1402 registry : `~lsst.daf.butler.Registry`
1403 Data butler instance.
1404 skipExistingIn
1405 Expressions representing the collections to search for existing
1406 output datasets that should be skipped. See
1407 :ref:`daf_butler_ordered_collection_searches`.
1408 clobberOutputs : `bool`, optional
1409 If `True` (default), allow quanta to created even if partial outputs
1410 exist; this requires the same behavior behavior to be enabled when
1411 executing.
1412 datastore : `Datastore`, optional
1413 If not `None` then fill datastore records in each generated Quantum.
1414 """
1416 def __init__(
1417 self,
1418 registry: Registry,
1419 skipExistingIn: Any = None,
1420 clobberOutputs: bool = True,
1421 datastore: Optional[Datastore] = None,
1422 ):
1423 self.registry = registry
1424 self.dimensions = registry.dimensions
1425 self.skipExistingIn = skipExistingIn
1426 self.clobberOutputs = clobberOutputs
1427 self.datastore = datastore
1429 def makeGraph(
1430 self,
1431 pipeline: Union[Pipeline, Iterable[TaskDef]],
1432 collections: Any,
1433 run: Optional[str],
1434 userQuery: Optional[str],
1435 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1436 metadata: Optional[Mapping[str, Any]] = None,
1437 resolveRefs: bool = False,
1438 bind: Optional[Mapping[str, Any]] = None,
1439 ) -> QuantumGraph:
1440 """Create execution graph for a pipeline.
1442 Parameters
1443 ----------
1444 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1445 Pipeline definition, task names/classes and their configs.
1446 collections
1447 Expressions representing the collections to search for input
1448 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1449 run : `str`, optional
1450 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1451 output datasets. Collection does not have to exist and it will be
1452 created when graph is executed.
1453 userQuery : `str`
1454 String which defines user-defined selection for registry, should be
1455 empty or `None` if there is no restrictions on data selection.
1456 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1457 The query constraint variant that should be used to constraint the
1458 query based on dataset existance, defaults to
1459 `DatasetQueryConstraintVariant.ALL`.
1460 metadata : Optional Mapping of `str` to primitives
1461 This is an optional parameter of extra data to carry with the
1462 graph. Entries in this mapping should be able to be serialized in
1463 JSON.
1464 resolveRefs : `bool`, optional
1465 If `True` then resolve all input references and generate random
1466 dataset IDs for all output and intermediate datasets. True value
1467 requires ``run`` collection to be specified.
1468 bind : `Mapping`, optional
1469 Mapping containing literal values that should be injected into the
1470 ``userQuery`` expression, keyed by the identifiers they replace.
1472 Returns
1473 -------
1474 graph : `QuantumGraph`
1476 Raises
1477 ------
1478 UserExpressionError
1479 Raised when user expression cannot be parsed.
1480 OutputExistsError
1481 Raised when output datasets already exist.
1482 Exception
1483 Other exceptions types may be raised by underlying registry
1484 classes.
1485 """
1486 if resolveRefs and run is None:
1487 raise ValueError("`resolveRefs` requires `run` parameter.")
1488 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1489 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1490 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1491 instrument_class: Optional[Any] = None
1492 if isinstance(pipeline, Pipeline):
1493 instrument_class_name = pipeline.getInstrument()
1494 if instrument_class_name is not None:
1495 instrument_class = doImportType(instrument_class_name)
1496 pipeline = list(pipeline.toExpandedPipeline())
1497 if instrument_class is not None:
1498 dataId = DataCoordinate.standardize(
1499 instrument=instrument_class.getName(), universe=self.registry.dimensions
1500 )
1501 else:
1502 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1503 with scaffolding.connectDataIds(
1504 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1505 ) as commonDataIds:
1506 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1507 scaffolding.resolveDatasetRefs(
1508 self.registry,
1509 collections,
1510 run,
1511 commonDataIds,
1512 skipExistingIn=self.skipExistingIn,
1513 clobberOutputs=self.clobberOutputs,
1514 constrainedByAllDatasets=condition,
1515 resolveRefs=resolveRefs,
1516 )
1517 return scaffolding.makeQuantumGraph(
1518 registry=self.registry, metadata=metadata, datastore=self.datastore
1519 )