Coverage for python/lsst/pipe/base/graphBuilder.py: 15%
485 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-13 02:51 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-13 02:51 -0800
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union
38from lsst.daf.butler import (
39 CollectionType,
40 DataCoordinate,
41 DatasetIdGenEnum,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 Quantum,
50 Registry,
51)
52from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
53from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
54from lsst.daf.butler.registry.wildcards import CollectionWildcard
55from lsst.utils import doImportType
57from ._datasetQueryConstraints import DatasetQueryConstraintVariant
58from ._status import NoWorkFound
60# -----------------------------
61# Imports for other modules --
62# -----------------------------
63from .connections import AdjustQuantumHelper, iterConnections
64from .graph import QuantumGraph
65from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
67# ----------------------------------
68# Local non-exported definitions --
69# ----------------------------------
71_LOG = logging.getLogger(__name__)
74class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
75 """A custom dictionary that maps `DatasetType` to a nested dictionary of
76 the known `DatasetRef` instances of that type.
78 Parameters
79 ----------
80 args
81 Positional arguments are forwarded to the `dict` constructor.
82 universe : `DimensionUniverse`
83 Universe of all possible dimensions.
84 """
86 def __init__(self, *args: Any, universe: DimensionUniverse):
87 super().__init__(*args)
88 self.universe = universe
90 @classmethod
91 def fromDatasetTypes(
92 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
93 ) -> _DatasetDict:
94 """Construct a dictionary from a flat iterable of `DatasetType` keys.
96 Parameters
97 ----------
98 datasetTypes : `iterable` of `DatasetType`
99 DatasetTypes to use as keys for the dict. Values will be empty
100 dictionaries.
101 universe : `DimensionUniverse`
102 Universe of all possible dimensions.
104 Returns
105 -------
106 dictionary : `_DatasetDict`
107 A new `_DatasetDict` instance.
108 """
109 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
111 @classmethod
112 def fromSubset(
113 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
114 ) -> _DatasetDict:
115 """Return a new dictionary by extracting items corresponding to the
116 given keys from one or more existing dictionaries.
118 Parameters
119 ----------
120 datasetTypes : `iterable` of `DatasetType`
121 DatasetTypes to use as keys for the dict. Values will be obtained
122 by lookups against ``first`` and ``rest``.
123 first : `_DatasetDict`
124 Another dictionary from which to extract values.
125 rest
126 Additional dictionaries from which to extract values.
128 Returns
129 -------
130 dictionary : `_DatasetDict`
131 A new dictionary instance.
132 """
133 combined = ChainMap(first, *rest)
135 # Dataset types known to match immediately can be processed
136 # without checks.
137 matches = combined.keys() & set(datasetTypes)
138 _dict = {k: combined[k] for k in matches}
140 if len(_dict) < len(datasetTypes):
141 # Work out which ones are missing.
142 missing_datasetTypes = set(datasetTypes) - _dict.keys()
144 # Get the known names for comparison.
145 combined_by_name = {k.name: k for k in combined}
147 missing = set()
148 incompatible = {}
149 for datasetType in missing_datasetTypes:
150 # The dataset type is not found. It may not be listed
151 # or it may be that it is there with the same name
152 # but different definition.
153 if datasetType.name in combined_by_name:
154 # This implies some inconsistency in definitions
155 # for connections. If there is support for storage
156 # class conversion we can let it slide.
157 # At this point we do not know
158 # where the inconsistency is but trust that down
159 # stream code will be more explicit about input
160 # vs output incompatibilities.
161 existing = combined_by_name[datasetType.name]
162 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
163 _LOG.warning(
164 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
165 datasetType,
166 existing,
167 )
168 _dict[datasetType] = combined[existing]
169 else:
170 incompatible[datasetType] = existing
171 else:
172 missing.add(datasetType)
174 if missing or incompatible:
175 reasons = []
176 if missing:
177 reasons.append(
178 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
179 + ", ".join(d.name for d in combined)
180 )
181 if incompatible:
182 for x, y in incompatible.items():
183 reasons.append(f"{x} incompatible with {y}")
184 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
186 return cls(_dict, universe=first.universe)
188 @property
189 def dimensions(self) -> DimensionGraph:
190 """The union of all dimensions used by all dataset types in this
191 dictionary, including implied dependencies (`DimensionGraph`).
192 """
193 base = self.universe.empty
194 if len(self) == 0:
195 return base
196 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
198 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
199 """Unpack nested single-element `DatasetRef` dicts into a new
200 mapping with `DatasetType` keys and `DatasetRef` values.
202 This method assumes that each nest contains exactly one item, as is the
203 case for all "init" datasets.
205 Returns
206 -------
207 dictionary : `NamedKeyDict`
208 Dictionary mapping `DatasetType` to `DatasetRef`, with both
209 `DatasetType` instances and string names usable as keys.
210 """
212 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
213 (ref,) = refs.values()
214 return ref
216 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
218 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
219 """Unpack nested multi-element `DatasetRef` dicts into a new
220 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
222 Returns
223 -------
224 dictionary : `NamedKeyDict`
225 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
226 both `DatasetType` instances and string names usable as keys.
227 """
228 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
230 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
231 """Iterate over the contained `DatasetRef` instances that match the
232 given `DatasetType` and data IDs.
234 Parameters
235 ----------
236 datasetType : `DatasetType`
237 Dataset type to match.
238 dataIds : `Iterable` [ `DataCoordinate` ]
239 Data IDs to match.
241 Returns
242 -------
243 refs : `Iterator` [ `DatasetRef` ]
244 DatasetRef instances for which ``ref.datasetType == datasetType``
245 and ``ref.dataId`` is in ``dataIds``.
246 """
247 refs = self[datasetType]
248 return (refs[dataId] for dataId in dataIds)
251class _QuantumScaffolding:
252 """Helper class aggregating information about a `Quantum`, used when
253 constructing a `QuantumGraph`.
255 See `_PipelineScaffolding` for a top-down description of the full
256 scaffolding data structure.
258 Parameters
259 ----------
260 task : _TaskScaffolding
261 Back-reference to the helper object for the `PipelineTask` this quantum
262 represents an execution of.
263 dataId : `DataCoordinate`
264 Data ID for this quantum.
265 """
267 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
268 self.task = task
269 self.dataId = dataId
270 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
271 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
272 self.prerequisites = _DatasetDict.fromDatasetTypes(
273 task.prerequisites.keys(), universe=dataId.universe
274 )
276 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
278 def __repr__(self) -> str:
279 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
281 task: _TaskScaffolding
282 """Back-reference to the helper object for the `PipelineTask` this quantum
283 represents an execution of.
284 """
286 dataId: DataCoordinate
287 """Data ID for this quantum.
288 """
290 inputs: _DatasetDict
291 """Nested dictionary containing `DatasetRef` inputs to this quantum.
293 This is initialized to map each `DatasetType` to an empty dictionary at
294 construction. Those nested dictionaries are populated (with data IDs as
295 keys) with unresolved `DatasetRef` instances in
296 `_PipelineScaffolding.connectDataIds`.
297 """
299 outputs: _DatasetDict
300 """Nested dictionary containing `DatasetRef` outputs this quantum.
301 """
303 prerequisites: _DatasetDict
304 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
305 quantum.
306 """
308 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
309 """Transform the scaffolding object into a true `Quantum` instance.
311 Parameters
312 ----------
313 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
314 If not `None` then fill datastore records in each generated Quantum
315 using the records from this structure.
317 Returns
318 -------
319 quantum : `Quantum`
320 An actual `Quantum` instance.
321 """
322 allInputs = self.inputs.unpackMultiRefs()
323 allInputs.update(self.prerequisites.unpackMultiRefs())
324 # Give the task's Connections class an opportunity to remove some
325 # inputs, or complain if they are unacceptable.
326 # This will raise if one of the check conditions is not met, which is
327 # the intended behavior.
328 # If it raises NotWorkFound, there is a bug in the QG algorithm
329 # or the adjustQuantum is incorrectly trying to make a prerequisite
330 # input behave like a regular input; adjustQuantum should only raise
331 # NoWorkFound if a regular input is missing, and it shouldn't be
332 # possible for us to have generated ``self`` if that's true.
333 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
334 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
335 initInputs = self.task.initInputs.unpackSingleRefs()
336 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
337 if datastore_records is not None:
338 quantum_records = {}
339 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
340 input_refs += list(initInputs.values())
341 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
342 for datastore_name, records in datastore_records.items():
343 matching_records = records.subset(input_ids)
344 if matching_records is not None:
345 quantum_records[datastore_name] = matching_records
346 return Quantum(
347 taskName=self.task.taskDef.taskName,
348 taskClass=self.task.taskDef.taskClass,
349 dataId=self.dataId,
350 initInputs=initInputs,
351 inputs=helper.inputs,
352 outputs=helper.outputs,
353 datastore_records=quantum_records,
354 )
357@dataclass
358class _TaskScaffolding:
359 """Helper class aggregating information about a `PipelineTask`, used when
360 constructing a `QuantumGraph`.
362 See `_PipelineScaffolding` for a top-down description of the full
363 scaffolding data structure.
365 Parameters
366 ----------
367 taskDef : `TaskDef`
368 Data structure that identifies the task class and its config.
369 parent : `_PipelineScaffolding`
370 The parent data structure that will hold the instance being
371 constructed.
372 datasetTypes : `TaskDatasetTypes`
373 Data structure that categorizes the dataset types used by this task.
374 """
376 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
377 universe = parent.dimensions.universe
378 self.taskDef = taskDef
379 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
380 assert self.dimensions.issubset(parent.dimensions)
381 # Initialize _DatasetDicts as subsets of the one or two
382 # corresponding dicts in the parent _PipelineScaffolding.
383 self.initInputs = _DatasetDict.fromSubset(
384 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
385 )
386 self.initOutputs = _DatasetDict.fromSubset(
387 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
388 )
389 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
390 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
391 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
392 self.dataIds: Set[DataCoordinate] = set()
393 self.quanta = {}
395 def __repr__(self) -> str:
396 # Default dataclass-injected __repr__ gets caught in an infinite loop
397 # because of back-references.
398 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
400 taskDef: TaskDef
401 """Data structure that identifies the task class and its config
402 (`TaskDef`).
403 """
405 dimensions: DimensionGraph
406 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
407 """
409 initInputs: _DatasetDict
410 """Dictionary containing information about datasets used to construct this
411 task (`_DatasetDict`).
412 """
414 initOutputs: _DatasetDict
415 """Dictionary containing information about datasets produced as a
416 side-effect of constructing this task (`_DatasetDict`).
417 """
419 inputs: _DatasetDict
420 """Dictionary containing information about datasets used as regular,
421 graph-constraining inputs to this task (`_DatasetDict`).
422 """
424 outputs: _DatasetDict
425 """Dictionary containing information about datasets produced by this task
426 (`_DatasetDict`).
427 """
429 prerequisites: _DatasetDict
430 """Dictionary containing information about input datasets that must be
431 present in the repository before any Pipeline containing this task is run
432 (`_DatasetDict`).
433 """
435 quanta: Dict[DataCoordinate, _QuantumScaffolding]
436 """Dictionary mapping data ID to a scaffolding object for the Quantum of
437 this task with that data ID.
438 """
440 def makeQuantumSet(
441 self,
442 unresolvedRefs: Optional[Set[DatasetRef]] = None,
443 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
444 ) -> Set[Quantum]:
445 """Create a `set` of `Quantum` from the information in ``self``.
447 Parameters
448 ----------
449 unresolvedRefs : `set` [ `DatasetRef` ], optional
450 Input dataset refs that have not been found.
451 datastore_records : `dict`
454 Returns
455 -------
456 nodes : `set` of `Quantum`
457 The `Quantum` elements corresponding to this task.
458 """
459 if unresolvedRefs is None:
460 unresolvedRefs = set()
461 outputs = set()
462 for q in self.quanta.values():
463 try:
464 tmpQuanta = q.makeQuantum(datastore_records)
465 outputs.add(tmpQuanta)
466 except (NoWorkFound, FileNotFoundError) as exc:
467 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
468 if unresolvedRefs.intersection(refs):
469 # This means it is a node that is Known to be pruned
470 # later and should be left in even though some follow up
471 # queries fail. This allows the pruning to start from this
472 # quantum with known issues, and prune other nodes it
473 # touches
474 inputs = q.inputs.unpackMultiRefs()
475 inputs.update(q.prerequisites.unpackMultiRefs())
476 tmpQuantum = Quantum(
477 taskName=q.task.taskDef.taskName,
478 taskClass=q.task.taskDef.taskClass,
479 dataId=q.dataId,
480 initInputs=q.task.initInputs.unpackSingleRefs(),
481 inputs=inputs,
482 outputs=q.outputs.unpackMultiRefs(),
483 )
484 outputs.add(tmpQuantum)
485 else:
486 raise exc
487 return outputs
490class _DatasetIdMaker:
491 """Helper class which generates random dataset UUIDs for unresolved
492 datasets.
493 """
495 def __init__(self, registry: Registry, run: str):
496 self.datasetIdFactory = registry.datasetIdFactory
497 self.run = run
498 # Dataset IDs generated so far
499 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {}
501 def resolveRef(self, ref: DatasetRef) -> DatasetRef:
502 if ref.id is not None:
503 return ref
504 # For components we need their parent dataset type.
505 if ref.isComponent():
506 ref = ref.makeCompositeRef()
507 # Some basic check - parent should be resolved if this is an
508 # existing input, or it should be in the cache already if it is
509 # an intermediate.
510 if ref.id is None and (ref.datasetType, ref.dataId) not in self.resolved:
511 raise ValueError(f"Composite dataset is missing from cache: {ref}")
512 key = ref.datasetType, ref.dataId
513 if (resolved := self.resolved.get(key)) is None:
514 resolved = self.datasetIdFactory.resolveRef(ref, self.run, DatasetIdGenEnum.UNIQUE)
515 self.resolved[key] = resolved
516 return resolved
518 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]:
519 """Resolve all unresolved references in the provided dictionary."""
520 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()}
523@dataclass
524class _PipelineScaffolding:
525 """A helper data structure that organizes the information involved in
526 constructing a `QuantumGraph` for a `Pipeline`.
528 Parameters
529 ----------
530 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
531 Sequence of tasks from which a graph is to be constructed. Must
532 have nested task classes already imported.
533 universe : `DimensionUniverse`
534 Universe of all possible dimensions.
536 Notes
537 -----
538 The scaffolding data structure contains nested data structures for both
539 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
540 data structures are shared between the pipeline-level structure (which
541 aggregates all datasets and categorizes them from the perspective of the
542 complete pipeline) and the individual tasks that use them as inputs and
543 outputs.
545 `QuantumGraph` construction proceeds in four steps, with each corresponding
546 to a different `_PipelineScaffolding` method:
548 1. When `_PipelineScaffolding` is constructed, we extract and categorize
549 the DatasetTypes used by the pipeline (delegating to
550 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
551 nested `_TaskScaffolding` and `_DatasetDict` objects.
553 2. In `connectDataIds`, we construct and run the "Big Join Query", which
554 returns related tuples of all dimensions used to identify any regular
555 input, output, and intermediate datasets (not prerequisites). We then
556 iterate over these tuples of related dimensions, identifying the subsets
557 that correspond to distinct data IDs for each task and dataset type,
558 and then create `_QuantumScaffolding` objects.
560 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
561 dataset data IDs previously identified, transforming unresolved
562 DatasetRefs into resolved DatasetRefs where appropriate. We then look
563 up prerequisite datasets for all quanta.
565 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
566 per-task `_QuantumScaffolding` objects.
567 """
569 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
570 _LOG.debug("Initializing data structures for QuantumGraph generation.")
571 self.tasks = []
572 # Aggregate and categorize the DatasetTypes in the Pipeline.
573 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
574 # Construct dictionaries that map those DatasetTypes to structures
575 # that will (later) hold additional information about them.
576 for attr in (
577 "initInputs",
578 "initIntermediates",
579 "initOutputs",
580 "inputs",
581 "intermediates",
582 "outputs",
583 "prerequisites",
584 ):
585 setattr(
586 self,
587 attr,
588 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
589 )
590 # Aggregate all dimensions for all non-init, non-prerequisite
591 # DatasetTypes. These are the ones we'll include in the big join
592 # query.
593 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
594 # Construct scaffolding nodes for each Task, and add backreferences
595 # to the Task from each DatasetScaffolding node.
596 # Note that there's only one scaffolding node for each DatasetType,
597 # shared by _PipelineScaffolding and all _TaskScaffoldings that
598 # reference it.
599 if isinstance(pipeline, Pipeline):
600 pipeline = pipeline.toExpandedPipeline()
601 self.tasks = [
602 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
603 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
604 ]
606 def __repr__(self) -> str:
607 # Default dataclass-injected __repr__ gets caught in an infinite loop
608 # because of back-references.
609 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
611 tasks: List[_TaskScaffolding]
612 """Scaffolding data structures for each task in the pipeline
613 (`list` of `_TaskScaffolding`).
614 """
616 initInputs: _DatasetDict
617 """Datasets consumed but not produced when constructing the tasks in this
618 pipeline (`_DatasetDict`).
619 """
621 initIntermediates: _DatasetDict
622 """Datasets that are both consumed and produced when constructing the tasks
623 in this pipeline (`_DatasetDict`).
624 """
626 initOutputs: _DatasetDict
627 """Datasets produced but not consumed when constructing the tasks in this
628 pipeline (`_DatasetDict`).
629 """
631 inputs: _DatasetDict
632 """Datasets that are consumed but not produced when running this pipeline
633 (`_DatasetDict`).
634 """
636 intermediates: _DatasetDict
637 """Datasets that are both produced and consumed when running this pipeline
638 (`_DatasetDict`).
639 """
641 outputs: _DatasetDict
642 """Datasets produced but not consumed when when running this pipeline
643 (`_DatasetDict`).
644 """
646 prerequisites: _DatasetDict
647 """Datasets that are consumed when running this pipeline and looked up
648 per-Quantum when generating the graph (`_DatasetDict`).
649 """
651 dimensions: DimensionGraph
652 """All dimensions used by any regular input, intermediate, or output
653 (not prerequisite) dataset; the set of dimension used in the "Big Join
654 Query" (`DimensionGraph`).
656 This is required to be a superset of all task quantum dimensions.
657 """
659 globalInitOutputs: _DatasetDict | None = None
660 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`)
661 """
663 @contextmanager
664 def connectDataIds(
665 self,
666 registry: Registry,
667 collections: Any,
668 userQuery: Optional[str],
669 externalDataId: DataCoordinate,
670 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
671 bind: Optional[Mapping[str, Any]] = None,
672 ) -> Iterator[DataCoordinateQueryResults]:
673 """Query for the data IDs that connect nodes in the `QuantumGraph`.
675 This method populates `_TaskScaffolding.dataIds` and
676 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
678 Parameters
679 ----------
680 registry : `lsst.daf.butler.Registry`
681 Registry for the data repository; used for all data ID queries.
682 collections
683 Expressions representing the collections to search for input
684 datasets. See :ref:`daf_butler_ordered_collection_searches`.
685 userQuery : `str` or `None`
686 User-provided expression to limit the data IDs processed.
687 externalDataId : `DataCoordinate`
688 Externally-provided data ID that should be used to restrict the
689 results, just as if these constraints had been included via ``AND``
690 in ``userQuery``. This includes (at least) any instrument named
691 in the pipeline definition.
692 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
693 The query constraint variant that should be used to constraint the
694 query based on dataset existance, defaults to
695 `DatasetQueryConstraintVariant.ALL`.
696 bind : `Mapping`, optional
697 Mapping containing literal values that should be injected into the
698 ``userQuery`` expression, keyed by the identifiers they replace.
700 Returns
701 -------
702 commonDataIds : \
703 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
704 An interface to a database temporary table containing all data IDs
705 that will appear in this `QuantumGraph`. Returned inside a
706 context manager, which will drop the temporary table at the end of
707 the `with` block in which this method is called.
708 """
709 _LOG.debug("Building query for data IDs.")
710 # Initialization datasets always have empty data IDs.
711 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
712 for datasetType, refs in itertools.chain(
713 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
714 ):
715 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
716 # Run one big query for the data IDs for task dimensions and regular
717 # inputs and outputs. We limit the query to only dimensions that are
718 # associated with the input dataset types, but don't (yet) try to
719 # obtain the dataset_ids for those inputs.
720 _LOG.debug("Submitting data ID query and materializing results.")
721 queryArgs: Dict[str, Any] = {
722 "dimensions": self.dimensions,
723 "where": userQuery,
724 "dataId": externalDataId,
725 "bind": bind,
726 }
727 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
728 _LOG.debug("Constraining graph query using all datasets in pipeline.")
729 queryArgs["datasets"] = list(self.inputs)
730 queryArgs["collections"] = collections
731 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
732 _LOG.debug("Not using dataset existence to constrain query.")
733 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
734 constraint = set(datasetQueryConstraint)
735 inputs = {k.name: k for k in self.inputs.keys()}
736 if remainder := constraint.difference(inputs.keys()):
737 raise ValueError(
738 f"{remainder} dataset type(s) specified as a graph constraint, but"
739 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
740 )
741 _LOG.debug(f"Constraining graph query using {constraint}")
742 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
743 queryArgs["collections"] = collections
744 else:
745 raise ValueError(
746 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
747 )
749 if "datasets" in queryArgs:
750 for i, dataset_type in enumerate(queryArgs["datasets"]):
751 if dataset_type.isComponent():
752 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType()
754 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
755 _LOG.debug("Expanding data IDs.")
756 commonDataIds = commonDataIds.expanded()
757 _LOG.debug("Iterating over query results to associate quanta with datasets.")
758 # Iterate over query results, populating data IDs for datasets and
759 # quanta and then connecting them to each other.
760 n = -1
761 for n, commonDataId in enumerate(commonDataIds):
762 _LOG.debug("Next DataID = %s", commonDataId)
763 # Create DatasetRefs for all DatasetTypes from this result row,
764 # noting that we might have created some already.
765 # We remember both those that already existed and those that we
766 # create now.
767 refsForRow = {}
768 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
769 for datasetType, refs in itertools.chain(
770 self.inputs.items(), self.intermediates.items(), self.outputs.items()
771 ):
772 datasetDataId: Optional[DataCoordinate]
773 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
774 datasetDataId = commonDataId.subset(datasetType.dimensions)
775 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
776 ref = refs.get(datasetDataId)
777 if ref is None:
778 ref = DatasetRef(datasetType, datasetDataId)
779 _LOG.debug("Made new ref = %s", ref)
780 refs[datasetDataId] = ref
781 refsForRow[datasetType.name] = ref
782 # Create _QuantumScaffolding objects for all tasks from this
783 # result row, noting that we might have created some already.
784 for task in self.tasks:
785 quantumDataId = commonDataId.subset(task.dimensions)
786 quantum = task.quanta.get(quantumDataId)
787 if quantum is None:
788 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
789 task.quanta[quantumDataId] = quantum
790 # Whether this is a new quantum or an existing one, we can
791 # now associate the DatasetRefs for this row with it. The
792 # fact that a Quantum data ID and a dataset data ID both
793 # came from the same result row is what tells us they
794 # should be associated.
795 # Many of these associates will be duplicates (because
796 # another query row that differed from this one only in
797 # irrelevant dimensions already added them), and we use
798 # sets to skip.
799 for datasetType in task.inputs:
800 ref = refsForRow[datasetType.name]
801 quantum.inputs[datasetType.name][ref.dataId] = ref
802 for datasetType in task.outputs:
803 ref = refsForRow[datasetType.name]
804 quantum.outputs[datasetType.name][ref.dataId] = ref
805 if n < 0:
806 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
807 emptiness_explained = False
808 for message in commonDataIds.explain_no_results():
809 _LOG.critical(message)
810 emptiness_explained = True
811 if not emptiness_explained:
812 _LOG.critical(
813 "To reproduce this query for debugging purposes, run "
814 "Registry.queryDataIds with these arguments:"
815 )
816 # We could just repr() the queryArgs dict to get something
817 # the user could make sense of, but it's friendlier to
818 # put these args in an easier-to-construct equivalent form
819 # so they can read it more easily and copy and paste into
820 # a Python terminal.
821 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names))
822 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName())
823 if queryArgs["where"]:
824 _LOG.critical(" where=%s,", repr(queryArgs["where"]))
825 if "datasets" in queryArgs:
826 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
827 if "collections" in queryArgs:
828 _LOG.critical(" collections=%s,", list(queryArgs["collections"]))
829 _LOG.debug("Finished processing %d rows from data ID query.", n)
830 yield commonDataIds
832 def resolveDatasetRefs(
833 self,
834 registry: Registry,
835 collections: Any,
836 run: Optional[str],
837 commonDataIds: DataCoordinateQueryResults,
838 *,
839 skipExistingIn: Any = None,
840 clobberOutputs: bool = True,
841 constrainedByAllDatasets: bool = True,
842 resolveRefs: bool = False,
843 ) -> None:
844 """Perform follow up queries for each dataset data ID produced in
845 `fillDataIds`.
847 This method populates `_DatasetScaffolding.refs` (except for those in
848 `prerequisites`).
850 Parameters
851 ----------
852 registry : `lsst.daf.butler.Registry`
853 Registry for the data repository; used for all data ID queries.
854 collections
855 Expressions representing the collections to search for input
856 datasets. See :ref:`daf_butler_ordered_collection_searches`.
857 run : `str`, optional
858 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
859 output datasets, if it already exists.
860 commonDataIds : \
861 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
862 Result of a previous call to `connectDataIds`.
863 skipExistingIn
864 Expressions representing the collections to search for existing
865 output datasets that should be skipped. See
866 :ref:`daf_butler_ordered_collection_searches` for allowed types.
867 `None` or empty string/sequence disables skipping.
868 clobberOutputs : `bool`, optional
869 If `True` (default), allow quanta to created even if outputs exist;
870 this requires the same behavior behavior to be enabled when
871 executing. If ``skipExistingIn`` is not `None`, completed quanta
872 (those with metadata, or all outputs if there is no metadata
873 dataset configured) will be skipped rather than clobbered.
874 constrainedByAllDatasets : `bool`, optional
875 Indicates if the commonDataIds were generated with a constraint on
876 all dataset types.
877 resolveRefs : `bool`, optional
878 If `True` then resolve all input references and generate random
879 dataset IDs for all output and intermediate datasets. True value
880 requires ``run`` collection to be specified.
882 Raises
883 ------
884 OutputExistsError
885 Raised if an output dataset already exists in the output run
886 and ``skipExistingIn`` does not include output run, or if only
887 some outputs are present and ``clobberOutputs`` is `False`.
888 """
889 # Run may be provided but it does not have to exist, in that case we
890 # use it for resolving references but don't check it for existing refs.
891 run_exists = False
892 if run:
893 try:
894 run_exists = bool(registry.queryCollections(run))
895 except MissingCollectionError:
896 # Undocumented exception is raise if it does not exist
897 pass
899 skip_collections_wildcard: CollectionWildcard | None = None
900 skipExistingInRun = False
901 if skipExistingIn:
902 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn)
903 if run_exists:
904 # as optimization check in the explicit list of names first
905 skipExistingInRun = run in skip_collections_wildcard.strings
906 if not skipExistingInRun:
907 # need to flatten it and check again
908 skipExistingInRun = run in registry.queryCollections(
909 skipExistingIn,
910 collectionTypes=CollectionType.RUN,
911 )
913 idMaker: Optional[_DatasetIdMaker] = None
914 if resolveRefs:
915 assert run is not None, "run cannot be None when resolveRefs is True"
916 idMaker = _DatasetIdMaker(registry, run)
918 resolvedRefQueryResults: Iterable[DatasetRef]
920 # Look up [init] intermediate and output datasets in the output
921 # collection, if there is an output collection.
922 if run_exists or skip_collections_wildcard is not None:
923 for datasetType, refs in itertools.chain(
924 self.initIntermediates.items(),
925 self.initOutputs.items(),
926 self.intermediates.items(),
927 self.outputs.items(),
928 ):
929 _LOG.debug(
930 "Resolving %d datasets for intermediate and/or output dataset %s.",
931 len(refs),
932 datasetType.name,
933 )
934 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
935 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
936 # TODO: this assert incorrectly bans component inputs;
937 # investigate on DM-33027.
938 # assert not datasetType.isComponent(), \
939 # "Output datasets cannot be components."
940 #
941 # Instead we have to handle them manually to avoid a
942 # deprecation warning, but it is at least confusing and
943 # possibly a bug for components to appear here at all.
944 if datasetType.isComponent():
945 parent_dataset_type = datasetType.makeCompositeDatasetType()
946 component = datasetType.component()
947 else:
948 parent_dataset_type = datasetType
949 component = None
951 # look at RUN collection first
952 if run_exists:
953 try:
954 resolvedRefQueryResults = subset.findDatasets(
955 parent_dataset_type, collections=run, findFirst=True
956 )
957 except MissingDatasetTypeError:
958 resolvedRefQueryResults = []
959 for resolvedRef in resolvedRefQueryResults:
960 # TODO: we could easily support per-DatasetType
961 # skipExisting and I could imagine that being useful -
962 # it's probably required in order to support writing
963 # initOutputs before QuantumGraph generation.
964 assert resolvedRef.dataId in refs
965 if not (skipExistingInRun or isInit or clobberOutputs):
966 raise OutputExistsError(
967 f"Output dataset {datasetType.name} already exists in "
968 f"output RUN collection '{run}' with data ID"
969 f" {resolvedRef.dataId}."
970 )
971 # If we are going to resolve all outputs then we have
972 # to remember existing ones to avoid generating new
973 # dataset IDs for them.
974 if resolveRefs:
975 refs[resolvedRef.dataId] = (
976 resolvedRef.makeComponentRef(component)
977 if component is not None
978 else resolvedRef
979 )
981 # And check skipExistingIn too, if RUN collection is in
982 # it is handled above
983 if skip_collections_wildcard is not None:
984 try:
985 resolvedRefQueryResults = subset.findDatasets(
986 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True
987 )
988 except MissingDatasetTypeError:
989 resolvedRefQueryResults = []
990 for resolvedRef in resolvedRefQueryResults:
991 assert resolvedRef.dataId in refs
992 refs[resolvedRef.dataId] = (
993 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef
994 )
996 # Look up input and initInput datasets in the input collection(s).
997 # container to accumulate unfound refs, if the common dataIs were not
998 # constrained on dataset type existence.
999 self.unfoundRefs = set()
1000 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
1001 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
1002 if datasetType.isComponent():
1003 parent_dataset_type = datasetType.makeCompositeDatasetType()
1004 component = datasetType.component()
1005 else:
1006 parent_dataset_type = datasetType
1007 component = None
1008 try:
1009 resolvedRefQueryResults = commonDataIds.subset(
1010 datasetType.dimensions, unique=True
1011 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True)
1012 except MissingDatasetTypeError:
1013 resolvedRefQueryResults = []
1014 dataIdsNotFoundYet = set(refs.keys())
1015 for resolvedRef in resolvedRefQueryResults:
1016 dataIdsNotFoundYet.discard(resolvedRef.dataId)
1017 refs[resolvedRef.dataId] = (
1018 resolvedRef if component is None else resolvedRef.makeComponentRef(component)
1019 )
1020 if dataIdsNotFoundYet:
1021 if constrainedByAllDatasets:
1022 raise RuntimeError(
1023 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
1024 f"'{datasetType.name}' was/were present in a previous "
1025 f"query, but could not be found now. "
1026 f"This is either a logic bug in QuantumGraph generation "
1027 f"or the input collections have been modified since "
1028 f"QuantumGraph generation began."
1029 )
1030 else:
1031 # if the common dataIds were not constrained using all the
1032 # input dataset types, it is possible that some data ids
1033 # found dont correspond to existing dataset types and they
1034 # will be un-resolved. Mark these for later pruning from
1035 # the quantum graph.
1036 for k in dataIdsNotFoundYet:
1037 self.unfoundRefs.add(refs[k])
1039 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
1040 # replacing the unresolved refs there, and then look up prerequisites.
1041 for task in self.tasks:
1042 _LOG.debug(
1043 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
1044 len(task.quanta),
1045 task.taskDef.label,
1046 )
1047 # The way iterConnections is designed makes it impossible to
1048 # annotate precisely enough to satisfy MyPy here.
1049 lookupFunctions = {
1050 c.name: c.lookupFunction # type: ignore
1051 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
1052 if c.lookupFunction is not None # type: ignore
1053 }
1054 dataIdsFailed = []
1055 dataIdsSucceeded = []
1056 for quantum in task.quanta.values():
1057 # Process outputs datasets only if skipExistingIn is not None
1058 # or there is a run to look for outputs in and clobberOutputs
1059 # is True. Note that if skipExistingIn is None, any output
1060 # datasets that already exist would have already caused an
1061 # exception to be raised. We never update the DatasetRefs in
1062 # the quantum because those should never be resolved.
1063 if skip_collections_wildcard is not None or (run_exists and clobberOutputs):
1064 resolvedRefs = []
1065 unresolvedRefs = []
1066 haveMetadata = False
1067 for datasetType, originalRefs in quantum.outputs.items():
1068 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
1069 if ref.id is not None:
1070 resolvedRefs.append(ref)
1071 if datasetType.name == task.taskDef.metadataDatasetName:
1072 haveMetadata = True
1073 else:
1074 unresolvedRefs.append(ref)
1075 if resolvedRefs:
1076 if haveMetadata or not unresolvedRefs:
1077 dataIdsSucceeded.append(quantum.dataId)
1078 if skip_collections_wildcard is not None:
1079 continue
1080 else:
1081 dataIdsFailed.append(quantum.dataId)
1082 if not clobberOutputs:
1083 raise OutputExistsError(
1084 f"Quantum {quantum.dataId} of task with label "
1085 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1086 f"({resolvedRefs}) "
1087 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1088 "and clobbering outputs was not enabled."
1089 )
1090 # Update the input DatasetRefs to the resolved ones we already
1091 # searched for.
1092 for datasetType, input_refs in quantum.inputs.items():
1093 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1094 input_refs[ref.dataId] = ref
1095 # Look up prerequisite datasets in the input collection(s).
1096 # These may have dimensions that extend beyond those we queried
1097 # for originally, because we want to permit those data ID
1098 # values to differ across quanta and dataset types.
1099 for datasetType in task.prerequisites:
1100 if datasetType.isComponent():
1101 parent_dataset_type = datasetType.makeCompositeDatasetType()
1102 component = datasetType.component()
1103 else:
1104 parent_dataset_type = datasetType
1105 component = None
1106 lookupFunction = lookupFunctions.get(datasetType.name)
1107 if lookupFunction is not None:
1108 # PipelineTask has provided its own function to do the
1109 # lookup. This always takes precedence.
1110 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1111 elif (
1112 datasetType.isCalibration()
1113 and datasetType.dimensions <= quantum.dataId.graph
1114 and quantum.dataId.graph.temporal
1115 ):
1116 # This is a master calibration lookup, which we have to
1117 # handle specially because the query system can't do a
1118 # temporal join on a non-dimension-based timespan yet.
1119 timespan = quantum.dataId.timespan
1120 try:
1121 prereq_ref = registry.findDataset(
1122 parent_dataset_type,
1123 quantum.dataId,
1124 collections=collections,
1125 timespan=timespan,
1126 )
1127 if prereq_ref is not None:
1128 if component is not None:
1129 prereq_ref = prereq_ref.makeComponentRef(component)
1130 prereq_refs = [prereq_ref]
1131 else:
1132 prereq_refs = []
1133 except (KeyError, MissingDatasetTypeError):
1134 # This dataset type is not present in the registry,
1135 # which just means there are no datasets here.
1136 prereq_refs = []
1137 else:
1138 # Most general case.
1139 prereq_refs = [
1140 prereq_ref if component is None else prereq_ref.makeComponentRef(component)
1141 for prereq_ref in registry.queryDatasets(
1142 parent_dataset_type,
1143 collections=collections,
1144 dataId=quantum.dataId,
1145 findFirst=True,
1146 ).expanded()
1147 ]
1148 prereq_refs_map = {ref.dataId: ref for ref in prereq_refs if ref is not None}
1149 quantum.prerequisites[datasetType].update(prereq_refs_map)
1150 task.prerequisites[datasetType].update(prereq_refs_map)
1152 # Resolve all quantum inputs and outputs.
1153 if idMaker:
1154 for datasetDict in (quantum.inputs, quantum.outputs):
1155 for refDict in datasetDict.values():
1156 refDict.update(idMaker.resolveDict(refDict))
1158 # Resolve task initInputs and initOutputs.
1159 if idMaker:
1160 for datasetDict in (task.initInputs, task.initOutputs):
1161 for refDict in datasetDict.values():
1162 refDict.update(idMaker.resolveDict(refDict))
1164 # Actually remove any quanta that we decided to skip above.
1165 if dataIdsSucceeded:
1166 if skip_collections_wildcard is not None:
1167 _LOG.debug(
1168 "Pruning successful %d quanta for task with label '%s' because all of their "
1169 "outputs exist or metadata was written successfully.",
1170 len(dataIdsSucceeded),
1171 task.taskDef.label,
1172 )
1173 for dataId in dataIdsSucceeded:
1174 del task.quanta[dataId]
1175 elif clobberOutputs:
1176 _LOG.info(
1177 "Found %d successful quanta for task with label '%s' "
1178 "that will need to be clobbered during execution.",
1179 len(dataIdsSucceeded),
1180 task.taskDef.label,
1181 )
1182 else:
1183 raise AssertionError("OutputExistsError should have already been raised.")
1184 if dataIdsFailed:
1185 if clobberOutputs:
1186 _LOG.info(
1187 "Found %d failed/incomplete quanta for task with label '%s' "
1188 "that will need to be clobbered during execution.",
1189 len(dataIdsFailed),
1190 task.taskDef.label,
1191 )
1192 else:
1193 raise AssertionError("OutputExistsError should have already been raised.")
1195 # Collect initOutputs that do not belong to any task.
1196 global_dataset_types: set[DatasetType] = set(self.initOutputs)
1197 for task in self.tasks:
1198 global_dataset_types -= set(task.initOutputs)
1199 if global_dataset_types:
1200 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs)
1201 if idMaker is not None:
1202 for refDict in self.globalInitOutputs.values():
1203 refDict.update(idMaker.resolveDict(refDict))
1205 def makeQuantumGraph(
1206 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None
1207 ) -> QuantumGraph:
1208 """Create a `QuantumGraph` from the quanta already present in
1209 the scaffolding data structure.
1211 Parameters
1212 ---------
1213 metadata : Optional Mapping of `str` to primitives
1214 This is an optional parameter of extra data to carry with the
1215 graph. Entries in this mapping should be able to be serialized in
1216 JSON.
1217 datastore : `Datastore`, optional
1218 If not `None` then fill datastore records in each generated
1219 Quantum.
1221 Returns
1222 -------
1223 graph : `QuantumGraph`
1224 The full `QuantumGraph`.
1225 """
1227 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1228 """Extract all DatasetRefs from the dictionaries"""
1229 for ref_dict in dataset_dict.values():
1230 yield from ref_dict.values()
1232 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1233 if datastore is not None:
1234 datastore_records = datastore.export_records(
1235 itertools.chain(
1236 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1237 )
1238 )
1240 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1241 for task in self.tasks:
1242 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1243 graphInput[task.taskDef] = qset
1245 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks}
1246 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks}
1248 globalInitOutputs: list[DatasetRef] = []
1249 if self.globalInitOutputs is not None:
1250 for refs_dict in self.globalInitOutputs.values():
1251 globalInitOutputs.extend(refs_dict.values())
1253 graph = QuantumGraph(
1254 graphInput,
1255 metadata=metadata,
1256 pruneRefs=self.unfoundRefs,
1257 universe=self.dimensions.universe,
1258 initInputs=taskInitInputs,
1259 initOutputs=taskInitOutputs,
1260 globalInitOutputs=globalInitOutputs,
1261 )
1262 return graph
1265# ------------------------
1266# Exported definitions --
1267# ------------------------
1270class GraphBuilderError(Exception):
1271 """Base class for exceptions generated by graph builder."""
1273 pass
1276class OutputExistsError(GraphBuilderError):
1277 """Exception generated when output datasets already exist."""
1279 pass
1282class PrerequisiteMissingError(GraphBuilderError):
1283 """Exception generated when a prerequisite dataset does not exist."""
1285 pass
1288class GraphBuilder:
1289 """GraphBuilder class is responsible for building task execution graph from
1290 a Pipeline.
1292 Parameters
1293 ----------
1294 registry : `~lsst.daf.butler.Registry`
1295 Data butler instance.
1296 skipExistingIn
1297 Expressions representing the collections to search for existing
1298 output datasets that should be skipped. See
1299 :ref:`daf_butler_ordered_collection_searches`.
1300 clobberOutputs : `bool`, optional
1301 If `True` (default), allow quanta to created even if partial outputs
1302 exist; this requires the same behavior behavior to be enabled when
1303 executing.
1304 datastore : `Datastore`, optional
1305 If not `None` then fill datastore records in each generated Quantum.
1306 """
1308 def __init__(
1309 self,
1310 registry: Registry,
1311 skipExistingIn: Any = None,
1312 clobberOutputs: bool = True,
1313 datastore: Optional[Datastore] = None,
1314 ):
1315 self.registry = registry
1316 self.dimensions = registry.dimensions
1317 self.skipExistingIn = skipExistingIn
1318 self.clobberOutputs = clobberOutputs
1319 self.datastore = datastore
1321 def makeGraph(
1322 self,
1323 pipeline: Union[Pipeline, Iterable[TaskDef]],
1324 collections: Any,
1325 run: Optional[str],
1326 userQuery: Optional[str],
1327 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1328 metadata: Optional[Mapping[str, Any]] = None,
1329 resolveRefs: bool = False,
1330 bind: Optional[Mapping[str, Any]] = None,
1331 ) -> QuantumGraph:
1332 """Create execution graph for a pipeline.
1334 Parameters
1335 ----------
1336 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1337 Pipeline definition, task names/classes and their configs.
1338 collections
1339 Expressions representing the collections to search for input
1340 datasets. See :ref:`daf_butler_ordered_collection_searches`.
1341 run : `str`, optional
1342 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1343 output datasets. Collection does not have to exist and it will be
1344 created when graph is executed.
1345 userQuery : `str`
1346 String which defines user-defined selection for registry, should be
1347 empty or `None` if there is no restrictions on data selection.
1348 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1349 The query constraint variant that should be used to constraint the
1350 query based on dataset existance, defaults to
1351 `DatasetQueryConstraintVariant.ALL`.
1352 metadata : Optional Mapping of `str` to primitives
1353 This is an optional parameter of extra data to carry with the
1354 graph. Entries in this mapping should be able to be serialized in
1355 JSON.
1356 resolveRefs : `bool`, optional
1357 If `True` then resolve all input references and generate random
1358 dataset IDs for all output and intermediate datasets. True value
1359 requires ``run`` collection to be specified.
1360 bind : `Mapping`, optional
1361 Mapping containing literal values that should be injected into the
1362 ``userQuery`` expression, keyed by the identifiers they replace.
1364 Returns
1365 -------
1366 graph : `QuantumGraph`
1368 Raises
1369 ------
1370 UserExpressionError
1371 Raised when user expression cannot be parsed.
1372 OutputExistsError
1373 Raised when output datasets already exist.
1374 Exception
1375 Other exceptions types may be raised by underlying registry
1376 classes.
1377 """
1378 if resolveRefs and run is None:
1379 raise ValueError("`resolveRefs` requires `run` parameter.")
1380 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1381 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1382 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1383 instrument_class: Optional[Any] = None
1384 if isinstance(pipeline, Pipeline):
1385 instrument_class_name = pipeline.getInstrument()
1386 if instrument_class_name is not None:
1387 instrument_class = doImportType(instrument_class_name)
1388 pipeline = list(pipeline.toExpandedPipeline())
1389 if instrument_class is not None:
1390 dataId = DataCoordinate.standardize(
1391 instrument=instrument_class.getName(), universe=self.registry.dimensions
1392 )
1393 else:
1394 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1395 with scaffolding.connectDataIds(
1396 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind
1397 ) as commonDataIds:
1398 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1399 scaffolding.resolveDatasetRefs(
1400 self.registry,
1401 collections,
1402 run,
1403 commonDataIds,
1404 skipExistingIn=self.skipExistingIn,
1405 clobberOutputs=self.clobberOutputs,
1406 constrainedByAllDatasets=condition,
1407 resolveRefs=resolveRefs,
1408 )
1409 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)