Coverage for python/lsst/pipe/base/graphBuilder.py: 18%
394 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-10-26 15:47 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-10-26 15:47 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Union
38from lsst.daf.butler import (
39 CollectionSearch,
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 NamedValueSet,
50 Quantum,
51 Registry,
52)
53from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
54from lsst.utils import doImportType
56from ._datasetQueryConstraints import DatasetQueryConstraintVariant
57from ._status import NoWorkFound
59# -----------------------------
60# Imports for other modules --
61# -----------------------------
62from .connections import AdjustQuantumHelper, iterConnections
63from .graph import QuantumGraph
64from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
66# ----------------------------------
67# Local non-exported definitions --
68# ----------------------------------
70_LOG = logging.getLogger(__name__)
73class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
74 """A custom dictionary that maps `DatasetType` to a nested dictionary of
75 the known `DatasetRef` instances of that type.
77 Parameters
78 ----------
79 args
80 Positional arguments are forwarded to the `dict` constructor.
81 universe : `DimensionUniverse`
82 Universe of all possible dimensions.
83 """
85 def __init__(self, *args: Any, universe: DimensionUniverse):
86 super().__init__(*args)
87 self.universe = universe
89 @classmethod
90 def fromDatasetTypes(
91 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
92 ) -> _DatasetDict:
93 """Construct a dictionary from a flat iterable of `DatasetType` keys.
95 Parameters
96 ----------
97 datasetTypes : `iterable` of `DatasetType`
98 DatasetTypes to use as keys for the dict. Values will be empty
99 dictionaries.
100 universe : `DimensionUniverse`
101 Universe of all possible dimensions.
103 Returns
104 -------
105 dictionary : `_DatasetDict`
106 A new `_DatasetDict` instance.
107 """
108 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
110 @classmethod
111 def fromSubset(
112 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
113 ) -> _DatasetDict:
114 """Return a new dictionary by extracting items corresponding to the
115 given keys from one or more existing dictionaries.
117 Parameters
118 ----------
119 datasetTypes : `iterable` of `DatasetType`
120 DatasetTypes to use as keys for the dict. Values will be obtained
121 by lookups against ``first`` and ``rest``.
122 first : `_DatasetDict`
123 Another dictionary from which to extract values.
124 rest
125 Additional dictionaries from which to extract values.
127 Returns
128 -------
129 dictionary : `_DatasetDict`
130 A new dictionary instance.
131 """
132 combined = ChainMap(first, *rest)
134 # Dataset types known to match immediately can be processed
135 # without checks.
136 matches = combined.keys() & set(datasetTypes)
137 _dict = {k: combined[k] for k in matches}
139 if len(_dict) < len(datasetTypes):
140 # Work out which ones are missing.
141 missing_datasetTypes = set(datasetTypes) - _dict.keys()
143 # Get the known names for comparison.
144 combined_by_name = {k.name: k for k in combined}
146 missing = set()
147 incompatible = {}
148 for datasetType in missing_datasetTypes:
149 # The dataset type is not found. It may not be listed
150 # or it may be that it is there with the same name
151 # but different definition.
152 if datasetType.name in combined_by_name:
153 # This implies some inconsistency in definitions
154 # for connections. If there is support for storage
155 # class conversion we can let it slide.
156 # At this point we do not know
157 # where the inconsistency is but trust that down
158 # stream code will be more explicit about input
159 # vs output incompatibilities.
160 existing = combined_by_name[datasetType.name]
161 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
162 _LOG.warning(
163 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
164 datasetType,
165 existing,
166 )
167 _dict[datasetType] = combined[existing]
168 else:
169 incompatible[datasetType] = existing
170 else:
171 missing.add(datasetType)
173 if missing or incompatible:
174 reasons = []
175 if missing:
176 reasons.append(
177 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
178 + ", ".join(d.name for d in combined)
179 )
180 if incompatible:
181 for x, y in incompatible.items():
182 reasons.append(f"{x} incompatible with {y}")
183 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
185 return cls(_dict, universe=first.universe)
187 @property
188 def dimensions(self) -> DimensionGraph:
189 """The union of all dimensions used by all dataset types in this
190 dictionary, including implied dependencies (`DimensionGraph`).
191 """
192 base = self.universe.empty
193 if len(self) == 0:
194 return base
195 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
197 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
198 """Unpack nested single-element `DatasetRef` dicts into a new
199 mapping with `DatasetType` keys and `DatasetRef` values.
201 This method assumes that each nest contains exactly one item, as is the
202 case for all "init" datasets.
204 Returns
205 -------
206 dictionary : `NamedKeyDict`
207 Dictionary mapping `DatasetType` to `DatasetRef`, with both
208 `DatasetType` instances and string names usable as keys.
209 """
211 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
212 (ref,) = refs.values()
213 return ref
215 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
217 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
218 """Unpack nested multi-element `DatasetRef` dicts into a new
219 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
221 Returns
222 -------
223 dictionary : `NamedKeyDict`
224 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
225 both `DatasetType` instances and string names usable as keys.
226 """
227 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
229 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
230 """Iterate over the contained `DatasetRef` instances that match the
231 given `DatasetType` and data IDs.
233 Parameters
234 ----------
235 datasetType : `DatasetType`
236 Dataset type to match.
237 dataIds : `Iterable` [ `DataCoordinate` ]
238 Data IDs to match.
240 Returns
241 -------
242 refs : `Iterator` [ `DatasetRef` ]
243 DatasetRef instances for which ``ref.datasetType == datasetType``
244 and ``ref.dataId`` is in ``dataIds``.
245 """
246 refs = self[datasetType]
247 return (refs[dataId] for dataId in dataIds)
250class _QuantumScaffolding:
251 """Helper class aggregating information about a `Quantum`, used when
252 constructing a `QuantumGraph`.
254 See `_PipelineScaffolding` for a top-down description of the full
255 scaffolding data structure.
257 Parameters
258 ----------
259 task : _TaskScaffolding
260 Back-reference to the helper object for the `PipelineTask` this quantum
261 represents an execution of.
262 dataId : `DataCoordinate`
263 Data ID for this quantum.
264 """
266 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
267 self.task = task
268 self.dataId = dataId
269 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
270 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
271 self.prerequisites = _DatasetDict.fromDatasetTypes(
272 task.prerequisites.keys(), universe=dataId.universe
273 )
275 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
277 def __repr__(self) -> str:
278 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
280 task: _TaskScaffolding
281 """Back-reference to the helper object for the `PipelineTask` this quantum
282 represents an execution of.
283 """
285 dataId: DataCoordinate
286 """Data ID for this quantum.
287 """
289 inputs: _DatasetDict
290 """Nested dictionary containing `DatasetRef` inputs to this quantum.
292 This is initialized to map each `DatasetType` to an empty dictionary at
293 construction. Those nested dictionaries are populated (with data IDs as
294 keys) with unresolved `DatasetRef` instances in
295 `_PipelineScaffolding.connectDataIds`.
296 """
298 outputs: _DatasetDict
299 """Nested dictionary containing `DatasetRef` outputs this quantum.
300 """
302 prerequisites: _DatasetDict
303 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
304 quantum.
305 """
307 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
308 """Transform the scaffolding object into a true `Quantum` instance.
310 Parameters
311 ----------
312 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
313 If not `None` then fill datastore records in each generated Quantum
314 using the records from this structure.
316 Returns
317 -------
318 quantum : `Quantum`
319 An actual `Quantum` instance.
320 """
321 allInputs = self.inputs.unpackMultiRefs()
322 allInputs.update(self.prerequisites.unpackMultiRefs())
323 # Give the task's Connections class an opportunity to remove some
324 # inputs, or complain if they are unacceptable.
325 # This will raise if one of the check conditions is not met, which is
326 # the intended behavior.
327 # If it raises NotWorkFound, there is a bug in the QG algorithm
328 # or the adjustQuantum is incorrectly trying to make a prerequisite
329 # input behave like a regular input; adjustQuantum should only raise
330 # NoWorkFound if a regular input is missing, and it shouldn't be
331 # possible for us to have generated ``self`` if that's true.
332 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
333 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
334 initInputs = self.task.initInputs.unpackSingleRefs()
335 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
336 if datastore_records is not None:
337 quantum_records = {}
338 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
339 input_refs += list(initInputs.values())
340 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
341 for datastore_name, records in datastore_records.items():
342 matching_records = records.subset(input_ids)
343 if matching_records is not None:
344 quantum_records[datastore_name] = matching_records
345 return Quantum(
346 taskName=self.task.taskDef.taskName,
347 taskClass=self.task.taskDef.taskClass,
348 dataId=self.dataId,
349 initInputs=initInputs,
350 inputs=helper.inputs,
351 outputs=helper.outputs,
352 datastore_records=quantum_records,
353 )
356@dataclass
357class _TaskScaffolding:
358 """Helper class aggregating information about a `PipelineTask`, used when
359 constructing a `QuantumGraph`.
361 See `_PipelineScaffolding` for a top-down description of the full
362 scaffolding data structure.
364 Parameters
365 ----------
366 taskDef : `TaskDef`
367 Data structure that identifies the task class and its config.
368 parent : `_PipelineScaffolding`
369 The parent data structure that will hold the instance being
370 constructed.
371 datasetTypes : `TaskDatasetTypes`
372 Data structure that categorizes the dataset types used by this task.
373 """
375 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
376 universe = parent.dimensions.universe
377 self.taskDef = taskDef
378 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
379 assert self.dimensions.issubset(parent.dimensions)
380 # Initialize _DatasetDicts as subsets of the one or two
381 # corresponding dicts in the parent _PipelineScaffolding.
382 self.initInputs = _DatasetDict.fromSubset(
383 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
384 )
385 self.initOutputs = _DatasetDict.fromSubset(
386 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
387 )
388 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
389 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
390 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
391 self.dataIds: Set[DataCoordinate] = set()
392 self.quanta = {}
394 def __repr__(self) -> str:
395 # Default dataclass-injected __repr__ gets caught in an infinite loop
396 # because of back-references.
397 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
399 taskDef: TaskDef
400 """Data structure that identifies the task class and its config
401 (`TaskDef`).
402 """
404 dimensions: DimensionGraph
405 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
406 """
408 initInputs: _DatasetDict
409 """Dictionary containing information about datasets used to construct this
410 task (`_DatasetDict`).
411 """
413 initOutputs: _DatasetDict
414 """Dictionary containing information about datasets produced as a
415 side-effect of constructing this task (`_DatasetDict`).
416 """
418 inputs: _DatasetDict
419 """Dictionary containing information about datasets used as regular,
420 graph-constraining inputs to this task (`_DatasetDict`).
421 """
423 outputs: _DatasetDict
424 """Dictionary containing information about datasets produced by this task
425 (`_DatasetDict`).
426 """
428 prerequisites: _DatasetDict
429 """Dictionary containing information about input datasets that must be
430 present in the repository before any Pipeline containing this task is run
431 (`_DatasetDict`).
432 """
434 quanta: Dict[DataCoordinate, _QuantumScaffolding]
435 """Dictionary mapping data ID to a scaffolding object for the Quantum of
436 this task with that data ID.
437 """
439 def makeQuantumSet(
440 self,
441 unresolvedRefs: Optional[Set[DatasetRef]] = None,
442 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
443 ) -> Set[Quantum]:
444 """Create a `set` of `Quantum` from the information in ``self``.
446 Parameters
447 ----------
448 unresolvedRefs : `set` [ `DatasetRef` ], optional
449 Input dataset refs that have not been found.
450 datastore_records : `dict`
453 Returns
454 -------
455 nodes : `set` of `Quantum`
456 The `Quantum` elements corresponding to this task.
457 """
458 if unresolvedRefs is None:
459 unresolvedRefs = set()
460 outputs = set()
461 for q in self.quanta.values():
462 try:
463 tmpQuanta = q.makeQuantum(datastore_records)
464 outputs.add(tmpQuanta)
465 except (NoWorkFound, FileNotFoundError) as exc:
466 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
467 if unresolvedRefs.intersection(refs):
468 # This means it is a node that is Known to be pruned
469 # later and should be left in even though some follow up
470 # queries fail. This allows the pruning to start from this
471 # quantum with known issues, and prune other nodes it
472 # touches
473 inputs = q.inputs.unpackMultiRefs()
474 inputs.update(q.prerequisites.unpackMultiRefs())
475 tmpQuantum = Quantum(
476 taskName=q.task.taskDef.taskName,
477 taskClass=q.task.taskDef.taskClass,
478 dataId=q.dataId,
479 initInputs=q.task.initInputs.unpackSingleRefs(),
480 inputs=inputs,
481 outputs=q.outputs.unpackMultiRefs(),
482 )
483 outputs.add(tmpQuantum)
484 else:
485 raise exc
486 return outputs
489@dataclass
490class _PipelineScaffolding:
491 """A helper data structure that organizes the information involved in
492 constructing a `QuantumGraph` for a `Pipeline`.
494 Parameters
495 ----------
496 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
497 Sequence of tasks from which a graph is to be constructed. Must
498 have nested task classes already imported.
499 universe : `DimensionUniverse`
500 Universe of all possible dimensions.
502 Notes
503 -----
504 The scaffolding data structure contains nested data structures for both
505 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
506 data structures are shared between the pipeline-level structure (which
507 aggregates all datasets and categorizes them from the perspective of the
508 complete pipeline) and the individual tasks that use them as inputs and
509 outputs.
511 `QuantumGraph` construction proceeds in four steps, with each corresponding
512 to a different `_PipelineScaffolding` method:
514 1. When `_PipelineScaffolding` is constructed, we extract and categorize
515 the DatasetTypes used by the pipeline (delegating to
516 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
517 nested `_TaskScaffolding` and `_DatasetDict` objects.
519 2. In `connectDataIds`, we construct and run the "Big Join Query", which
520 returns related tuples of all dimensions used to identify any regular
521 input, output, and intermediate datasets (not prerequisites). We then
522 iterate over these tuples of related dimensions, identifying the subsets
523 that correspond to distinct data IDs for each task and dataset type,
524 and then create `_QuantumScaffolding` objects.
526 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
527 dataset data IDs previously identified, transforming unresolved
528 DatasetRefs into resolved DatasetRefs where appropriate. We then look
529 up prerequisite datasets for all quanta.
531 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
532 per-task `_QuantumScaffolding` objects.
533 """
535 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
536 _LOG.debug("Initializing data structures for QuantumGraph generation.")
537 self.tasks = []
538 # Aggregate and categorize the DatasetTypes in the Pipeline.
539 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
540 # Construct dictionaries that map those DatasetTypes to structures
541 # that will (later) hold addiitonal information about them.
542 for attr in (
543 "initInputs",
544 "initIntermediates",
545 "initOutputs",
546 "inputs",
547 "intermediates",
548 "outputs",
549 "prerequisites",
550 ):
551 setattr(
552 self,
553 attr,
554 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
555 )
556 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints
557 # Aggregate all dimensions for all non-init, non-prerequisite
558 # DatasetTypes. These are the ones we'll include in the big join
559 # query.
560 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
561 # Construct scaffolding nodes for each Task, and add backreferences
562 # to the Task from each DatasetScaffolding node.
563 # Note that there's only one scaffolding node for each DatasetType,
564 # shared by _PipelineScaffolding and all _TaskScaffoldings that
565 # reference it.
566 if isinstance(pipeline, Pipeline):
567 pipeline = pipeline.toExpandedPipeline()
568 self.tasks = [
569 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
570 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
571 ]
573 def __repr__(self) -> str:
574 # Default dataclass-injected __repr__ gets caught in an infinite loop
575 # because of back-references.
576 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
578 tasks: List[_TaskScaffolding]
579 """Scaffolding data structures for each task in the pipeline
580 (`list` of `_TaskScaffolding`).
581 """
583 initInputs: _DatasetDict
584 """Datasets consumed but not produced when constructing the tasks in this
585 pipeline (`_DatasetDict`).
586 """
588 initIntermediates: _DatasetDict
589 """Datasets that are both consumed and produced when constructing the tasks
590 in this pipeline (`_DatasetDict`).
591 """
593 initOutputs: _DatasetDict
594 """Datasets produced but not consumed when constructing the tasks in this
595 pipeline (`_DatasetDict`).
596 """
598 inputs: _DatasetDict
599 """Datasets that are consumed but not produced when running this pipeline
600 (`_DatasetDict`).
601 """
603 intermediates: _DatasetDict
604 """Datasets that are both produced and consumed when running this pipeline
605 (`_DatasetDict`).
606 """
608 outputs: _DatasetDict
609 """Datasets produced but not consumed when when running this pipeline
610 (`_DatasetDict`).
611 """
613 prerequisites: _DatasetDict
614 """Datasets that are consumed when running this pipeline and looked up
615 per-Quantum when generating the graph (`_DatasetDict`).
616 """
618 defaultDatasetQueryConstraints: NamedValueSet[DatasetType]
619 """Datasets that should be used as constraints in the initial query,
620 according to tasks (`NamedValueSet`).
621 """
623 dimensions: DimensionGraph
624 """All dimensions used by any regular input, intermediate, or output
625 (not prerequisite) dataset; the set of dimension used in the "Big Join
626 Query" (`DimensionGraph`).
628 This is required to be a superset of all task quantum dimensions.
629 """
631 @contextmanager
632 def connectDataIds(
633 self,
634 registry: Registry,
635 collections: Any,
636 userQuery: Optional[str],
637 externalDataId: DataCoordinate,
638 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
639 ) -> Iterator[DataCoordinateQueryResults]:
640 """Query for the data IDs that connect nodes in the `QuantumGraph`.
642 This method populates `_TaskScaffolding.dataIds` and
643 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
645 Parameters
646 ----------
647 registry : `lsst.daf.butler.Registry`
648 Registry for the data repository; used for all data ID queries.
649 collections
650 Expressions representing the collections to search for input
651 datasets. May be any of the types accepted by
652 `lsst.daf.butler.CollectionSearch.fromExpression`.
653 userQuery : `str` or `None`
654 User-provided expression to limit the data IDs processed.
655 externalDataId : `DataCoordinate`
656 Externally-provided data ID that should be used to restrict the
657 results, just as if these constraints had been included via ``AND``
658 in ``userQuery``. This includes (at least) any instrument named
659 in the pipeline definition.
660 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
661 The query constraint variant that should be used to constraint the
662 query based on dataset existance, defaults to
663 `DatasetQueryConstraintVariant.ALL`.
665 Returns
666 -------
667 commonDataIds : \
668 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
669 An interface to a database temporary table containing all data IDs
670 that will appear in this `QuantumGraph`. Returned inside a
671 context manager, which will drop the temporary table at the end of
672 the `with` block in which this method is called.
673 """
674 _LOG.debug("Building query for data IDs.")
675 # Initialization datasets always have empty data IDs.
676 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
677 for datasetType, refs in itertools.chain(
678 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
679 ):
680 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
681 # Run one big query for the data IDs for task dimensions and regular
682 # inputs and outputs. We limit the query to only dimensions that are
683 # associated with the input dataset types, but don't (yet) try to
684 # obtain the dataset_ids for those inputs.
685 _LOG.debug(
686 "Submitting data ID query over dimensions %s and materializing results.",
687 list(self.dimensions.names),
688 )
689 queryArgs: Dict[str, Any] = {
690 "dimensions": self.dimensions,
691 "where": userQuery,
692 "dataId": externalDataId,
693 }
694 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
695 _LOG.debug(
696 "Constraining graph query using default of %s.",
697 list(self.defaultDatasetQueryConstraints.names),
698 )
699 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints)
700 queryArgs["collections"] = collections
701 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
702 _LOG.debug("Not using dataset existence to constrain query.")
703 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
704 constraint = set(datasetQueryConstraint)
705 inputs = {k.name: k for k in self.inputs.keys()}
706 if remainder := constraint.difference(inputs.keys()):
707 raise ValueError(
708 f"{remainder} dataset type(s) specified as a graph constraint, but"
709 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
710 )
711 _LOG.debug(f"Constraining graph query using {constraint}")
712 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
713 queryArgs["collections"] = collections
714 else:
715 raise ValueError(
716 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
717 )
719 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
720 _LOG.debug("Expanding data IDs.")
721 commonDataIds = commonDataIds.expanded()
722 _LOG.debug("Iterating over query results to associate quanta with datasets.")
723 # Iterate over query results, populating data IDs for datasets and
724 # quanta and then connecting them to each other.
725 n = -1
726 for n, commonDataId in enumerate(commonDataIds):
727 # Create DatasetRefs for all DatasetTypes from this result row,
728 # noting that we might have created some already.
729 # We remember both those that already existed and those that we
730 # create now.
731 refsForRow = {}
732 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
733 for datasetType, refs in itertools.chain(
734 self.inputs.items(), self.intermediates.items(), self.outputs.items()
735 ):
736 datasetDataId: Optional[DataCoordinate]
737 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
738 datasetDataId = commonDataId.subset(datasetType.dimensions)
739 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
740 ref = refs.get(datasetDataId)
741 if ref is None:
742 ref = DatasetRef(datasetType, datasetDataId)
743 refs[datasetDataId] = ref
744 refsForRow[datasetType.name] = ref
745 # Create _QuantumScaffolding objects for all tasks from this
746 # result row, noting that we might have created some already.
747 for task in self.tasks:
748 quantumDataId = commonDataId.subset(task.dimensions)
749 quantum = task.quanta.get(quantumDataId)
750 if quantum is None:
751 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
752 task.quanta[quantumDataId] = quantum
753 # Whether this is a new quantum or an existing one, we can
754 # now associate the DatasetRefs for this row with it. The
755 # fact that a Quantum data ID and a dataset data ID both
756 # came from the same result row is what tells us they
757 # should be associated.
758 # Many of these associates will be duplicates (because
759 # another query row that differed from this one only in
760 # irrelevant dimensions already added them), and we use
761 # sets to skip.
762 for datasetType in task.inputs:
763 ref = refsForRow[datasetType.name]
764 quantum.inputs[datasetType.name][ref.dataId] = ref
765 for datasetType in task.outputs:
766 ref = refsForRow[datasetType.name]
767 quantum.outputs[datasetType.name][ref.dataId] = ref
768 if n < 0:
769 emptiness_explained = False
770 for message in commonDataIds.explain_no_results():
771 _LOG.warning(message)
772 emptiness_explained = True
773 if not emptiness_explained:
774 _LOG.warning(
775 "To reproduce this query for debugging purposes, run "
776 "Registry.queryDataIds with these arguments:"
777 )
778 # We could just repr() the queryArgs dict to get something
779 # the user could make sense of, but it's friendlier to
780 # put these args in an easier-to-construct equivalent form
781 # so they can read it more easily and copy and paste into
782 # a Python terminal.
783 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names))
784 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName())
785 if queryArgs["where"]:
786 _LOG.warning(" where=%s,", repr(queryArgs["where"]))
787 if "datasets" in queryArgs:
788 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
789 if "collections" in queryArgs:
790 _LOG.warning(" collections=%s,", list(queryArgs["collections"]))
791 _LOG.debug("Finished processing %d rows from data ID query.", n)
792 yield commonDataIds
794 def resolveDatasetRefs(
795 self,
796 registry: Registry,
797 collections: Any,
798 run: Optional[str],
799 commonDataIds: DataCoordinateQueryResults,
800 *,
801 skipExistingIn: Any = None,
802 clobberOutputs: bool = True,
803 constrainedByAllDatasets: bool = True,
804 ) -> None:
805 """Perform follow up queries for each dataset data ID produced in
806 `fillDataIds`.
808 This method populates `_DatasetScaffolding.refs` (except for those in
809 `prerequisites`).
811 Parameters
812 ----------
813 registry : `lsst.daf.butler.Registry`
814 Registry for the data repository; used for all data ID queries.
815 collections
816 Expressions representing the collections to search for input
817 datasets. May be any of the types accepted by
818 `lsst.daf.butler.CollectionSearch.fromExpression`.
819 run : `str`, optional
820 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
821 output datasets, if it already exists.
822 commonDataIds : \
823 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
824 Result of a previous call to `connectDataIds`.
825 skipExistingIn
826 Expressions representing the collections to search for existing
827 output datasets that should be skipped. May be any of the types
828 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
829 `None` or empty string/sequence disables skipping.
830 clobberOutputs : `bool`, optional
831 If `True` (default), allow quanta to created even if outputs exist;
832 this requires the same behavior behavior to be enabled when
833 executing. If ``skipExistingIn`` is not `None`, completed quanta
834 (those with metadata, or all outputs if there is no metadata
835 dataset configured) will be skipped rather than clobbered.
836 constrainedByAllDatasets : `bool`, optional
837 Indicates if the commonDataIds were generated with a constraint on
838 all dataset types.
840 Raises
841 ------
842 OutputExistsError
843 Raised if an output dataset already exists in the output run
844 and ``skipExistingIn`` does not include output run, or if only
845 some outputs are present and ``clobberOutputs`` is `False`.
846 """
847 skipCollections: Optional[CollectionSearch] = None
848 skipExistingInRun = False
849 if skipExistingIn:
850 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
851 if run:
852 # as optimization check in the explicit list of names first
853 skipExistingInRun = run in skipCollections.explicitNames()
854 if not skipExistingInRun:
855 # need to flatten it and check again
856 skipExistingInRun = run in registry.queryCollections(
857 skipExistingIn,
858 collectionTypes=CollectionType.RUN,
859 )
861 # Updating constrainedByAllDatasets here is not ideal, but we have a
862 # few different code paths that each transfer different pieces of
863 # information about what dataset query constraints were applied here,
864 # and none of them has the complete picture until we get here. We're
865 # long overdue for a QG generation rewrite that will make this go away
866 # entirely anyway.
867 constrainedByAllDatasets = (
868 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys()
869 )
871 # Look up [init] intermediate and output datasets in the output
872 # collection, if there is an output collection.
873 if run is not None or skipCollections is not None:
874 for datasetType, refs in itertools.chain(
875 self.initIntermediates.items(),
876 self.initOutputs.items(),
877 self.intermediates.items(),
878 self.outputs.items(),
879 ):
880 _LOG.debug(
881 "Resolving %d datasets for intermediate and/or output dataset %s.",
882 len(refs),
883 datasetType.name,
884 )
885 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
886 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
888 # look at RUN collection first
889 if run is not None:
890 resolvedRefQueryResults = subset.findDatasets(
891 datasetType, collections=run, findFirst=True
892 )
893 for resolvedRef in resolvedRefQueryResults:
894 # TODO: we could easily support per-DatasetType
895 # skipExisting and I could imagine that being useful -
896 # it's probably required in order to support writing
897 # initOutputs before QuantumGraph generation.
898 assert resolvedRef.dataId in refs
899 if not (skipExistingInRun or isInit or clobberOutputs):
900 raise OutputExistsError(
901 f"Output dataset {datasetType.name} already exists in "
902 f"output RUN collection '{run}' with data ID"
903 f" {resolvedRef.dataId}."
904 )
906 # And check skipExistingIn too, if RUN collection is in
907 # it is handled above
908 if skipCollections is not None:
909 resolvedRefQueryResults = subset.findDatasets(
910 datasetType, collections=skipCollections, findFirst=True
911 )
912 for resolvedRef in resolvedRefQueryResults:
913 assert resolvedRef.dataId in refs
914 refs[resolvedRef.dataId] = resolvedRef
916 # Look up input and initInput datasets in the input collection(s).
917 # container to accumulate unfound refs, if the common dataIs were not
918 # constrained on dataset type existence.
919 self.unfoundRefs = set()
920 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
921 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
922 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets(
923 datasetType, collections=collections, findFirst=True
924 )
925 dataIdsNotFoundYet = set(refs.keys())
926 for resolvedRef in resolvedRefQueryResults:
927 dataIdsNotFoundYet.discard(resolvedRef.dataId)
928 refs[resolvedRef.dataId] = resolvedRef
929 if dataIdsNotFoundYet:
930 if constrainedByAllDatasets:
931 raise RuntimeError(
932 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
933 f"'{datasetType.name}' was/were present in a previous "
934 f"query, but could not be found now."
935 f"This is either a logic bug in QuantumGraph generation "
936 f"or the input collections have been modified since "
937 f"QuantumGraph generation began."
938 )
939 elif not datasetType.dimensions:
940 raise RuntimeError(
941 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in "
942 f"collections {collections}."
943 )
944 else:
945 # if the common dataIds were not constrained using all the
946 # input dataset types, it is possible that some data ids
947 # found dont correspond to existing dataset types and they
948 # will be un-resolved. Mark these for later pruning from
949 # the quantum graph.
950 for k in dataIdsNotFoundYet:
951 self.unfoundRefs.add(refs[k])
953 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
954 # replacing the unresolved refs there, and then look up prerequisites.
955 for task in self.tasks:
956 _LOG.debug(
957 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
958 len(task.quanta),
959 task.taskDef.label,
960 )
961 # The way iterConnections is designed makes it impossible to
962 # annotate precisely enough to satisfy MyPy here.
963 lookupFunctions = {
964 c.name: c.lookupFunction # type: ignore
965 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
966 if c.lookupFunction is not None # type: ignore
967 }
968 dataIdsFailed = []
969 dataIdsSucceeded = []
970 for quantum in task.quanta.values():
971 # Process outputs datasets only if skipExistingIn is not None
972 # or there is a run to look for outputs in and clobberOutputs
973 # is True. Note that if skipExistingIn is None, any output
974 # datasets that already exist would have already caused an
975 # exception to be raised. We never update the DatasetRefs in
976 # the quantum because those should never be resolved.
977 if skipCollections is not None or (run is not None and clobberOutputs):
978 resolvedRefs = []
979 unresolvedRefs = []
980 haveMetadata = False
981 for datasetType, originalRefs in quantum.outputs.items():
982 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
983 if ref.id is not None:
984 resolvedRefs.append(ref)
985 if datasetType.name == task.taskDef.metadataDatasetName:
986 haveMetadata = True
987 else:
988 unresolvedRefs.append(ref)
989 if resolvedRefs:
990 if haveMetadata or not unresolvedRefs:
991 dataIdsSucceeded.append(quantum.dataId)
992 if skipCollections is not None:
993 continue
994 else:
995 dataIdsFailed.append(quantum.dataId)
996 if not clobberOutputs:
997 raise OutputExistsError(
998 f"Quantum {quantum.dataId} of task with label "
999 f"'{quantum.task.taskDef.label}' has some outputs that exist "
1000 f"({resolvedRefs}) "
1001 f"and others that don't ({unresolvedRefs}), with no metadata output, "
1002 "and clobbering outputs was not enabled."
1003 )
1004 # Update the input DatasetRefs to the resolved ones we already
1005 # searched for.
1006 for datasetType, input_refs in quantum.inputs.items():
1007 for ref in task.inputs.extract(datasetType, input_refs.keys()):
1008 input_refs[ref.dataId] = ref
1009 # Look up prerequisite datasets in the input collection(s).
1010 # These may have dimensions that extend beyond those we queried
1011 # for originally, because we want to permit those data ID
1012 # values to differ across quanta and dataset types.
1013 for datasetType in task.prerequisites:
1014 lookupFunction = lookupFunctions.get(datasetType.name)
1015 if lookupFunction is not None:
1016 # PipelineTask has provided its own function to do the
1017 # lookup. This always takes precedence.
1018 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
1019 elif (
1020 datasetType.isCalibration()
1021 and datasetType.dimensions <= quantum.dataId.graph
1022 and quantum.dataId.graph.temporal
1023 ):
1024 # This is a master calibration lookup, which we have to
1025 # handle specially because the query system can't do a
1026 # temporal join on a non-dimension-based timespan yet.
1027 timespan = quantum.dataId.timespan
1028 try:
1029 prereq_refs = [
1030 registry.findDataset(
1031 datasetType, quantum.dataId, collections=collections, timespan=timespan
1032 )
1033 ]
1034 except KeyError:
1035 # This dataset type is not present in the registry,
1036 # which just means there are no datasets here.
1037 prereq_refs = []
1038 else:
1039 # Most general case.
1040 prereq_refs = list(
1041 registry.queryDatasets(
1042 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True
1043 ).expanded()
1044 )
1045 quantum.prerequisites[datasetType].update(
1046 {ref.dataId: ref for ref in prereq_refs if ref is not None}
1047 )
1048 # Actually remove any quanta that we decided to skip above.
1049 if dataIdsSucceeded:
1050 if skipCollections is not None:
1051 _LOG.debug(
1052 "Pruning successful %d quanta for task with label '%s' because all of their "
1053 "outputs exist or metadata was written successfully.",
1054 len(dataIdsSucceeded),
1055 task.taskDef.label,
1056 )
1057 for dataId in dataIdsSucceeded:
1058 del task.quanta[dataId]
1059 elif clobberOutputs:
1060 _LOG.info(
1061 "Found %d successful quanta for task with label '%s' "
1062 "that will need to be clobbered during execution.",
1063 len(dataIdsSucceeded),
1064 task.taskDef.label,
1065 )
1066 else:
1067 raise AssertionError("OutputExistsError should have already been raised.")
1068 if dataIdsFailed:
1069 if clobberOutputs:
1070 _LOG.info(
1071 "Found %d failed/incomplete quanta for task with label '%s' "
1072 "that will need to be clobbered during execution.",
1073 len(dataIdsFailed),
1074 task.taskDef.label,
1075 )
1076 else:
1077 raise AssertionError("OutputExistsError should have already been raised.")
1079 def makeQuantumGraph(
1080 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None
1081 ) -> QuantumGraph:
1082 """Create a `QuantumGraph` from the quanta already present in
1083 the scaffolding data structure.
1085 Parameters
1086 ---------
1087 metadata : Optional Mapping of `str` to primitives
1088 This is an optional parameter of extra data to carry with the
1089 graph. Entries in this mapping should be able to be serialized in
1090 JSON.
1091 datastore : `Datastore`, optional
1092 If not `None` then fill datastore records in each generated
1093 Quantum.
1095 Returns
1096 -------
1097 graph : `QuantumGraph`
1098 The full `QuantumGraph`.
1099 """
1101 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1102 """Extract all DatasetRefs from the dictionaries"""
1103 for ref_dict in dataset_dict.values():
1104 yield from ref_dict.values()
1106 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1107 if datastore is not None:
1108 datastore_records = datastore.export_records(
1109 itertools.chain(
1110 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1111 )
1112 )
1114 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1115 for task in self.tasks:
1116 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1117 graphInput[task.taskDef] = qset
1119 graph = QuantumGraph(
1120 graphInput, metadata=metadata, pruneRefs=self.unfoundRefs, universe=self.dimensions.universe
1121 )
1122 return graph
1125# ------------------------
1126# Exported definitions --
1127# ------------------------
1130class GraphBuilderError(Exception):
1131 """Base class for exceptions generated by graph builder."""
1133 pass
1136class OutputExistsError(GraphBuilderError):
1137 """Exception generated when output datasets already exist."""
1139 pass
1142class PrerequisiteMissingError(GraphBuilderError):
1143 """Exception generated when a prerequisite dataset does not exist."""
1145 pass
1148class GraphBuilder:
1149 """GraphBuilder class is responsible for building task execution graph from
1150 a Pipeline.
1152 Parameters
1153 ----------
1154 registry : `~lsst.daf.butler.Registry`
1155 Data butler instance.
1156 skipExistingIn
1157 Expressions representing the collections to search for existing
1158 output datasets that should be skipped. May be any of the types
1159 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
1160 clobberOutputs : `bool`, optional
1161 If `True` (default), allow quanta to created even if partial outputs
1162 exist; this requires the same behavior behavior to be enabled when
1163 executing.
1164 datastore : `Datastore`, optional
1165 If not `None` then fill datastore records in each generated Quantum.
1166 """
1168 def __init__(
1169 self,
1170 registry: Registry,
1171 skipExistingIn: Any = None,
1172 clobberOutputs: bool = True,
1173 datastore: Optional[Datastore] = None,
1174 ):
1175 self.registry = registry
1176 self.dimensions = registry.dimensions
1177 self.skipExistingIn = skipExistingIn
1178 self.clobberOutputs = clobberOutputs
1179 self.datastore = datastore
1181 def makeGraph(
1182 self,
1183 pipeline: Union[Pipeline, Iterable[TaskDef]],
1184 collections: Any,
1185 run: Optional[str],
1186 userQuery: Optional[str],
1187 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1188 metadata: Optional[Mapping[str, Any]] = None,
1189 ) -> QuantumGraph:
1190 """Create execution graph for a pipeline.
1192 Parameters
1193 ----------
1194 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1195 Pipeline definition, task names/classes and their configs.
1196 collections
1197 Expressions representing the collections to search for input
1198 datasets. May be any of the types accepted by
1199 `lsst.daf.butler.CollectionSearch.fromExpression`.
1200 run : `str`, optional
1201 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1202 output datasets, if it already exists.
1203 userQuery : `str`
1204 String which defines user-defined selection for registry, should be
1205 empty or `None` if there is no restrictions on data selection.
1206 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1207 The query constraint variant that should be used to constraint the
1208 query based on dataset existance, defaults to
1209 `DatasetQueryConstraintVariant.ALL`.
1210 metadata : Optional Mapping of `str` to primitives
1211 This is an optional parameter of extra data to carry with the
1212 graph. Entries in this mapping should be able to be serialized in
1213 JSON.
1215 Returns
1216 -------
1217 graph : `QuantumGraph`
1219 Raises
1220 ------
1221 UserExpressionError
1222 Raised when user expression cannot be parsed.
1223 OutputExistsError
1224 Raised when output datasets already exist.
1225 Exception
1226 Other exceptions types may be raised by underlying registry
1227 classes.
1228 """
1229 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1230 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1231 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1232 instrument_class: Optional[Any] = None
1233 if isinstance(pipeline, Pipeline):
1234 instrument_class_name = pipeline.getInstrument()
1235 if instrument_class_name is not None:
1236 instrument_class = doImportType(instrument_class_name)
1237 pipeline = list(pipeline.toExpandedPipeline())
1238 if instrument_class is not None:
1239 dataId = DataCoordinate.standardize(
1240 instrument=instrument_class.getName(), universe=self.registry.dimensions
1241 )
1242 else:
1243 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1244 with scaffolding.connectDataIds(
1245 self.registry, collections, userQuery, dataId, datasetQueryConstraint
1246 ) as commonDataIds:
1247 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1248 scaffolding.resolveDatasetRefs(
1249 self.registry,
1250 collections,
1251 run,
1252 commonDataIds,
1253 skipExistingIn=self.skipExistingIn,
1254 clobberOutputs=self.clobberOutputs,
1255 constrainedByAllDatasets=condition,
1256 )
1257 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)