Coverage for python/lsst/pipe/base/graphBuilder.py: 17%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Union
38from lsst.daf.butler import (
39 CollectionSearch,
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 DimensionGraph,
45 DimensionUniverse,
46 NamedKeyDict,
47 Quantum,
48 Registry,
49)
50from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
51from lsst.utils import doImportType
53from ._datasetQueryConstraints import DatasetQueryConstraintVariant
54from ._status import NoWorkFound
56# -----------------------------
57# Imports for other modules --
58# -----------------------------
59from .connections import AdjustQuantumHelper, iterConnections
60from .graph import QuantumGraph
61from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
63# ----------------------------------
64# Local non-exported definitions --
65# ----------------------------------
67_LOG = logging.getLogger(__name__)
70class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
71 """A custom dictionary that maps `DatasetType` to a nested dictionary of
72 the known `DatasetRef` instances of that type.
74 Parameters
75 ----------
76 args
77 Positional arguments are forwarded to the `dict` constructor.
78 universe : `DimensionUniverse`
79 Universe of all possible dimensions.
80 """
82 def __init__(self, *args: Any, universe: DimensionUniverse):
83 super().__init__(*args)
84 self.universe = universe
86 @classmethod
87 def fromDatasetTypes(
88 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
89 ) -> _DatasetDict:
90 """Construct a dictionary from a flat iterable of `DatasetType` keys.
92 Parameters
93 ----------
94 datasetTypes : `iterable` of `DatasetType`
95 DatasetTypes to use as keys for the dict. Values will be empty
96 dictionaries.
97 universe : `DimensionUniverse`
98 Universe of all possible dimensions.
100 Returns
101 -------
102 dictionary : `_DatasetDict`
103 A new `_DatasetDict` instance.
104 """
105 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
107 @classmethod
108 def fromSubset(
109 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
110 ) -> _DatasetDict:
111 """Return a new dictionary by extracting items corresponding to the
112 given keys from one or more existing dictionaries.
114 Parameters
115 ----------
116 datasetTypes : `iterable` of `DatasetType`
117 DatasetTypes to use as keys for the dict. Values will be obtained
118 by lookups against ``first`` and ``rest``.
119 first : `_DatasetDict`
120 Another dictionary from which to extract values.
121 rest
122 Additional dictionaries from which to extract values.
124 Returns
125 -------
126 dictionary : `_DatasetDict`
127 A new dictionary instance.
128 """
129 combined = ChainMap(first, *rest)
131 # Dataset types known to match immediately can be processed
132 # without checks.
133 matches = combined.keys() & set(datasetTypes)
134 _dict = {k: combined[k] for k in matches}
136 if len(_dict) < len(datasetTypes):
137 # Work out which ones are missing.
138 missing_datasetTypes = set(datasetTypes) - _dict.keys()
140 # Get the known names for comparison.
141 combined_by_name = {k.name: k for k in combined}
143 missing = set()
144 incompatible = {}
145 for datasetType in missing_datasetTypes:
146 # The dataset type is not found. It may not be listed
147 # or it may be that it is there with the same name
148 # but different definition.
149 if datasetType.name in combined_by_name:
150 # This implies some inconsistency in definitions
151 # for connections. If there is support for storage
152 # class conversion we can let it slide.
153 # At this point we do not know
154 # where the inconsistency is but trust that down
155 # stream code will be more explicit about input
156 # vs output incompatibilities.
157 existing = combined_by_name[datasetType.name]
158 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
159 _LOG.warning(
160 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
161 datasetType,
162 existing,
163 )
164 _dict[datasetType] = combined[existing]
165 else:
166 incompatible[datasetType] = existing
167 else:
168 missing.add(datasetType)
170 if missing or incompatible:
171 reasons = []
172 if missing:
173 reasons.append(
174 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
175 + ", ".join(d.name for d in combined)
176 )
177 if incompatible:
178 for x, y in incompatible.items():
179 reasons.append(f"{x} incompatible with {y}")
180 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
182 return cls(_dict, universe=first.universe)
184 @property
185 def dimensions(self) -> DimensionGraph:
186 """The union of all dimensions used by all dataset types in this
187 dictionary, including implied dependencies (`DimensionGraph`).
188 """
189 base = self.universe.empty
190 if len(self) == 0:
191 return base
192 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
194 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
195 """Unpack nested single-element `DatasetRef` dicts into a new
196 mapping with `DatasetType` keys and `DatasetRef` values.
198 This method assumes that each nest contains exactly one item, as is the
199 case for all "init" datasets.
201 Returns
202 -------
203 dictionary : `NamedKeyDict`
204 Dictionary mapping `DatasetType` to `DatasetRef`, with both
205 `DatasetType` instances and string names usable as keys.
206 """
208 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
209 (ref,) = refs.values()
210 return ref
212 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
214 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
215 """Unpack nested multi-element `DatasetRef` dicts into a new
216 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
218 Returns
219 -------
220 dictionary : `NamedKeyDict`
221 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
222 both `DatasetType` instances and string names usable as keys.
223 """
224 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
226 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
227 """Iterate over the contained `DatasetRef` instances that match the
228 given `DatasetType` and data IDs.
230 Parameters
231 ----------
232 datasetType : `DatasetType`
233 Dataset type to match.
234 dataIds : `Iterable` [ `DataCoordinate` ]
235 Data IDs to match.
237 Returns
238 -------
239 refs : `Iterator` [ `DatasetRef` ]
240 DatasetRef instances for which ``ref.datasetType == datasetType``
241 and ``ref.dataId`` is in ``dataIds``.
242 """
243 refs = self[datasetType]
244 return (refs[dataId] for dataId in dataIds)
247class _QuantumScaffolding:
248 """Helper class aggregating information about a `Quantum`, used when
249 constructing a `QuantumGraph`.
251 See `_PipelineScaffolding` for a top-down description of the full
252 scaffolding data structure.
254 Parameters
255 ----------
256 task : _TaskScaffolding
257 Back-reference to the helper object for the `PipelineTask` this quantum
258 represents an execution of.
259 dataId : `DataCoordinate`
260 Data ID for this quantum.
261 """
263 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
264 self.task = task
265 self.dataId = dataId
266 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
267 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
268 self.prerequisites = _DatasetDict.fromDatasetTypes(
269 task.prerequisites.keys(), universe=dataId.universe
270 )
272 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
274 def __repr__(self) -> str:
275 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
277 task: _TaskScaffolding
278 """Back-reference to the helper object for the `PipelineTask` this quantum
279 represents an execution of.
280 """
282 dataId: DataCoordinate
283 """Data ID for this quantum.
284 """
286 inputs: _DatasetDict
287 """Nested dictionary containing `DatasetRef` inputs to this quantum.
289 This is initialized to map each `DatasetType` to an empty dictionary at
290 construction. Those nested dictionaries are populated (with data IDs as
291 keys) with unresolved `DatasetRef` instances in
292 `_PipelineScaffolding.connectDataIds`.
293 """
295 outputs: _DatasetDict
296 """Nested dictionary containing `DatasetRef` outputs this quantum.
297 """
299 prerequisites: _DatasetDict
300 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
301 quantum.
302 """
304 def makeQuantum(self) -> Quantum:
305 """Transform the scaffolding object into a true `Quantum` instance.
307 Returns
308 -------
309 quantum : `Quantum`
310 An actual `Quantum` instance.
311 """
312 allInputs = self.inputs.unpackMultiRefs()
313 allInputs.update(self.prerequisites.unpackMultiRefs())
314 # Give the task's Connections class an opportunity to remove some
315 # inputs, or complain if they are unacceptable.
316 # This will raise if one of the check conditions is not met, which is
317 # the intended behavior.
318 # If it raises NotWorkFound, there is a bug in the QG algorithm
319 # or the adjustQuantum is incorrectly trying to make a prerequisite
320 # input behave like a regular input; adjustQuantum should only raise
321 # NoWorkFound if a regular input is missing, and it shouldn't be
322 # possible for us to have generated ``self`` if that's true.
323 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
324 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
325 return Quantum(
326 taskName=self.task.taskDef.taskName,
327 taskClass=self.task.taskDef.taskClass,
328 dataId=self.dataId,
329 initInputs=self.task.initInputs.unpackSingleRefs(),
330 inputs=helper.inputs,
331 outputs=helper.outputs,
332 )
335@dataclass
336class _TaskScaffolding:
337 """Helper class aggregating information about a `PipelineTask`, used when
338 constructing a `QuantumGraph`.
340 See `_PipelineScaffolding` for a top-down description of the full
341 scaffolding data structure.
343 Parameters
344 ----------
345 taskDef : `TaskDef`
346 Data structure that identifies the task class and its config.
347 parent : `_PipelineScaffolding`
348 The parent data structure that will hold the instance being
349 constructed.
350 datasetTypes : `TaskDatasetTypes`
351 Data structure that categorizes the dataset types used by this task.
352 """
354 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
355 universe = parent.dimensions.universe
356 self.taskDef = taskDef
357 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
358 assert self.dimensions.issubset(parent.dimensions)
359 # Initialize _DatasetDicts as subsets of the one or two
360 # corresponding dicts in the parent _PipelineScaffolding.
361 self.initInputs = _DatasetDict.fromSubset(
362 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
363 )
364 self.initOutputs = _DatasetDict.fromSubset(
365 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
366 )
367 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
368 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
369 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
370 self.dataIds: Set[DataCoordinate] = set()
371 self.quanta = {}
373 def __repr__(self) -> str:
374 # Default dataclass-injected __repr__ gets caught in an infinite loop
375 # because of back-references.
376 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
378 taskDef: TaskDef
379 """Data structure that identifies the task class and its config
380 (`TaskDef`).
381 """
383 dimensions: DimensionGraph
384 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
385 """
387 initInputs: _DatasetDict
388 """Dictionary containing information about datasets used to construct this
389 task (`_DatasetDict`).
390 """
392 initOutputs: _DatasetDict
393 """Dictionary containing information about datasets produced as a
394 side-effect of constructing this task (`_DatasetDict`).
395 """
397 inputs: _DatasetDict
398 """Dictionary containing information about datasets used as regular,
399 graph-constraining inputs to this task (`_DatasetDict`).
400 """
402 outputs: _DatasetDict
403 """Dictionary containing information about datasets produced by this task
404 (`_DatasetDict`).
405 """
407 prerequisites: _DatasetDict
408 """Dictionary containing information about input datasets that must be
409 present in the repository before any Pipeline containing this task is run
410 (`_DatasetDict`).
411 """
413 quanta: Dict[DataCoordinate, _QuantumScaffolding]
414 """Dictionary mapping data ID to a scaffolding object for the Quantum of
415 this task with that data ID.
416 """
418 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]:
419 """Create a `set` of `Quantum` from the information in ``self``.
421 Returns
422 -------
423 nodes : `set` of `Quantum
424 The `Quantum` elements corresponding to this task.
425 """
426 if unresolvedRefs is None:
427 unresolvedRefs = set()
428 outputs = set()
429 for q in self.quanta.values():
430 try:
431 tmpQuanta = q.makeQuantum()
432 outputs.add(tmpQuanta)
433 except (NoWorkFound, FileNotFoundError) as exc:
434 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
435 if unresolvedRefs.intersection(refs):
436 # This means it is a node that is Known to be pruned
437 # later and should be left in even though some follow up
438 # queries fail. This allows the pruning to start from this
439 # quantum with known issues, and prune other nodes it
440 # touches
441 inputs = q.inputs.unpackMultiRefs()
442 inputs.update(q.prerequisites.unpackMultiRefs())
443 tmpQuantum = Quantum(
444 taskName=q.task.taskDef.taskName,
445 taskClass=q.task.taskDef.taskClass,
446 dataId=q.dataId,
447 initInputs=q.task.initInputs.unpackSingleRefs(),
448 inputs=inputs,
449 outputs=q.outputs.unpackMultiRefs(),
450 )
451 outputs.add(tmpQuantum)
452 else:
453 raise exc
454 return outputs
457@dataclass
458class _PipelineScaffolding:
459 """A helper data structure that organizes the information involved in
460 constructing a `QuantumGraph` for a `Pipeline`.
462 Parameters
463 ----------
464 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
465 Sequence of tasks from which a graph is to be constructed. Must
466 have nested task classes already imported.
467 universe : `DimensionUniverse`
468 Universe of all possible dimensions.
470 Notes
471 -----
472 The scaffolding data structure contains nested data structures for both
473 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
474 data structures are shared between the pipeline-level structure (which
475 aggregates all datasets and categorizes them from the perspective of the
476 complete pipeline) and the individual tasks that use them as inputs and
477 outputs.
479 `QuantumGraph` construction proceeds in four steps, with each corresponding
480 to a different `_PipelineScaffolding` method:
482 1. When `_PipelineScaffolding` is constructed, we extract and categorize
483 the DatasetTypes used by the pipeline (delegating to
484 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
485 nested `_TaskScaffolding` and `_DatasetDict` objects.
487 2. In `connectDataIds`, we construct and run the "Big Join Query", which
488 returns related tuples of all dimensions used to identify any regular
489 input, output, and intermediate datasets (not prerequisites). We then
490 iterate over these tuples of related dimensions, identifying the subsets
491 that correspond to distinct data IDs for each task and dataset type,
492 and then create `_QuantumScaffolding` objects.
494 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
495 dataset data IDs previously identified, transforming unresolved
496 DatasetRefs into resolved DatasetRefs where appropriate. We then look
497 up prerequisite datasets for all quanta.
499 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
500 per-task `_QuantumScaffolding` objects.
501 """
503 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
504 _LOG.debug("Initializing data structures for QuantumGraph generation.")
505 self.tasks = []
506 # Aggregate and categorize the DatasetTypes in the Pipeline.
507 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
508 # Construct dictionaries that map those DatasetTypes to structures
509 # that will (later) hold addiitonal information about them.
510 for attr in (
511 "initInputs",
512 "initIntermediates",
513 "initOutputs",
514 "inputs",
515 "intermediates",
516 "outputs",
517 "prerequisites",
518 ):
519 setattr(
520 self,
521 attr,
522 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
523 )
524 # Aggregate all dimensions for all non-init, non-prerequisite
525 # DatasetTypes. These are the ones we'll include in the big join
526 # query.
527 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
528 # Construct scaffolding nodes for each Task, and add backreferences
529 # to the Task from each DatasetScaffolding node.
530 # Note that there's only one scaffolding node for each DatasetType,
531 # shared by _PipelineScaffolding and all _TaskScaffoldings that
532 # reference it.
533 if isinstance(pipeline, Pipeline):
534 pipeline = pipeline.toExpandedPipeline()
535 self.tasks = [
536 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
537 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
538 ]
540 def __repr__(self) -> str:
541 # Default dataclass-injected __repr__ gets caught in an infinite loop
542 # because of back-references.
543 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
545 tasks: List[_TaskScaffolding]
546 """Scaffolding data structures for each task in the pipeline
547 (`list` of `_TaskScaffolding`).
548 """
550 initInputs: _DatasetDict
551 """Datasets consumed but not produced when constructing the tasks in this
552 pipeline (`_DatasetDict`).
553 """
555 initIntermediates: _DatasetDict
556 """Datasets that are both consumed and produced when constructing the tasks
557 in this pipeline (`_DatasetDict`).
558 """
560 initOutputs: _DatasetDict
561 """Datasets produced but not consumed when constructing the tasks in this
562 pipeline (`_DatasetDict`).
563 """
565 inputs: _DatasetDict
566 """Datasets that are consumed but not produced when running this pipeline
567 (`_DatasetDict`).
568 """
570 intermediates: _DatasetDict
571 """Datasets that are both produced and consumed when running this pipeline
572 (`_DatasetDict`).
573 """
575 outputs: _DatasetDict
576 """Datasets produced but not consumed when when running this pipeline
577 (`_DatasetDict`).
578 """
580 prerequisites: _DatasetDict
581 """Datasets that are consumed when running this pipeline and looked up
582 per-Quantum when generating the graph (`_DatasetDict`).
583 """
585 dimensions: DimensionGraph
586 """All dimensions used by any regular input, intermediate, or output
587 (not prerequisite) dataset; the set of dimension used in the "Big Join
588 Query" (`DimensionGraph`).
590 This is required to be a superset of all task quantum dimensions.
591 """
593 @contextmanager
594 def connectDataIds(
595 self,
596 registry: Registry,
597 collections: Any,
598 userQuery: Optional[str],
599 externalDataId: DataCoordinate,
600 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
601 ) -> Iterator[DataCoordinateQueryResults]:
602 """Query for the data IDs that connect nodes in the `QuantumGraph`.
604 This method populates `_TaskScaffolding.dataIds` and
605 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
607 Parameters
608 ----------
609 registry : `lsst.daf.butler.Registry`
610 Registry for the data repository; used for all data ID queries.
611 collections
612 Expressions representing the collections to search for input
613 datasets. May be any of the types accepted by
614 `lsst.daf.butler.CollectionSearch.fromExpression`.
615 userQuery : `str` or `None`
616 User-provided expression to limit the data IDs processed.
617 externalDataId : `DataCoordinate`
618 Externally-provided data ID that should be used to restrict the
619 results, just as if these constraints had been included via ``AND``
620 in ``userQuery``. This includes (at least) any instrument named
621 in the pipeline definition.
622 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
623 The query constraint variant that should be used to constraint the
624 query based on dataset existance, defaults to
625 `DatasetQueryConstraintVariant.ALL`.
627 Returns
628 -------
629 commonDataIds : \
630 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
631 An interface to a database temporary table containing all data IDs
632 that will appear in this `QuantumGraph`. Returned inside a
633 context manager, which will drop the temporary table at the end of
634 the `with` block in which this method is called.
635 """
636 _LOG.debug("Building query for data IDs.")
637 # Initialization datasets always have empty data IDs.
638 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
639 for datasetType, refs in itertools.chain(
640 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
641 ):
642 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
643 # Run one big query for the data IDs for task dimensions and regular
644 # inputs and outputs. We limit the query to only dimensions that are
645 # associated with the input dataset types, but don't (yet) try to
646 # obtain the dataset_ids for those inputs.
647 _LOG.debug("Submitting data ID query and materializing results.")
648 queryArgs: Dict[str, Any] = {
649 "dimensions": self.dimensions,
650 "where": userQuery,
651 "dataId": externalDataId,
652 }
653 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
654 _LOG.debug("Constraining graph query using all datasets in pipeline.")
655 queryArgs["datasets"] = list(self.inputs)
656 queryArgs["collections"] = collections
657 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
658 _LOG.debug("Not using dataset existence to constrain query.")
659 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
660 constraint = set(datasetQueryConstraint)
661 inputs = {k.name: k for k in self.inputs.keys()}
662 if remainder := constraint.difference(inputs.keys()):
663 raise ValueError(
664 f"{remainder} dataset type(s) specified as a graph constraint, but"
665 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
666 )
667 _LOG.debug(f"Constraining graph query using {constraint}")
668 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
669 queryArgs["collections"] = collections
670 else:
671 raise ValueError(
672 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
673 )
675 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
676 _LOG.debug("Expanding data IDs.")
677 commonDataIds = commonDataIds.expanded()
678 _LOG.debug("Iterating over query results to associate quanta with datasets.")
679 # Iterate over query results, populating data IDs for datasets and
680 # quanta and then connecting them to each other.
681 n = -1
682 for n, commonDataId in enumerate(commonDataIds):
683 # Create DatasetRefs for all DatasetTypes from this result row,
684 # noting that we might have created some already.
685 # We remember both those that already existed and those that we
686 # create now.
687 refsForRow = {}
688 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
689 for datasetType, refs in itertools.chain(
690 self.inputs.items(), self.intermediates.items(), self.outputs.items()
691 ):
692 datasetDataId: Optional[DataCoordinate]
693 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
694 datasetDataId = commonDataId.subset(datasetType.dimensions)
695 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
696 ref = refs.get(datasetDataId)
697 if ref is None:
698 ref = DatasetRef(datasetType, datasetDataId)
699 refs[datasetDataId] = ref
700 refsForRow[datasetType.name] = ref
701 # Create _QuantumScaffolding objects for all tasks from this
702 # result row, noting that we might have created some already.
703 for task in self.tasks:
704 quantumDataId = commonDataId.subset(task.dimensions)
705 quantum = task.quanta.get(quantumDataId)
706 if quantum is None:
707 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
708 task.quanta[quantumDataId] = quantum
709 # Whether this is a new quantum or an existing one, we can
710 # now associate the DatasetRefs for this row with it. The
711 # fact that a Quantum data ID and a dataset data ID both
712 # came from the same result row is what tells us they
713 # should be associated.
714 # Many of these associates will be duplicates (because
715 # another query row that differed from this one only in
716 # irrelevant dimensions already added them), and we use
717 # sets to skip.
718 for datasetType in task.inputs:
719 ref = refsForRow[datasetType.name]
720 quantum.inputs[datasetType.name][ref.dataId] = ref
721 for datasetType in task.outputs:
722 ref = refsForRow[datasetType.name]
723 quantum.outputs[datasetType.name][ref.dataId] = ref
724 if n < 0:
725 emptiness_explained = False
726 for message in commonDataIds.explain_no_results():
727 _LOG.warning(message)
728 emptiness_explained = True
729 if not emptiness_explained:
730 _LOG.warning(
731 "To reproduce this query for debugging purposes, run "
732 "Registry.queryDataIds with these arguments:"
733 )
734 # We could just repr() the queryArgs dict to get something
735 # the user could make sense of, but it's friendlier to
736 # put these args in an easier-to-construct equivalent form
737 # so they can read it more easily and copy and paste into
738 # a Python terminal.
739 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names))
740 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName())
741 if queryArgs["where"]:
742 _LOG.warning(" where=%s,", repr(queryArgs["where"]))
743 if "datasets" in queryArgs:
744 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
745 if "collections" in queryArgs:
746 _LOG.warning(" collections=%s,", list(queryArgs["collections"]))
747 _LOG.debug("Finished processing %d rows from data ID query.", n)
748 yield commonDataIds
750 def resolveDatasetRefs(
751 self,
752 registry: Registry,
753 collections: Any,
754 run: Optional[str],
755 commonDataIds: DataCoordinateQueryResults,
756 *,
757 skipExistingIn: Any = None,
758 clobberOutputs: bool = True,
759 constrainedByAllDatasets: bool = True,
760 ) -> None:
761 """Perform follow up queries for each dataset data ID produced in
762 `fillDataIds`.
764 This method populates `_DatasetScaffolding.refs` (except for those in
765 `prerequisites`).
767 Parameters
768 ----------
769 registry : `lsst.daf.butler.Registry`
770 Registry for the data repository; used for all data ID queries.
771 collections
772 Expressions representing the collections to search for input
773 datasets. May be any of the types accepted by
774 `lsst.daf.butler.CollectionSearch.fromExpression`.
775 run : `str`, optional
776 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
777 output datasets, if it already exists.
778 commonDataIds : \
779 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
780 Result of a previous call to `connectDataIds`.
781 skipExistingIn
782 Expressions representing the collections to search for existing
783 output datasets that should be skipped. May be any of the types
784 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
785 `None` or empty string/sequence disables skipping.
786 clobberOutputs : `bool`, optional
787 If `True` (default), allow quanta to created even if outputs exist;
788 this requires the same behavior behavior to be enabled when
789 executing. If ``skipExistingIn`` is not `None`, completed quanta
790 (those with metadata, or all outputs if there is no metadata
791 dataset configured) will be skipped rather than clobbered.
792 constrainedByAllDatasets : `bool`, optional
793 Indicates if the commonDataIds were generated with a constraint on
794 all dataset types.
796 Raises
797 ------
798 OutputExistsError
799 Raised if an output dataset already exists in the output run
800 and ``skipExistingIn`` does not include output run, or if only
801 some outputs are present and ``clobberOutputs`` is `False`.
802 """
803 skipCollections: Optional[CollectionSearch] = None
804 skipExistingInRun = False
805 if skipExistingIn:
806 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
807 if run:
808 # as optimization check in the explicit list of names first
809 skipExistingInRun = run in skipCollections.explicitNames()
810 if not skipExistingInRun:
811 # need to flatten it and check again
812 skipExistingInRun = run in registry.queryCollections(
813 skipExistingIn,
814 collectionTypes=CollectionType.RUN,
815 )
817 # Look up [init] intermediate and output datasets in the output
818 # collection, if there is an output collection.
819 if run is not None or skipCollections is not None:
820 for datasetType, refs in itertools.chain(
821 self.initIntermediates.items(),
822 self.initOutputs.items(),
823 self.intermediates.items(),
824 self.outputs.items(),
825 ):
826 _LOG.debug(
827 "Resolving %d datasets for intermediate and/or output dataset %s.",
828 len(refs),
829 datasetType.name,
830 )
831 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
832 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
834 # look at RUN collection first
835 if run is not None:
836 resolvedRefQueryResults = subset.findDatasets(
837 datasetType, collections=run, findFirst=True
838 )
839 for resolvedRef in resolvedRefQueryResults:
840 # TODO: we could easily support per-DatasetType
841 # skipExisting and I could imagine that being useful -
842 # it's probably required in order to support writing
843 # initOutputs before QuantumGraph generation.
844 assert resolvedRef.dataId in refs
845 if not (skipExistingInRun or isInit or clobberOutputs):
846 raise OutputExistsError(
847 f"Output dataset {datasetType.name} already exists in "
848 f"output RUN collection '{run}' with data ID"
849 f" {resolvedRef.dataId}."
850 )
852 # And check skipExistingIn too, if RUN collection is in
853 # it is handled above
854 if skipCollections is not None:
855 resolvedRefQueryResults = subset.findDatasets(
856 datasetType, collections=skipCollections, findFirst=True
857 )
858 for resolvedRef in resolvedRefQueryResults:
859 assert resolvedRef.dataId in refs
860 refs[resolvedRef.dataId] = resolvedRef
862 # Look up input and initInput datasets in the input collection(s).
863 # container to accumulate unfound refs, if the common dataIs were not
864 # constrained on dataset type existence.
865 self.unfoundRefs = set()
866 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
867 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
868 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets(
869 datasetType, collections=collections, findFirst=True
870 )
871 dataIdsNotFoundYet = set(refs.keys())
872 for resolvedRef in resolvedRefQueryResults:
873 dataIdsNotFoundYet.discard(resolvedRef.dataId)
874 refs[resolvedRef.dataId] = resolvedRef
875 if dataIdsNotFoundYet:
876 if constrainedByAllDatasets:
877 raise RuntimeError(
878 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
879 f"'{datasetType.name}' was/were present in a previous "
880 f"query, but could not be found now."
881 f"This is either a logic bug in QuantumGraph generation "
882 f"or the input collections have been modified since "
883 f"QuantumGraph generation began."
884 )
885 else:
886 # if the common dataIds were not constrained using all the
887 # input dataset types, it is possible that some data ids
888 # found dont correspond to existing dataset types and they
889 # will be un-resolved. Mark these for later pruning from
890 # the quantum graph.
891 for k in dataIdsNotFoundYet:
892 self.unfoundRefs.add(refs[k])
894 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
895 # replacing the unresolved refs there, and then look up prerequisites.
896 for task in self.tasks:
897 _LOG.debug(
898 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
899 len(task.quanta),
900 task.taskDef.label,
901 )
902 # The way iterConnections is designed makes it impossible to
903 # annotate precisely enough to satisfy MyPy here.
904 lookupFunctions = {
905 c.name: c.lookupFunction # type: ignore
906 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
907 if c.lookupFunction is not None # type: ignore
908 }
909 dataIdsFailed = []
910 dataIdsSucceeded = []
911 for quantum in task.quanta.values():
912 # Process outputs datasets only if skipExistingIn is not None
913 # or there is a run to look for outputs in and clobberOutputs
914 # is True. Note that if skipExistingIn is None, any output
915 # datasets that already exist would have already caused an
916 # exception to be raised. We never update the DatasetRefs in
917 # the quantum because those should never be resolved.
918 if skipCollections is not None or (run is not None and clobberOutputs):
919 resolvedRefs = []
920 unresolvedRefs = []
921 haveMetadata = False
922 for datasetType, originalRefs in quantum.outputs.items():
923 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
924 if ref.id is not None:
925 resolvedRefs.append(ref)
926 if datasetType.name == task.taskDef.metadataDatasetName:
927 haveMetadata = True
928 else:
929 unresolvedRefs.append(ref)
930 if resolvedRefs:
931 if haveMetadata or not unresolvedRefs:
932 dataIdsSucceeded.append(quantum.dataId)
933 if skipCollections is not None:
934 continue
935 else:
936 dataIdsFailed.append(quantum.dataId)
937 if not clobberOutputs:
938 raise OutputExistsError(
939 f"Quantum {quantum.dataId} of task with label "
940 f"'{quantum.task.taskDef.label}' has some outputs that exist "
941 f"({resolvedRefs}) "
942 f"and others that don't ({unresolvedRefs}), with no metadata output, "
943 "and clobbering outputs was not enabled."
944 )
945 # Update the input DatasetRefs to the resolved ones we already
946 # searched for.
947 for datasetType, input_refs in quantum.inputs.items():
948 for ref in task.inputs.extract(datasetType, input_refs.keys()):
949 input_refs[ref.dataId] = ref
950 # Look up prerequisite datasets in the input collection(s).
951 # These may have dimensions that extend beyond those we queried
952 # for originally, because we want to permit those data ID
953 # values to differ across quanta and dataset types.
954 for datasetType in task.prerequisites:
955 lookupFunction = lookupFunctions.get(datasetType.name)
956 if lookupFunction is not None:
957 # PipelineTask has provided its own function to do the
958 # lookup. This always takes precedence.
959 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
960 elif (
961 datasetType.isCalibration()
962 and datasetType.dimensions <= quantum.dataId.graph
963 and quantum.dataId.graph.temporal
964 ):
965 # This is a master calibration lookup, which we have to
966 # handle specially because the query system can't do a
967 # temporal join on a non-dimension-based timespan yet.
968 timespan = quantum.dataId.timespan
969 try:
970 prereq_refs = [
971 registry.findDataset(
972 datasetType, quantum.dataId, collections=collections, timespan=timespan
973 )
974 ]
975 except KeyError:
976 # This dataset type is not present in the registry,
977 # which just means there are no datasets here.
978 prereq_refs = []
979 else:
980 # Most general case.
981 prereq_refs = list(
982 registry.queryDatasets(
983 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True
984 ).expanded()
985 )
986 quantum.prerequisites[datasetType].update(
987 {ref.dataId: ref for ref in prereq_refs if ref is not None}
988 )
989 # Actually remove any quanta that we decided to skip above.
990 if dataIdsSucceeded:
991 if skipCollections is not None:
992 _LOG.debug(
993 "Pruning successful %d quanta for task with label '%s' because all of their "
994 "outputs exist or metadata was written successfully.",
995 len(dataIdsSucceeded),
996 task.taskDef.label,
997 )
998 for dataId in dataIdsSucceeded:
999 del task.quanta[dataId]
1000 elif clobberOutputs:
1001 _LOG.info(
1002 "Found %d successful quanta for task with label '%s' "
1003 "that will need to be clobbered during execution.",
1004 len(dataIdsSucceeded),
1005 task.taskDef.label,
1006 )
1007 else:
1008 raise AssertionError("OutputExistsError should have already been raised.")
1009 if dataIdsFailed:
1010 if clobberOutputs:
1011 _LOG.info(
1012 "Found %d failed/incomplete quanta for task with label '%s' "
1013 "that will need to be clobbered during execution.",
1014 len(dataIdsFailed),
1015 task.taskDef.label,
1016 )
1017 else:
1018 raise AssertionError("OutputExistsError should have already been raised.")
1020 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None) -> QuantumGraph:
1021 """Create a `QuantumGraph` from the quanta already present in
1022 the scaffolding data structure.
1024 Parameters
1025 ---------
1026 metadata : Optional Mapping of `str` to primitives
1027 This is an optional parameter of extra data to carry with the
1028 graph. Entries in this mapping should be able to be serialized in
1029 JSON.
1031 Returns
1032 -------
1033 graph : `QuantumGraph`
1034 The full `QuantumGraph`.
1035 """
1036 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1037 for task in self.tasks:
1038 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs)
1039 graphInput[task.taskDef] = qset
1041 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs)
1042 return graph
1045# ------------------------
1046# Exported definitions --
1047# ------------------------
1050class GraphBuilderError(Exception):
1051 """Base class for exceptions generated by graph builder."""
1053 pass
1056class OutputExistsError(GraphBuilderError):
1057 """Exception generated when output datasets already exist."""
1059 pass
1062class PrerequisiteMissingError(GraphBuilderError):
1063 """Exception generated when a prerequisite dataset does not exist."""
1065 pass
1068class GraphBuilder(object):
1069 """GraphBuilder class is responsible for building task execution graph from
1070 a Pipeline.
1072 Parameters
1073 ----------
1074 registry : `~lsst.daf.butler.Registry`
1075 Data butler instance.
1076 skipExistingIn
1077 Expressions representing the collections to search for existing
1078 output datasets that should be skipped. May be any of the types
1079 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
1080 clobberOutputs : `bool`, optional
1081 If `True` (default), allow quanta to created even if partial outputs
1082 exist; this requires the same behavior behavior to be enabled when
1083 executing.
1084 """
1086 def __init__(self, registry: Registry, skipExistingIn: Any = None, clobberOutputs: bool = True):
1087 self.registry = registry
1088 self.dimensions = registry.dimensions
1089 self.skipExistingIn = skipExistingIn
1090 self.clobberOutputs = clobberOutputs
1092 def makeGraph(
1093 self,
1094 pipeline: Union[Pipeline, Iterable[TaskDef]],
1095 collections: Any,
1096 run: Optional[str],
1097 userQuery: Optional[str],
1098 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1099 metadata: Optional[Mapping[str, Any]] = None,
1100 ) -> QuantumGraph:
1101 """Create execution graph for a pipeline.
1103 Parameters
1104 ----------
1105 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1106 Pipeline definition, task names/classes and their configs.
1107 collections
1108 Expressions representing the collections to search for input
1109 datasets. May be any of the types accepted by
1110 `lsst.daf.butler.CollectionSearch.fromExpression`.
1111 run : `str`, optional
1112 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1113 output datasets, if it already exists.
1114 userQuery : `str`
1115 String which defines user-defined selection for registry, should be
1116 empty or `None` if there is no restrictions on data selection.
1117 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1118 The query constraint variant that should be used to constraint the
1119 query based on dataset existance, defaults to
1120 `DatasetQueryConstraintVariant.ALL`.
1121 metadata : Optional Mapping of `str` to primitives
1122 This is an optional parameter of extra data to carry with the
1123 graph. Entries in this mapping should be able to be serialized in
1124 JSON.
1126 Returns
1127 -------
1128 graph : `QuantumGraph`
1130 Raises
1131 ------
1132 UserExpressionError
1133 Raised when user expression cannot be parsed.
1134 OutputExistsError
1135 Raised when output datasets already exist.
1136 Exception
1137 Other exceptions types may be raised by underlying registry
1138 classes.
1139 """
1140 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1141 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1142 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1143 instrument_class: Optional[Any] = None
1144 if isinstance(pipeline, Pipeline):
1145 instrument_class_name = pipeline.getInstrument()
1146 if instrument_class_name is not None:
1147 instrument_class = doImportType(instrument_class_name)
1148 pipeline = list(pipeline.toExpandedPipeline())
1149 if instrument_class is not None:
1150 dataId = DataCoordinate.standardize(
1151 instrument=instrument_class.getName(), universe=self.registry.dimensions
1152 )
1153 else:
1154 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1155 with scaffolding.connectDataIds(
1156 self.registry, collections, userQuery, dataId, datasetQueryConstraint
1157 ) as commonDataIds:
1158 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1159 scaffolding.resolveDatasetRefs(
1160 self.registry,
1161 collections,
1162 run,
1163 commonDataIds,
1164 skipExistingIn=self.skipExistingIn,
1165 clobberOutputs=self.clobberOutputs,
1166 constrainedByAllDatasets=condition,
1167 )
1168 return scaffolding.makeQuantumGraph(metadata=metadata)