Coverage for python/lsst/pipe/base/graphBuilder.py: 17%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set
38from lsst.daf.butler import (
39 CollectionSearch,
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 DimensionGraph,
45 DimensionUniverse,
46 NamedKeyDict,
47 Quantum,
48)
49from lsst.utils import doImport
51from ._datasetQueryConstraints import DatasetQueryConstraintVariant
52from ._status import NoWorkFound
54# -----------------------------
55# Imports for other modules --
56# -----------------------------
57from .connections import AdjustQuantumHelper, iterConnections
58from .graph import QuantumGraph
59from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
61# ----------------------------------
62# Local non-exported definitions --
63# ----------------------------------
65_LOG = logging.getLogger(__name__)
68class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
69 """A custom dictionary that maps `DatasetType` to a nested dictionary of
70 the known `DatasetRef` instances of that type.
72 Parameters
73 ----------
74 args
75 Positional arguments are forwarded to the `dict` constructor.
76 universe : `DimensionUniverse`
77 Universe of all possible dimensions.
78 """
80 def __init__(self, *args, universe: DimensionGraph):
81 super().__init__(*args)
82 self.universe = universe
84 @classmethod
85 def fromDatasetTypes(
86 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
87 ) -> _DatasetDict:
88 """Construct a dictionary from a flat iterable of `DatasetType` keys.
90 Parameters
91 ----------
92 datasetTypes : `iterable` of `DatasetType`
93 DatasetTypes to use as keys for the dict. Values will be empty
94 dictionaries.
95 universe : `DimensionUniverse`
96 Universe of all possible dimensions.
98 Returns
99 -------
100 dictionary : `_DatasetDict`
101 A new `_DatasetDict` instance.
102 """
103 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
105 @classmethod
106 def fromSubset(
107 cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
108 ) -> _DatasetDict:
109 """Return a new dictionary by extracting items corresponding to the
110 given keys from one or more existing dictionaries.
112 Parameters
113 ----------
114 datasetTypes : `iterable` of `DatasetType`
115 DatasetTypes to use as keys for the dict. Values will be obtained
116 by lookups against ``first`` and ``rest``.
117 first : `_DatasetDict`
118 Another dictionary from which to extract values.
119 rest
120 Additional dictionaries from which to extract values.
122 Returns
123 -------
124 dictionary : `_DatasetDict`
125 A new dictionary instance.
126 """
127 combined = ChainMap(first, *rest)
129 # Dataset types known to match immediately can be processed
130 # without checks.
131 matches = combined.keys() & set(datasetTypes)
132 _dict = {k: combined[k] for k in matches}
134 if len(_dict) < len(datasetTypes):
135 # Work out which ones are missing.
136 missing_datasetTypes = set(datasetTypes) - _dict.keys()
138 # Get the known names for comparison.
139 combined_by_name = {k.name: k for k in combined}
141 missing = set()
142 incompatible = {}
143 for datasetType in missing_datasetTypes:
144 # The dataset type is not found. It may not be listed
145 # or it may be that it is there with the same name
146 # but different definition.
147 if datasetType.name in combined_by_name:
148 # This implies some inconsistency in definitions
149 # for connections. If there is support for storage
150 # class conversion we can let it slide.
151 # At this point we do not know
152 # where the inconsistency is but trust that down
153 # stream code will be more explicit about input
154 # vs output incompatibilities.
155 existing = combined_by_name[datasetType.name]
156 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
157 _LOG.warning(
158 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
159 datasetType,
160 existing,
161 )
162 _dict[datasetType] = combined[existing]
163 else:
164 incompatible[datasetType] = existing
165 else:
166 missing.add(datasetType)
168 if missing or incompatible:
169 reasons = []
170 if missing:
171 reasons.append(
172 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
173 + ", ".join(d.name for d in combined)
174 )
175 if incompatible:
176 for x, y in incompatible.items():
177 reasons.append(f"{x} incompatible with {y}")
178 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
180 return cls(_dict, universe=first.universe)
182 @property
183 def dimensions(self) -> DimensionGraph:
184 """The union of all dimensions used by all dataset types in this
185 dictionary, including implied dependencies (`DimensionGraph`).
186 """
187 base = self.universe.empty
188 if len(self) == 0:
189 return base
190 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
192 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
193 """Unpack nested single-element `DatasetRef` dicts into a new
194 mapping with `DatasetType` keys and `DatasetRef` values.
196 This method assumes that each nest contains exactly one item, as is the
197 case for all "init" datasets.
199 Returns
200 -------
201 dictionary : `NamedKeyDict`
202 Dictionary mapping `DatasetType` to `DatasetRef`, with both
203 `DatasetType` instances and string names usable as keys.
204 """
206 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
207 (ref,) = refs.values()
208 return ref
210 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
212 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
213 """Unpack nested multi-element `DatasetRef` dicts into a new
214 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
216 Returns
217 -------
218 dictionary : `NamedKeyDict`
219 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
220 both `DatasetType` instances and string names usable as keys.
221 """
222 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
224 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
225 """Iterate over the contained `DatasetRef` instances that match the
226 given `DatasetType` and data IDs.
228 Parameters
229 ----------
230 datasetType : `DatasetType`
231 Dataset type to match.
232 dataIds : `Iterable` [ `DataCoordinate` ]
233 Data IDs to match.
235 Returns
236 -------
237 refs : `Iterator` [ `DatasetRef` ]
238 DatasetRef instances for which ``ref.datasetType == datasetType``
239 and ``ref.dataId`` is in ``dataIds``.
240 """
241 refs = self[datasetType]
242 return (refs[dataId] for dataId in dataIds)
245class _QuantumScaffolding:
246 """Helper class aggregating information about a `Quantum`, used when
247 constructing a `QuantumGraph`.
249 See `_PipelineScaffolding` for a top-down description of the full
250 scaffolding data structure.
252 Parameters
253 ----------
254 task : _TaskScaffolding
255 Back-reference to the helper object for the `PipelineTask` this quantum
256 represents an execution of.
257 dataId : `DataCoordinate`
258 Data ID for this quantum.
259 """
261 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
262 self.task = task
263 self.dataId = dataId
264 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
265 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
266 self.prerequisites = _DatasetDict.fromDatasetTypes(
267 task.prerequisites.keys(), universe=dataId.universe
268 )
270 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
272 def __repr__(self):
273 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
275 task: _TaskScaffolding
276 """Back-reference to the helper object for the `PipelineTask` this quantum
277 represents an execution of.
278 """
280 dataId: DataCoordinate
281 """Data ID for this quantum.
282 """
284 inputs: _DatasetDict
285 """Nested dictionary containing `DatasetRef` inputs to this quantum.
287 This is initialized to map each `DatasetType` to an empty dictionary at
288 construction. Those nested dictionaries are populated (with data IDs as
289 keys) with unresolved `DatasetRef` instances in
290 `_PipelineScaffolding.connectDataIds`.
291 """
293 outputs: _DatasetDict
294 """Nested dictionary containing `DatasetRef` outputs this quantum.
295 """
297 prerequisites: _DatasetDict
298 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
299 quantum.
300 """
302 def makeQuantum(self) -> Quantum:
303 """Transform the scaffolding object into a true `Quantum` instance.
305 Returns
306 -------
307 quantum : `Quantum`
308 An actual `Quantum` instance.
309 """
310 allInputs = self.inputs.unpackMultiRefs()
311 allInputs.update(self.prerequisites.unpackMultiRefs())
312 # Give the task's Connections class an opportunity to remove some
313 # inputs, or complain if they are unacceptable.
314 # This will raise if one of the check conditions is not met, which is
315 # the intended behavior.
316 # If it raises NotWorkFound, there is a bug in the QG algorithm
317 # or the adjustQuantum is incorrectly trying to make a prerequisite
318 # input behave like a regular input; adjustQuantum should only raise
319 # NoWorkFound if a regular input is missing, and it shouldn't be
320 # possible for us to have generated ``self`` if that's true.
321 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
322 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
323 return Quantum(
324 taskName=self.task.taskDef.taskName,
325 taskClass=self.task.taskDef.taskClass,
326 dataId=self.dataId,
327 initInputs=self.task.initInputs.unpackSingleRefs(),
328 inputs=helper.inputs,
329 outputs=helper.outputs,
330 )
333@dataclass
334class _TaskScaffolding:
335 """Helper class aggregating information about a `PipelineTask`, used when
336 constructing a `QuantumGraph`.
338 See `_PipelineScaffolding` for a top-down description of the full
339 scaffolding data structure.
341 Parameters
342 ----------
343 taskDef : `TaskDef`
344 Data structure that identifies the task class and its config.
345 parent : `_PipelineScaffolding`
346 The parent data structure that will hold the instance being
347 constructed.
348 datasetTypes : `TaskDatasetTypes`
349 Data structure that categorizes the dataset types used by this task.
350 """
352 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
353 universe = parent.dimensions.universe
354 self.taskDef = taskDef
355 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
356 assert self.dimensions.issubset(parent.dimensions)
357 # Initialize _DatasetDicts as subsets of the one or two
358 # corresponding dicts in the parent _PipelineScaffolding.
359 self.initInputs = _DatasetDict.fromSubset(
360 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
361 )
362 self.initOutputs = _DatasetDict.fromSubset(
363 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
364 )
365 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
366 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
367 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
368 self.dataIds = set()
369 self.quanta = {}
371 def __repr__(self):
372 # Default dataclass-injected __repr__ gets caught in an infinite loop
373 # because of back-references.
374 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
376 taskDef: TaskDef
377 """Data structure that identifies the task class and its config
378 (`TaskDef`).
379 """
381 dimensions: DimensionGraph
382 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
383 """
385 initInputs: _DatasetDict
386 """Dictionary containing information about datasets used to construct this
387 task (`_DatasetDict`).
388 """
390 initOutputs: _DatasetDict
391 """Dictionary containing information about datasets produced as a
392 side-effect of constructing this task (`_DatasetDict`).
393 """
395 inputs: _DatasetDict
396 """Dictionary containing information about datasets used as regular,
397 graph-constraining inputs to this task (`_DatasetDict`).
398 """
400 outputs: _DatasetDict
401 """Dictionary containing information about datasets produced by this task
402 (`_DatasetDict`).
403 """
405 prerequisites: _DatasetDict
406 """Dictionary containing information about input datasets that must be
407 present in the repository before any Pipeline containing this task is run
408 (`_DatasetDict`).
409 """
411 quanta: Dict[DataCoordinate, _QuantumScaffolding]
412 """Dictionary mapping data ID to a scaffolding object for the Quantum of
413 this task with that data ID.
414 """
416 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]:
417 """Create a `set` of `Quantum` from the information in ``self``.
419 Returns
420 -------
421 nodes : `set` of `Quantum
422 The `Quantum` elements corresponding to this task.
423 """
424 if unresolvedRefs is None:
425 unresolvedRefs = set()
426 outputs = set()
427 for q in self.quanta.values():
428 try:
429 tmpQuanta = q.makeQuantum()
430 outputs.add(tmpQuanta)
431 except (NoWorkFound, FileNotFoundError) as exc:
432 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
433 if unresolvedRefs.intersection(refs):
434 # This means it is a node that is Known to be pruned
435 # later and should be left in even though some follow up
436 # queries fail. This allows the pruning to start from this
437 # quantum with known issues, and prune other nodes it
438 # touches
439 inputs = q.inputs.unpackMultiRefs()
440 inputs.update(q.prerequisites.unpackMultiRefs())
441 tmpQuantum = Quantum(
442 taskName=q.task.taskDef.taskName,
443 taskClass=q.task.taskDef.taskClass,
444 dataId=q.dataId,
445 initInputs=q.task.initInputs.unpackSingleRefs(),
446 inputs=inputs,
447 outputs=q.outputs.unpackMultiRefs(),
448 )
449 outputs.add(tmpQuantum)
450 else:
451 raise exc
452 return outputs
455@dataclass
456class _PipelineScaffolding:
457 """A helper data structure that organizes the information involved in
458 constructing a `QuantumGraph` for a `Pipeline`.
460 Parameters
461 ----------
462 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
463 Sequence of tasks from which a graph is to be constructed. Must
464 have nested task classes already imported.
465 universe : `DimensionUniverse`
466 Universe of all possible dimensions.
468 Notes
469 -----
470 The scaffolding data structure contains nested data structures for both
471 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
472 data structures are shared between the pipeline-level structure (which
473 aggregates all datasets and categorizes them from the perspective of the
474 complete pipeline) and the individual tasks that use them as inputs and
475 outputs.
477 `QuantumGraph` construction proceeds in four steps, with each corresponding
478 to a different `_PipelineScaffolding` method:
480 1. When `_PipelineScaffolding` is constructed, we extract and categorize
481 the DatasetTypes used by the pipeline (delegating to
482 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
483 nested `_TaskScaffolding` and `_DatasetDict` objects.
485 2. In `connectDataIds`, we construct and run the "Big Join Query", which
486 returns related tuples of all dimensions used to identify any regular
487 input, output, and intermediate datasets (not prerequisites). We then
488 iterate over these tuples of related dimensions, identifying the subsets
489 that correspond to distinct data IDs for each task and dataset type,
490 and then create `_QuantumScaffolding` objects.
492 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
493 dataset data IDs previously identified, transforming unresolved
494 DatasetRefs into resolved DatasetRefs where appropriate. We then look
495 up prerequisite datasets for all quanta.
497 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
498 per-task `_QuantumScaffolding` objects.
499 """
501 def __init__(self, pipeline, *, registry):
502 _LOG.debug("Initializing data structures for QuantumGraph generation.")
503 self.tasks = []
504 # Aggregate and categorize the DatasetTypes in the Pipeline.
505 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
506 # Construct dictionaries that map those DatasetTypes to structures
507 # that will (later) hold addiitonal information about them.
508 for attr in (
509 "initInputs",
510 "initIntermediates",
511 "initOutputs",
512 "inputs",
513 "intermediates",
514 "outputs",
515 "prerequisites",
516 ):
517 setattr(
518 self,
519 attr,
520 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
521 )
522 # Aggregate all dimensions for all non-init, non-prerequisite
523 # DatasetTypes. These are the ones we'll include in the big join
524 # query.
525 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
526 # Construct scaffolding nodes for each Task, and add backreferences
527 # to the Task from each DatasetScaffolding node.
528 # Note that there's only one scaffolding node for each DatasetType,
529 # shared by _PipelineScaffolding and all _TaskScaffoldings that
530 # reference it.
531 if isinstance(pipeline, Pipeline):
532 pipeline = pipeline.toExpandedPipeline()
533 self.tasks = [
534 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
535 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
536 ]
538 def __repr__(self):
539 # Default dataclass-injected __repr__ gets caught in an infinite loop
540 # because of back-references.
541 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
543 tasks: List[_TaskScaffolding]
544 """Scaffolding data structures for each task in the pipeline
545 (`list` of `_TaskScaffolding`).
546 """
548 initInputs: _DatasetDict
549 """Datasets consumed but not produced when constructing the tasks in this
550 pipeline (`_DatasetDict`).
551 """
553 initIntermediates: _DatasetDict
554 """Datasets that are both consumed and produced when constructing the tasks
555 in this pipeline (`_DatasetDict`).
556 """
558 initOutputs: _DatasetDict
559 """Datasets produced but not consumed when constructing the tasks in this
560 pipeline (`_DatasetDict`).
561 """
563 inputs: _DatasetDict
564 """Datasets that are consumed but not produced when running this pipeline
565 (`_DatasetDict`).
566 """
568 intermediates: _DatasetDict
569 """Datasets that are both produced and consumed when running this pipeline
570 (`_DatasetDict`).
571 """
573 outputs: _DatasetDict
574 """Datasets produced but not consumed when when running this pipeline
575 (`_DatasetDict`).
576 """
578 prerequisites: _DatasetDict
579 """Datasets that are consumed when running this pipeline and looked up
580 per-Quantum when generating the graph (`_DatasetDict`).
581 """
583 dimensions: DimensionGraph
584 """All dimensions used by any regular input, intermediate, or output
585 (not prerequisite) dataset; the set of dimension used in the "Big Join
586 Query" (`DimensionGraph`).
588 This is required to be a superset of all task quantum dimensions.
589 """
591 @contextmanager
592 def connectDataIds(
593 self,
594 registry,
595 collections,
596 userQuery,
597 externalDataId,
598 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
599 ):
600 """Query for the data IDs that connect nodes in the `QuantumGraph`.
602 This method populates `_TaskScaffolding.dataIds` and
603 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
605 Parameters
606 ----------
607 registry : `lsst.daf.butler.Registry`
608 Registry for the data repository; used for all data ID queries.
609 collections
610 Expressions representing the collections to search for input
611 datasets. May be any of the types accepted by
612 `lsst.daf.butler.CollectionSearch.fromExpression`.
613 userQuery : `str` or `None`
614 User-provided expression to limit the data IDs processed.
615 externalDataId : `DataCoordinate`
616 Externally-provided data ID that should be used to restrict the
617 results, just as if these constraints had been included via ``AND``
618 in ``userQuery``. This includes (at least) any instrument named
619 in the pipeline definition.
620 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
621 The query constraint variant that should be used to constraint the
622 query based on dataset existance, defaults to
623 `DatasetQueryConstraintVariant.ALL`.
625 Returns
626 -------
627 commonDataIds : \
628 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
629 An interface to a database temporary table containing all data IDs
630 that will appear in this `QuantumGraph`. Returned inside a
631 context manager, which will drop the temporary table at the end of
632 the `with` block in which this method is called.
633 """
634 _LOG.debug("Building query for data IDs.")
635 # Initialization datasets always have empty data IDs.
636 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
637 for datasetType, refs in itertools.chain(
638 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
639 ):
640 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
641 # Run one big query for the data IDs for task dimensions and regular
642 # inputs and outputs. We limit the query to only dimensions that are
643 # associated with the input dataset types, but don't (yet) try to
644 # obtain the dataset_ids for those inputs.
645 _LOG.debug("Submitting data ID query and materializing results.")
646 queryArgs = {"dimensions": self.dimensions, "where": userQuery, "dataId": externalDataId}
647 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
648 _LOG.debug("Constraining graph query using all datasets in pipeline.")
649 queryArgs["datasets"] = list(self.inputs)
650 queryArgs["collections"] = collections
651 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
652 _LOG.debug("Not using dataset existence to constrain query.")
653 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
654 constraint = set(datasetQueryConstraint)
655 inputs = {k.name: k for k in self.inputs.keys()}
656 if remainder := constraint.difference(inputs.keys()):
657 raise ValueError(
658 f"{remainder} dataset type(s) specified as a graph constraint, but"
659 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
660 )
661 _LOG.debug(f"Constraining graph query using {constraint}")
662 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
663 queryArgs["collections"] = collections
664 else:
665 raise ValueError(
666 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
667 )
669 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
670 _LOG.debug("Expanding data IDs.")
671 commonDataIds = commonDataIds.expanded()
672 _LOG.debug("Iterating over query results to associate quanta with datasets.")
673 # Iterate over query results, populating data IDs for datasets and
674 # quanta and then connecting them to each other.
675 n = -1
676 for n, commonDataId in enumerate(commonDataIds):
677 # Create DatasetRefs for all DatasetTypes from this result row,
678 # noting that we might have created some already.
679 # We remember both those that already existed and those that we
680 # create now.
681 refsForRow = {}
682 dataIdCacheForRow: Mapping[DimensionGraph, DataCoordinate] = {}
683 for datasetType, refs in itertools.chain(
684 self.inputs.items(), self.intermediates.items(), self.outputs.items()
685 ):
686 if not (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)):
687 datasetDataId = commonDataId.subset(datasetType.dimensions)
688 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
689 ref = refs.get(datasetDataId)
690 if ref is None:
691 ref = DatasetRef(datasetType, datasetDataId)
692 refs[datasetDataId] = ref
693 refsForRow[datasetType.name] = ref
694 # Create _QuantumScaffolding objects for all tasks from this
695 # result row, noting that we might have created some already.
696 for task in self.tasks:
697 quantumDataId = commonDataId.subset(task.dimensions)
698 quantum = task.quanta.get(quantumDataId)
699 if quantum is None:
700 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
701 task.quanta[quantumDataId] = quantum
702 # Whether this is a new quantum or an existing one, we can
703 # now associate the DatasetRefs for this row with it. The
704 # fact that a Quantum data ID and a dataset data ID both
705 # came from the same result row is what tells us they
706 # should be associated.
707 # Many of these associates will be duplicates (because
708 # another query row that differed from this one only in
709 # irrelevant dimensions already added them), and we use
710 # sets to skip.
711 for datasetType in task.inputs:
712 ref = refsForRow[datasetType.name]
713 quantum.inputs[datasetType.name][ref.dataId] = ref
714 for datasetType in task.outputs:
715 ref = refsForRow[datasetType.name]
716 quantum.outputs[datasetType.name][ref.dataId] = ref
717 if n < 0:
718 emptiness_explained = False
719 for message in commonDataIds.explain_no_results():
720 _LOG.warning(message)
721 emptiness_explained = True
722 if not emptiness_explained:
723 _LOG.warning(
724 "To reproduce this query for debugging purposes, run "
725 "Registry.queryDataIds with these arguments:"
726 )
727 # We could just repr() the queryArgs dict to get something
728 # the user could make sense of, but it's friendlier to
729 # put these args in an easier-to-construct equivalent form
730 # so they can read it more easily and copy and paste into
731 # a Python terminal.
732 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names))
733 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName())
734 if queryArgs["where"]:
735 _LOG.warning(" where=%s,", repr(queryArgs["where"]))
736 if "datasets" in queryArgs:
737 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
738 if "collections" in queryArgs:
739 _LOG.warning(" collections=%s,", list(queryArgs["collections"]))
740 _LOG.debug("Finished processing %d rows from data ID query.", n)
741 yield commonDataIds
743 def resolveDatasetRefs(
744 self,
745 registry,
746 collections,
747 run,
748 commonDataIds,
749 *,
750 skipExistingIn=None,
751 clobberOutputs=True,
752 constrainedByAllDatasets: bool = True,
753 ):
754 """Perform follow up queries for each dataset data ID produced in
755 `fillDataIds`.
757 This method populates `_DatasetScaffolding.refs` (except for those in
758 `prerequisites`).
760 Parameters
761 ----------
762 registry : `lsst.daf.butler.Registry`
763 Registry for the data repository; used for all data ID queries.
764 collections
765 Expressions representing the collections to search for input
766 datasets. May be any of the types accepted by
767 `lsst.daf.butler.CollectionSearch.fromExpression`.
768 run : `str`, optional
769 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
770 output datasets, if it already exists.
771 commonDataIds : \
772 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
773 Result of a previous call to `connectDataIds`.
774 skipExistingIn
775 Expressions representing the collections to search for existing
776 output datasets that should be skipped. May be any of the types
777 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
778 `None` or empty string/sequence disables skipping.
779 clobberOutputs : `bool`, optional
780 If `True` (default), allow quanta to created even if outputs exist;
781 this requires the same behavior behavior to be enabled when
782 executing. If ``skipExistingIn`` is not `None`, completed quanta
783 (those with metadata, or all outputs if there is no metadata
784 dataset configured) will be skipped rather than clobbered.
785 constrainedByAllDatasets : `bool`, optional
786 Indicates if the commonDataIds were generated with a constraint on
787 all dataset types.
789 Raises
790 ------
791 OutputExistsError
792 Raised if an output dataset already exists in the output run
793 and ``skipExistingIn`` does not include output run, or if only
794 some outputs are present and ``clobberOutputs`` is `False`.
795 """
796 skipCollections: Optional[CollectionSearch] = None
797 skipExistingInRun = False
798 if skipExistingIn:
799 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
800 if run:
801 # as optimization check in the explicit list of names first
802 skipExistingInRun = run in skipCollections.explicitNames()
803 if not skipExistingInRun:
804 # need to flatten it and check again
805 skipExistingInRun = run in registry.queryCollections(
806 skipExistingIn,
807 collectionTypes=CollectionType.RUN,
808 )
810 # Look up [init] intermediate and output datasets in the output
811 # collection, if there is an output collection.
812 if run is not None or skipCollections is not None:
813 for datasetType, refs in itertools.chain(
814 self.initIntermediates.items(),
815 self.initOutputs.items(),
816 self.intermediates.items(),
817 self.outputs.items(),
818 ):
819 _LOG.debug(
820 "Resolving %d datasets for intermediate and/or output dataset %s.",
821 len(refs),
822 datasetType.name,
823 )
824 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
825 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
827 # look at RUN collection first
828 if run is not None:
829 resolvedRefQueryResults = subset.findDatasets(
830 datasetType, collections=run, findFirst=True
831 )
832 for resolvedRef in resolvedRefQueryResults:
833 # TODO: we could easily support per-DatasetType
834 # skipExisting and I could imagine that being useful -
835 # it's probably required in order to support writing
836 # initOutputs before QuantumGraph generation.
837 assert resolvedRef.dataId in refs
838 if not (skipExistingInRun or isInit or clobberOutputs):
839 raise OutputExistsError(
840 f"Output dataset {datasetType.name} already exists in "
841 f"output RUN collection '{run}' with data ID"
842 f" {resolvedRef.dataId}."
843 )
845 # And check skipExistingIn too, if RUN collection is in
846 # it is handled above
847 if skipCollections is not None:
848 resolvedRefQueryResults = subset.findDatasets(
849 datasetType, collections=skipCollections, findFirst=True
850 )
851 for resolvedRef in resolvedRefQueryResults:
852 assert resolvedRef.dataId in refs
853 refs[resolvedRef.dataId] = resolvedRef
855 # Look up input and initInput datasets in the input collection(s).
856 # container to accumulate unfound refs, if the common dataIs were not
857 # constrained on dataset type existence.
858 self.unfoundRefs = set()
859 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
860 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
861 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets(
862 datasetType, collections=collections, findFirst=True
863 )
864 dataIdsNotFoundYet = set(refs.keys())
865 for resolvedRef in resolvedRefQueryResults:
866 dataIdsNotFoundYet.discard(resolvedRef.dataId)
867 refs[resolvedRef.dataId] = resolvedRef
868 if dataIdsNotFoundYet:
869 if constrainedByAllDatasets:
870 raise RuntimeError(
871 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
872 f"'{datasetType.name}' was/were present in a previous "
873 f"query, but could not be found now."
874 f"This is either a logic bug in QuantumGraph generation "
875 f"or the input collections have been modified since "
876 f"QuantumGraph generation began."
877 )
878 else:
879 # if the common dataIds were not constrained using all the
880 # input dataset types, it is possible that some data ids
881 # found dont correspond to existing dataset types and they
882 # will be un-resolved. Mark these for later pruning from
883 # the quantum graph.
884 for k in dataIdsNotFoundYet:
885 self.unfoundRefs.add(refs[k])
887 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
888 # replacing the unresolved refs there, and then look up prerequisites.
889 for task in self.tasks:
890 _LOG.debug(
891 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
892 len(task.quanta),
893 task.taskDef.label,
894 )
895 lookupFunctions = {
896 c.name: c.lookupFunction
897 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
898 if c.lookupFunction is not None
899 }
900 dataIdsFailed = []
901 dataIdsSucceeded = []
902 for quantum in task.quanta.values():
903 # Process outputs datasets only if skipExistingIn is not None
904 # or there is a run to look for outputs in and clobberOutputs
905 # is True. Note that if skipExistingIn is None, any output
906 # datasets that already exist would have already caused an
907 # exception to be raised. We never update the DatasetRefs in
908 # the quantum because those should never be resolved.
909 if skipCollections is not None or (run is not None and clobberOutputs):
910 resolvedRefs = []
911 unresolvedRefs = []
912 haveMetadata = False
913 for datasetType, originalRefs in quantum.outputs.items():
914 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
915 if ref.id is not None:
916 resolvedRefs.append(ref)
917 if datasetType.name == task.taskDef.metadataDatasetName:
918 haveMetadata = True
919 else:
920 unresolvedRefs.append(ref)
921 if resolvedRefs:
922 if haveMetadata or not unresolvedRefs:
923 dataIdsSucceeded.append(quantum.dataId)
924 if skipCollections is not None:
925 continue
926 else:
927 dataIdsFailed.append(quantum.dataId)
928 if not clobberOutputs:
929 raise OutputExistsError(
930 f"Quantum {quantum.dataId} of task with label "
931 f"'{quantum.task.taskDef.label}' has some outputs that exist "
932 f"({resolvedRefs}) "
933 f"and others that don't ({unresolvedRefs}), with no metadata output, "
934 "and clobbering outputs was not enabled."
935 )
936 # Update the input DatasetRefs to the resolved ones we already
937 # searched for.
938 for datasetType, refs in quantum.inputs.items():
939 for ref in task.inputs.extract(datasetType, refs.keys()):
940 refs[ref.dataId] = ref
941 # Look up prerequisite datasets in the input collection(s).
942 # These may have dimensions that extend beyond those we queried
943 # for originally, because we want to permit those data ID
944 # values to differ across quanta and dataset types.
945 for datasetType in task.prerequisites:
946 lookupFunction = lookupFunctions.get(datasetType.name)
947 if lookupFunction is not None:
948 # PipelineTask has provided its own function to do the
949 # lookup. This always takes precedence.
950 refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
951 elif (
952 datasetType.isCalibration()
953 and datasetType.dimensions <= quantum.dataId.graph
954 and quantum.dataId.graph.temporal
955 ):
956 # This is a master calibration lookup, which we have to
957 # handle specially because the query system can't do a
958 # temporal join on a non-dimension-based timespan yet.
959 timespan = quantum.dataId.timespan
960 try:
961 refs = [
962 registry.findDataset(
963 datasetType, quantum.dataId, collections=collections, timespan=timespan
964 )
965 ]
966 except KeyError:
967 # This dataset type is not present in the registry,
968 # which just means there are no datasets here.
969 refs = []
970 else:
971 # Most general case.
972 refs = list(
973 registry.queryDatasets(
974 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True
975 ).expanded()
976 )
977 quantum.prerequisites[datasetType].update(
978 {ref.dataId: ref for ref in refs if ref is not None}
979 )
980 # Actually remove any quanta that we decided to skip above.
981 if dataIdsSucceeded:
982 if skipCollections is not None:
983 _LOG.debug(
984 "Pruning successful %d quanta for task with label '%s' because all of their "
985 "outputs exist or metadata was written successfully.",
986 len(dataIdsSucceeded),
987 task.taskDef.label,
988 )
989 for dataId in dataIdsSucceeded:
990 del task.quanta[dataId]
991 elif clobberOutputs:
992 _LOG.info(
993 "Found %d successful quanta for task with label '%s' "
994 "that will need to be clobbered during execution.",
995 len(dataIdsSucceeded),
996 task.taskDef.label,
997 )
998 else:
999 raise AssertionError("OutputExistsError should have already been raised.")
1000 if dataIdsFailed:
1001 if clobberOutputs:
1002 _LOG.info(
1003 "Found %d failed/incomplete quanta for task with label '%s' "
1004 "that will need to be clobbered during execution.",
1005 len(dataIdsFailed),
1006 task.taskDef.label,
1007 )
1008 else:
1009 raise AssertionError("OutputExistsError should have already been raised.")
1011 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None):
1012 """Create a `QuantumGraph` from the quanta already present in
1013 the scaffolding data structure.
1015 Parameters
1016 ---------
1017 metadata : Optional Mapping of `str` to primitives
1018 This is an optional parameter of extra data to carry with the
1019 graph. Entries in this mapping should be able to be serialized in
1020 JSON.
1022 Returns
1023 -------
1024 graph : `QuantumGraph`
1025 The full `QuantumGraph`.
1026 """
1027 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1028 for task in self.tasks:
1029 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs)
1030 graphInput[task.taskDef] = qset
1032 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs)
1033 return graph
1036# ------------------------
1037# Exported definitions --
1038# ------------------------
1041class GraphBuilderError(Exception):
1042 """Base class for exceptions generated by graph builder."""
1044 pass
1047class OutputExistsError(GraphBuilderError):
1048 """Exception generated when output datasets already exist."""
1050 pass
1053class PrerequisiteMissingError(GraphBuilderError):
1054 """Exception generated when a prerequisite dataset does not exist."""
1056 pass
1059class GraphBuilder(object):
1060 """GraphBuilder class is responsible for building task execution graph from
1061 a Pipeline.
1063 Parameters
1064 ----------
1065 registry : `~lsst.daf.butler.Registry`
1066 Data butler instance.
1067 skipExistingIn
1068 Expressions representing the collections to search for existing
1069 output datasets that should be skipped. May be any of the types
1070 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
1071 clobberOutputs : `bool`, optional
1072 If `True` (default), allow quanta to created even if partial outputs
1073 exist; this requires the same behavior behavior to be enabled when
1074 executing.
1075 """
1077 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True):
1078 self.registry = registry
1079 self.dimensions = registry.dimensions
1080 self.skipExistingIn = skipExistingIn
1081 self.clobberOutputs = clobberOutputs
1083 def makeGraph(
1084 self,
1085 pipeline,
1086 collections,
1087 run,
1088 userQuery,
1089 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1090 metadata: Optional[Mapping[str, Any]] = None,
1091 ):
1092 """Create execution graph for a pipeline.
1094 Parameters
1095 ----------
1096 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1097 Pipeline definition, task names/classes and their configs.
1098 collections
1099 Expressions representing the collections to search for input
1100 datasets. May be any of the types accepted by
1101 `lsst.daf.butler.CollectionSearch.fromExpression`.
1102 run : `str`, optional
1103 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1104 output datasets, if it already exists.
1105 userQuery : `str`
1106 String which defines user-defined selection for registry, should be
1107 empty or `None` if there is no restrictions on data selection.
1108 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1109 The query constraint variant that should be used to constraint the
1110 query based on dataset existance, defaults to
1111 `DatasetQueryConstraintVariant.ALL`.
1112 metadata : Optional Mapping of `str` to primitives
1113 This is an optional parameter of extra data to carry with the
1114 graph. Entries in this mapping should be able to be serialized in
1115 JSON.
1117 Returns
1118 -------
1119 graph : `QuantumGraph`
1121 Raises
1122 ------
1123 UserExpressionError
1124 Raised when user expression cannot be parsed.
1125 OutputExistsError
1126 Raised when output datasets already exist.
1127 Exception
1128 Other exceptions types may be raised by underlying registry
1129 classes.
1130 """
1131 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1132 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1133 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1134 instrument = None
1135 if isinstance(pipeline, Pipeline):
1136 instrument = pipeline.getInstrument()
1137 if isinstance(instrument, str):
1138 instrument = doImport(instrument)
1139 pipeline = list(pipeline.toExpandedPipeline())
1140 if instrument is not None:
1141 dataId = DataCoordinate.standardize(
1142 instrument=instrument.getName(), universe=self.registry.dimensions
1143 )
1144 else:
1145 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1146 with scaffolding.connectDataIds(
1147 self.registry, collections, userQuery, dataId, datasetQueryConstraint
1148 ) as commonDataIds:
1149 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1150 scaffolding.resolveDatasetRefs(
1151 self.registry,
1152 collections,
1153 run,
1154 commonDataIds,
1155 skipExistingIn=self.skipExistingIn,
1156 clobberOutputs=self.clobberOutputs,
1157 constrainedByAllDatasets=condition,
1158 )
1159 return scaffolding.makeQuantumGraph(metadata=metadata)