21 from __future__
import annotations
23 """Module defining GraphBuilder class and related methods.
26 __all__ = [
'GraphBuilder']
32 from collections
import ChainMap
33 from contextlib
import contextmanager
34 from dataclasses
import dataclass
35 from typing
import Dict, Iterable, Iterator, List
42 from .connections
import iterConnections
43 from .pipeline
import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44 from .graph
import QuantumGraph, QuantumGraphTaskNodes
45 from lsst.daf.butler
import (
54 from lsst.daf.butler.registry.queries.exprParser
import ParseError, ParserYacc, TreeVisitor
61 _LOG = logging.getLogger(__name__.partition(
".")[2])
64 class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
65 """A custom dictionary that maps `DatasetType` to a nested dictionary of
66 the known `DatasetRef` instances of that type.
71 Positional arguments are forwarded to the `dict` constructor.
72 universe : `DimensionUniverse`
73 Universe of all possible dimensions.
75 def __init__(self, *args, universe: DimensionGraph):
81 universe: DimensionUniverse) -> _DatasetDict:
82 """Construct a dictionary from a flat iterable of `DatasetType` keys.
86 datasetTypes : `iterable` of `DatasetType`
87 DatasetTypes to use as keys for the dict. Values will be empty
89 universe : `DimensionUniverse`
90 Universe of all possible dimensions.
94 dictionary : `_DatasetDict`
95 A new `_DatasetDict` instance.
97 return cls({datasetType: {}
for datasetType
in datasetTypes}, universe=universe)
100 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
102 """Return a new dictionary by extracting items corresponding to the
103 given keys from one or more existing dictionaries.
107 datasetTypes : `iterable` of `DatasetType`
108 DatasetTypes to use as keys for the dict. Values will be obtained
109 by lookups against ``first`` and ``rest``.
110 first : `_DatasetDict`
111 Another dictionary from which to extract values.
113 Additional dictionaries from which to extract values.
117 dictionary : `_DatasetDict`
118 A new dictionary instance.
120 combined = ChainMap(first, *rest)
121 return cls({datasetType: combined[datasetType]
for datasetType
in datasetTypes},
122 universe=first.universe)
126 """The union of all dimensions used by all dataset types in this
127 dictionary, including implied dependencies (`DimensionGraph`).
132 return base.union(*[datasetType.dimensions
for datasetType
in self.keys()])
135 """Unpack nested single-element `DatasetRef` dicts into a new
136 mapping with `DatasetType` keys and `DatasetRef` values.
138 This method assumes that each nest contains exactly one item, as is the
139 case for all "init" datasets.
143 dictionary : `NamedKeyDict`
144 Dictionary mapping `DatasetType` to `DatasetRef`, with both
145 `DatasetType` instances and string names usable as keys.
147 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
150 return NamedKeyDict({datasetType: getOne(refs)
for datasetType, refs
in self.items()})
153 """Unpack nested multi-element `DatasetRef` dicts into a new
154 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
158 dictionary : `NamedKeyDict`
159 Dictionary mapping `DatasetType` to `DatasetRef`, with both
160 `DatasetType` instances and string names usable as keys.
162 return NamedKeyDict({datasetType: list(refs.values())
for datasetType, refs
in self.items()})
164 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
165 ) -> Iterator[DatasetRef]:
166 """Iterate over the contained `DatasetRef` instances that match the
167 given `DatasetType` and data IDs.
171 datasetType : `DatasetType`
172 Dataset type to match.
173 dataIds : `Iterable` [ `DataCoordinate` ]
178 refs : `Iterator` [ `DatasetRef` ]
179 DatasetRef instances for which ``ref.datasetType == datasetType``
180 and ``ref.dataId`` is in ``dataIds``.
182 refs = self[datasetType]
183 return (refs[dataId]
for dataId
in dataIds)
187 """Helper class aggregating information about a `Quantum`, used when
188 constructing a `QuantumGraph`.
190 See `_PipelineScaffolding` for a top-down description of the full
191 scaffolding data structure.
195 task : _TaskScaffolding
196 Back-reference to the helper object for the `PipelineTask` this quantum
197 represents an execution of.
198 dataId : `DataCoordinate`
199 Data ID for this quantum.
201 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
204 self.
inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
205 self.
outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
206 self.
prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
207 universe=dataId.universe)
209 __slots__ = (
"task",
"dataId",
"inputs",
"outputs",
"prerequisites")
212 return f
"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
214 task: _TaskScaffolding
215 """Back-reference to the helper object for the `PipelineTask` this quantum
216 represents an execution of.
219 dataId: DataCoordinate
220 """Data ID for this quantum.
224 """Nested dictionary containing `DatasetRef` inputs to this quantum.
226 This is initialized to map each `DatasetType` to an empty dictionary at
227 construction. Those nested dictionaries are populated (with data IDs as
228 keys) with unresolved `DatasetRef` instances in
229 `_PipelineScaffolding.connectDataIds`.
232 outputs: _DatasetDict
233 """Nested dictionary containing `DatasetRef` outputs this quantum.
236 prerequisites: _DatasetDict
237 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
242 """Transform the scaffolding object into a true `Quantum` instance.
247 An actual `Quantum` instance.
249 allInputs = self.
inputs.unpackMultiRefs()
253 config = self.
task.taskDef.config
254 connections = config.connections.ConnectionsClass(config=config)
257 allInputs = connections.adjustQuantum(allInputs)
259 taskName=self.
task.taskDef.taskName,
260 taskClass=self.
task.taskDef.taskClass,
262 initInputs=self.
task.initInputs.unpackSingleRefs(),
263 predictedInputs=allInputs,
264 outputs=self.
outputs.unpackMultiRefs(),
270 """Helper class aggregating information about a `PipelineTask`, used when
271 constructing a `QuantumGraph`.
273 See `_PipelineScaffolding` for a top-down description of the full
274 scaffolding data structure.
279 Data structure that identifies the task class and its config.
280 parent : `_PipelineScaffolding`
281 The parent data structure that will hold the instance being
283 datasetTypes : `TaskDatasetTypes`
284 Data structure that categorizes the dataset types used by this task.
286 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
287 universe = parent.dimensions.universe
289 self.
dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
290 assert self.
dimensions.issubset(parent.dimensions)
293 self.
initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
294 parent.initIntermediates)
295 self.
initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
297 self.
inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
298 self.
outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
299 self.
prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
306 return f
"_TaskScaffolding(taskDef={self.taskDef}, ...)"
309 """Data structure that identifies the task class and its config
313 dimensions: DimensionGraph
314 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
317 initInputs: _DatasetDict
318 """Dictionary containing information about datasets used to construct this
319 task (`_DatasetDict`).
322 initOutputs: _DatasetDict
323 """Dictionary containing information about datasets produced as a
324 side-effect of constructing this task (`_DatasetDict`).
328 """Dictionary containing information about datasets used as regular,
329 graph-constraining inputs to this task (`_DatasetDict`).
332 outputs: _DatasetDict
333 """Dictionary containing information about datasets produced by this task
337 prerequisites: _DatasetDict
338 """Dictionary containing information about input datasets that must be
339 present in the repository before any Pipeline containing this task is run
343 quanta: Dict[DataCoordinate, _QuantumScaffolding]
344 """Dictionary mapping data ID to a scaffolding object for the Quantum of
345 this task with that data ID.
349 """Create a `QuantumGraphTaskNodes` instance from the information in
354 nodes : `QuantumGraphTaskNodes`
355 The `QuantumGraph` elements corresponding to this task.
359 quanta=[q.makeQuantum()
for q
in self.
quanta.values()],
360 initInputs=self.
initInputs.unpackSingleRefs(),
367 """A helper data structure that organizes the information involved in
368 constructing a `QuantumGraph` for a `Pipeline`.
372 pipeline : `Pipeline`
373 Sequence of tasks from which a graph is to be constructed. Must
374 have nested task classes already imported.
375 universe : `DimensionUniverse`
376 Universe of all possible dimensions.
380 The scaffolding data structure contains nested data structures for both
381 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
382 data structures are shared between the pipeline-level structure (which
383 aggregates all datasets and categorizes them from the perspective of the
384 complete pipeline) and the individual tasks that use them as inputs and
387 `QuantumGraph` construction proceeds in four steps, with each corresponding
388 to a different `_PipelineScaffolding` method:
390 1. When `_PipelineScaffolding` is constructed, we extract and categorize
391 the DatasetTypes used by the pipeline (delegating to
392 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
393 nested `_TaskScaffolding` and `_DatasetDict` objects.
395 2. In `connectDataIds`, we construct and run the "Big Join Query", which
396 returns related tuples of all dimensions used to identify any regular
397 input, output, and intermediate datasets (not prerequisites). We then
398 iterate over these tuples of related dimensions, identifying the subsets
399 that correspond to distinct data IDs for each task and dataset type,
400 and then create `_QuantumScaffolding` objects.
402 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
403 dataset data IDs previously identified, transforming unresolved
404 DatasetRefs into resolved DatasetRefs where appropriate. We then look
405 up prerequisite datasets for all quanta.
407 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
408 per-task `_QuantumScaffolding` objects.
411 _LOG.debug(
"Initializing data structures for QuantumGraph generation.")
414 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
417 for attr
in (
"initInputs",
"initIntermediates",
"initOutputs",
418 "inputs",
"intermediates",
"outputs",
"prerequisites"):
419 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
420 universe=registry.dimensions))
423 self.
dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
424 self.outputs.dimensions)
429 if isinstance(pipeline, Pipeline):
430 pipeline = pipeline.toExpandedPipeline()
432 for taskDef, taskDatasetTypes
in zip(pipeline,
433 datasetTypes.byTask.values())]
438 return f
"_PipelineScaffolding(tasks={self.tasks}, ...)"
440 tasks: List[_TaskScaffolding]
441 """Scaffolding data structures for each task in the pipeline
442 (`list` of `_TaskScaffolding`).
445 initInputs: _DatasetDict
446 """Datasets consumed but not produced when constructing the tasks in this
447 pipeline (`_DatasetDict`).
450 initIntermediates: _DatasetDict
451 """Datasets that are both consumed and produced when constructing the tasks
452 in this pipeline (`_DatasetDict`).
455 initOutputs: _DatasetDict
456 """Datasets produced but not consumed when constructing the tasks in this
457 pipeline (`_DatasetDict`).
461 """Datasets that are consumed but not produced when running this pipeline
465 intermediates: _DatasetDict
466 """Datasets that are both produced and consumed when running this pipeline
470 outputs: _DatasetDict
471 """Datasets produced but not consumed when when running this pipeline
475 prerequisites: _DatasetDict
476 """Datasets that are consumed when running this pipeline and looked up
477 per-Quantum when generating the graph (`_DatasetDict`).
480 dimensions: DimensionGraph
481 """All dimensions used by any regular input, intermediate, or output
482 (not prerequisite) dataset; the set of dimension used in the "Big Join
483 Query" (`DimensionGraph`).
485 This is required to be a superset of all task quantum dimensions.
490 """Query for the data IDs that connect nodes in the `QuantumGraph`.
492 This method populates `_TaskScaffolding.dataIds` and
493 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
497 registry : `lsst.daf.butler.Registry`
498 Registry for the data repository; used for all data ID queries.
499 collections : `lsst.daf.butler.CollectionSearch`
500 Object representing the collections to search for input datasets.
501 userQuery : `str`, optional
502 User-provided expression to limit the data IDs processed.
507 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
508 An interface to a database temporary table containing all data IDs
509 that will appear in this `QuantumGraph`. Returned inside a
510 context manager, which will drop the temporary table at the end of
511 the `with` block in which this method is called.
513 _LOG.debug(
"Building query for data IDs.")
515 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
516 for datasetType, refs
in itertools.chain(self.initInputs.items(),
517 self.initIntermediates.items(),
518 self.initOutputs.items()):
519 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
524 _LOG.debug(
"Submitting data ID query and materializing results.")
526 datasets=list(self.inputs),
527 collections=collections,
529 ).materialize()
as commonDataIds:
530 _LOG.debug(
"Expanding data IDs.")
531 commonDataIds = commonDataIds.expanded()
532 _LOG.debug(
"Iterating over query results to associate quanta with datasets.")
536 for n, commonDataId
in enumerate(commonDataIds):
542 for datasetType, refs
in itertools.chain(self.inputs.items(), self.intermediates.items(),
543 self.outputs.items()):
544 datasetDataId = commonDataId.subset(datasetType.dimensions)
545 ref = refs.get(datasetDataId)
547 ref = DatasetRef(datasetType, datasetDataId)
548 refs[datasetDataId] = ref
549 refsForRow[datasetType.name] = ref
552 for task
in self.
tasks:
553 quantumDataId = commonDataId.subset(task.dimensions)
554 quantum = task.quanta.get(quantumDataId)
557 task.quanta[quantumDataId] = quantum
566 for datasetType
in task.inputs:
567 ref = refsForRow[datasetType.name]
568 quantum.inputs[datasetType.name][ref.dataId] = ref
569 for datasetType
in task.outputs:
570 ref = refsForRow[datasetType.name]
571 quantum.outputs[datasetType.name][ref.dataId] = ref
572 _LOG.debug(
"Finished processing %d rows from data ID query.", n)
576 """Perform follow up queries for each dataset data ID produced in
579 This method populates `_DatasetScaffolding.refs` (except for those in
584 registry : `lsst.daf.butler.Registry`
585 Registry for the data repository; used for all data ID queries.
586 collections : `lsst.daf.butler.CollectionSearch`
587 Object representing the collections to search for input datasets.
588 run : `str`, optional
589 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
590 output datasets, if it already exists.
592 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
593 Result of a previous call to `connectDataIds`.
594 skipExisting : `bool`, optional
595 If `True` (default), a Quantum is not created if all its outputs
596 already exist in ``run``. Ignored if ``run`` is `None`.
601 Raised if an output dataset already exists in the output run
602 and ``skipExisting`` is `False`. The case where some but not all
603 of a quantum's outputs are present and ``skipExisting`` is `True`
604 cannot be identified at this stage, and is handled by `fillQuanta`
610 for datasetType, refs
in itertools.chain(self.initIntermediates.items(),
611 self.initOutputs.items(),
612 self.intermediates.items(),
613 self.outputs.items()):
614 _LOG.debug(
"Resolving %d datasets for intermediate and/or output dataset %s.",
615 len(refs), datasetType.name)
616 isInit = datasetType
in self.initIntermediates
or datasetType
in self.initOutputs
617 resolvedRefQueryResults = commonDataIds.subset(
618 datasetType.dimensions,
625 for resolvedRef
in resolvedRefQueryResults:
630 assert resolvedRef.dataId
in refs
631 if skipExisting
or isInit:
632 refs[resolvedRef.dataId] = resolvedRef
635 f
"output RUN collection '{run}' with data ID"
636 f
" {resolvedRef.dataId}.")
638 for datasetType, refs
in itertools.chain(self.initInputs.items(), self.inputs.items()):
639 _LOG.debug(
"Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
640 resolvedRefQueryResults = commonDataIds.subset(
641 datasetType.dimensions,
645 collections=collections,
648 dataIdsNotFoundYet = set(refs.keys())
649 for resolvedRef
in resolvedRefQueryResults:
650 dataIdsNotFoundYet.discard(resolvedRef.dataId)
651 refs[resolvedRef.dataId] = resolvedRef
652 if dataIdsNotFoundYet:
654 f
"{len(dataIdsNotFoundYet)} dataset(s) of type "
655 f
"'{datasetType.name}' was/were present in a previous "
656 f
"query, but could not be found now."
657 f
"This is either a logic bug in QuantumGraph generation "
658 f
"or the input collections have been modified since "
659 f
"QuantumGraph generation began."
663 for task
in self.
tasks:
665 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
670 c.name: c.lookupFunction
671 for c
in iterConnections(task.taskDef.connections,
"prerequisiteInputs")
672 if c.lookupFunction
is not None
675 for quantum
in task.quanta.values():
682 if run
is not None and skipExisting:
685 for datasetType, originalRefs
in quantum.outputs.items():
686 for ref
in task.outputs.extract(datasetType, originalRefs.keys()):
687 if ref.id
is not None:
688 resolvedRefs.append(ref)
690 unresolvedRefs.append(ref)
694 f
"Quantum {quantum.dataId} of task with label "
695 f
"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
696 f
"and others that don't ({unresolvedRefs})."
701 dataIdsToSkip.append(quantum.dataId)
705 for datasetType, refs
in quantum.inputs.items():
706 for ref
in task.inputs.extract(datasetType, refs.keys()):
707 refs[ref.dataId] = ref
716 for datasetType
in task.prerequisites:
717 lookupFunction = lookupFunctions.get(datasetType.name)
718 if lookupFunction
is not None:
720 lookupFunction(datasetType, registry, quantum.dataId, collections)
723 refs = list(registry.queryDatasets(datasetType,
724 collections=collections,
725 dataId=quantum.dataId,
726 deduplicate=
True).expanded())
727 quantum.prerequisites[datasetType].update({ref.dataId: ref
for ref
in refs})
730 _LOG.debug(
"Pruning %d quanta for task with label '%s' because all of their outputs exist.",
731 len(dataIdsToSkip), task.taskDef.label)
732 for dataId
in dataIdsToSkip:
733 del task.quanta[dataId]
736 """Create a `QuantumGraph` from the quanta already present in
737 the scaffolding data structure.
741 graph : `QuantumGraph`
742 The full `QuantumGraph`.
745 graph.initInputs = self.initInputs.unpackSingleRefs()
746 graph.initOutputs = self.initOutputs.unpackSingleRefs()
747 graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
752 """Implementation of TreeVisitor which looks for instrument name
754 Instrument should be specified as a boolean expression
756 instrument = 'string'
757 'string' = instrument
759 so we only need to find a binary operator where operator is "=",
760 one side is a string literal and other side is an identifier.
761 All visit methods return tuple of (type, value), non-useful nodes
762 return None for both type and value.
773 return (
"str", value)
784 if name.lower() ==
"instrument":
785 return (
"id",
"instrument")
794 if lhs == (
"id",
"instrument")
and rhs[0] ==
"str":
796 elif rhs == (
"id",
"instrument")
and lhs[0] ==
"str":
809 def _findInstruments(queryStr):
810 parser = ParserYacc()
813 tree = parser.parse(queryStr)
814 except ParseError
as exc:
815 raise ValueError(f
"failed to parse query expression: {queryStr}")
from exc
817 return finder.instruments
826 """Base class for exceptions generated by graph builder.
831 class OutputExistsError(GraphBuilderError):
832 """Exception generated when output datasets already exist.
838 """Exception generated when a prerequisite dataset does not exist.
844 """GraphBuilder class is responsible for building task execution graph from
849 registry : `~lsst.daf.butler.Registry`
850 Data butler instance.
851 skipExisting : `bool`, optional
852 If `True` (default), a Quantum is not created if all its outputs
861 def makeGraph(self, pipeline, collections, run, userQuery):
862 """Create execution graph for a pipeline.
866 pipeline : `Pipeline`
867 Pipeline definition, task names/classes and their configs.
868 collections : `lsst.daf.butler.CollectionSearch`
869 Object representing the collections to search for input datasets.
870 run : `str`, optional
871 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
872 output datasets, if it already exists.
874 String which defines user-defined selection for registry, should be
875 empty or `None` if there is no restrictions on data selection.
879 graph : `QuantumGraph`
884 Raised when user expression cannot be parsed.
886 Raised when output datasets already exist.
888 Other exceptions types may be raised by underlying registry
893 instrument = pipeline.getInstrument()
894 if isinstance(instrument, str):
895 instrument = doImport(instrument)
896 instrumentName = instrument.getName()
if instrument
else None
899 with scaffolding.connectDataIds(self.
registry, collections, userQuery)
as commonDataIds:
900 scaffolding.resolveDatasetRefs(self.
registry, collections, run, commonDataIds,
902 return scaffolding.makeQuantumGraph()
905 def _verifyInstrumentRestriction(instrumentName, query):
906 """Add an instrument restriction to the query if it does not have one,
907 and verify that if given an instrument name that there are no other
908 instrument restrictions in the query.
912 instrumentName : `str`
913 The name of the instrument that should appear in the query.
920 The query string with the instrument added to it if needed.
925 If the pipeline names an instrument and the query contains more
926 than one instrument or the name of the instrument in the query does
927 not match the instrument named by the pipeline.
929 if not instrumentName:
931 queryInstruments = _findInstruments(query)
932 if len(queryInstruments) > 1:
933 raise RuntimeError(f
"When the pipeline has an instrument (\"{instrumentName}\") the query must "
934 "have zero instruments or one instrument that matches the pipeline. "
935 f
"Found these instruments in the query: {queryInstruments}.")
936 if not queryInstruments:
938 restriction = f
"instrument = '{instrumentName}'"
939 _LOG.debug(f
"Adding restriction \"{restriction}\" to query.")
940 query = f
"{restriction} AND ({query})"
941 elif queryInstruments[0] != instrumentName:
944 raise RuntimeError(f
"The instrument named in the query (\"{queryInstruments[0]}\") does not "
945 f
"match the instrument named by the pipeline (\"{instrumentName}\")")