22 """Module defining GraphBuilder class and related methods. 25 __all__ = [
'GraphBuilder']
31 from collections
import namedtuple
32 from itertools
import chain
38 from .graph
import QuantumGraphTaskNodes, QuantumGraph
39 from lsst.daf.butler
import Quantum, DatasetRef
40 from lsst.daf.butler.exprParser
import ParserYacc, ParserYaccError
46 _LOG = logging.getLogger(__name__.partition(
".")[2])
55 _TaskDatasetTypes = namedtuple(
"_TaskDatasetTypes",
"taskDef inputs outputs initInputs initOutputs")
59 """Base class for exceptions generated by graph builder. 65 """Exception generated by graph builder for error in user expression. 69 msg =
"Failed to parse user expression `{}' ({})".format(expr, exc)
70 GraphBuilderError.__init__(self, msg)
74 """Exception generated when output datasets already exist. 78 refs =
', '.join(str(ref)
for ref
in refs)
79 msg =
"Output datasets already exist for task {}: {}".format(taskName, refs)
80 GraphBuilderError.__init__(self, msg)
90 GraphBuilder class is responsible for building task execution graph from 95 taskFactory : `TaskFactory` 96 Factory object used to load/instantiate PipelineTasks 97 registry : `~lsst.daf.butler.Registry` 99 skipExisting : `bool`, optional 100 If ``True`` (default) then Quantum is not created if all its outputs 101 already exist, otherwise exception is raised. 104 def __init__(self, taskFactory, registry, skipExisting=True):
111 def _parseUserQuery(userQuery):
117 User expression string specifying data selecton. 121 `exprTree.Node` instance representing parsed expression tree. 123 parser = ParserYacc()
126 tree = parser.parse(userQuery)
127 _LOG.debug(
"parsed expression: %s", tree)
128 except ParserYaccError
as exc:
132 def _loadTaskClass(self, taskDef):
133 """Make sure task class is loaded. 135 Load task class, update task name to make sure it is fully-qualified, 136 do not update original taskDef in a Pipeline though. 144 `TaskDef` instance, may be the same as parameter if task class is 147 if taskDef.taskClass
is None:
148 tClass, tName = self.
taskFactory.loadTaskClass(taskDef.taskName)
149 taskDef = copy.copy(taskDef)
150 taskDef.taskClass = tClass
151 taskDef.taskName = tName
155 """Create execution graph for a pipeline. 159 pipeline : `Pipeline` 160 Pipeline definition, task names/classes and their configs. 161 originInfo : `~lsst.daf.butler.DatasetOriginInfo` 162 Object which provides names of the input/output collections. 164 String which defunes user-defined selection for registry, should be 165 empty or `None` if there is no restrictions on data selection. 169 graph : `QuantumGraph` 174 Raised when user expression cannot be parsed. 176 Raised when output datasets already exist. 178 Other exceptions types may be raised by underlying registry 187 for taskDef
in taskList:
188 taskClass = taskDef.taskClass
190 for attr
in (
"Input",
"Output",
"InitInput",
"InitOutput"):
191 getter = getattr(taskClass, f
"get{attr}DatasetTypes")
192 ioObject = getter(taskDef.config)
or {}
193 taskIo.append([dsTypeDescr.datasetType
for dsTypeDescr
in ioObject.values()])
194 taskDatasets.append(_TaskDatasetTypes(taskDef, *taskIo))
200 return self.
_makeGraph(taskDatasets, inputs, outputs, initInputs, initOutputs,
201 originInfo, userQuery)
203 def _makeFullIODatasetTypes(self, taskDatasets):
204 """Returns full set of input and output dataset types for all tasks. 208 taskDatasets : sequence of `_TaskDatasetTypes` 209 Tasks with their inputs, outputs, initInputs and initOutputs. 213 inputs : `set` of `butler.DatasetType` 214 Datasets used as inputs by the pipeline. 215 outputs : `set` of `butler.DatasetType` 216 Datasets produced by the pipeline. 217 initInputs : `set` of `butler.DatasetType` 218 Datasets used as init method inputs by the pipeline. 219 initOutputs : `set` of `butler.DatasetType` 220 Datasets used as init method outputs by the pipeline. 229 for taskDs
in taskDatasets:
230 for ioType, ioSet
in zip((
"inputs",
"outputs",
"initInputs",
"initOutputs"),
231 (inputs, outputs, initInputs, initOutputs)):
232 for dsType
in getattr(taskDs, ioType):
233 ioSet.add(dsType.name)
234 allDatasetTypes[dsType.name] = dsType
239 initInputs -= initOutputs
241 inputs = set(allDatasetTypes[name]
for name
in inputs)
242 outputs = set(allDatasetTypes[name]
for name
in outputs)
243 initInputs = set(allDatasetTypes[name]
for name
in initInputs)
244 initOutputs = set(allDatasetTypes[name]
for name
in initOutputs)
245 return inputs, outputs, initInputs, initOutputs
247 def _makeGraph(self, taskDatasets, inputs, outputs, initInputs, initOutputs, originInfo, userQuery):
248 """Make QuantumGraph instance. 252 taskDatasets : sequence of `_TaskDatasetTypes` 253 Tasks with their inputs and outputs. 254 inputs : `set` of `DatasetType` 255 Datasets which should already exist in input repository 256 outputs : `set` of `DatasetType` 257 Datasets which will be created by tasks 258 initInputs : `set` of `DatasetType` 259 Datasets which should exist in input repository, and will be used 260 in task initialization 261 initOutputs : `set` of `DatasetType` 262 Datasets which which will be created in task initialization 263 originInfo : `DatasetOriginInfo` 264 Object which provides names of the input/output collections. 266 String which defines user-defined selection for registry, should be 267 empty or `None` if there is no restrictions on data selection. 271 `QuantumGraph` instance. 274 expr =
None if parsedQuery
is None else str(parsedQuery)
275 rows = self.
registry.selectDimensions(originInfo, expr, inputs, outputs)
281 _LOG.debug(
"row: %s", row)
282 dimensionVerse.append(row)
286 qgraph._inputDatasetTypes = inputs
287 qgraph._outputDatasetTypes = outputs
288 for dsType
in initInputs:
289 for collection
in originInfo.getInputCollections(dsType.name):
290 result = self.
registry.find(collection, dsType)
291 if result
is not None:
292 qgraph.initInputs.append(result)
297 for dsType
in initOutputs:
298 qgraph.initOutputs.append(DatasetRef(dsType, {}))
300 for taskDss
in taskDatasets:
301 taskQuantaInputs = {}
302 taskQuantaOutputs = {}
304 for dimensionName
in taskDss.taskDef.config.quantum.dimensions:
306 qlinks += dimension.links()
307 _LOG.debug(
"task %s qdimensions: %s", taskDss.taskDef.label, qlinks)
311 for row
in dimensionVerse:
312 qkey = tuple((col, row.dataId[col])
for col
in qlinks)
313 _LOG.debug(
"qkey: %s", qkey)
315 def _dataRefKey(dataRef):
316 return tuple(sorted(dataRef.dataId.items()))
318 qinputs = taskQuantaInputs.setdefault(qkey, {})
319 for dsType
in taskDss.inputs:
320 dataRefs = qinputs.setdefault(dsType, {})
321 dataRef = row.datasetRefs[dsType]
322 dataRefs[_dataRefKey(dataRef)] = dataRef
323 _LOG.debug(
"add input dataRef: %s %s", dsType.name, dataRef)
325 qoutputs = taskQuantaOutputs.setdefault(qkey, {})
326 for dsType
in taskDss.outputs:
327 dataRefs = qoutputs.setdefault(dsType, {})
328 dataRef = row.datasetRefs[dsType]
329 dataRefs[_dataRefKey(dataRef)] = dataRef
330 _LOG.debug(
"add output dataRef: %s %s", dsType.name, dataRef)
334 for qkey
in taskQuantaInputs:
336 _LOG.debug(
"make quantum for qkey: %s", qkey)
337 quantum = Quantum(run=
None, task=
None)
340 outputs = list(chain.from_iterable(dataRefs.values()
341 for dataRefs
in taskQuantaOutputs[qkey].values()))
343 _LOG.debug(
"add output: %s", ref)
344 if self.
skipExisting and all(ref.id
is not None for ref
in outputs):
345 _LOG.debug(
"all output dataRefs already exist, skip quantum")
347 if any(ref.id
is not None for ref
in outputs):
351 quantum.addOutput(ref)
354 for dataRefs
in taskQuantaInputs[qkey].values():
355 for ref
in dataRefs.values():
356 quantum.addPredictedInput(ref)
357 _LOG.debug(
"add input: %s", ref)
359 quanta.append(quantum)
def __init__(self, taskName, refs)
def makeGraph(self, pipeline, originInfo, userQuery)
def _makeGraph(self, taskDatasets, inputs, outputs, initInputs, initOutputs, originInfo, userQuery)
def _makeFullIODatasetTypes(self, taskDatasets)
def _parseUserQuery(userQuery)
def _loadTaskClass(self, taskDef)
def __init__(self, taskFactory, registry, skipExisting=True)
def __init__(self, expr, exc)