22 """Module defining GraphBuilder class and related methods. 25 __all__ = [
'GraphBuilder']
31 from collections
import namedtuple
32 from itertools
import chain
38 from .graph
import QuantumGraphTaskNodes, QuantumGraph
39 from lsst.daf.butler
import Quantum, DatasetRef, DimensionSet
45 _LOG = logging.getLogger(__name__.partition(
".")[2])
58 _TaskDatasetTypes = namedtuple(
"_TaskDatasetTypes", (
"taskDef",
"inputs",
"outputs",
59 "initInputs",
"initOutputs",
60 "perDatasetTypeDimensions",
"prerequisite"))
64 """Base class for exceptions generated by graph builder. 70 """Exception generated when output datasets already exist. 74 refs =
', '.join(str(ref)
for ref
in refs)
75 msg =
"Output datasets already exist for task {}: {}".format(taskName, refs)
76 GraphBuilderError.__init__(self, msg)
80 """Exception generated when a prerequisite dataset does not exist. 85 class GraphBuilder(object):
87 GraphBuilder class is responsible for building task execution graph from 92 taskFactory : `TaskFactory` 93 Factory object used to load/instantiate PipelineTasks 94 registry : `~lsst.daf.butler.Registry` 96 skipExisting : `bool`, optional 97 If ``True`` (default) then Quantum is not created if all its outputs 98 already exist, otherwise exception is raised. 101 def __init__(self, taskFactory, registry, skipExisting=True):
107 def _loadTaskClass(self, taskDef):
108 """Make sure task class is loaded. 110 Load task class, update task name to make sure it is fully-qualified, 111 do not update original taskDef in a Pipeline though. 119 `TaskDef` instance, may be the same as parameter if task class is 122 if taskDef.taskClass
is None:
123 tClass, tName = self.
taskFactory.loadTaskClass(taskDef.taskName)
124 taskDef = copy.copy(taskDef)
125 taskDef.taskClass = tClass
126 taskDef.taskName = tName
130 """Create execution graph for a pipeline. 134 pipeline : `Pipeline` 135 Pipeline definition, task names/classes and their configs. 136 originInfo : `~lsst.daf.butler.DatasetOriginInfo` 137 Object which provides names of the input/output collections. 139 String which defunes user-defined selection for registry, should be 140 empty or `None` if there is no restrictions on data selection. 144 graph : `QuantumGraph` 149 Raised when user expression cannot be parsed. 151 Raised when output datasets already exist. 153 Other exceptions types may be raised by underlying registry 162 for taskDef
in taskList:
163 taskClass = taskDef.taskClass
164 inputs = {k: v.makeDatasetType(self.
registry.dimensions)
165 for k, v
in taskClass.getInputDatasetTypes(taskDef.config).items()}
166 prerequisite = set(inputs[k]
for k
in taskClass.getPrerequisiteDatasetTypes(taskDef.config))
167 taskIo = [inputs.values()]
168 for attr
in (
"Output",
"InitInput",
"InitOutput"):
169 getter = getattr(taskClass, f
"get{attr}DatasetTypes")
170 ioObject = getter(taskDef.config)
or {}
171 taskIo.append(set(dsTypeDescr.makeDatasetType(self.
registry.dimensions)
172 for dsTypeDescr
in ioObject.values()))
173 perDatasetTypeDimensions = DimensionSet(self.
registry.dimensions,
174 taskClass.getPerDatasetTypeDimensions(taskDef.config))
175 taskDatasets.append(_TaskDatasetTypes(taskDef, *taskIo, prerequisite=prerequisite,
176 perDatasetTypeDimensions=perDatasetTypeDimensions))
184 return self.
_makeGraph(taskDatasets, required, optional, prerequisite, initInputs, initOutputs,
185 originInfo, userQuery, perDatasetTypeDimensions=perDatasetTypeDimensions)
187 def _extractPerDatasetTypeDimensions(self, taskDatasets):
188 """Return the complete set of all per-DatasetType dimensions declared 191 Per-DatasetType dimensions are those that need not have the same values 192 for different Datasets within a Quantum. 196 taskDatasets : sequence of `_TaskDatasetTypes` 197 Information for each task in the pipeline. 201 perDatasetTypeDimensions : `~lsst.daf.butler.DimensionSet` 202 All per-DatasetType dimensions. 207 Raised if tasks disagree on whether a dimension is declared 212 noDimensions = DimensionSet(self.
registry.dimensions, ())
215 perDatasetTypeDimensions = noDimensions.union(
216 *[taskDs.perDatasetTypeDimensions
for taskDs
in taskDatasets]
220 for taskDs
in taskDatasets:
221 allTaskDimensions = noDimensions.union(
222 *[datasetType.dimensions
for datasetType
in chain(taskDs.inputs, taskDs.outputs)]
224 commonTaskDimensions = allTaskDimensions - taskDs.perDatasetTypeDimensions
225 if not commonTaskDimensions.isdisjoint(perDatasetTypeDimensions):
226 overlap = commonTaskDimensions.intersections(perDatasetTypeDimensions)
228 f
"Task {taskDs.taskDef.taskName} uses dimensions {overlap} without declaring them " 229 f
"per-DatasetType, but they are declared per-DatasetType by another task." 231 return perDatasetTypeDimensions
233 def _makeFullIODatasetTypes(self, taskDatasets):
234 """Returns full set of input and output dataset types for all tasks. 238 taskDatasets : sequence of `_TaskDatasetTypes` 239 Tasks with their inputs, outputs, initInputs and initOutputs. 243 required : `set` of `~lsst.daf.butler.DatasetType` 244 Datasets that must exist in the repository in order to generate 245 a QuantumGraph node that consumes them. 246 optional : `set` of `~lsst.daf.butler.DatasetType` 247 Datasets that will be produced by the graph, but may exist in the 248 repository. If ``self.skipExisting`` is `True` and all outputs of 249 a particular node already exist, it will be skipped. Otherwise 250 pre-existing datasets of these types will cause 251 `OutputExistsError` to be raised. 252 prerequisite : `set` of `~lsst.daf.butler.DatasetType` 253 Datasets that must exist in the repository, but whose absence 254 should cause `PrerequisiteMissingError` to be raised if they 255 are needed by any graph node that would otherwise be created. 256 initInputs : `set` of `~lsst.daf.butler.DatasetType` 257 Datasets used as init method inputs by the pipeline. 258 initOutputs : `set` of `~lsst.daf.butler.DatasetType` 259 Datasets used as init method outputs by the pipeline. 269 for taskDs
in taskDatasets:
270 for ioType, ioSet
in zip((
"inputs",
"outputs",
"prerequisite",
"initInputs",
"initOutputs"),
271 (required, optional, prerequisite, initInputs, initOutputs)):
272 for dsType
in getattr(taskDs, ioType):
273 ioSet.add(dsType.name)
274 allDatasetTypes[dsType.name] = dsType
278 prerequisite -= optional
281 initInputs -= initOutputs
283 required = set(allDatasetTypes[name]
for name
in required)
284 optional = set(allDatasetTypes[name]
for name
in optional)
285 prerequisite = set(allDatasetTypes[name]
for name
in prerequisite)
286 initInputs = set(allDatasetTypes[name]
for name
in initInputs)
287 initOutputs = set(allDatasetTypes[name]
for name
in initOutputs)
288 return required, optional, prerequisite, initInputs, initOutputs
290 def _makeGraph(self, taskDatasets, required, optional, prerequisite,
291 initInputs, initOutputs, originInfo, userQuery,
292 perDatasetTypeDimensions=()):
293 """Make QuantumGraph instance. 297 taskDatasets : sequence of `_TaskDatasetTypes` 298 Tasks with their inputs and outputs. 299 required : `set` of `~lsst.daf.butler.DatasetType` 300 Datasets that must exist in the repository in order to generate 301 a QuantumGraph node that consumes them. 302 optional : `set` of `~lsst.daf.butler.DatasetType` 303 Datasets that will be produced by the graph, but may exist in 304 the repository. If ``self.skipExisting`` and all outputs of a 305 particular node already exist, it will be skipped. Otherwise 306 pre-existing datasets of these types will cause 307 `OutputExistsError` to be raised. 308 prerequisite : `set` of `~lsst.daf.butler.DatasetType` 309 Datasets that must exist in the repository, but whose absence 310 should cause `PrerequisiteMissingError` to be raised if they 311 are needed by any graph node that would otherwise be created. 312 initInputs : `set` of `DatasetType` 313 Datasets which should exist in input repository, and will be used 314 in task initialization 315 initOutputs : `set` of `DatasetType` 316 Datasets which which will be created in task initialization 317 originInfo : `DatasetOriginInfo` 318 Object which provides names of the input/output collections. 320 String which defines user-defined selection for registry, should be 321 empty or `None` if there is no restrictions on data selection. 322 perDatasetTypeDimensions : iterable of `Dimension` or `str` 323 Dimensions (or names thereof) that may have different values for 324 different dataset types within the same quantum. 328 `QuantumGraph` instance. 330 rows = self.
registry.selectMultipleDatasetTypes(
331 originInfo, userQuery,
332 required=required, optional=optional, prerequisite=prerequisite,
333 perDatasetTypeDimensions=perDatasetTypeDimensions
341 _LOG.debug(
"row: %s", row)
342 dimensionVerse.append(row)
343 except LookupError
as err:
348 qgraph._inputDatasetTypes = (required | prerequisite)
349 qgraph._outputDatasetTypes = optional
350 for dsType
in initInputs:
351 for collection
in originInfo.getInputCollections(dsType.name):
352 result = self.
registry.find(collection, dsType)
353 if result
is not None:
354 qgraph.initInputs.append(result)
359 for dsType
in initOutputs:
360 qgraph.initOutputs.append(DatasetRef(dsType, {}))
362 for taskDss
in taskDatasets:
363 taskQuantaInputs = {}
364 taskQuantaOutputs = {}
366 for dimensionName
in taskDss.taskDef.config.quantum.dimensions:
368 qlinks += dimension.links()
369 _LOG.debug(
"task %s qdimensions: %s", taskDss.taskDef.label, qlinks)
373 for row
in dimensionVerse:
374 qkey = tuple((col, row.dataId[col])
for col
in qlinks)
375 _LOG.debug(
"qkey: %s", qkey)
377 def _datasetRefKey(datasetRef):
378 return tuple(sorted(datasetRef.dataId.items()))
380 qinputs = taskQuantaInputs.setdefault(qkey, {})
381 for dsType
in taskDss.inputs:
382 datasetRefs = qinputs.setdefault(dsType, {})
383 datasetRef = row.datasetRefs[dsType]
384 datasetRefs[_datasetRefKey(datasetRef)] = datasetRef
385 _LOG.debug(
"add input datasetRef: %s %s", dsType.name, datasetRef)
387 qoutputs = taskQuantaOutputs.setdefault(qkey, {})
388 for dsType
in taskDss.outputs:
389 datasetRefs = qoutputs.setdefault(dsType, {})
390 datasetRef = row.datasetRefs[dsType]
391 datasetRefs[_datasetRefKey(datasetRef)] = datasetRef
392 _LOG.debug(
"add output datasetRef: %s %s", dsType.name, datasetRef)
396 for qkey
in taskQuantaInputs:
398 _LOG.debug(
"make quantum for qkey: %s", qkey)
399 quantum = Quantum(run=
None, task=
None)
402 outputs = list(chain.from_iterable(datasetRefs.values()
403 for datasetRefs
in taskQuantaOutputs[qkey].values()))
405 _LOG.debug(
"add output: %s", ref)
406 if self.
skipExisting and all(ref.id
is not None for ref
in outputs):
407 _LOG.debug(
"all output datasetRefs already exist, skip quantum")
409 if any(ref.id
is not None for ref
in outputs):
414 quantum.addOutput(ref)
417 for datasetRefs
in taskQuantaInputs[qkey].values():
418 for ref
in datasetRefs.values():
419 quantum.addPredictedInput(ref)
420 _LOG.debug(
"add input: %s", ref)
422 quanta.append(quantum)
def __init__(self, taskName, refs)
def makeGraph(self, pipeline, originInfo, userQuery)
def _makeFullIODatasetTypes(self, taskDatasets)
def _makeGraph(self, taskDatasets, required, optional, prerequisite, initInputs, initOutputs, originInfo, userQuery, perDatasetTypeDimensions=())
def _loadTaskClass(self, taskDef)
def __init__(self, taskFactory, registry, skipExisting=True)
def _extractPerDatasetTypeDimensions(self, taskDatasets)