22 """Module defining few methods to manipulate or query pipelines. 26 __all__ = [
"isPipelineOrdered",
"orderPipeline"]
35 from .pipeline
import Pipeline
42 def _loadTaskClass(taskDef, taskFactory):
43 """Import task class if necessary. 47 `ImportError` is raised when task class cannot be imported. 48 `MissingTaskFactoryError` is raised when TaskFactory is needed but not provided. 50 taskClass = taskDef.taskClass
54 "factory instance is not provided")
55 taskClass = taskFactory.loadTaskClass(taskDef.taskName)
64 """Exception raised when client fails to provide TaskFactory instance. 69 class DuplicateOutputError(Exception):
70 """Exception raised when Pipeline has more than one task for the same 77 """Exception raised when Pipeline has data dependency cycle. 83 """Checks whether tasks in pipeline are correctly ordered. 85 Pipeline is correctly ordered if for any DatasetType produced by a task 86 in a pipeline all its consumer tasks are located after producer. 90 pipeline : `pipe.base.Pipeline` 92 taskFactory: `pipe.base.TaskFactory`, optional 93 Instance of an object which knows how to import task classes. It is only 94 used if pipeline task definitions do not define task classes. 98 True for correctly ordered pipeline, False otherwise. 102 `ImportError` is raised when task class cannot be imported. 103 `DuplicateOutputError` is raised when there is more than one producer for a 105 `MissingTaskFactoryError` is raised when TaskFactory is needed but not 110 for idx, taskDef
in enumerate(pipeline):
113 taskDef.taskClass = _loadTaskClass(taskDef, taskFactory)
116 outputs = taskDef.taskClass.getOutputDatasetTypes(taskDef.config)
117 for dsTypeDescr
in outputs.values():
118 if dsTypeDescr.name
in producerIndex:
119 raise DuplicateOutputError(
"DatasetType `{}' appears more than " 120 "once as output".format(dsTypeDescr.name))
121 producerIndex[dsTypeDescr.name] = idx
124 for idx, taskDef
in enumerate(pipeline):
127 inputs = taskDef.taskClass.getInputDatasetTypes(taskDef.config)
128 for dsTypeDescr
in inputs.values():
130 prodIdx = producerIndex.get(dsTypeDescr.name, -1)
139 """Re-order tasks in pipeline to satisfy data dependencies. 141 When possible new ordering keeps original relative order of the tasks. 145 pipeline : `pipe.base.Pipeline` 146 Pipeline description. 147 taskFactory: `pipe.base.TaskFactory`, optional 148 Instance of an object which knows how to import task classes. It is only 149 used if pipeline task definitions do not define task classes. 153 Correctly ordered pipeline (`pipe.base.Pipeline` instance). 157 `ImportError` is raised when task class cannot be imported. 158 `DuplicateOutputError` is raised when there is more than one producer for a 160 `PipelineDataCycleError` is also raised when pipeline has dependency cycles. 161 `MissingTaskFactoryError` is raised when TaskFactory is needed but not 172 for idx, taskDef
in enumerate(pipeline):
175 taskClass = _loadTaskClass(taskDef, taskFactory)
178 dsMap = taskClass.getOutputDatasetTypes(taskDef.config)
179 for dsTypeDescr
in dsMap.values():
180 if dsTypeDescr.name
in allOutputs:
182 "once as output".format(dsTypeDescr.name))
183 outputs[idx] = set(dsTypeDescr.name
for dsTypeDescr
in dsMap.values())
184 allOutputs.update(outputs[idx])
187 dsMap = taskClass.getInputDatasetTypes(taskDef.config)
188 inputs[idx] = set(dsTypeDescr.name
for dsTypeDescr
in dsMap.values())
189 allInputs.update(inputs[idx])
193 preExisting = allInputs - allOutputs
194 outputs[-1] = preExisting
207 thisTaskOutputs = outputs.get(idx, set())
208 for taskInputs
in inputs.values():
209 taskInputs -= thisTaskOutputs
212 topNodes = [key
for key, value
in inputs.items()
if not value]
224 for idx, inputNames
in inputs.items():
225 taskName = pipeline[idx].label
226 outputNames = outputs[idx]
227 edge =
" {} -> {} -> {}".format(inputNames, taskName, outputNames)
231 return Pipeline(pipeline[idx]
for idx
in result)