22 """This module defines PipelineTask class and related methods. 25 __all__ = [
"PipelineTask"]
27 from lsst.daf.butler
import DatasetType, StorageClassFactory
28 from .config
import (InputDatasetConfig, OutputDatasetConfig,
29 InitInputDatasetConfig, InitOutputDatasetConfig)
30 from .task
import Task
34 """Exception raised when dataset type is configured as scalar 35 but there are multiple DataIds in a Quantum for that dataset. 40 Name of the configuration field for dataset type. 42 Actual number of DataIds in a Quantum for this dataset type. 45 super().
__init__((
"Expected scalar for output dataset field {}, " 46 "received {} DataIds").format(key, numDataIds))
50 """Base class for all pipeline tasks. 52 This is an abstract base class for PipelineTasks which represents an 53 algorithm executed by framework(s) on data which comes from data butler, 54 resulting data is also stored in a data butler. 56 PipelineTask inherits from a `pipe.base.Task` and uses the same 57 configuration mechanism based on `pex.config`. PipelineTask sub-class 58 typically implements `run()` method which receives Python-domain data 59 objects and returns `pipe.base.Struct` object with resulting data. 60 `run()` method is not supposed to perform any I/O, it operates entirely 61 on in-memory objects. `runQuantum()` is the method (can be re-implemented 62 in sub-class) where all necessary I/O is performed, it reads all input 63 data from data butler into memory, calls `run()` method with that data, 64 examines returned `Struct` object and saves some or all of that data back 65 to data butler. `runQuantum()` method receives `daf.butler.Quantum` 66 instance which defines all input and output datasets for a single 67 invocation of PipelineTask. 69 Subclasses must be constructable with exactly the arguments taken by the 70 PipelineTask base class constructor, but may support other signatures as 75 canMultiprocess : bool, True by default (class attribute) 76 This class attribute is checked by execution framework, sub-classes 77 can set it to ``False`` in case task does not support multiprocessing. 81 config : `pex.config.Config`, optional 82 Configuration for this task (an instance of ``self.ConfigClass``, 83 which is a task-specific subclass of `PipelineTaskConfig`). 84 If not specified then it defaults to `self.ConfigClass()`. 85 log : `lsst.log.Log`, optional 86 Logger instance whose name is used as a log name prefix, or ``None`` 88 initInputs : `dict`, optional 89 A dictionary of objects needed to construct this PipelineTask, with 90 keys matching the keys of the dictionary returned by 91 `getInitInputDatasetTypes` and values equivalent to what would be 92 obtained by calling `Butler.get` with those DatasetTypes and no data 93 IDs. While it is optional for the base class, subclasses are 94 permitted to require this argument. 97 canMultiprocess =
True 99 def __init__(self, *, config=None, log=None, initInputs=None, **kwargs):
100 super().
__init__(config=config, log=log, **kwargs)
103 """Return persistable outputs that are available immediately after 104 the task has been constructed. 106 Subclasses that operate on catalogs should override this method to 107 return the schema(s) of the catalog(s) they produce. 109 It is not necessary to return the PipelineTask's configuration or 110 other provenance information in order for it to be persisted; that is 111 the responsibility of the execution system. 116 Dictionary with keys that match those of the dict returned by 117 `getInitOutputDatasetTypes` values that can be written by calling 118 `Butler.put` with those DatasetTypes and no data IDs. An empty 119 `dict` should be returned by tasks that produce no initialization 126 """Return input dataset types for this task. 128 Default implementation finds all fields of type `InputDatasetConfig` 129 in configuration (non-recursively) and uses them for constructing 130 `DatasetType` instances. The keys of these fields are used as keys 131 in returned dictionary. Subclasses can override this behavior. 136 Configuration for this task. Typically datasets are defined in 137 a task configuration. 141 Dictionary where key is the name (arbitrary) of the input dataset 142 and value is the `butler.DatasetType` instance. Default 143 implementation uses configuration field name as dictionary key. 149 """Return output dataset types for this task. 151 Default implementation finds all fields of type `OutputDatasetConfig` 152 in configuration (non-recursively) and uses them for constructing 153 `DatasetType` instances. The keys of these fields are used as keys 154 in returned dictionary. Subclasses can override this behavior. 159 Configuration for this task. Typically datasets are defined in 160 a task configuration. 164 Dictionary where key is the name (arbitrary) of the output dataset 165 and value is the `butler.DatasetType` instance. Default 166 implementation uses configuration field name as dictionary key. 172 """Return dataset types that can be used to retrieve the 173 ``initInputs`` constructor argument. 175 Datasets used in initialization may not be associated with any 176 DataUnits (i.e. their data IDs must be empty dictionaries). 178 Default implementation finds all fields of type 179 `InitInputInputDatasetConfig` in configuration (non-recursively) and 180 uses them for constructing `DatasetType` instances. The keys of these 181 fields are used as keys in returned dictionary. Subclasses can 182 override this behavior. 187 Configuration for this task. Typically datasets are defined in 188 a task configuration. 192 Dictionary where key is the name (arbitrary) of the input dataset 193 and value is the `butler.DatasetType` instance. Default 194 implementation uses configuration field name as dictionary key. 196 When the task requires no initialization inputs, should return an 203 """Return dataset types that can be used to write the objects 204 returned by `getOutputDatasets`. 206 Datasets used in initialization may not be associated with any 207 DataUnits (i.e. their data IDs must be empty dictionaries). 209 Default implementation finds all fields of type 210 `InitOutputDatasetConfig` in configuration (non-recursively) and uses 211 them for constructing `DatasetType` instances. The keys of these 212 fields are used as keys in returned dictionary. Subclasses can 213 override this behavior. 218 Configuration for this task. Typically datasets are defined in 219 a task configuration. 223 Dictionary where key is the name (arbitrary) of the output dataset 224 and value is the `butler.DatasetType` instance. Default 225 implementation uses configuration field name as dictionary key. 227 When the task produces no initialization outputs, should return an 234 """Return dataset types defined in task configuration . 236 This method can be used by other methods that need to extract dataset 237 types from task configuration (e.g. :py:method:`getInputDatasetTypes` 238 or sub-class methods). 243 Configuration for this task. Typically datasets are defined in 244 a task configuration. 246 Class of the configuration object which defines dataset type. 250 Dictionary where key is the name (arbitrary) of the output dataset 251 and value is the `butler.DatasetType` instance. Default 252 implementation uses configuration field name as dictionary key. 254 When the task produces no initialization outputs, should return an 258 for key, value
in config.items():
259 if isinstance(value, configClass):
264 """Run task algorithm on in-memory data. 266 This method is called by `runQuantum` to operate on input in-memory 267 data and produce coressponding output in-memory data. It receives 268 arguments which are dictionaries with input data and input/output 269 DataIds. Many simple tasks do not need to know DataIds so default 270 implementation of this method calls `run` method passing input data 271 objects as keyword arguments. Most simple tasks will implement `run` 272 method, more complex tasks that need to know about output DataIds 273 will override this method instead. 275 All three arguments to this method are dictionaries with keys equal 276 to the name of the configuration fields for dataset type. If dataset 277 type is configured with ``scalar`` fiels set to ``True`` then it is 278 expected that only one dataset appears on input or output for that 279 dataset type and dictionary value will be a single data object or 280 DataId. Otherwise if ``scalar`` is ``False`` (default) then value 281 will be a list (even if only one item is in the list). 283 The method returns `Struct` instance with attributes matching the 284 configuration fields for output dataset types. Values stored in 285 returned struct are single object if ``scalar`` is ``True`` or 286 list of objects otherwise. If tasks produces more than one object 287 for some dataset type then data objects returned in ``struct`` must 288 match in count and order corresponding DataIds in ``outputDataIds``. 293 Dictionary whose keys are the names of the configuration fields 294 describing input dataset types and values are Python-domain data 295 objects (or lists of objects) retrieved from data butler. 296 inputDataIds : `dict` 297 Dictionary whose keys are the names of the configuration fields 298 describing input dataset types and values are DataIds (or lists 299 of DataIds) that task consumes for corresponding dataset type. 300 DataIds are guaranteed to match data objects in ``inputData`` 301 outputDataIds : `dict` 302 Dictionary whose keys are the names of the configuration fields 303 describing output dataset types and values are DataIds (or lists 304 of DataIds) that task is to produce for corresponding dataset 310 Standard convention is that this method should return `Struct` 311 instance containing all output data. Struct attribute names 312 should correspond to the names of the configuration fields 313 describing task output dataset types. If something different 314 is returned then `saveStruct` method has to be re-implemented 317 return self.
run(**inputData)
320 """Run task algorithm on in-memory data. 322 This method should be implemented in a subclass unless tasks overrides 323 `adaptArgsAndRun` to do something different from its default 324 implementation. With default implementation of `adaptArgsAndRun` this 325 method will receive keyword arguments whose names will be the same as 326 names of configuration fields describing input dataset types. Argument 327 values will be data objects retrieved from data butler. If a dataset 328 type is configured with ``scalar`` field set to ``True`` then argument 329 value will be a single object, otherwise it will be a list of objects. 331 If the task needs to know its input or output DataIds then it has to 332 override `adaptArgsAndRun` method instead. 337 See description of `adaptArgsAndRun` method. 341 Typical implementation of this method may look like:: 343 def run(self, input, calib): 344 # "input", "calib", and "output" are the names of the config fields 346 # Assuming that input/calib datasets are `scalar` they are simple objects, 347 # do something with inputs and calibs, produce output image. 348 image = self.makeImage(input, calib) 350 # If output dataset is `scalar` then return object, not list 351 return Struct(output=image) 354 raise NotImplementedError(
"run() is not implemented")
357 """Execute PipelineTask algorithm on single quantum of data. 359 Typical implementation of this method will use inputs from quantum 360 to retrieve Python-domain objects from data butler and call 361 `adaptArgsAndRun` method on that data. On return from 362 `adaptArgsAndRun` this method will extract data from returned 363 `Struct` instance and save that data to butler. 365 The `Struct` returned from `adaptArgsAndRun` is expected to contain 366 data attributes with the names equal to the names of the 367 configuration fields defining output dataset types. The values of 368 the data attributes must be data objects corresponding to 369 the DataIds of output dataset types. All data objects will be 370 saved in butler using DataRefs from Quantum's output dictionary. 372 This method does not return anything to the caller, on errors 373 corresponding exception is raised. 378 Object describing input and output corresponding to this 379 invocation of PipelineTask instance. 381 Data butler instance. 385 `ScalarError` if a dataset type is configured as scalar but receives 386 multiple DataIds in `quantum`. Any exceptions that happen in data 387 butler or in `adaptArgsAndRun` method. 392 for key, value
in self.
config.items():
393 if isinstance(value, InputDatasetConfig):
394 dataRefs = quantum.predictedInputs[value.name]
395 dataIds = [dataRef.dataId
for dataRef
in dataRefs]
396 data = [butler.get(dataRef)
for dataRef
in dataRefs]
399 if len(dataRefs) != 1:
403 inputDataIds[key] = dataIds
409 for key, value
in self.
config.items():
410 if isinstance(value, OutputDatasetConfig):
411 dataRefs = quantum.outputs[value.name]
412 dataIds = [dataRef.dataId
for dataRef
in dataRefs]
415 if len(dataRefs) != 1:
417 dataRefs = dataRefs[0]
419 outputDataRefs[key] = dataRefs
420 outputDataIds[key] = dataIds
426 self.
saveStruct(struct, outputDataRefs, butler)
429 """Save data in butler. 431 Convention is that struct returned from ``run()`` method has data 432 field(s) with the same names as the config fields defining 433 output DatasetTypes. Subclasses may override this method to implement 434 different convention for `Struct` content or in case any 435 post-processing of data may be needed. 440 Data produced by the task packed into `Struct` instance 441 outputDataRefs : `dict` 442 Dictionary whose keys are the names of the configuration fields 443 describing output dataset types and values are lists of DataRefs. 444 DataRefs must match corresponding data objects in ``struct`` in 447 Data butler instance. 449 structDict = struct.getDict()
450 for key, value
in self.
config.items():
451 if isinstance(value, OutputDatasetConfig):
452 dataList = structDict[key]
453 dataRefs = outputDataRefs[key]
454 if not isinstance(dataRefs, list):
456 dataRefs = [dataRefs]
457 dataList = [dataList]
459 for dataRef, data
in zip(dataRefs, dataList):
460 butler.put(data, dataRef.datasetType.name, dataRef.dataId)
464 """Create new instance of the `DatasetType` from task config. 468 dsConfig : `pexConfig.Config` 469 Instance of `InputDatasetConfig`, `OutputDatasetConfig`, 470 `InitInputDatasetConfig`, or `InitOutputDatasetConfig`. 474 `butler.DatasetType` instance. 477 storageClass = StorageClassFactory().getStorageClass(dsConfig.storageClass)
479 return DatasetType(name=dsConfig.name,
480 dataUnits=dsConfig.units,
481 storageClass=storageClass)
484 """Return resource configuration for this task. 488 Object of type `~config.ResourceConfig` or ``None`` if resource 489 configuration is not defined for this task. 491 return getattr(self.
config,
"resources",
None)
def getDatasetTypes(cls, config, configClass)
def makeDatasetType(cls, dsConfig)
def getInitOutputDatasetTypes(cls, config)
def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds)
def getInitInputDatasetTypes(cls, config)
def runQuantum(self, quantum, butler)
def getInitOutputDatasets(self)
def getOutputDatasetTypes(cls, config)
def __init__(self, key, numDataIds)
def __init__(self, config=None, log=None, initInputs=None, kwargs)
def getInputDatasetTypes(cls, config)
def getResourceConfig(self)
def saveStruct(self, struct, outputDataRefs, butler)