22 """This module defines PipelineTask class and related methods. 25 __all__ = [
"DatasetTypeDescriptor",
"PipelineTask"]
27 from lsst.daf.butler
import DatasetType, StorageClassFactory
28 from .config
import (InputDatasetConfig, OutputDatasetConfig,
29 InitInputDatasetConfig, InitOutputDatasetConfig)
30 from .task
import Task
34 """Exception raised when dataset type is configured as scalar 35 but there are multiple DataIds in a Quantum for that dataset. 40 Name of the configuration field for dataset type. 42 Actual number of DataIds in a Quantum for this dataset type. 45 super().
__init__((
"Expected scalar for output dataset field {}, " 46 "received {} DataIds").format(key, numDataIds))
50 """Describe DatasetType and its options for PipelineTask. 52 This class contains DatasetType and all relevant options that are used by 53 PipelineTask. Typically this is derived from configuration classes but 54 sub-classes of PipelineTask can also define additional DatasetTypes that 55 are not part of the task configuration. 59 datasetType : `DatasetType` 61 `True` if this is a scalar dataset. 70 """Make DatasetTypeDescriptor instance from configuration object. 74 datasetConfig : `lsst.pex.config.Config` 75 Instance of one the `InputDatasetConfig`, `OutputDatasetConfig`, 76 `InitInputDatasetConfig`, or `InitOutputDatasetConfig` types 80 descriptor : `DatasetTypeDescriptor` 83 storageClass = StorageClassFactory().getStorageClass(datasetConfig.storageClass)
85 datasetType = DatasetType(name=datasetConfig.name,
86 dataUnits=datasetConfig.units,
87 storageClass=storageClass)
89 scalar = getattr(datasetConfig,
'scalar',
True)
90 return cls(datasetType=datasetType, scalar=scalar)
94 """`DatasetType` instance. 100 """`True` if this is a scalar dataset. 106 """Base class for all pipeline tasks. 108 This is an abstract base class for PipelineTasks which represents an 109 algorithm executed by framework(s) on data which comes from data butler, 110 resulting data is also stored in a data butler. 112 PipelineTask inherits from a `pipe.base.Task` and uses the same 113 configuration mechanism based on `pex.config`. PipelineTask sub-class 114 typically implements `run()` method which receives Python-domain data 115 objects and returns `pipe.base.Struct` object with resulting data. 116 `run()` method is not supposed to perform any I/O, it operates entirely 117 on in-memory objects. `runQuantum()` is the method (can be re-implemented 118 in sub-class) where all necessary I/O is performed, it reads all input 119 data from data butler into memory, calls `run()` method with that data, 120 examines returned `Struct` object and saves some or all of that data back 121 to data butler. `runQuantum()` method receives `daf.butler.Quantum` 122 instance which defines all input and output datasets for a single 123 invocation of PipelineTask. 125 Subclasses must be constructable with exactly the arguments taken by the 126 PipelineTask base class constructor, but may support other signatures as 131 canMultiprocess : bool, True by default (class attribute) 132 This class attribute is checked by execution framework, sub-classes 133 can set it to ``False`` in case task does not support multiprocessing. 137 config : `pex.config.Config`, optional 138 Configuration for this task (an instance of ``self.ConfigClass``, 139 which is a task-specific subclass of `PipelineTaskConfig`). 140 If not specified then it defaults to `self.ConfigClass()`. 141 log : `lsst.log.Log`, optional 142 Logger instance whose name is used as a log name prefix, or ``None`` 144 initInputs : `dict`, optional 145 A dictionary of objects needed to construct this PipelineTask, with 146 keys matching the keys of the dictionary returned by 147 `getInitInputDatasetTypes` and values equivalent to what would be 148 obtained by calling `Butler.get` with those DatasetTypes and no data 149 IDs. While it is optional for the base class, subclasses are 150 permitted to require this argument. 153 canMultiprocess =
True 155 def __init__(self, *, config=None, log=None, initInputs=None, **kwargs):
156 super().
__init__(config=config, log=log, **kwargs)
159 """Return persistable outputs that are available immediately after 160 the task has been constructed. 162 Subclasses that operate on catalogs should override this method to 163 return the schema(s) of the catalog(s) they produce. 165 It is not necessary to return the PipelineTask's configuration or 166 other provenance information in order for it to be persisted; that is 167 the responsibility of the execution system. 172 Dictionary with keys that match those of the dict returned by 173 `getInitOutputDatasetTypes` values that can be written by calling 174 `Butler.put` with those DatasetTypes and no data IDs. An empty 175 `dict` should be returned by tasks that produce no initialization 182 """Return input dataset type descriptors for this task. 184 Default implementation finds all fields of type `InputDatasetConfig` 185 in configuration (non-recursively) and uses them for constructing 186 `DatasetTypeDescriptor` instances. The names of these fields are used 187 as keys in returned dictionary. Subclasses can override this behavior. 192 Configuration for this task. Typically datasets are defined in 193 a task configuration. 197 Dictionary where key is the name (arbitrary) of the input dataset 198 and value is the `DatasetTypeDescriptor` instance. Default 199 implementation uses configuration field name as dictionary key. 205 """Return output dataset type descriptors for this task. 207 Default implementation finds all fields of type `OutputDatasetConfig` 208 in configuration (non-recursively) and uses them for constructing 209 `DatasetTypeDescriptor` instances. The keys of these fields are used 210 as keys in returned dictionary. Subclasses can override this behavior. 215 Configuration for this task. Typically datasets are defined in 216 a task configuration. 220 Dictionary where key is the name (arbitrary) of the output dataset 221 and value is the `DatasetTypeDescriptor` instance. Default 222 implementation uses configuration field name as dictionary key. 228 """Return dataset type descriptors that can be used to retrieve the 229 ``initInputs`` constructor argument. 231 Datasets used in initialization may not be associated with any 232 DataUnits (i.e. their data IDs must be empty dictionaries). 234 Default implementation finds all fields of type 235 `InitInputInputDatasetConfig` in configuration (non-recursively) and 236 uses them for constructing `DatasetTypeDescriptor` instances. The 237 names of these fields are used as keys in returned dictionary. 238 Subclasses can override this behavior. 243 Configuration for this task. Typically datasets are defined in 244 a task configuration. 248 Dictionary where key is the name (arbitrary) of the input dataset 249 and value is the `DatasetTypeDescriptor` instance. Default 250 implementation uses configuration field name as dictionary key. 252 When the task requires no initialization inputs, should return an 259 """Return dataset type descriptors that can be used to write the 260 objects returned by `getOutputDatasets`. 262 Datasets used in initialization may not be associated with any 263 DataUnits (i.e. their data IDs must be empty dictionaries). 265 Default implementation finds all fields of type 266 `InitOutputDatasetConfig` in configuration (non-recursively) and uses 267 them for constructing `DatasetTypeDescriptor` instances. The names of 268 these fields are used as keys in returned dictionary. Subclasses can 269 override this behavior. 274 Configuration for this task. Typically datasets are defined in 275 a task configuration. 279 Dictionary where key is the name (arbitrary) of the output dataset 280 and value is the `DatasetTypeDescriptor` instance. Default 281 implementation uses configuration field name as dictionary key. 283 When the task produces no initialization outputs, should return an 290 """Return dataset type descriptors defined in task configuration. 292 This method can be used by other methods that need to extract dataset 293 types from task configuration (e.g. `getInputDatasetTypes` or 299 Configuration for this task. Typically datasets are defined in 300 a task configuration. 302 Class of the configuration object which defines dataset type. 306 Dictionary where key is the name (arbitrary) of the output dataset 307 and value is the `DatasetTypeDescriptor` instance. Default 308 implementation uses configuration field name as dictionary key. 309 Returns empty dict if configuration has no fields with the specified 313 for key, value
in config.items():
314 if isinstance(value, configClass):
315 dsTypes[key] = DatasetTypeDescriptor.fromConfig(value)
319 """Run task algorithm on in-memory data. 321 This method is called by `runQuantum` to operate on input in-memory 322 data and produce coressponding output in-memory data. It receives 323 arguments which are dictionaries with input data and input/output 324 DataIds. Many simple tasks do not need to know DataIds so default 325 implementation of this method calls `run` method passing input data 326 objects as keyword arguments. Most simple tasks will implement `run` 327 method, more complex tasks that need to know about output DataIds 328 will override this method instead. 330 All three arguments to this method are dictionaries with keys equal 331 to the name of the configuration fields for dataset type. If dataset 332 type is configured with ``scalar`` fiels set to ``True`` then it is 333 expected that only one dataset appears on input or output for that 334 dataset type and dictionary value will be a single data object or 335 DataId. Otherwise if ``scalar`` is ``False`` (default) then value 336 will be a list (even if only one item is in the list). 338 The method returns `Struct` instance with attributes matching the 339 configuration fields for output dataset types. Values stored in 340 returned struct are single object if ``scalar`` is ``True`` or 341 list of objects otherwise. If tasks produces more than one object 342 for some dataset type then data objects returned in ``struct`` must 343 match in count and order corresponding DataIds in ``outputDataIds``. 348 Dictionary whose keys are the names of the configuration fields 349 describing input dataset types and values are Python-domain data 350 objects (or lists of objects) retrieved from data butler. 351 inputDataIds : `dict` 352 Dictionary whose keys are the names of the configuration fields 353 describing input dataset types and values are DataIds (or lists 354 of DataIds) that task consumes for corresponding dataset type. 355 DataIds are guaranteed to match data objects in ``inputData`` 356 outputDataIds : `dict` 357 Dictionary whose keys are the names of the configuration fields 358 describing output dataset types and values are DataIds (or lists 359 of DataIds) that task is to produce for corresponding dataset 365 Standard convention is that this method should return `Struct` 366 instance containing all output data. Struct attribute names 367 should correspond to the names of the configuration fields 368 describing task output dataset types. If something different 369 is returned then `saveStruct` method has to be re-implemented 372 return self.
run(**inputData)
375 """Run task algorithm on in-memory data. 377 This method should be implemented in a subclass unless tasks overrides 378 `adaptArgsAndRun` to do something different from its default 379 implementation. With default implementation of `adaptArgsAndRun` this 380 method will receive keyword arguments whose names will be the same as 381 names of configuration fields describing input dataset types. Argument 382 values will be data objects retrieved from data butler. If a dataset 383 type is configured with ``scalar`` field set to ``True`` then argument 384 value will be a single object, otherwise it will be a list of objects. 386 If the task needs to know its input or output DataIds then it has to 387 override `adaptArgsAndRun` method instead. 392 See description of `adaptArgsAndRun` method. 396 Typical implementation of this method may look like:: 398 def run(self, input, calib): 399 # "input", "calib", and "output" are the names of the config fields 401 # Assuming that input/calib datasets are `scalar` they are simple objects, 402 # do something with inputs and calibs, produce output image. 403 image = self.makeImage(input, calib) 405 # If output dataset is `scalar` then return object, not list 406 return Struct(output=image) 409 raise NotImplementedError(
"run() is not implemented")
412 """Execute PipelineTask algorithm on single quantum of data. 414 Typical implementation of this method will use inputs from quantum 415 to retrieve Python-domain objects from data butler and call 416 `adaptArgsAndRun` method on that data. On return from 417 `adaptArgsAndRun` this method will extract data from returned 418 `Struct` instance and save that data to butler. 420 The `Struct` returned from `adaptArgsAndRun` is expected to contain 421 data attributes with the names equal to the names of the 422 configuration fields defining output dataset types. The values of 423 the data attributes must be data objects corresponding to 424 the DataIds of output dataset types. All data objects will be 425 saved in butler using DataRefs from Quantum's output dictionary. 427 This method does not return anything to the caller, on errors 428 corresponding exception is raised. 433 Object describing input and output corresponding to this 434 invocation of PipelineTask instance. 436 Data butler instance. 440 `ScalarError` if a dataset type is configured as scalar but receives 441 multiple DataIds in `quantum`. Any exceptions that happen in data 442 butler or in `adaptArgsAndRun` method. 445 def makeDataRefs(descriptors, refMap):
446 """Generate map of DatasetRefs and DataIds. 448 Given a map of DatasetTypeDescriptor and a map of Quantum 449 DatasetRefs makes maps of DataIds and and DatasetRefs. 450 For scalar dataset types unpacks DatasetRefs and DataIds. 455 Map of (dataset key, DatasetTypeDescriptor). 457 Map of (dataset type name, DatasetRefs). 462 Map of (dataset key, DataIds) 464 Map of (dataset key, DatasetRefs) 469 Raised if dataset type is configured as scalar but more than 470 one DatasetRef exists for it. 474 for key, descriptor
in descriptors.items():
475 keyDataRefs = refMap[descriptor.datasetType.name]
476 keyDataIds = [dataRef.dataId
for dataRef
in keyDataRefs]
477 if descriptor.scalar:
479 if len(keyDataRefs) != 1:
481 keyDataRefs = keyDataRefs[0]
482 keyDataIds = keyDataIds[0]
483 dataIds[key] = keyDataIds
484 dataRefs[key] = keyDataRefs
485 return dataIds, dataRefs
489 inputDataIds, inputDataRefs = makeDataRefs(descriptors, quantum.predictedInputs)
493 for key, dataRefs
in inputDataRefs.items():
494 if isinstance(dataRefs, list):
495 inputs[key] = [butler.get(dataRef)
for dataRef
in dataRefs]
497 inputs[key] = butler.get(dataRefs)
502 outputDataIds, outputDataRefs = makeDataRefs(descriptors, quantum.outputs)
508 self.
saveStruct(struct, outputDataRefs, butler)
511 """Save data in butler. 513 Convention is that struct returned from ``run()`` method has data 514 field(s) with the same names as the config fields defining 515 output DatasetTypes. Subclasses may override this method to implement 516 different convention for `Struct` content or in case any 517 post-processing of data may be needed. 522 Data produced by the task packed into `Struct` instance 523 outputDataRefs : `dict` 524 Dictionary whose keys are the names of the configuration fields 525 describing output dataset types and values are lists of DataRefs. 526 DataRefs must match corresponding data objects in ``struct`` in 529 Data butler instance. 531 structDict = struct.getDict()
533 for key
in descriptors.keys():
534 dataList = structDict[key]
535 dataRefs = outputDataRefs[key]
536 if not isinstance(dataRefs, list):
538 dataRefs = [dataRefs]
539 dataList = [dataList]
541 for dataRef, data
in zip(dataRefs, dataList):
542 butler.put(data, dataRef.datasetType.name, dataRef.dataId)
545 """Return resource configuration for this task. 549 Object of type `~config.ResourceConfig` or ``None`` if resource 550 configuration is not defined for this task. 552 return getattr(self.
config,
"resources",
None)
def getDatasetTypes(cls, config, configClass)
def __init__(self, datasetType, scalar)
def getInitOutputDatasetTypes(cls, config)
def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds)
def getInitInputDatasetTypes(cls, config)
def runQuantum(self, quantum, butler)
def getInitOutputDatasets(self)
def getOutputDatasetTypes(cls, config)
def fromConfig(cls, datasetConfig)
def __init__(self, key, numDataIds)
def __init__(self, config=None, log=None, initInputs=None, kwargs)
def getInputDatasetTypes(cls, config)
def getResourceConfig(self)
def saveStruct(self, struct, outputDataRefs, butler)