22 """This module defines PipelineTask class and related methods. 25 __all__ = [
"DatasetTypeDescriptor",
"PipelineTask"]
27 from lsst.daf.butler
import DatasetType
28 from .config
import (InputDatasetConfig, OutputDatasetConfig,
29 InitInputDatasetConfig, InitOutputDatasetConfig)
30 from .task
import Task
34 """Exception raised when dataset type is configured as scalar 35 but there are multiple DataIds in a Quantum for that dataset. 40 Name of the configuration field for dataset type. 42 Actual number of DataIds in a Quantum for this dataset type. 45 super().
__init__((
"Expected scalar for output dataset field {}, " 46 "received {} DataIds").format(key, numDataIds))
50 """Describe DatasetType and its options for PipelineTask. 52 This class contains DatasetType and all relevant options that are used by 53 PipelineTask. Typically this is derived from configuration classes but 54 sub-classes of PipelineTask can also define additional DatasetTypes that 55 are not part of the task configuration. 59 datasetType : `DatasetType` 61 `True` if this is a scalar dataset. 70 """Make DatasetTypeDescriptor instance from configuration object. 74 datasetConfig : `lsst.pex.config.Config` 75 Instance of one the `InputDatasetConfig`, `OutputDatasetConfig`, 76 `InitInputDatasetConfig`, or `InitOutputDatasetConfig` types 80 descriptor : `DatasetTypeDescriptor` 82 datasetType = DatasetType(name=datasetConfig.name,
83 dimensions=datasetConfig.dimensions,
84 storageClass=datasetConfig.storageClass)
86 scalar = getattr(datasetConfig,
'scalar',
True)
87 return cls(datasetType=datasetType, scalar=scalar)
91 """`DatasetType` instance. 97 """`True` if this is a scalar dataset. 103 """Base class for all pipeline tasks. 105 This is an abstract base class for PipelineTasks which represents an 106 algorithm executed by framework(s) on data which comes from data butler, 107 resulting data is also stored in a data butler. 109 PipelineTask inherits from a `pipe.base.Task` and uses the same 110 configuration mechanism based on `pex.config`. PipelineTask sub-class 111 typically implements `run()` method which receives Python-domain data 112 objects and returns `pipe.base.Struct` object with resulting data. 113 `run()` method is not supposed to perform any I/O, it operates entirely 114 on in-memory objects. `runQuantum()` is the method (can be re-implemented 115 in sub-class) where all necessary I/O is performed, it reads all input 116 data from data butler into memory, calls `run()` method with that data, 117 examines returned `Struct` object and saves some or all of that data back 118 to data butler. `runQuantum()` method receives `daf.butler.Quantum` 119 instance which defines all input and output datasets for a single 120 invocation of PipelineTask. 122 Subclasses must be constructable with exactly the arguments taken by the 123 PipelineTask base class constructor, but may support other signatures as 128 canMultiprocess : bool, True by default (class attribute) 129 This class attribute is checked by execution framework, sub-classes 130 can set it to ``False`` in case task does not support multiprocessing. 134 config : `pex.config.Config`, optional 135 Configuration for this task (an instance of ``self.ConfigClass``, 136 which is a task-specific subclass of `PipelineTaskConfig`). 137 If not specified then it defaults to `self.ConfigClass()`. 138 log : `lsst.log.Log`, optional 139 Logger instance whose name is used as a log name prefix, or ``None`` 141 initInputs : `dict`, optional 142 A dictionary of objects needed to construct this PipelineTask, with 143 keys matching the keys of the dictionary returned by 144 `getInitInputDatasetTypes` and values equivalent to what would be 145 obtained by calling `Butler.get` with those DatasetTypes and no data 146 IDs. While it is optional for the base class, subclasses are 147 permitted to require this argument. 150 canMultiprocess =
True 152 def __init__(self, *, config=None, log=None, initInputs=None, **kwargs):
153 super().
__init__(config=config, log=log, **kwargs)
156 """Return persistable outputs that are available immediately after 157 the task has been constructed. 159 Subclasses that operate on catalogs should override this method to 160 return the schema(s) of the catalog(s) they produce. 162 It is not necessary to return the PipelineTask's configuration or 163 other provenance information in order for it to be persisted; that is 164 the responsibility of the execution system. 169 Dictionary with keys that match those of the dict returned by 170 `getInitOutputDatasetTypes` values that can be written by calling 171 `Butler.put` with those DatasetTypes and no data IDs. An empty 172 `dict` should be returned by tasks that produce no initialization 179 """Return input dataset type descriptors for this task. 181 Default implementation finds all fields of type `InputDatasetConfig` 182 in configuration (non-recursively) and uses them for constructing 183 `DatasetTypeDescriptor` instances. The names of these fields are used 184 as keys in returned dictionary. Subclasses can override this behavior. 189 Configuration for this task. Typically datasets are defined in 190 a task configuration. 194 Dictionary where key is the name (arbitrary) of the input dataset 195 and value is the `DatasetTypeDescriptor` instance. Default 196 implementation uses configuration field name as dictionary key. 202 """Return output dataset type descriptors for this task. 204 Default implementation finds all fields of type `OutputDatasetConfig` 205 in configuration (non-recursively) and uses them for constructing 206 `DatasetTypeDescriptor` instances. The keys of these fields are used 207 as keys in returned dictionary. Subclasses can override this behavior. 212 Configuration for this task. Typically datasets are defined in 213 a task configuration. 217 Dictionary where key is the name (arbitrary) of the output dataset 218 and value is the `DatasetTypeDescriptor` instance. Default 219 implementation uses configuration field name as dictionary key. 225 """Return dataset type descriptors that can be used to retrieve the 226 ``initInputs`` constructor argument. 228 Datasets used in initialization may not be associated with any 229 Dimension (i.e. their data IDs must be empty dictionaries). 231 Default implementation finds all fields of type 232 `InitInputInputDatasetConfig` in configuration (non-recursively) and 233 uses them for constructing `DatasetTypeDescriptor` instances. The 234 names of these fields are used as keys in returned dictionary. 235 Subclasses can override this behavior. 240 Configuration for this task. Typically datasets are defined in 241 a task configuration. 245 Dictionary where key is the name (arbitrary) of the input dataset 246 and value is the `DatasetTypeDescriptor` instance. Default 247 implementation uses configuration field name as dictionary key. 249 When the task requires no initialization inputs, should return an 256 """Return dataset type descriptors that can be used to write the 257 objects returned by `getOutputDatasets`. 259 Datasets used in initialization may not be associated with any 260 Dimension (i.e. their data IDs must be empty dictionaries). 262 Default implementation finds all fields of type 263 `InitOutputDatasetConfig` in configuration (non-recursively) and uses 264 them for constructing `DatasetTypeDescriptor` instances. The names of 265 these fields are used as keys in returned dictionary. Subclasses can 266 override this behavior. 271 Configuration for this task. Typically datasets are defined in 272 a task configuration. 276 Dictionary where key is the name (arbitrary) of the output dataset 277 and value is the `DatasetTypeDescriptor` instance. Default 278 implementation uses configuration field name as dictionary key. 280 When the task produces no initialization outputs, should return an 287 """Return dataset type descriptors defined in task configuration. 289 This method can be used by other methods that need to extract dataset 290 types from task configuration (e.g. `getInputDatasetTypes` or 296 Configuration for this task. Typically datasets are defined in 297 a task configuration. 299 Class of the configuration object which defines dataset type. 303 Dictionary where key is the name (arbitrary) of the output dataset 304 and value is the `DatasetTypeDescriptor` instance. Default 305 implementation uses configuration field name as dictionary key. 306 Returns empty dict if configuration has no fields with the specified 310 for key, value
in config.items():
311 if isinstance(value, configClass):
312 dsTypes[key] = DatasetTypeDescriptor.fromConfig(value)
316 """Run task algorithm on in-memory data. 318 This method is called by `runQuantum` to operate on input in-memory 319 data and produce coressponding output in-memory data. It receives 320 arguments which are dictionaries with input data and input/output 321 DataIds. Many simple tasks do not need to know DataIds so default 322 implementation of this method calls `run` method passing input data 323 objects as keyword arguments. Most simple tasks will implement `run` 324 method, more complex tasks that need to know about output DataIds 325 will override this method instead. 327 All three arguments to this method are dictionaries with keys equal 328 to the name of the configuration fields for dataset type. If dataset 329 type is configured with ``scalar`` fiels set to ``True`` then it is 330 expected that only one dataset appears on input or output for that 331 dataset type and dictionary value will be a single data object or 332 DataId. Otherwise if ``scalar`` is ``False`` (default) then value 333 will be a list (even if only one item is in the list). 335 The method returns `Struct` instance with attributes matching the 336 configuration fields for output dataset types. Values stored in 337 returned struct are single object if ``scalar`` is ``True`` or 338 list of objects otherwise. If tasks produces more than one object 339 for some dataset type then data objects returned in ``struct`` must 340 match in count and order corresponding DataIds in ``outputDataIds``. 345 Dictionary whose keys are the names of the configuration fields 346 describing input dataset types and values are Python-domain data 347 objects (or lists of objects) retrieved from data butler. 348 inputDataIds : `dict` 349 Dictionary whose keys are the names of the configuration fields 350 describing input dataset types and values are DataIds (or lists 351 of DataIds) that task consumes for corresponding dataset type. 352 DataIds are guaranteed to match data objects in ``inputData`` 353 outputDataIds : `dict` 354 Dictionary whose keys are the names of the configuration fields 355 describing output dataset types and values are DataIds (or lists 356 of DataIds) that task is to produce for corresponding dataset 362 Standard convention is that this method should return `Struct` 363 instance containing all output data. Struct attribute names 364 should correspond to the names of the configuration fields 365 describing task output dataset types. If something different 366 is returned then `saveStruct` method has to be re-implemented 369 return self.
run(**inputData)
372 """Run task algorithm on in-memory data. 374 This method should be implemented in a subclass unless tasks overrides 375 `adaptArgsAndRun` to do something different from its default 376 implementation. With default implementation of `adaptArgsAndRun` this 377 method will receive keyword arguments whose names will be the same as 378 names of configuration fields describing input dataset types. Argument 379 values will be data objects retrieved from data butler. If a dataset 380 type is configured with ``scalar`` field set to ``True`` then argument 381 value will be a single object, otherwise it will be a list of objects. 383 If the task needs to know its input or output DataIds then it has to 384 override `adaptArgsAndRun` method instead. 389 See description of `adaptArgsAndRun` method. 393 Typical implementation of this method may look like:: 395 def run(self, input, calib): 396 # "input", "calib", and "output" are the names of the config fields 398 # Assuming that input/calib datasets are `scalar` they are simple objects, 399 # do something with inputs and calibs, produce output image. 400 image = self.makeImage(input, calib) 402 # If output dataset is `scalar` then return object, not list 403 return Struct(output=image) 406 raise NotImplementedError(
"run() is not implemented")
409 """Execute PipelineTask algorithm on single quantum of data. 411 Typical implementation of this method will use inputs from quantum 412 to retrieve Python-domain objects from data butler and call 413 `adaptArgsAndRun` method on that data. On return from 414 `adaptArgsAndRun` this method will extract data from returned 415 `Struct` instance and save that data to butler. 417 The `Struct` returned from `adaptArgsAndRun` is expected to contain 418 data attributes with the names equal to the names of the 419 configuration fields defining output dataset types. The values of 420 the data attributes must be data objects corresponding to 421 the DataIds of output dataset types. All data objects will be 422 saved in butler using DataRefs from Quantum's output dictionary. 424 This method does not return anything to the caller, on errors 425 corresponding exception is raised. 430 Object describing input and output corresponding to this 431 invocation of PipelineTask instance. 433 Data butler instance. 437 `ScalarError` if a dataset type is configured as scalar but receives 438 multiple DataIds in `quantum`. Any exceptions that happen in data 439 butler or in `adaptArgsAndRun` method. 442 def makeDataRefs(descriptors, refMap):
443 """Generate map of DatasetRefs and DataIds. 445 Given a map of DatasetTypeDescriptor and a map of Quantum 446 DatasetRefs makes maps of DataIds and and DatasetRefs. 447 For scalar dataset types unpacks DatasetRefs and DataIds. 452 Map of (dataset key, DatasetTypeDescriptor). 454 Map of (dataset type name, DatasetRefs). 459 Map of (dataset key, DataIds) 461 Map of (dataset key, DatasetRefs) 466 Raised if dataset type is configured as scalar but more than 467 one DatasetRef exists for it. 471 for key, descriptor
in descriptors.items():
472 keyDataRefs = refMap[descriptor.datasetType.name]
473 keyDataIds = [dataRef.dataId
for dataRef
in keyDataRefs]
474 if descriptor.scalar:
476 if len(keyDataRefs) != 1:
478 keyDataRefs = keyDataRefs[0]
479 keyDataIds = keyDataIds[0]
480 dataIds[key] = keyDataIds
481 dataRefs[key] = keyDataRefs
482 return dataIds, dataRefs
486 inputDataIds, inputDataRefs = makeDataRefs(descriptors, quantum.predictedInputs)
490 for key, dataRefs
in inputDataRefs.items():
491 if isinstance(dataRefs, list):
492 inputs[key] = [butler.get(dataRef)
for dataRef
in dataRefs]
494 inputs[key] = butler.get(dataRefs)
499 outputDataIds, outputDataRefs = makeDataRefs(descriptors, quantum.outputs)
505 self.
saveStruct(struct, outputDataRefs, butler)
508 """Save data in butler. 510 Convention is that struct returned from ``run()`` method has data 511 field(s) with the same names as the config fields defining 512 output DatasetTypes. Subclasses may override this method to implement 513 different convention for `Struct` content or in case any 514 post-processing of data may be needed. 519 Data produced by the task packed into `Struct` instance 520 outputDataRefs : `dict` 521 Dictionary whose keys are the names of the configuration fields 522 describing output dataset types and values are lists of DataRefs. 523 DataRefs must match corresponding data objects in ``struct`` in 526 Data butler instance. 528 structDict = struct.getDict()
530 for key
in descriptors.keys():
531 dataList = structDict[key]
532 dataRefs = outputDataRefs[key]
533 if not isinstance(dataRefs, list):
535 dataRefs = [dataRefs]
536 dataList = [dataList]
538 for dataRef, data
in zip(dataRefs, dataList):
539 butler.put(data, dataRef.datasetType.name, dataRef.dataId)
542 """Return resource configuration for this task. 546 Object of type `~config.ResourceConfig` or ``None`` if resource 547 configuration is not defined for this task. 549 return getattr(self.
config,
"resources",
None)
def getDatasetTypes(cls, config, configClass)
def __init__(self, datasetType, scalar)
def getInitOutputDatasetTypes(cls, config)
def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds)
def getInitInputDatasetTypes(cls, config)
def runQuantum(self, quantum, butler)
def getInitOutputDatasets(self)
def getOutputDatasetTypes(cls, config)
def fromConfig(cls, datasetConfig)
def __init__(self, key, numDataIds)
def __init__(self, config=None, log=None, initInputs=None, kwargs)
def getInputDatasetTypes(cls, config)
def getResourceConfig(self)
def saveStruct(self, struct, outputDataRefs, butler)