22 """This module defines PipelineTask class and related methods. 25 __all__ = [
"DatasetTypeDescriptor",
"PipelineTask"]
27 from lsst.daf.butler
import DatasetType
28 from .config
import (InputDatasetConfig, OutputDatasetConfig,
29 InitInputDatasetConfig, InitOutputDatasetConfig)
30 from .task
import Task
34 """Exception raised when dataset type is configured as scalar 35 but there are multiple DataIds in a Quantum for that dataset. 40 Name of the configuration field for dataset type. 42 Actual number of DataIds in a Quantum for this dataset type. 45 super().
__init__((
"Expected scalar for output dataset field {}, " 46 "received {} DataIds").format(key, numDataIds))
50 """Description of an unnormalized proto-DatasetType and its relationship to 53 This class contains the information needed to construct a `DatasetType` 54 (once a `DimensionUniverse` is available) and all relevant options that are 55 used by PipelineTask. Typically this is derived from configuration classes, 56 but sub-classes of PipelineTask can also define additional DatasetTypes 57 that are not part of the task configuration. 62 Name of the dataset type. 63 dimensionNames: `~collections.abc.Set` of `str` 64 Names of the dimensions used to identify datasets of this type. 65 storageClassName: `str` 66 Name of the `~lsst.daf.butler.StorageClass` for this dataset type. 68 `True` if this is a scalar dataset. 70 `True` if this dataset will be manually loaded by a concrete 71 `PipelineTask` instead of loaded automatically by the base class. 74 def __init__(self, name, dimensionNames, storageClassName, scalar, manualLoad):
83 """Make DatasetTypeDescriptor instance from configuration object. 87 datasetConfig : `lsst.pex.config.Config` 88 Instance of one the `InputDatasetConfig`, `OutputDatasetConfig`, 89 `InitInputDatasetConfig`, or `InitOutputDatasetConfig` types 93 descriptor : `DatasetTypeDescriptor` 96 scalar = getattr(datasetConfig,
'scalar',
True)
97 manualLoad = getattr(datasetConfig,
'manualLoad',
False)
98 return cls(name=datasetConfig.name, dimensionNames=datasetConfig.dimensions,
99 storageClassName=datasetConfig.storageClass, scalar=scalar, manualLoad=manualLoad)
102 """Construct a true `DatasetType` instance with normalized dimensions. 106 universe : `lsst.daf.butler.DimensionUniverse` 107 Set of all known dimensions to be used to normalize the dimension 108 names specified in config. 112 datasetType : `DatasetType` 113 The `DatasetType` defined by this descriptor. 115 return DatasetType(self.
_name,
121 """Name of the dataset type (`str`). 127 """`True` if this is a scalar dataset. 133 """`True` if the task will handle loading the data 139 """Base class for all pipeline tasks. 141 This is an abstract base class for PipelineTasks which represents an 142 algorithm executed by framework(s) on data which comes from data butler, 143 resulting data is also stored in a data butler. 145 PipelineTask inherits from a `pipe.base.Task` and uses the same 146 configuration mechanism based on `pex.config`. PipelineTask sub-class 147 typically implements `run()` method which receives Python-domain data 148 objects and returns `pipe.base.Struct` object with resulting data. 149 `run()` method is not supposed to perform any I/O, it operates entirely 150 on in-memory objects. `runQuantum()` is the method (can be re-implemented 151 in sub-class) where all necessary I/O is performed, it reads all input 152 data from data butler into memory, calls `run()` method with that data, 153 examines returned `Struct` object and saves some or all of that data back 154 to data butler. `runQuantum()` method receives `daf.butler.Quantum` 155 instance which defines all input and output datasets for a single 156 invocation of PipelineTask. 158 Subclasses must be constructable with exactly the arguments taken by the 159 PipelineTask base class constructor, but may support other signatures as 164 canMultiprocess : bool, True by default (class attribute) 165 This class attribute is checked by execution framework, sub-classes 166 can set it to ``False`` in case task does not support multiprocessing. 170 config : `pex.config.Config`, optional 171 Configuration for this task (an instance of ``self.ConfigClass``, 172 which is a task-specific subclass of `PipelineTaskConfig`). 173 If not specified then it defaults to `self.ConfigClass()`. 174 log : `lsst.log.Log`, optional 175 Logger instance whose name is used as a log name prefix, or ``None`` 177 initInputs : `dict`, optional 178 A dictionary of objects needed to construct this PipelineTask, with 179 keys matching the keys of the dictionary returned by 180 `getInitInputDatasetTypes` and values equivalent to what would be 181 obtained by calling `Butler.get` with those DatasetTypes and no data 182 IDs. While it is optional for the base class, subclasses are 183 permitted to require this argument. 186 canMultiprocess =
True 188 def __init__(self, *, config=None, log=None, initInputs=None, **kwargs):
189 super().
__init__(config=config, log=log, **kwargs)
192 """Return persistable outputs that are available immediately after 193 the task has been constructed. 195 Subclasses that operate on catalogs should override this method to 196 return the schema(s) of the catalog(s) they produce. 198 It is not necessary to return the PipelineTask's configuration or 199 other provenance information in order for it to be persisted; that is 200 the responsibility of the execution system. 205 Dictionary with keys that match those of the dict returned by 206 `getInitOutputDatasetTypes` values that can be written by calling 207 `Butler.put` with those DatasetTypes and no data IDs. An empty 208 `dict` should be returned by tasks that produce no initialization 215 """Return input dataset type descriptors for this task. 217 Default implementation finds all fields of type `InputDatasetConfig` 218 in configuration (non-recursively) and uses them for constructing 219 `DatasetTypeDescriptor` instances. The names of these fields are used 220 as keys in returned dictionary. Subclasses can override this behavior. 225 Configuration for this task. Typically datasets are defined in 226 a task configuration. 230 Dictionary where key is the name (arbitrary) of the input dataset 231 and value is the `DatasetTypeDescriptor` instance. Default 232 implementation uses configuration field name as dictionary key. 238 """Return output dataset type descriptors for this task. 240 Default implementation finds all fields of type `OutputDatasetConfig` 241 in configuration (non-recursively) and uses them for constructing 242 `DatasetTypeDescriptor` instances. The keys of these fields are used 243 as keys in returned dictionary. Subclasses can override this behavior. 248 Configuration for this task. Typically datasets are defined in 249 a task configuration. 253 Dictionary where key is the name (arbitrary) of the output dataset 254 and value is the `DatasetTypeDescriptor` instance. Default 255 implementation uses configuration field name as dictionary key. 261 """Return the local names of input dataset types that should be 262 assumed to exist instead of constraining what data to process with 265 Usually, when running a `PipelineTask`, the presence of input datasets 266 constrains the processing to be done (as defined by the `QuantumGraph` 267 generated during "preflight"). "Prerequisites" are special input 268 datasets that do not constrain that graph, but instead cause a hard 269 failure when missing. Calibration products and reference catalogs 270 are examples of dataset types that should usually be marked as 276 Configuration for this task. Typically datasets are defined in 277 a task configuration. 281 prerequisite : `~collections.abc.Set` of `str` 282 The keys in the dictionary returned by `getInputDatasetTypes` that 283 represent dataset types that should be considered prerequisites. 284 Names returned here that are not keys in that dictionary are 285 ignored; that way, if a config option removes an input dataset type 286 only `getInputDatasetTypes` needs to be updated. 292 """Return dataset type descriptors that can be used to retrieve the 293 ``initInputs`` constructor argument. 295 Datasets used in initialization may not be associated with any 296 Dimension (i.e. their data IDs must be empty dictionaries). 298 Default implementation finds all fields of type 299 `InitInputInputDatasetConfig` in configuration (non-recursively) and 300 uses them for constructing `DatasetTypeDescriptor` instances. The 301 names of these fields are used as keys in returned dictionary. 302 Subclasses can override this behavior. 307 Configuration for this task. Typically datasets are defined in 308 a task configuration. 312 Dictionary where key is the name (arbitrary) of the input dataset 313 and value is the `DatasetTypeDescriptor` instance. Default 314 implementation uses configuration field name as dictionary key. 316 When the task requires no initialization inputs, should return an 323 """Return dataset type descriptors that can be used to write the 324 objects returned by `getOutputDatasets`. 326 Datasets used in initialization may not be associated with any 327 Dimension (i.e. their data IDs must be empty dictionaries). 329 Default implementation finds all fields of type 330 `InitOutputDatasetConfig` in configuration (non-recursively) and uses 331 them for constructing `DatasetTypeDescriptor` instances. The names of 332 these fields are used as keys in returned dictionary. Subclasses can 333 override this behavior. 338 Configuration for this task. Typically datasets are defined in 339 a task configuration. 343 Dictionary where key is the name (arbitrary) of the output dataset 344 and value is the `DatasetTypeDescriptor` instance. Default 345 implementation uses configuration field name as dictionary key. 347 When the task produces no initialization outputs, should return an 354 """Return dataset type descriptors defined in task configuration. 356 This method can be used by other methods that need to extract dataset 357 types from task configuration (e.g. `getInputDatasetTypes` or 363 Configuration for this task. Typically datasets are defined in 364 a task configuration. 366 Class of the configuration object which defines dataset type. 370 Dictionary where key is the name (arbitrary) of the output dataset 371 and value is the `DatasetTypeDescriptor` instance. Default 372 implementation uses configuration field name as dictionary key. 373 Returns empty dict if configuration has no fields with the specified 377 for key, value
in config.items():
378 if isinstance(value, configClass):
379 dsTypes[key] = DatasetTypeDescriptor.fromConfig(value)
384 """Return any Dimensions that are permitted to have different values 385 for different DatasetTypes within the same quantum. 390 Configuration for this task. 394 dimensions : `~collections.abc.Set` of `Dimension` or `str` 395 The dimensions or names thereof that should be considered 400 Any Dimension declared to be per-DatasetType by a PipelineTask must 401 also be declared to be per-DatasetType by other PipelineTasks in the 404 The classic example of a per-DatasetType dimension is the 405 ``CalibrationLabel`` dimension that maps to a validity range for 406 master calibrations. When running Instrument Signature Removal, one 407 does not care that different dataset types like flat, bias, and dark 408 have different validity ranges, as long as those validity ranges all 409 overlap the relevant observation. 414 """Run task algorithm on in-memory data. 416 This method is called by `runQuantum` to operate on input in-memory 417 data and produce coressponding output in-memory data. It receives 418 arguments which are dictionaries with input data and input/output 419 DataIds. Many simple tasks do not need to know DataIds so default 420 implementation of this method calls `run` method passing input data 421 objects as keyword arguments. Most simple tasks will implement `run` 422 method, more complex tasks that need to know about output DataIds 423 will override this method instead. 425 All three arguments to this method are dictionaries with keys equal 426 to the name of the configuration fields for dataset type. If dataset 427 type is configured with ``scalar`` fiels set to ``True`` then it is 428 expected that only one dataset appears on input or output for that 429 dataset type and dictionary value will be a single data object or 430 DataId. Otherwise if ``scalar`` is ``False`` (default) then value 431 will be a list (even if only one item is in the list). 433 The method returns `Struct` instance with attributes matching the 434 configuration fields for output dataset types. Values stored in 435 returned struct are single object if ``scalar`` is ``True`` or 436 list of objects otherwise. If tasks produces more than one object 437 for some dataset type then data objects returned in ``struct`` must 438 match in count and order corresponding DataIds in ``outputDataIds``. 443 Dictionary whose keys are the names of the configuration fields 444 describing input dataset types and values are Python-domain data 445 objects (or lists of objects) retrieved from data butler. 446 inputDataIds : `dict` 447 Dictionary whose keys are the names of the configuration fields 448 describing input dataset types and values are DataIds (or lists 449 of DataIds) that task consumes for corresponding dataset type. 450 DataIds are guaranteed to match data objects in ``inputData`` 451 outputDataIds : `dict` 452 Dictionary whose keys are the names of the configuration fields 453 describing output dataset types and values are DataIds (or lists 454 of DataIds) that task is to produce for corresponding dataset 460 Standard convention is that this method should return `Struct` 461 instance containing all output data. Struct attribute names 462 should correspond to the names of the configuration fields 463 describing task output dataset types. If something different 464 is returned then `saveStruct` method has to be re-implemented 467 return self.
run(**inputData)
470 """Run task algorithm on in-memory data. 472 This method should be implemented in a subclass unless tasks overrides 473 `adaptArgsAndRun` to do something different from its default 474 implementation. With default implementation of `adaptArgsAndRun` this 475 method will receive keyword arguments whose names will be the same as 476 names of configuration fields describing input dataset types. Argument 477 values will be data objects retrieved from data butler. If a dataset 478 type is configured with ``scalar`` field set to ``True`` then argument 479 value will be a single object, otherwise it will be a list of objects. 481 If the task needs to know its input or output DataIds then it has to 482 override `adaptArgsAndRun` method instead. 487 See description of `adaptArgsAndRun` method. 491 Typical implementation of this method may look like:: 493 def run(self, input, calib): 494 # "input", "calib", and "output" are the names of the config fields 496 # Assuming that input/calib datasets are `scalar` they are simple objects, 497 # do something with inputs and calibs, produce output image. 498 image = self.makeImage(input, calib) 500 # If output dataset is `scalar` then return object, not list 501 return Struct(output=image) 504 raise NotImplementedError(
"run() is not implemented")
507 """Execute PipelineTask algorithm on single quantum of data. 509 Typical implementation of this method will use inputs from quantum 510 to retrieve Python-domain objects from data butler and call 511 `adaptArgsAndRun` method on that data. On return from 512 `adaptArgsAndRun` this method will extract data from returned 513 `Struct` instance and save that data to butler. 515 The `Struct` returned from `adaptArgsAndRun` is expected to contain 516 data attributes with the names equal to the names of the 517 configuration fields defining output dataset types. The values of 518 the data attributes must be data objects corresponding to 519 the DataIds of output dataset types. All data objects will be 520 saved in butler using DataRefs from Quantum's output dictionary. 522 This method does not return anything to the caller, on errors 523 corresponding exception is raised. 528 Object describing input and output corresponding to this 529 invocation of PipelineTask instance. 531 Data butler instance. 535 `ScalarError` if a dataset type is configured as scalar but receives 536 multiple DataIds in `quantum`. Any exceptions that happen in data 537 butler or in `adaptArgsAndRun` method. 540 def makeDataRefs(descriptors, refMap):
541 """Generate map of DatasetRefs and DataIds. 543 Given a map of DatasetTypeDescriptor and a map of Quantum 544 DatasetRefs makes maps of DataIds and and DatasetRefs. 545 For scalar dataset types unpacks DatasetRefs and DataIds. 550 Map of (dataset key, DatasetTypeDescriptor). 552 Map of (dataset type name, DatasetRefs). 557 Map of (dataset key, DataIds) 559 Map of (dataset key, DatasetRefs) 564 Raised if dataset type is configured as scalar but more than 565 one DatasetRef exists for it. 569 for key, descriptor
in descriptors.items():
570 datasetType = descriptor.makeDatasetType(butler.registry.dimensions)
571 keyDataRefs = refMap[datasetType.name]
572 keyDataIds = [dataRef.dataId
for dataRef
in keyDataRefs]
573 if descriptor.scalar:
575 if len(keyDataRefs) != 1:
577 keyDataRefs = keyDataRefs[0]
578 keyDataIds = keyDataIds[0]
579 dataIds[key] = keyDataIds
580 if not descriptor.manualLoad:
581 dataRefs[key] = keyDataRefs
582 return dataIds, dataRefs
586 inputDataIds, inputDataRefs = makeDataRefs(descriptors, quantum.predictedInputs)
590 for key, dataRefs
in inputDataRefs.items():
591 if isinstance(dataRefs, list):
592 inputs[key] = [butler.get(dataRef)
for dataRef
in dataRefs]
594 inputs[key] = butler.get(dataRefs)
599 outputDataIds, outputDataRefs = makeDataRefs(descriptors, quantum.outputs)
602 struct = self.
adaptArgsAndRun(inputs, inputDataIds, outputDataIds, butler)
605 self.
saveStruct(struct, outputDataRefs, butler)
608 """Save data in butler. 610 Convention is that struct returned from ``run()`` method has data 611 field(s) with the same names as the config fields defining 612 output DatasetTypes. Subclasses may override this method to implement 613 different convention for `Struct` content or in case any 614 post-processing of data may be needed. 619 Data produced by the task packed into `Struct` instance 620 outputDataRefs : `dict` 621 Dictionary whose keys are the names of the configuration fields 622 describing output dataset types and values are lists of DataRefs. 623 DataRefs must match corresponding data objects in ``struct`` in 626 Data butler instance. 628 structDict = struct.getDict()
630 for key
in descriptors.keys():
631 dataList = structDict[key]
632 dataRefs = outputDataRefs[key]
633 if not isinstance(dataRefs, list):
635 dataRefs = [dataRefs]
636 dataList = [dataList]
638 for dataRef, data
in zip(dataRefs, dataList):
639 butler.put(data, dataRef.datasetType.name, dataRef.dataId)
642 """Return resource configuration for this task. 646 Object of type `~config.ResourceConfig` or ``None`` if resource 647 configuration is not defined for this task. 649 return getattr(self.
config,
"resources",
None)
def __init__(self, name, dimensionNames, storageClassName, scalar, manualLoad)
def getDatasetTypes(cls, config, configClass)
def getPerDatasetTypeDimensions(cls, config)
def getInitOutputDatasetTypes(cls, config)
def getInitInputDatasetTypes(cls, config)
def getPrerequisiteDatasetTypes(cls, config)
def runQuantum(self, quantum, butler)
def getInitOutputDatasets(self)
def getOutputDatasetTypes(cls, config)
def fromConfig(cls, datasetConfig)
def makeDatasetType(self, universe)
def __init__(self, key, numDataIds)
def __init__(self, config=None, log=None, initInputs=None, kwargs)
def getInputDatasetTypes(cls, config)
def getResourceConfig(self)
def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds, butler)
def saveStruct(self, struct, outputDataRefs, butler)