lsst.pipe.base  16.0-15-gb461e1a+5
pipelineTask.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 """This module defines PipelineTask class and related methods.
23 """
24 
25 __all__ = ["DatasetTypeDescriptor", "PipelineTask"] # Classes in this module
26 
27 from lsst.daf.butler import DatasetType, StorageClassFactory
28 from .config import (InputDatasetConfig, OutputDatasetConfig,
29  InitInputDatasetConfig, InitOutputDatasetConfig)
30 from .task import Task
31 
32 
33 class ScalarError(TypeError):
34  """Exception raised when dataset type is configured as scalar
35  but there are multiple DataIds in a Quantum for that dataset.
36 
37  Parameters
38  ----------
39  key : `str`
40  Name of the configuration field for dataset type.
41  numDataIds : `int`
42  Actual number of DataIds in a Quantum for this dataset type.
43  """
44  def __init__(self, key, numDataIds):
45  super().__init__(("Expected scalar for output dataset field {}, "
46  "received {} DataIds").format(key, numDataIds))
47 
48 
50  """Describe DatasetType and its options for PipelineTask.
51 
52  This class contains DatasetType and all relevant options that are used by
53  PipelineTask. Typically this is derived from configuration classes but
54  sub-classes of PipelineTask can also define additional DatasetTypes that
55  are not part of the task configuration.
56 
57  Parameters
58  ----------
59  datasetType : `DatasetType`
60  scalar : `bool`
61  `True` if this is a scalar dataset.
62  """
63 
64  def __init__(self, datasetType, scalar):
65  self._datasetType = datasetType
66  self._scalar = scalar
67 
68  @classmethod
69  def fromConfig(cls, datasetConfig):
70  """Make DatasetTypeDescriptor instance from configuration object.
71 
72  Parameters
73  ----------
74  datasetConfig : `lsst.pex.config.Config`
75  Instance of one the `InputDatasetConfig`, `OutputDatasetConfig`,
76  `InitInputDatasetConfig`, or `InitOutputDatasetConfig` types
77 
78  Returns
79  -------
80  descriptor : `DatasetTypeDescriptor`
81  """
82  # map storage class name to storage class
83  storageClass = StorageClassFactory().getStorageClass(datasetConfig.storageClass)
84 
85  datasetType = DatasetType(name=datasetConfig.name,
86  dataUnits=datasetConfig.units,
87  storageClass=storageClass)
88  # Use scalar=True for Init dataset types
89  scalar = getattr(datasetConfig, 'scalar', True)
90  return cls(datasetType=datasetType, scalar=scalar)
91 
92  @property
93  def datasetType(self):
94  """`DatasetType` instance.
95  """
96  return self._datasetType
97 
98  @property
99  def scalar(self):
100  """`True` if this is a scalar dataset.
101  """
102  return self._scalar
103 
104 
106  """Base class for all pipeline tasks.
107 
108  This is an abstract base class for PipelineTasks which represents an
109  algorithm executed by framework(s) on data which comes from data butler,
110  resulting data is also stored in a data butler.
111 
112  PipelineTask inherits from a `pipe.base.Task` and uses the same
113  configuration mechanism based on `pex.config`. PipelineTask sub-class
114  typically implements `run()` method which receives Python-domain data
115  objects and returns `pipe.base.Struct` object with resulting data.
116  `run()` method is not supposed to perform any I/O, it operates entirely
117  on in-memory objects. `runQuantum()` is the method (can be re-implemented
118  in sub-class) where all necessary I/O is performed, it reads all input
119  data from data butler into memory, calls `run()` method with that data,
120  examines returned `Struct` object and saves some or all of that data back
121  to data butler. `runQuantum()` method receives `daf.butler.Quantum`
122  instance which defines all input and output datasets for a single
123  invocation of PipelineTask.
124 
125  Subclasses must be constructable with exactly the arguments taken by the
126  PipelineTask base class constructor, but may support other signatures as
127  well.
128 
129  Attributes
130  ----------
131  canMultiprocess : bool, True by default (class attribute)
132  This class attribute is checked by execution framework, sub-classes
133  can set it to ``False`` in case task does not support multiprocessing.
134 
135  Parameters
136  ----------
137  config : `pex.config.Config`, optional
138  Configuration for this task (an instance of ``self.ConfigClass``,
139  which is a task-specific subclass of `PipelineTaskConfig`).
140  If not specified then it defaults to `self.ConfigClass()`.
141  log : `lsst.log.Log`, optional
142  Logger instance whose name is used as a log name prefix, or ``None``
143  for no prefix.
144  initInputs : `dict`, optional
145  A dictionary of objects needed to construct this PipelineTask, with
146  keys matching the keys of the dictionary returned by
147  `getInitInputDatasetTypes` and values equivalent to what would be
148  obtained by calling `Butler.get` with those DatasetTypes and no data
149  IDs. While it is optional for the base class, subclasses are
150  permitted to require this argument.
151  """
152 
153  canMultiprocess = True
154 
155  def __init__(self, *, config=None, log=None, initInputs=None, **kwargs):
156  super().__init__(config=config, log=log, **kwargs)
157 
159  """Return persistable outputs that are available immediately after
160  the task has been constructed.
161 
162  Subclasses that operate on catalogs should override this method to
163  return the schema(s) of the catalog(s) they produce.
164 
165  It is not necessary to return the PipelineTask's configuration or
166  other provenance information in order for it to be persisted; that is
167  the responsibility of the execution system.
168 
169  Returns
170  -------
171  datasets : `dict`
172  Dictionary with keys that match those of the dict returned by
173  `getInitOutputDatasetTypes` values that can be written by calling
174  `Butler.put` with those DatasetTypes and no data IDs. An empty
175  `dict` should be returned by tasks that produce no initialization
176  outputs.
177  """
178  return {}
179 
180  @classmethod
181  def getInputDatasetTypes(cls, config):
182  """Return input dataset type descriptors for this task.
183 
184  Default implementation finds all fields of type `InputDatasetConfig`
185  in configuration (non-recursively) and uses them for constructing
186  `DatasetTypeDescriptor` instances. The names of these fields are used
187  as keys in returned dictionary. Subclasses can override this behavior.
188 
189  Parameters
190  ----------
191  config : `Config`
192  Configuration for this task. Typically datasets are defined in
193  a task configuration.
194 
195  Returns
196  -------
197  Dictionary where key is the name (arbitrary) of the input dataset
198  and value is the `DatasetTypeDescriptor` instance. Default
199  implementation uses configuration field name as dictionary key.
200  """
201  return cls.getDatasetTypes(config, InputDatasetConfig)
202 
203  @classmethod
204  def getOutputDatasetTypes(cls, config):
205  """Return output dataset type descriptors for this task.
206 
207  Default implementation finds all fields of type `OutputDatasetConfig`
208  in configuration (non-recursively) and uses them for constructing
209  `DatasetTypeDescriptor` instances. The keys of these fields are used
210  as keys in returned dictionary. Subclasses can override this behavior.
211 
212  Parameters
213  ----------
214  config : `Config`
215  Configuration for this task. Typically datasets are defined in
216  a task configuration.
217 
218  Returns
219  -------
220  Dictionary where key is the name (arbitrary) of the output dataset
221  and value is the `DatasetTypeDescriptor` instance. Default
222  implementation uses configuration field name as dictionary key.
223  """
224  return cls.getDatasetTypes(config, OutputDatasetConfig)
225 
226  @classmethod
227  def getInitInputDatasetTypes(cls, config):
228  """Return dataset type descriptors that can be used to retrieve the
229  ``initInputs`` constructor argument.
230 
231  Datasets used in initialization may not be associated with any
232  DataUnits (i.e. their data IDs must be empty dictionaries).
233 
234  Default implementation finds all fields of type
235  `InitInputInputDatasetConfig` in configuration (non-recursively) and
236  uses them for constructing `DatasetTypeDescriptor` instances. The
237  names of these fields are used as keys in returned dictionary.
238  Subclasses can override this behavior.
239 
240  Parameters
241  ----------
242  config : `Config`
243  Configuration for this task. Typically datasets are defined in
244  a task configuration.
245 
246  Returns
247  -------
248  Dictionary where key is the name (arbitrary) of the input dataset
249  and value is the `DatasetTypeDescriptor` instance. Default
250  implementation uses configuration field name as dictionary key.
251 
252  When the task requires no initialization inputs, should return an
253  empty dict.
254  """
255  return cls.getDatasetTypes(config, InitInputDatasetConfig)
256 
257  @classmethod
258  def getInitOutputDatasetTypes(cls, config):
259  """Return dataset type descriptors that can be used to write the
260  objects returned by `getOutputDatasets`.
261 
262  Datasets used in initialization may not be associated with any
263  DataUnits (i.e. their data IDs must be empty dictionaries).
264 
265  Default implementation finds all fields of type
266  `InitOutputDatasetConfig` in configuration (non-recursively) and uses
267  them for constructing `DatasetTypeDescriptor` instances. The names of
268  these fields are used as keys in returned dictionary. Subclasses can
269  override this behavior.
270 
271  Parameters
272  ----------
273  config : `Config`
274  Configuration for this task. Typically datasets are defined in
275  a task configuration.
276 
277  Returns
278  -------
279  Dictionary where key is the name (arbitrary) of the output dataset
280  and value is the `DatasetTypeDescriptor` instance. Default
281  implementation uses configuration field name as dictionary key.
282 
283  When the task produces no initialization outputs, should return an
284  empty dict.
285  """
286  return cls.getDatasetTypes(config, InitOutputDatasetConfig)
287 
288  @classmethod
289  def getDatasetTypes(cls, config, configClass):
290  """Return dataset type descriptors defined in task configuration.
291 
292  This method can be used by other methods that need to extract dataset
293  types from task configuration (e.g. `getInputDatasetTypes` or
294  sub-class methods).
295 
296  Parameters
297  ----------
298  config : `Config`
299  Configuration for this task. Typically datasets are defined in
300  a task configuration.
301  configClass : `type`
302  Class of the configuration object which defines dataset type.
303 
304  Returns
305  -------
306  Dictionary where key is the name (arbitrary) of the output dataset
307  and value is the `DatasetTypeDescriptor` instance. Default
308  implementation uses configuration field name as dictionary key.
309  Returns empty dict if configuration has no fields with the specified
310  ``configClass``.
311  """
312  dsTypes = {}
313  for key, value in config.items():
314  if isinstance(value, configClass):
315  dsTypes[key] = DatasetTypeDescriptor.fromConfig(value)
316  return dsTypes
317 
318  def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds):
319  """Run task algorithm on in-memory data.
320 
321  This method is called by `runQuantum` to operate on input in-memory
322  data and produce coressponding output in-memory data. It receives
323  arguments which are dictionaries with input data and input/output
324  DataIds. Many simple tasks do not need to know DataIds so default
325  implementation of this method calls `run` method passing input data
326  objects as keyword arguments. Most simple tasks will implement `run`
327  method, more complex tasks that need to know about output DataIds
328  will override this method instead.
329 
330  All three arguments to this method are dictionaries with keys equal
331  to the name of the configuration fields for dataset type. If dataset
332  type is configured with ``scalar`` fiels set to ``True`` then it is
333  expected that only one dataset appears on input or output for that
334  dataset type and dictionary value will be a single data object or
335  DataId. Otherwise if ``scalar`` is ``False`` (default) then value
336  will be a list (even if only one item is in the list).
337 
338  The method returns `Struct` instance with attributes matching the
339  configuration fields for output dataset types. Values stored in
340  returned struct are single object if ``scalar`` is ``True`` or
341  list of objects otherwise. If tasks produces more than one object
342  for some dataset type then data objects returned in ``struct`` must
343  match in count and order corresponding DataIds in ``outputDataIds``.
344 
345  Parameters
346  ----------
347  inputData : `dict`
348  Dictionary whose keys are the names of the configuration fields
349  describing input dataset types and values are Python-domain data
350  objects (or lists of objects) retrieved from data butler.
351  inputDataIds : `dict`
352  Dictionary whose keys are the names of the configuration fields
353  describing input dataset types and values are DataIds (or lists
354  of DataIds) that task consumes for corresponding dataset type.
355  DataIds are guaranteed to match data objects in ``inputData``
356  outputDataIds : `dict`
357  Dictionary whose keys are the names of the configuration fields
358  describing output dataset types and values are DataIds (or lists
359  of DataIds) that task is to produce for corresponding dataset
360  type.
361 
362  Returns
363  -------
364  struct : `Struct`
365  Standard convention is that this method should return `Struct`
366  instance containing all output data. Struct attribute names
367  should correspond to the names of the configuration fields
368  describing task output dataset types. If something different
369  is returned then `saveStruct` method has to be re-implemented
370  accordingly.
371  """
372  return self.run(**inputData)
373 
374  def run(self, **kwargs):
375  """Run task algorithm on in-memory data.
376 
377  This method should be implemented in a subclass unless tasks overrides
378  `adaptArgsAndRun` to do something different from its default
379  implementation. With default implementation of `adaptArgsAndRun` this
380  method will receive keyword arguments whose names will be the same as
381  names of configuration fields describing input dataset types. Argument
382  values will be data objects retrieved from data butler. If a dataset
383  type is configured with ``scalar`` field set to ``True`` then argument
384  value will be a single object, otherwise it will be a list of objects.
385 
386  If the task needs to know its input or output DataIds then it has to
387  override `adaptArgsAndRun` method instead.
388 
389  Returns
390  -------
391  struct : `Struct`
392  See description of `adaptArgsAndRun` method.
393 
394  Examples
395  --------
396  Typical implementation of this method may look like::
397 
398  def run(self, input, calib):
399  # "input", "calib", and "output" are the names of the config fields
400 
401  # Assuming that input/calib datasets are `scalar` they are simple objects,
402  # do something with inputs and calibs, produce output image.
403  image = self.makeImage(input, calib)
404 
405  # If output dataset is `scalar` then return object, not list
406  return Struct(output=image)
407 
408  """
409  raise NotImplementedError("run() is not implemented")
410 
411  def runQuantum(self, quantum, butler):
412  """Execute PipelineTask algorithm on single quantum of data.
413 
414  Typical implementation of this method will use inputs from quantum
415  to retrieve Python-domain objects from data butler and call
416  `adaptArgsAndRun` method on that data. On return from
417  `adaptArgsAndRun` this method will extract data from returned
418  `Struct` instance and save that data to butler.
419 
420  The `Struct` returned from `adaptArgsAndRun` is expected to contain
421  data attributes with the names equal to the names of the
422  configuration fields defining output dataset types. The values of
423  the data attributes must be data objects corresponding to
424  the DataIds of output dataset types. All data objects will be
425  saved in butler using DataRefs from Quantum's output dictionary.
426 
427  This method does not return anything to the caller, on errors
428  corresponding exception is raised.
429 
430  Parameters
431  ----------
432  quantum : `Quantum`
433  Object describing input and output corresponding to this
434  invocation of PipelineTask instance.
435  butler : object
436  Data butler instance.
437 
438  Raises
439  ------
440  `ScalarError` if a dataset type is configured as scalar but receives
441  multiple DataIds in `quantum`. Any exceptions that happen in data
442  butler or in `adaptArgsAndRun` method.
443  """
444 
445  def makeDataRefs(descriptors, refMap):
446  """Generate map of DatasetRefs and DataIds.
447 
448  Given a map of DatasetTypeDescriptor and a map of Quantum
449  DatasetRefs makes maps of DataIds and and DatasetRefs.
450  For scalar dataset types unpacks DatasetRefs and DataIds.
451 
452  Parameters
453  ----------
454  descriptors : `dict`
455  Map of (dataset key, DatasetTypeDescriptor).
456  refMap : `dict`
457  Map of (dataset type name, DatasetRefs).
458 
459  Returns
460  -------
461  dataIds : `dict`
462  Map of (dataset key, DataIds)
463  dataRefs : `dict`
464  Map of (dataset key, DatasetRefs)
465 
466  Raises
467  ------
468  ScalarError
469  Raised if dataset type is configured as scalar but more than
470  one DatasetRef exists for it.
471  """
472  dataIds = {}
473  dataRefs = {}
474  for key, descriptor in descriptors.items():
475  keyDataRefs = refMap[descriptor.datasetType.name]
476  keyDataIds = [dataRef.dataId for dataRef in keyDataRefs]
477  if descriptor.scalar:
478  # unpack single-item lists
479  if len(keyDataRefs) != 1:
480  raise ScalarError(key, len(keyDataRefs))
481  keyDataRefs = keyDataRefs[0]
482  keyDataIds = keyDataIds[0]
483  dataIds[key] = keyDataIds
484  dataRefs[key] = keyDataRefs
485  return dataIds, dataRefs
486 
487  # lists of DataRefs/DataIds for input datasets
488  descriptors = self.getInputDatasetTypes(self.config)
489  inputDataIds, inputDataRefs = makeDataRefs(descriptors, quantum.predictedInputs)
490 
491  # get all data from butler
492  inputs = {}
493  for key, dataRefs in inputDataRefs.items():
494  if isinstance(dataRefs, list):
495  inputs[key] = [butler.get(dataRef) for dataRef in dataRefs]
496  else:
497  inputs[key] = butler.get(dataRefs)
498  del inputDataRefs
499 
500  # lists of DataRefs/DataIds for output datasets
501  descriptors = self.getOutputDatasetTypes(self.config)
502  outputDataIds, outputDataRefs = makeDataRefs(descriptors, quantum.outputs)
503 
504  # call run method with keyword arguments
505  struct = self.adaptArgsAndRun(inputs, inputDataIds, outputDataIds)
506 
507  # store produced ouput data
508  self.saveStruct(struct, outputDataRefs, butler)
509 
510  def saveStruct(self, struct, outputDataRefs, butler):
511  """Save data in butler.
512 
513  Convention is that struct returned from ``run()`` method has data
514  field(s) with the same names as the config fields defining
515  output DatasetTypes. Subclasses may override this method to implement
516  different convention for `Struct` content or in case any
517  post-processing of data may be needed.
518 
519  Parameters
520  ----------
521  struct : `Struct`
522  Data produced by the task packed into `Struct` instance
523  outputDataRefs : `dict`
524  Dictionary whose keys are the names of the configuration fields
525  describing output dataset types and values are lists of DataRefs.
526  DataRefs must match corresponding data objects in ``struct`` in
527  number and order.
528  butler : object
529  Data butler instance.
530  """
531  structDict = struct.getDict()
532  descriptors = self.getOutputDatasetTypes(self.config)
533  for key in descriptors.keys():
534  dataList = structDict[key]
535  dataRefs = outputDataRefs[key]
536  if not isinstance(dataRefs, list):
537  # scalar outputs, make them lists again
538  dataRefs = [dataRefs]
539  dataList = [dataList]
540  # TODO: check that data objects and data refs are aligned
541  for dataRef, data in zip(dataRefs, dataList):
542  butler.put(data, dataRef.datasetType.name, dataRef.dataId)
543 
544  def getResourceConfig(self):
545  """Return resource configuration for this task.
546 
547  Returns
548  -------
549  Object of type `~config.ResourceConfig` or ``None`` if resource
550  configuration is not defined for this task.
551  """
552  return getattr(self.config, "resources", None)
def getDatasetTypes(cls, config, configClass)
def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds)
def runQuantum(self, quantum, butler)
def __init__(self, key, numDataIds)
Definition: pipelineTask.py:44
def __init__(self, config=None, log=None, initInputs=None, kwargs)
def saveStruct(self, struct, outputDataRefs, butler)