lsst.pipe.base  16.0-12-g726f8f3+3
pipelineTask.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 """This module defines PipelineTask class and related methods.
23 """
24 
25 __all__ = ["PipelineTask"] # Classes in this module
26 
27 from lsst.daf.butler import DatasetType, StorageClassFactory
28 from .config import (InputDatasetConfig, OutputDatasetConfig,
29  InitInputDatasetConfig, InitOutputDatasetConfig)
30 from .task import Task
31 
32 
33 class ScalarError(TypeError):
34  """Exception raised when dataset type is configured as scalar
35  but there are multiple DataIds in a Quantum for that dataset.
36 
37  Parameters
38  ----------
39  key : `str`
40  Name of the configuration field for dataset type.
41  numDataIds : `int`
42  Actual number of DataIds in a Quantum for this dataset type.
43  """
44  def __init__(self, key, numDataIds):
45  super().__init__(("Expected scalar for output dataset field {}, "
46  "received {} DataIds").format(key, numDataIds))
47 
48 
50  """Base class for all pipeline tasks.
51 
52  This is an abstract base class for PipelineTasks which represents an
53  algorithm executed by framework(s) on data which comes from data butler,
54  resulting data is also stored in a data butler.
55 
56  PipelineTask inherits from a `pipe.base.Task` and uses the same
57  configuration mechanism based on `pex.config`. PipelineTask sub-class
58  typically implements `run()` method which receives Python-domain data
59  objects and returns `pipe.base.Struct` object with resulting data.
60  `run()` method is not supposed to perform any I/O, it operates entirely
61  on in-memory objects. `runQuantum()` is the method (can be re-implemented
62  in sub-class) where all necessary I/O is performed, it reads all input
63  data from data butler into memory, calls `run()` method with that data,
64  examines returned `Struct` object and saves some or all of that data back
65  to data butler. `runQuantum()` method receives `daf.butler.Quantum`
66  instance which defines all input and output datasets for a single
67  invocation of PipelineTask.
68 
69  Subclasses must be constructable with exactly the arguments taken by the
70  PipelineTask base class constructor, but may support other signatures as
71  well.
72 
73  Attributes
74  ----------
75  canMultiprocess : bool, True by default (class attribute)
76  This class attribute is checked by execution framework, sub-classes
77  can set it to ``False`` in case task does not support multiprocessing.
78 
79  Parameters
80  ----------
81  config : `pex.config.Config`, optional
82  Configuration for this task (an instance of ``self.ConfigClass``,
83  which is a task-specific subclass of `PipelineTaskConfig`).
84  If not specified then it defaults to `self.ConfigClass()`.
85  log : `lsst.log.Log`, optional
86  Logger instance whose name is used as a log name prefix, or ``None``
87  for no prefix.
88  initInputs : `dict`, optional
89  A dictionary of objects needed to construct this PipelineTask, with
90  keys matching the keys of the dictionary returned by
91  `getInitInputDatasetTypes` and values equivalent to what would be
92  obtained by calling `Butler.get` with those DatasetTypes and no data
93  IDs. While it is optional for the base class, subclasses are
94  permitted to require this argument.
95  """
96 
97  canMultiprocess = True
98 
99  def __init__(self, config=None, log=None, initInputs=None):
100  super().__init__(config=config, log=log,)
101 
103  """Return persistable outputs that are available immediately after
104  the task has been constructed.
105 
106  Subclasses that operate on catalogs should override this method to
107  return the schema(s) of the catalog(s) they produce.
108 
109  It is not necessary to return the PipelineTask's configuration or
110  other provenance information in order for it to be persisted; that is
111  the responsibility of the execution system.
112 
113  Returns
114  -------
115  datasets : `dict`
116  Dictionary with keys that match those of the dict returned by
117  `getInitOutputDatasetTypes` values that can be written by calling
118  `Butler.put` with those DatasetTypes and no data IDs. An empty
119  `dict` should be returned by tasks that produce no initialization
120  outputs.
121  """
122  return {}
123 
124  @classmethod
125  def getInputDatasetTypes(cls, config):
126  """Return input dataset types for this task.
127 
128  Default implementation finds all fields of type `InputDatasetConfig`
129  in configuration (non-recursively) and uses them for constructing
130  `DatasetType` instances. The keys of these fields are used as keys
131  in returned dictionary. Subclasses can override this behavior.
132 
133  Parameters
134  ----------
135  config : `Config`
136  Configuration for this task. Typically datasets are defined in
137  a task configuration.
138 
139  Returns
140  -------
141  Dictionary where key is the name (arbitrary) of the input dataset
142  and value is the `butler.DatasetType` instance. Default
143  implementation uses configuration field name as dictionary key.
144  """
145  return cls.getDatasetTypes(config, InputDatasetConfig)
146 
147  @classmethod
148  def getOutputDatasetTypes(cls, config):
149  """Return output dataset types for this task.
150 
151  Default implementation finds all fields of type `OutputDatasetConfig`
152  in configuration (non-recursively) and uses them for constructing
153  `DatasetType` instances. The keys of these fields are used as keys
154  in returned dictionary. Subclasses can override this behavior.
155 
156  Parameters
157  ----------
158  config : `Config`
159  Configuration for this task. Typically datasets are defined in
160  a task configuration.
161 
162  Returns
163  -------
164  Dictionary where key is the name (arbitrary) of the output dataset
165  and value is the `butler.DatasetType` instance. Default
166  implementation uses configuration field name as dictionary key.
167  """
168  return cls.getDatasetTypes(config, OutputDatasetConfig)
169 
170  @classmethod
171  def getInitInputDatasetTypes(cls, config):
172  """Return dataset types that can be used to retrieve the
173  ``initInputs`` constructor argument.
174 
175  Datasets used in initialization may not be associated with any
176  DataUnits (i.e. their data IDs must be empty dictionaries).
177 
178  Default implementation finds all fields of type
179  `InitInputInputDatasetConfig` in configuration (non-recursively) and
180  uses them for constructing `DatasetType` instances. The keys of these
181  fields are used as keys in returned dictionary. Subclasses can
182  override this behavior.
183 
184  Parameters
185  ----------
186  config : `Config`
187  Configuration for this task. Typically datasets are defined in
188  a task configuration.
189 
190  Returns
191  -------
192  Dictionary where key is the name (arbitrary) of the input dataset
193  and value is the `butler.DatasetType` instance. Default
194  implementation uses configuration field name as dictionary key.
195 
196  When the task requires no initialization inputs, should return an
197  empty dict.
198  """
199  return cls.getDatasetTypes(config, InitInputDatasetConfig)
200 
201  @classmethod
202  def getInitOutputDatasetTypes(cls, config):
203  """Return dataset types that can be used to write the objects
204  returned by `getOutputDatasets`.
205 
206  Datasets used in initialization may not be associated with any
207  DataUnits (i.e. their data IDs must be empty dictionaries).
208 
209  Default implementation finds all fields of type
210  `InitOutputDatasetConfig` in configuration (non-recursively) and uses
211  them for constructing `DatasetType` instances. The keys of these
212  fields are used as keys in returned dictionary. Subclasses can
213  override this behavior.
214 
215  Parameters
216  ----------
217  config : `Config`
218  Configuration for this task. Typically datasets are defined in
219  a task configuration.
220 
221  Returns
222  -------
223  Dictionary where key is the name (arbitrary) of the output dataset
224  and value is the `butler.DatasetType` instance. Default
225  implementation uses configuration field name as dictionary key.
226 
227  When the task produces no initialization outputs, should return an
228  empty dict.
229  """
230  return cls.getDatasetTypes(config, InitOutputDatasetConfig)
231 
232  @classmethod
233  def getDatasetTypes(cls, config, configClass):
234  """Return dataset types defined in task configuration .
235 
236  This method can be used by other methods that need to extract dataset
237  types from task configuration (e.g. :py:method:`getInputDatasetTypes`
238  or sub-class methods).
239 
240  Parameters
241  ----------
242  config : `Config`
243  Configuration for this task. Typically datasets are defined in
244  a task configuration.
245  configClass : `type`
246  Class of the configuration object which defines dataset type.
247 
248  Returns
249  -------
250  Dictionary where key is the name (arbitrary) of the output dataset
251  and value is the `butler.DatasetType` instance. Default
252  implementation uses configuration field name as dictionary key.
253 
254  When the task produces no initialization outputs, should return an
255  empty dict.
256  """
257  dsTypes = {}
258  for key, value in config.items():
259  if isinstance(value, configClass):
260  dsTypes[key] = cls.makeDatasetType(value)
261  return dsTypes
262 
263  def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds):
264  """Run task algorithm on in-memory data.
265 
266  This method is called by `runQuantum` to operate on input in-memory
267  data and produce coressponding output in-memory data. It receives
268  arguments which are dictionaries with input data and input/output
269  DataIds. Many simple tasks do not need to know DataIds so default
270  implementation of this method calls `run` method passing input data
271  objects as keyword arguments. Most simple tasks will implement `run`
272  method, more complex tasks that need to know about output DataIds
273  will override this method instead.
274 
275  All three arguments to this method are dictionaries with keys equal
276  to the name of the configuration fields for dataset type. If dataset
277  type is configured with ``scalar`` fiels set to ``True`` then it is
278  expected that only one dataset appears on input or output for that
279  dataset type and dictionary value will be a single data object or
280  DataId. Otherwise if ``scalar`` is ``False`` (default) then value
281  will be a list (even if only one item is in the list).
282 
283  The method returns `Struct` instance with attributes matching the
284  configuration fields for output dataset types. Values stored in
285  returned struct are single object if ``scalar`` is ``True`` or
286  list of objects otherwise. If tasks produces more than one object
287  for some dataset type then data objects returned in ``struct`` must
288  match in count and order corresponding DataIds in ``outputDataIds``.
289 
290  Parameters
291  ----------
292  inputData : `dict`
293  Dictionary whose keys are the names of the configuration fields
294  describing input dataset types and values are Python-domain data
295  objects (or lists of objects) retrieved from data butler.
296  inputDataIds : `dict`
297  Dictionary whose keys are the names of the configuration fields
298  describing input dataset types and values are DataIds (or lists
299  of DataIds) that task consumes for corresponding dataset type.
300  DataIds are guaranteed to match data objects in ``inputData``
301  outputDataIds : `dict`
302  Dictionary whose keys are the names of the configuration fields
303  describing output dataset types and values are DataIds (or lists
304  of DataIds) that task is to produce for corresponding dataset
305  type.
306 
307  Returns
308  -------
309  struct : `Struct`
310  Standard convention is that this method should return `Struct`
311  instance containing all output data. Struct attribute names
312  should correspond to the names of the configuration fields
313  describing task output dataset types. If something different
314  is returned then `saveStruct` method has to be re-implemented
315  accordingly.
316  """
317  return self.run(**inputData)
318 
319  def run(self, **kwargs):
320  """Run task algorithm on in-memory data.
321 
322  This method should be implemented in a subclass unless tasks overrides
323  `adaptArgsAndRun` to do something different from its default
324  implementation. With default implementation of `adaptArgsAndRun` this
325  method will receive keyword arguments whose names will be the same as
326  names of configuration fields describing input dataset types. Argument
327  values will be data objects retrieved from data butler. If a dataset
328  type is configured with ``scalar`` field set to ``True`` then argument
329  value will be a single object, otherwise it will be a list of objects.
330 
331  If the task needs to know its input or output DataIds then it has to
332  override `adaptArgsAndRun` method instead.
333 
334  Returns
335  -------
336  struct : `Struct`
337  See description of `adaptArgsAndRun` method.
338 
339  Examples
340  --------
341  Typical implementation of this method may look like::
342 
343  def run(self, input, calib):
344  # "input", "calib", and "output" are the names of the config fields
345 
346  # Assuming that input/calib datasets are `scalar` they are simple objects,
347  # do something with inputs and calibs, produce output image.
348  image = self.makeImage(input, calib)
349 
350  # If output dataset is `scalar` then return object, not list
351  return Struct(output=image)
352 
353  """
354  raise NotImplementedError("run() is not implemented")
355 
356  def runQuantum(self, quantum, butler):
357  """Execute PipelineTask algorithm on single quantum of data.
358 
359  Typical implementation of this method will use inputs from quantum
360  to retrieve Python-domain objects from data butler and call
361  `adaptArgsAndRun` method on that data. On return from
362  `adaptArgsAndRun` this method will extract data from returned
363  `Struct` instance and save that data to butler.
364 
365  The `Struct` returned from `adaptArgsAndRun` is expected to contain
366  data attributes with the names equal to the names of the
367  configuration fields defining output dataset types. The values of
368  the data attributes must be data objects corresponding to
369  the DataIds of output dataset types. All data objects will be
370  saved in butler using DataRefs from Quantum's output dictionary.
371 
372  This method does not return anything to the caller, on errors
373  corresponding exception is raised.
374 
375  Parameters
376  ----------
377  quantum : `Quantum`
378  Object describing input and output corresponding to this
379  invocation of PipelineTask instance.
380  butler : object
381  Data butler instance.
382 
383  Raises
384  ------
385  `ScalarError` if a dataset type is configured as scalar but receives
386  multiple DataIds in `quantum`. Any exceptions that happen in data
387  butler or in `adaptArgsAndRun` method.
388  """
389  # get all data from butler
390  inputDataIds = {}
391  inputs = {}
392  for key, value in self.config.items():
393  if isinstance(value, InputDatasetConfig):
394  dataRefs = quantum.predictedInputs[value.name]
395  dataIds = [dataRef.dataId for dataRef in dataRefs]
396  data = [butler.get(dataRef) for dataRef in dataRefs]
397  if value.scalar:
398  # unpack single-item lists
399  if len(dataRefs) != 1:
400  raise ScalarError(key, len(dataRefs))
401  data = data[0]
402  dataIds = dataIds[0]
403  inputDataIds[key] = dataIds
404  inputs[key] = data
405 
406  # lists of DataRefs/DataIds for output datasets
407  outputDataRefs = {}
408  outputDataIds = {}
409  for key, value in self.config.items():
410  if isinstance(value, OutputDatasetConfig):
411  dataRefs = quantum.outputs[value.name]
412  dataIds = [dataRef.dataId for dataRef in dataRefs]
413  if value.scalar:
414  # unpack single-item lists
415  if len(dataRefs) != 1:
416  raise ScalarError(key, len(dataRefs))
417  dataRefs = dataRefs[0]
418  dataIds = dataIds[0]
419  outputDataRefs[key] = dataRefs
420  outputDataIds[key] = dataIds
421 
422  # call run method with keyword arguments
423  struct = self.adaptArgsAndRun(inputs, inputDataIds, outputDataIds)
424 
425  # store produced ouput data
426  self.saveStruct(struct, outputDataRefs, butler)
427 
428  def saveStruct(self, struct, outputDataRefs, butler):
429  """Save data in butler.
430 
431  Convention is that struct returned from ``run()`` method has data
432  field(s) with the same names as the config fields defining
433  output DatasetTypes. Subclasses may override this method to implement
434  different convention for `Struct` content or in case any
435  post-processing of data may be needed.
436 
437  Parameters
438  ----------
439  struct : `Struct`
440  Data produced by the task packed into `Struct` instance
441  outputDataRefs : `dict`
442  Dictionary whose keys are the names of the configuration fields
443  describing output dataset types and values are lists of DataRefs.
444  DataRefs must match corresponding data objects in ``struct`` in
445  number and order.
446  butler : object
447  Data butler instance.
448  """
449  structDict = struct.getDict()
450  for key, value in self.config.items():
451  if isinstance(value, OutputDatasetConfig):
452  dataList = structDict[key]
453  dataRefs = outputDataRefs[key]
454  if not isinstance(dataRefs, list):
455  # scalar outputs, make them lists again
456  dataRefs = [dataRefs]
457  dataList = [dataList]
458  # TODO: check that data objects and data refs are aligned
459  for dataRef, data in zip(dataRefs, dataList):
460  butler.put(data, dataRef.datasetType.name, dataRef.dataId)
461 
462  @classmethod
463  def makeDatasetType(cls, dsConfig):
464  """Create new instance of the `DatasetType` from task config.
465 
466  Parameters
467  ----------
468  dsConfig : `pexConfig.Config`
469  Instance of `InputDatasetConfig`, `OutputDatasetConfig`,
470  `InitInputDatasetConfig`, or `InitOutputDatasetConfig`.
471 
472  Returns
473  -------
474  `butler.DatasetType` instance.
475  """
476  # map storage class name to storage class
477  storageClass = StorageClassFactory().getStorageClass(dsConfig.storageClass)
478 
479  return DatasetType(name=dsConfig.name,
480  dataUnits=dsConfig.units,
481  storageClass=storageClass)
482 
483  def getResourceConfig(self):
484  """Return resource configuration for this task.
485 
486  Returns
487  -------
488  Object of type `~config.ResourceConfig` or ``None`` if resource
489  configuration is not defined for this task.
490  """
491  return getattr(self.config, "resources", None)
def getDatasetTypes(cls, config, configClass)
def __init__(self, config=None, log=None, initInputs=None)
Definition: pipelineTask.py:99
def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds)
def runQuantum(self, quantum, butler)
def __init__(self, key, numDataIds)
Definition: pipelineTask.py:44
def saveStruct(self, struct, outputDataRefs, butler)