lsst.pipe.base  16.0-16-ge6a35c8+5
pipelineTask.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 """This module defines PipelineTask class and related methods.
23 """
24 
25 __all__ = ["DatasetTypeDescriptor", "PipelineTask"] # Classes in this module
26 
27 from lsst.daf.butler import DatasetType
28 from .config import (InputDatasetConfig, OutputDatasetConfig,
29  InitInputDatasetConfig, InitOutputDatasetConfig)
30 from .task import Task
31 
32 
33 class ScalarError(TypeError):
34  """Exception raised when dataset type is configured as scalar
35  but there are multiple DataIds in a Quantum for that dataset.
36 
37  Parameters
38  ----------
39  key : `str`
40  Name of the configuration field for dataset type.
41  numDataIds : `int`
42  Actual number of DataIds in a Quantum for this dataset type.
43  """
44  def __init__(self, key, numDataIds):
45  super().__init__(("Expected scalar for output dataset field {}, "
46  "received {} DataIds").format(key, numDataIds))
47 
48 
50  """Describe DatasetType and its options for PipelineTask.
51 
52  This class contains DatasetType and all relevant options that are used by
53  PipelineTask. Typically this is derived from configuration classes but
54  sub-classes of PipelineTask can also define additional DatasetTypes that
55  are not part of the task configuration.
56 
57  Parameters
58  ----------
59  datasetType : `DatasetType`
60  scalar : `bool`
61  `True` if this is a scalar dataset.
62  """
63 
64  def __init__(self, datasetType, scalar):
65  self._datasetType = datasetType
66  self._scalar = scalar
67 
68  @classmethod
69  def fromConfig(cls, datasetConfig):
70  """Make DatasetTypeDescriptor instance from configuration object.
71 
72  Parameters
73  ----------
74  datasetConfig : `lsst.pex.config.Config`
75  Instance of one the `InputDatasetConfig`, `OutputDatasetConfig`,
76  `InitInputDatasetConfig`, or `InitOutputDatasetConfig` types
77 
78  Returns
79  -------
80  descriptor : `DatasetTypeDescriptor`
81  """
82  datasetType = DatasetType(name=datasetConfig.name,
83  dataUnits=datasetConfig.units,
84  storageClass=datasetConfig.storageClass)
85  # Use scalar=True for Init dataset types
86  scalar = getattr(datasetConfig, 'scalar', True)
87  return cls(datasetType=datasetType, scalar=scalar)
88 
89  @property
90  def datasetType(self):
91  """`DatasetType` instance.
92  """
93  return self._datasetType
94 
95  @property
96  def scalar(self):
97  """`True` if this is a scalar dataset.
98  """
99  return self._scalar
100 
101 
103  """Base class for all pipeline tasks.
104 
105  This is an abstract base class for PipelineTasks which represents an
106  algorithm executed by framework(s) on data which comes from data butler,
107  resulting data is also stored in a data butler.
108 
109  PipelineTask inherits from a `pipe.base.Task` and uses the same
110  configuration mechanism based on `pex.config`. PipelineTask sub-class
111  typically implements `run()` method which receives Python-domain data
112  objects and returns `pipe.base.Struct` object with resulting data.
113  `run()` method is not supposed to perform any I/O, it operates entirely
114  on in-memory objects. `runQuantum()` is the method (can be re-implemented
115  in sub-class) where all necessary I/O is performed, it reads all input
116  data from data butler into memory, calls `run()` method with that data,
117  examines returned `Struct` object and saves some or all of that data back
118  to data butler. `runQuantum()` method receives `daf.butler.Quantum`
119  instance which defines all input and output datasets for a single
120  invocation of PipelineTask.
121 
122  Subclasses must be constructable with exactly the arguments taken by the
123  PipelineTask base class constructor, but may support other signatures as
124  well.
125 
126  Attributes
127  ----------
128  canMultiprocess : bool, True by default (class attribute)
129  This class attribute is checked by execution framework, sub-classes
130  can set it to ``False`` in case task does not support multiprocessing.
131 
132  Parameters
133  ----------
134  config : `pex.config.Config`, optional
135  Configuration for this task (an instance of ``self.ConfigClass``,
136  which is a task-specific subclass of `PipelineTaskConfig`).
137  If not specified then it defaults to `self.ConfigClass()`.
138  log : `lsst.log.Log`, optional
139  Logger instance whose name is used as a log name prefix, or ``None``
140  for no prefix.
141  initInputs : `dict`, optional
142  A dictionary of objects needed to construct this PipelineTask, with
143  keys matching the keys of the dictionary returned by
144  `getInitInputDatasetTypes` and values equivalent to what would be
145  obtained by calling `Butler.get` with those DatasetTypes and no data
146  IDs. While it is optional for the base class, subclasses are
147  permitted to require this argument.
148  """
149 
150  canMultiprocess = True
151 
152  def __init__(self, *, config=None, log=None, initInputs=None, **kwargs):
153  super().__init__(config=config, log=log, **kwargs)
154 
156  """Return persistable outputs that are available immediately after
157  the task has been constructed.
158 
159  Subclasses that operate on catalogs should override this method to
160  return the schema(s) of the catalog(s) they produce.
161 
162  It is not necessary to return the PipelineTask's configuration or
163  other provenance information in order for it to be persisted; that is
164  the responsibility of the execution system.
165 
166  Returns
167  -------
168  datasets : `dict`
169  Dictionary with keys that match those of the dict returned by
170  `getInitOutputDatasetTypes` values that can be written by calling
171  `Butler.put` with those DatasetTypes and no data IDs. An empty
172  `dict` should be returned by tasks that produce no initialization
173  outputs.
174  """
175  return {}
176 
177  @classmethod
178  def getInputDatasetTypes(cls, config):
179  """Return input dataset type descriptors for this task.
180 
181  Default implementation finds all fields of type `InputDatasetConfig`
182  in configuration (non-recursively) and uses them for constructing
183  `DatasetTypeDescriptor` instances. The names of these fields are used
184  as keys in returned dictionary. Subclasses can override this behavior.
185 
186  Parameters
187  ----------
188  config : `Config`
189  Configuration for this task. Typically datasets are defined in
190  a task configuration.
191 
192  Returns
193  -------
194  Dictionary where key is the name (arbitrary) of the input dataset
195  and value is the `DatasetTypeDescriptor` instance. Default
196  implementation uses configuration field name as dictionary key.
197  """
198  return cls.getDatasetTypes(config, InputDatasetConfig)
199 
200  @classmethod
201  def getOutputDatasetTypes(cls, config):
202  """Return output dataset type descriptors for this task.
203 
204  Default implementation finds all fields of type `OutputDatasetConfig`
205  in configuration (non-recursively) and uses them for constructing
206  `DatasetTypeDescriptor` instances. The keys of these fields are used
207  as keys in returned dictionary. Subclasses can override this behavior.
208 
209  Parameters
210  ----------
211  config : `Config`
212  Configuration for this task. Typically datasets are defined in
213  a task configuration.
214 
215  Returns
216  -------
217  Dictionary where key is the name (arbitrary) of the output dataset
218  and value is the `DatasetTypeDescriptor` instance. Default
219  implementation uses configuration field name as dictionary key.
220  """
221  return cls.getDatasetTypes(config, OutputDatasetConfig)
222 
223  @classmethod
224  def getInitInputDatasetTypes(cls, config):
225  """Return dataset type descriptors that can be used to retrieve the
226  ``initInputs`` constructor argument.
227 
228  Datasets used in initialization may not be associated with any
229  DataUnits (i.e. their data IDs must be empty dictionaries).
230 
231  Default implementation finds all fields of type
232  `InitInputInputDatasetConfig` in configuration (non-recursively) and
233  uses them for constructing `DatasetTypeDescriptor` instances. The
234  names of these fields are used as keys in returned dictionary.
235  Subclasses can override this behavior.
236 
237  Parameters
238  ----------
239  config : `Config`
240  Configuration for this task. Typically datasets are defined in
241  a task configuration.
242 
243  Returns
244  -------
245  Dictionary where key is the name (arbitrary) of the input dataset
246  and value is the `DatasetTypeDescriptor` instance. Default
247  implementation uses configuration field name as dictionary key.
248 
249  When the task requires no initialization inputs, should return an
250  empty dict.
251  """
252  return cls.getDatasetTypes(config, InitInputDatasetConfig)
253 
254  @classmethod
255  def getInitOutputDatasetTypes(cls, config):
256  """Return dataset type descriptors that can be used to write the
257  objects returned by `getOutputDatasets`.
258 
259  Datasets used in initialization may not be associated with any
260  DataUnits (i.e. their data IDs must be empty dictionaries).
261 
262  Default implementation finds all fields of type
263  `InitOutputDatasetConfig` in configuration (non-recursively) and uses
264  them for constructing `DatasetTypeDescriptor` instances. The names of
265  these fields are used as keys in returned dictionary. Subclasses can
266  override this behavior.
267 
268  Parameters
269  ----------
270  config : `Config`
271  Configuration for this task. Typically datasets are defined in
272  a task configuration.
273 
274  Returns
275  -------
276  Dictionary where key is the name (arbitrary) of the output dataset
277  and value is the `DatasetTypeDescriptor` instance. Default
278  implementation uses configuration field name as dictionary key.
279 
280  When the task produces no initialization outputs, should return an
281  empty dict.
282  """
283  return cls.getDatasetTypes(config, InitOutputDatasetConfig)
284 
285  @classmethod
286  def getDatasetTypes(cls, config, configClass):
287  """Return dataset type descriptors defined in task configuration.
288 
289  This method can be used by other methods that need to extract dataset
290  types from task configuration (e.g. `getInputDatasetTypes` or
291  sub-class methods).
292 
293  Parameters
294  ----------
295  config : `Config`
296  Configuration for this task. Typically datasets are defined in
297  a task configuration.
298  configClass : `type`
299  Class of the configuration object which defines dataset type.
300 
301  Returns
302  -------
303  Dictionary where key is the name (arbitrary) of the output dataset
304  and value is the `DatasetTypeDescriptor` instance. Default
305  implementation uses configuration field name as dictionary key.
306  Returns empty dict if configuration has no fields with the specified
307  ``configClass``.
308  """
309  dsTypes = {}
310  for key, value in config.items():
311  if isinstance(value, configClass):
312  dsTypes[key] = DatasetTypeDescriptor.fromConfig(value)
313  return dsTypes
314 
315  def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds):
316  """Run task algorithm on in-memory data.
317 
318  This method is called by `runQuantum` to operate on input in-memory
319  data and produce coressponding output in-memory data. It receives
320  arguments which are dictionaries with input data and input/output
321  DataIds. Many simple tasks do not need to know DataIds so default
322  implementation of this method calls `run` method passing input data
323  objects as keyword arguments. Most simple tasks will implement `run`
324  method, more complex tasks that need to know about output DataIds
325  will override this method instead.
326 
327  All three arguments to this method are dictionaries with keys equal
328  to the name of the configuration fields for dataset type. If dataset
329  type is configured with ``scalar`` fiels set to ``True`` then it is
330  expected that only one dataset appears on input or output for that
331  dataset type and dictionary value will be a single data object or
332  DataId. Otherwise if ``scalar`` is ``False`` (default) then value
333  will be a list (even if only one item is in the list).
334 
335  The method returns `Struct` instance with attributes matching the
336  configuration fields for output dataset types. Values stored in
337  returned struct are single object if ``scalar`` is ``True`` or
338  list of objects otherwise. If tasks produces more than one object
339  for some dataset type then data objects returned in ``struct`` must
340  match in count and order corresponding DataIds in ``outputDataIds``.
341 
342  Parameters
343  ----------
344  inputData : `dict`
345  Dictionary whose keys are the names of the configuration fields
346  describing input dataset types and values are Python-domain data
347  objects (or lists of objects) retrieved from data butler.
348  inputDataIds : `dict`
349  Dictionary whose keys are the names of the configuration fields
350  describing input dataset types and values are DataIds (or lists
351  of DataIds) that task consumes for corresponding dataset type.
352  DataIds are guaranteed to match data objects in ``inputData``
353  outputDataIds : `dict`
354  Dictionary whose keys are the names of the configuration fields
355  describing output dataset types and values are DataIds (or lists
356  of DataIds) that task is to produce for corresponding dataset
357  type.
358 
359  Returns
360  -------
361  struct : `Struct`
362  Standard convention is that this method should return `Struct`
363  instance containing all output data. Struct attribute names
364  should correspond to the names of the configuration fields
365  describing task output dataset types. If something different
366  is returned then `saveStruct` method has to be re-implemented
367  accordingly.
368  """
369  return self.run(**inputData)
370 
371  def run(self, **kwargs):
372  """Run task algorithm on in-memory data.
373 
374  This method should be implemented in a subclass unless tasks overrides
375  `adaptArgsAndRun` to do something different from its default
376  implementation. With default implementation of `adaptArgsAndRun` this
377  method will receive keyword arguments whose names will be the same as
378  names of configuration fields describing input dataset types. Argument
379  values will be data objects retrieved from data butler. If a dataset
380  type is configured with ``scalar`` field set to ``True`` then argument
381  value will be a single object, otherwise it will be a list of objects.
382 
383  If the task needs to know its input or output DataIds then it has to
384  override `adaptArgsAndRun` method instead.
385 
386  Returns
387  -------
388  struct : `Struct`
389  See description of `adaptArgsAndRun` method.
390 
391  Examples
392  --------
393  Typical implementation of this method may look like::
394 
395  def run(self, input, calib):
396  # "input", "calib", and "output" are the names of the config fields
397 
398  # Assuming that input/calib datasets are `scalar` they are simple objects,
399  # do something with inputs and calibs, produce output image.
400  image = self.makeImage(input, calib)
401 
402  # If output dataset is `scalar` then return object, not list
403  return Struct(output=image)
404 
405  """
406  raise NotImplementedError("run() is not implemented")
407 
408  def runQuantum(self, quantum, butler):
409  """Execute PipelineTask algorithm on single quantum of data.
410 
411  Typical implementation of this method will use inputs from quantum
412  to retrieve Python-domain objects from data butler and call
413  `adaptArgsAndRun` method on that data. On return from
414  `adaptArgsAndRun` this method will extract data from returned
415  `Struct` instance and save that data to butler.
416 
417  The `Struct` returned from `adaptArgsAndRun` is expected to contain
418  data attributes with the names equal to the names of the
419  configuration fields defining output dataset types. The values of
420  the data attributes must be data objects corresponding to
421  the DataIds of output dataset types. All data objects will be
422  saved in butler using DataRefs from Quantum's output dictionary.
423 
424  This method does not return anything to the caller, on errors
425  corresponding exception is raised.
426 
427  Parameters
428  ----------
429  quantum : `Quantum`
430  Object describing input and output corresponding to this
431  invocation of PipelineTask instance.
432  butler : object
433  Data butler instance.
434 
435  Raises
436  ------
437  `ScalarError` if a dataset type is configured as scalar but receives
438  multiple DataIds in `quantum`. Any exceptions that happen in data
439  butler or in `adaptArgsAndRun` method.
440  """
441 
442  def makeDataRefs(descriptors, refMap):
443  """Generate map of DatasetRefs and DataIds.
444 
445  Given a map of DatasetTypeDescriptor and a map of Quantum
446  DatasetRefs makes maps of DataIds and and DatasetRefs.
447  For scalar dataset types unpacks DatasetRefs and DataIds.
448 
449  Parameters
450  ----------
451  descriptors : `dict`
452  Map of (dataset key, DatasetTypeDescriptor).
453  refMap : `dict`
454  Map of (dataset type name, DatasetRefs).
455 
456  Returns
457  -------
458  dataIds : `dict`
459  Map of (dataset key, DataIds)
460  dataRefs : `dict`
461  Map of (dataset key, DatasetRefs)
462 
463  Raises
464  ------
465  ScalarError
466  Raised if dataset type is configured as scalar but more than
467  one DatasetRef exists for it.
468  """
469  dataIds = {}
470  dataRefs = {}
471  for key, descriptor in descriptors.items():
472  keyDataRefs = refMap[descriptor.datasetType.name]
473  keyDataIds = [dataRef.dataId for dataRef in keyDataRefs]
474  if descriptor.scalar:
475  # unpack single-item lists
476  if len(keyDataRefs) != 1:
477  raise ScalarError(key, len(keyDataRefs))
478  keyDataRefs = keyDataRefs[0]
479  keyDataIds = keyDataIds[0]
480  dataIds[key] = keyDataIds
481  dataRefs[key] = keyDataRefs
482  return dataIds, dataRefs
483 
484  # lists of DataRefs/DataIds for input datasets
485  descriptors = self.getInputDatasetTypes(self.config)
486  inputDataIds, inputDataRefs = makeDataRefs(descriptors, quantum.predictedInputs)
487 
488  # get all data from butler
489  inputs = {}
490  for key, dataRefs in inputDataRefs.items():
491  if isinstance(dataRefs, list):
492  inputs[key] = [butler.get(dataRef) for dataRef in dataRefs]
493  else:
494  inputs[key] = butler.get(dataRefs)
495  del inputDataRefs
496 
497  # lists of DataRefs/DataIds for output datasets
498  descriptors = self.getOutputDatasetTypes(self.config)
499  outputDataIds, outputDataRefs = makeDataRefs(descriptors, quantum.outputs)
500 
501  # call run method with keyword arguments
502  struct = self.adaptArgsAndRun(inputs, inputDataIds, outputDataIds)
503 
504  # store produced ouput data
505  self.saveStruct(struct, outputDataRefs, butler)
506 
507  def saveStruct(self, struct, outputDataRefs, butler):
508  """Save data in butler.
509 
510  Convention is that struct returned from ``run()`` method has data
511  field(s) with the same names as the config fields defining
512  output DatasetTypes. Subclasses may override this method to implement
513  different convention for `Struct` content or in case any
514  post-processing of data may be needed.
515 
516  Parameters
517  ----------
518  struct : `Struct`
519  Data produced by the task packed into `Struct` instance
520  outputDataRefs : `dict`
521  Dictionary whose keys are the names of the configuration fields
522  describing output dataset types and values are lists of DataRefs.
523  DataRefs must match corresponding data objects in ``struct`` in
524  number and order.
525  butler : object
526  Data butler instance.
527  """
528  structDict = struct.getDict()
529  descriptors = self.getOutputDatasetTypes(self.config)
530  for key in descriptors.keys():
531  dataList = structDict[key]
532  dataRefs = outputDataRefs[key]
533  if not isinstance(dataRefs, list):
534  # scalar outputs, make them lists again
535  dataRefs = [dataRefs]
536  dataList = [dataList]
537  # TODO: check that data objects and data refs are aligned
538  for dataRef, data in zip(dataRefs, dataList):
539  butler.put(data, dataRef.datasetType.name, dataRef.dataId)
540 
541  def getResourceConfig(self):
542  """Return resource configuration for this task.
543 
544  Returns
545  -------
546  Object of type `~config.ResourceConfig` or ``None`` if resource
547  configuration is not defined for this task.
548  """
549  return getattr(self.config, "resources", None)
def getDatasetTypes(cls, config, configClass)
def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds)
def runQuantum(self, quantum, butler)
def __init__(self, key, numDataIds)
Definition: pipelineTask.py:44
def __init__(self, config=None, log=None, initInputs=None, kwargs)
def saveStruct(self, struct, outputDataRefs, butler)