lsst.pipe.base  17.0.1-5-g3877d06+7
pipelineTask.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 """This module defines PipelineTask class and related methods.
23 """
24 
25 __all__ = ["DatasetTypeDescriptor", "PipelineTask"] # Classes in this module
26 
27 from lsst.daf.butler import DatasetType
28 from .config import (InputDatasetConfig, OutputDatasetConfig,
29  InitInputDatasetConfig, InitOutputDatasetConfig)
30 from .task import Task
31 
32 
33 class ScalarError(TypeError):
34  """Exception raised when dataset type is configured as scalar
35  but there are multiple DataIds in a Quantum for that dataset.
36 
37  Parameters
38  ----------
39  key : `str`
40  Name of the configuration field for dataset type.
41  numDataIds : `int`
42  Actual number of DataIds in a Quantum for this dataset type.
43  """
44  def __init__(self, key, numDataIds):
45  super().__init__(("Expected scalar for output dataset field {}, "
46  "received {} DataIds").format(key, numDataIds))
47 
48 
50  """Description of an unnormalized proto-DatasetType and its relationship to
51  a PipelineTask.
52 
53  This class contains the information needed to construct a `DatasetType`
54  (once a `DimensionUniverse` is available) and all relevant options that are
55  used by PipelineTask. Typically this is derived from configuration classes,
56  but sub-classes of PipelineTask can also define additional DatasetTypes
57  that are not part of the task configuration.
58 
59  Parameters
60  ----------
61  name : `str`
62  Name of the dataset type.
63  dimensionNames: `~collections.abc.Set` of `str`
64  Names of the dimensions used to identify datasets of this type.
65  storageClassName: `str`
66  Name of the `~lsst.daf.butler.StorageClass` for this dataset type.
67  scalar : `bool`
68  `True` if this is a scalar dataset.
69  manualLoad : `bool`
70  `True` if this dataset will be manually loaded by a concrete
71  `PipelineTask` instead of loaded automatically by the base class.
72  """
73 
74  def __init__(self, name, dimensionNames, storageClassName, scalar, manualLoad):
75  self._name = name
76  self._dimensionNames = dimensionNames
77  self._storageClassName = storageClassName
78  self._scalar = scalar
79  self._manualLoad = manualLoad
80 
81  @classmethod
82  def fromConfig(cls, datasetConfig):
83  """Make DatasetTypeDescriptor instance from configuration object.
84 
85  Parameters
86  ----------
87  datasetConfig : `lsst.pex.config.Config`
88  Instance of one the `InputDatasetConfig`, `OutputDatasetConfig`,
89  `InitInputDatasetConfig`, or `InitOutputDatasetConfig` types
90 
91  Returns
92  -------
93  descriptor : `DatasetTypeDescriptor`
94  """
95  # Use scalar=True for Init dataset types
96  scalar = getattr(datasetConfig, 'scalar', True)
97  manualLoad = getattr(datasetConfig, 'manualLoad', False)
98  return cls(name=datasetConfig.name, dimensionNames=datasetConfig.dimensions,
99  storageClassName=datasetConfig.storageClass, scalar=scalar, manualLoad=manualLoad)
100 
101  def makeDatasetType(self, universe):
102  """Construct a true `DatasetType` instance with normalized dimensions.
103 
104  Parameters
105  ----------
106  universe : `lsst.daf.butler.DimensionUniverse`
107  Set of all known dimensions to be used to normalize the dimension
108  names specified in config.
109 
110  Returns
111  -------
112  datasetType : `DatasetType`
113  The `DatasetType` defined by this descriptor.
114  """
115  return DatasetType(self._name,
116  universe.extract(self._dimensionNames),
117  self._storageClassName)
118 
119  @property
120  def name(self):
121  """Name of the dataset type (`str`).
122  """
123  return self._name
124 
125  @property
126  def scalar(self):
127  """`True` if this is a scalar dataset.
128  """
129  return self._scalar
130 
131  @property
132  def manualLoad(self):
133  """`True` if the task will handle loading the data
134  """
135  return self._manualLoad
136 
137 
139  """Base class for all pipeline tasks.
140 
141  This is an abstract base class for PipelineTasks which represents an
142  algorithm executed by framework(s) on data which comes from data butler,
143  resulting data is also stored in a data butler.
144 
145  PipelineTask inherits from a `pipe.base.Task` and uses the same
146  configuration mechanism based on `pex.config`. PipelineTask sub-class
147  typically implements `run()` method which receives Python-domain data
148  objects and returns `pipe.base.Struct` object with resulting data.
149  `run()` method is not supposed to perform any I/O, it operates entirely
150  on in-memory objects. `runQuantum()` is the method (can be re-implemented
151  in sub-class) where all necessary I/O is performed, it reads all input
152  data from data butler into memory, calls `run()` method with that data,
153  examines returned `Struct` object and saves some or all of that data back
154  to data butler. `runQuantum()` method receives `daf.butler.Quantum`
155  instance which defines all input and output datasets for a single
156  invocation of PipelineTask.
157 
158  Subclasses must be constructable with exactly the arguments taken by the
159  PipelineTask base class constructor, but may support other signatures as
160  well.
161 
162  Attributes
163  ----------
164  canMultiprocess : bool, True by default (class attribute)
165  This class attribute is checked by execution framework, sub-classes
166  can set it to ``False`` in case task does not support multiprocessing.
167 
168  Parameters
169  ----------
170  config : `pex.config.Config`, optional
171  Configuration for this task (an instance of ``self.ConfigClass``,
172  which is a task-specific subclass of `PipelineTaskConfig`).
173  If not specified then it defaults to `self.ConfigClass()`.
174  log : `lsst.log.Log`, optional
175  Logger instance whose name is used as a log name prefix, or ``None``
176  for no prefix.
177  initInputs : `dict`, optional
178  A dictionary of objects needed to construct this PipelineTask, with
179  keys matching the keys of the dictionary returned by
180  `getInitInputDatasetTypes` and values equivalent to what would be
181  obtained by calling `Butler.get` with those DatasetTypes and no data
182  IDs. While it is optional for the base class, subclasses are
183  permitted to require this argument.
184  """
185 
186  canMultiprocess = True
187 
188  def __init__(self, *, config=None, log=None, initInputs=None, **kwargs):
189  super().__init__(config=config, log=log, **kwargs)
190 
192  """Return persistable outputs that are available immediately after
193  the task has been constructed.
194 
195  Subclasses that operate on catalogs should override this method to
196  return the schema(s) of the catalog(s) they produce.
197 
198  It is not necessary to return the PipelineTask's configuration or
199  other provenance information in order for it to be persisted; that is
200  the responsibility of the execution system.
201 
202  Returns
203  -------
204  datasets : `dict`
205  Dictionary with keys that match those of the dict returned by
206  `getInitOutputDatasetTypes` values that can be written by calling
207  `Butler.put` with those DatasetTypes and no data IDs. An empty
208  `dict` should be returned by tasks that produce no initialization
209  outputs.
210  """
211  return {}
212 
213  @classmethod
214  def getInputDatasetTypes(cls, config):
215  """Return input dataset type descriptors for this task.
216 
217  Default implementation finds all fields of type `InputDatasetConfig`
218  in configuration (non-recursively) and uses them for constructing
219  `DatasetTypeDescriptor` instances. The names of these fields are used
220  as keys in returned dictionary. Subclasses can override this behavior.
221 
222  Parameters
223  ----------
224  config : `Config`
225  Configuration for this task. Typically datasets are defined in
226  a task configuration.
227 
228  Returns
229  -------
230  Dictionary where key is the name (arbitrary) of the input dataset
231  and value is the `DatasetTypeDescriptor` instance. Default
232  implementation uses configuration field name as dictionary key.
233  """
234  return cls.getDatasetTypes(config, InputDatasetConfig)
235 
236  @classmethod
237  def getOutputDatasetTypes(cls, config):
238  """Return output dataset type descriptors for this task.
239 
240  Default implementation finds all fields of type `OutputDatasetConfig`
241  in configuration (non-recursively) and uses them for constructing
242  `DatasetTypeDescriptor` instances. The keys of these fields are used
243  as keys in returned dictionary. Subclasses can override this behavior.
244 
245  Parameters
246  ----------
247  config : `Config`
248  Configuration for this task. Typically datasets are defined in
249  a task configuration.
250 
251  Returns
252  -------
253  Dictionary where key is the name (arbitrary) of the output dataset
254  and value is the `DatasetTypeDescriptor` instance. Default
255  implementation uses configuration field name as dictionary key.
256  """
257  return cls.getDatasetTypes(config, OutputDatasetConfig)
258 
259  @classmethod
260  def getPrerequisiteDatasetTypes(cls, config):
261  """Return the local names of input dataset types that should be
262  assumed to exist instead of constraining what data to process with
263  this task.
264 
265  Usually, when running a `PipelineTask`, the presence of input datasets
266  constrains the processing to be done (as defined by the `QuantumGraph`
267  generated during "preflight"). "Prerequisites" are special input
268  datasets that do not constrain that graph, but instead cause a hard
269  failure when missing. Calibration products and reference catalogs
270  are examples of dataset types that should usually be marked as
271  prerequisites.
272 
273  Parameters
274  ----------
275  config : `Config`
276  Configuration for this task. Typically datasets are defined in
277  a task configuration.
278 
279  Returns
280  -------
281  prerequisite : `~collections.abc.Set` of `str`
282  The keys in the dictionary returned by `getInputDatasetTypes` that
283  represent dataset types that should be considered prerequisites.
284  Names returned here that are not keys in that dictionary are
285  ignored; that way, if a config option removes an input dataset type
286  only `getInputDatasetTypes` needs to be updated.
287  """
288  return frozenset()
289 
290  @classmethod
291  def getInitInputDatasetTypes(cls, config):
292  """Return dataset type descriptors that can be used to retrieve the
293  ``initInputs`` constructor argument.
294 
295  Datasets used in initialization may not be associated with any
296  Dimension (i.e. their data IDs must be empty dictionaries).
297 
298  Default implementation finds all fields of type
299  `InitInputInputDatasetConfig` in configuration (non-recursively) and
300  uses them for constructing `DatasetTypeDescriptor` instances. The
301  names of these fields are used as keys in returned dictionary.
302  Subclasses can override this behavior.
303 
304  Parameters
305  ----------
306  config : `Config`
307  Configuration for this task. Typically datasets are defined in
308  a task configuration.
309 
310  Returns
311  -------
312  Dictionary where key is the name (arbitrary) of the input dataset
313  and value is the `DatasetTypeDescriptor` instance. Default
314  implementation uses configuration field name as dictionary key.
315 
316  When the task requires no initialization inputs, should return an
317  empty dict.
318  """
319  return cls.getDatasetTypes(config, InitInputDatasetConfig)
320 
321  @classmethod
322  def getInitOutputDatasetTypes(cls, config):
323  """Return dataset type descriptors that can be used to write the
324  objects returned by `getOutputDatasets`.
325 
326  Datasets used in initialization may not be associated with any
327  Dimension (i.e. their data IDs must be empty dictionaries).
328 
329  Default implementation finds all fields of type
330  `InitOutputDatasetConfig` in configuration (non-recursively) and uses
331  them for constructing `DatasetTypeDescriptor` instances. The names of
332  these fields are used as keys in returned dictionary. Subclasses can
333  override this behavior.
334 
335  Parameters
336  ----------
337  config : `Config`
338  Configuration for this task. Typically datasets are defined in
339  a task configuration.
340 
341  Returns
342  -------
343  Dictionary where key is the name (arbitrary) of the output dataset
344  and value is the `DatasetTypeDescriptor` instance. Default
345  implementation uses configuration field name as dictionary key.
346 
347  When the task produces no initialization outputs, should return an
348  empty dict.
349  """
350  return cls.getDatasetTypes(config, InitOutputDatasetConfig)
351 
352  @classmethod
353  def getDatasetTypes(cls, config, configClass):
354  """Return dataset type descriptors defined in task configuration.
355 
356  This method can be used by other methods that need to extract dataset
357  types from task configuration (e.g. `getInputDatasetTypes` or
358  sub-class methods).
359 
360  Parameters
361  ----------
362  config : `Config`
363  Configuration for this task. Typically datasets are defined in
364  a task configuration.
365  configClass : `type`
366  Class of the configuration object which defines dataset type.
367 
368  Returns
369  -------
370  Dictionary where key is the name (arbitrary) of the output dataset
371  and value is the `DatasetTypeDescriptor` instance. Default
372  implementation uses configuration field name as dictionary key.
373  Returns empty dict if configuration has no fields with the specified
374  ``configClass``.
375  """
376  dsTypes = {}
377  for key, value in config.items():
378  if isinstance(value, configClass):
379  dsTypes[key] = DatasetTypeDescriptor.fromConfig(value)
380  return dsTypes
381 
382  @classmethod
383  def getPerDatasetTypeDimensions(cls, config):
384  """Return any Dimensions that are permitted to have different values
385  for different DatasetTypes within the same quantum.
386 
387  Parameters
388  ----------
389  config : `Config`
390  Configuration for this task.
391 
392  Returns
393  -------
394  dimensions : `~collections.abc.Set` of `Dimension` or `str`
395  The dimensions or names thereof that should be considered
396  per-DatasetType.
397 
398  Notes
399  -----
400  Any Dimension declared to be per-DatasetType by a PipelineTask must
401  also be declared to be per-DatasetType by other PipelineTasks in the
402  same Pipeline.
403 
404  The classic example of a per-DatasetType dimension is the
405  ``CalibrationLabel`` dimension that maps to a validity range for
406  master calibrations. When running Instrument Signature Removal, one
407  does not care that different dataset types like flat, bias, and dark
408  have different validity ranges, as long as those validity ranges all
409  overlap the relevant observation.
410  """
411  return frozenset()
412 
413  def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds, butler):
414  """Run task algorithm on in-memory data.
415 
416  This method is called by `runQuantum` to operate on input in-memory
417  data and produce coressponding output in-memory data. It receives
418  arguments which are dictionaries with input data and input/output
419  DataIds. Many simple tasks do not need to know DataIds so default
420  implementation of this method calls `run` method passing input data
421  objects as keyword arguments. Most simple tasks will implement `run`
422  method, more complex tasks that need to know about output DataIds
423  will override this method instead.
424 
425  All three arguments to this method are dictionaries with keys equal
426  to the name of the configuration fields for dataset type. If dataset
427  type is configured with ``scalar`` fiels set to ``True`` then it is
428  expected that only one dataset appears on input or output for that
429  dataset type and dictionary value will be a single data object or
430  DataId. Otherwise if ``scalar`` is ``False`` (default) then value
431  will be a list (even if only one item is in the list).
432 
433  The method returns `Struct` instance with attributes matching the
434  configuration fields for output dataset types. Values stored in
435  returned struct are single object if ``scalar`` is ``True`` or
436  list of objects otherwise. If tasks produces more than one object
437  for some dataset type then data objects returned in ``struct`` must
438  match in count and order corresponding DataIds in ``outputDataIds``.
439 
440  Parameters
441  ----------
442  inputData : `dict`
443  Dictionary whose keys are the names of the configuration fields
444  describing input dataset types and values are Python-domain data
445  objects (or lists of objects) retrieved from data butler.
446  inputDataIds : `dict`
447  Dictionary whose keys are the names of the configuration fields
448  describing input dataset types and values are DataIds (or lists
449  of DataIds) that task consumes for corresponding dataset type.
450  DataIds are guaranteed to match data objects in ``inputData``
451  outputDataIds : `dict`
452  Dictionary whose keys are the names of the configuration fields
453  describing output dataset types and values are DataIds (or lists
454  of DataIds) that task is to produce for corresponding dataset
455  type.
456 
457  Returns
458  -------
459  struct : `Struct`
460  Standard convention is that this method should return `Struct`
461  instance containing all output data. Struct attribute names
462  should correspond to the names of the configuration fields
463  describing task output dataset types. If something different
464  is returned then `saveStruct` method has to be re-implemented
465  accordingly.
466  """
467  return self.run(**inputData)
468 
469  def run(self, **kwargs):
470  """Run task algorithm on in-memory data.
471 
472  This method should be implemented in a subclass unless tasks overrides
473  `adaptArgsAndRun` to do something different from its default
474  implementation. With default implementation of `adaptArgsAndRun` this
475  method will receive keyword arguments whose names will be the same as
476  names of configuration fields describing input dataset types. Argument
477  values will be data objects retrieved from data butler. If a dataset
478  type is configured with ``scalar`` field set to ``True`` then argument
479  value will be a single object, otherwise it will be a list of objects.
480 
481  If the task needs to know its input or output DataIds then it has to
482  override `adaptArgsAndRun` method instead.
483 
484  Returns
485  -------
486  struct : `Struct`
487  See description of `adaptArgsAndRun` method.
488 
489  Examples
490  --------
491  Typical implementation of this method may look like::
492 
493  def run(self, input, calib):
494  # "input", "calib", and "output" are the names of the config fields
495 
496  # Assuming that input/calib datasets are `scalar` they are simple objects,
497  # do something with inputs and calibs, produce output image.
498  image = self.makeImage(input, calib)
499 
500  # If output dataset is `scalar` then return object, not list
501  return Struct(output=image)
502 
503  """
504  raise NotImplementedError("run() is not implemented")
505 
506  def runQuantum(self, quantum, butler):
507  """Execute PipelineTask algorithm on single quantum of data.
508 
509  Typical implementation of this method will use inputs from quantum
510  to retrieve Python-domain objects from data butler and call
511  `adaptArgsAndRun` method on that data. On return from
512  `adaptArgsAndRun` this method will extract data from returned
513  `Struct` instance and save that data to butler.
514 
515  The `Struct` returned from `adaptArgsAndRun` is expected to contain
516  data attributes with the names equal to the names of the
517  configuration fields defining output dataset types. The values of
518  the data attributes must be data objects corresponding to
519  the DataIds of output dataset types. All data objects will be
520  saved in butler using DataRefs from Quantum's output dictionary.
521 
522  This method does not return anything to the caller, on errors
523  corresponding exception is raised.
524 
525  Parameters
526  ----------
527  quantum : `Quantum`
528  Object describing input and output corresponding to this
529  invocation of PipelineTask instance.
530  butler : object
531  Data butler instance.
532 
533  Raises
534  ------
535  `ScalarError` if a dataset type is configured as scalar but receives
536  multiple DataIds in `quantum`. Any exceptions that happen in data
537  butler or in `adaptArgsAndRun` method.
538  """
539 
540  def makeDataRefs(descriptors, refMap):
541  """Generate map of DatasetRefs and DataIds.
542 
543  Given a map of DatasetTypeDescriptor and a map of Quantum
544  DatasetRefs makes maps of DataIds and and DatasetRefs.
545  For scalar dataset types unpacks DatasetRefs and DataIds.
546 
547  Parameters
548  ----------
549  descriptors : `dict`
550  Map of (dataset key, DatasetTypeDescriptor).
551  refMap : `dict`
552  Map of (dataset type name, DatasetRefs).
553 
554  Returns
555  -------
556  dataIds : `dict`
557  Map of (dataset key, DataIds)
558  dataRefs : `dict`
559  Map of (dataset key, DatasetRefs)
560 
561  Raises
562  ------
563  ScalarError
564  Raised if dataset type is configured as scalar but more than
565  one DatasetRef exists for it.
566  """
567  dataIds = {}
568  dataRefs = {}
569  for key, descriptor in descriptors.items():
570  datasetType = descriptor.makeDatasetType(butler.registry.dimensions)
571  keyDataRefs = refMap[datasetType.name]
572  keyDataIds = [dataRef.dataId for dataRef in keyDataRefs]
573  if descriptor.scalar:
574  # unpack single-item lists
575  if len(keyDataRefs) != 1:
576  raise ScalarError(key, len(keyDataRefs))
577  keyDataRefs = keyDataRefs[0]
578  keyDataIds = keyDataIds[0]
579  dataIds[key] = keyDataIds
580  if not descriptor.manualLoad:
581  dataRefs[key] = keyDataRefs
582  return dataIds, dataRefs
583 
584  # lists of DataRefs/DataIds for input datasets
585  descriptors = self.getInputDatasetTypes(self.config)
586  inputDataIds, inputDataRefs = makeDataRefs(descriptors, quantum.predictedInputs)
587 
588  # get all data from butler
589  inputs = {}
590  for key, dataRefs in inputDataRefs.items():
591  if isinstance(dataRefs, list):
592  inputs[key] = [butler.get(dataRef) for dataRef in dataRefs]
593  else:
594  inputs[key] = butler.get(dataRefs)
595  del inputDataRefs
596 
597  # lists of DataRefs/DataIds for output datasets
598  descriptors = self.getOutputDatasetTypes(self.config)
599  outputDataIds, outputDataRefs = makeDataRefs(descriptors, quantum.outputs)
600 
601  # call run method with keyword arguments
602  struct = self.adaptArgsAndRun(inputs, inputDataIds, outputDataIds, butler)
603 
604  # store produced ouput data
605  self.saveStruct(struct, outputDataRefs, butler)
606 
607  def saveStruct(self, struct, outputDataRefs, butler):
608  """Save data in butler.
609 
610  Convention is that struct returned from ``run()`` method has data
611  field(s) with the same names as the config fields defining
612  output DatasetTypes. Subclasses may override this method to implement
613  different convention for `Struct` content or in case any
614  post-processing of data may be needed.
615 
616  Parameters
617  ----------
618  struct : `Struct`
619  Data produced by the task packed into `Struct` instance
620  outputDataRefs : `dict`
621  Dictionary whose keys are the names of the configuration fields
622  describing output dataset types and values are lists of DataRefs.
623  DataRefs must match corresponding data objects in ``struct`` in
624  number and order.
625  butler : object
626  Data butler instance.
627  """
628  structDict = struct.getDict()
629  descriptors = self.getOutputDatasetTypes(self.config)
630  for key in descriptors.keys():
631  dataList = structDict[key]
632  dataRefs = outputDataRefs[key]
633  if not isinstance(dataRefs, list):
634  # scalar outputs, make them lists again
635  dataRefs = [dataRefs]
636  dataList = [dataList]
637  # TODO: check that data objects and data refs are aligned
638  for dataRef, data in zip(dataRefs, dataList):
639  butler.put(data, dataRef.datasetType.name, dataRef.dataId)
640 
641  def getResourceConfig(self):
642  """Return resource configuration for this task.
643 
644  Returns
645  -------
646  Object of type `~config.ResourceConfig` or ``None`` if resource
647  configuration is not defined for this task.
648  """
649  return getattr(self.config, "resources", None)
def __init__(self, name, dimensionNames, storageClassName, scalar, manualLoad)
Definition: pipelineTask.py:74
def getDatasetTypes(cls, config, configClass)
def runQuantum(self, quantum, butler)
def __init__(self, key, numDataIds)
Definition: pipelineTask.py:44
def __init__(self, config=None, log=None, initInputs=None, kwargs)
def adaptArgsAndRun(self, inputData, inputDataIds, outputDataIds, butler)
def saveStruct(self, struct, outputDataRefs, butler)