lsst.pipe.base  18.1.0-9-gee19f03
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import copy
32 import itertools
33 from collections import ChainMap
34 from dataclasses import dataclass
35 from typing import Set, List, Dict, Optional, Iterable
36 import logging
37 
38 # -----------------------------
39 # Imports for other modules --
40 # -----------------------------
41 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, Pipeline, TaskDef
42 from .graph import QuantumGraph, QuantumGraphTaskNodes
43 from lsst.daf.butler import (
44  DatasetRef,
45  DatasetType,
46  DimensionGraph,
47  DimensionUniverse,
48  ExpandedDataCoordinate,
49  Quantum,
50 )
51 from lsst.daf.butler.core.utils import NamedKeyDict
52 
53 # ----------------------------------
54 # Local non-exported definitions --
55 # ----------------------------------
56 
57 _LOG = logging.getLogger(__name__.partition(".")[2])
58 
59 
60 @dataclass
62  """Helper class aggregating information about a `DatasetType`, used when
63  constructing a `QuantumGraph`.
64 
65  `_DatasetScaffolding` does not hold the `DatasetType` instance itself
66  because it is usually used as the value type in `_DatasetScaffoldingDict`,
67  which uses `DatasetType` instances as keys.
68 
69  See `_PipelineScaffolding` for a top-down description of the full
70  scaffolding data structure.
71 
72  Parameters
73  ----------
74  dimensions : `DimensionGraph`
75  Dimensions of the `DatasetType`.
76  """
77  def __init__(self, dimensions: DimensionGraph):
78  self.dimensions = dimensions
79  self.producer = None
80  self.consumers = {}
81  self.dataIds = set()
82  self.refs = []
83 
84  __slots__ = ("dimensions", "producer", "consumers", "dataIds", "refs")
85 
86  dimensions: DimensionGraph
87  """The dimensions of the dataset type (`DimensionGraph`).
88 
89  Set during `_PipelineScaffolding` construction.
90  """
91 
92  producer: Optional[_TaskScaffolding]
93  """The scaffolding objects for the Task that produces this dataset.
94 
95  Set during `_PipelineScaffolding` construction.
96  """
97 
98  consumers: Dict[str, _TaskScaffolding]
99  """The scaffolding objects for the Tasks that consume this dataset,
100  keyed by their label in the `Pipeline`.
101 
102  Set during `_PipelineScaffolding` construction.
103  """
104 
105  dataIds: Set[ExpandedDataCoordinate]
106  """Data IDs for all instances of this dataset type in the graph.
107 
108  Populated after construction by `_PipelineScaffolding.fillDataIds`.
109  """
110 
111  refs: List[DatasetRef]
112  """References for all instances of this dataset type in the graph.
113 
114  Populated after construction by `_PipelineScaffolding.fillDatasetRefs`.
115  """
116 
117 
118 class _DatasetScaffoldingDict(NamedKeyDict):
119  """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`.
120 
121  See `_PipelineScaffolding` for a top-down description of the full
122  scaffolding data structure.
123 
124  Parameters
125  ----------
126  args
127  Positional arguments are forwarded to the `dict` constructor.
128  universe : `DimensionUniverse`
129  Universe of all possible dimensions.
130  """
131  def __init__(self, *args, universe: DimensionGraph):
132  super().__init__(*args)
133  self.universe = universe
134 
135  @classmethod
136  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
137  universe: DimensionUniverse) -> _DatasetScaffoldingDict:
138  """Construct a a dictionary from a flat iterable of `DatasetType` keys.
139 
140  Parameters
141  ----------
142  datasetTypes : `iterable` of `DatasetType`
143  DatasetTypes to use as keys for the dict. Values will be
144  constructed from the dimensions of the keys.
145  universe : `DimensionUniverse`
146  Universe of all possible dimensions.
147 
148  Returns
149  -------
150  dictionary : `_DatasetScaffoldingDict`
151  A new dictionary instance.
152  """
153  return cls(((datasetType, _DatasetScaffolding(datasetType.dimensions))
154  for datasetType in datasetTypes),
155  universe=universe)
156 
157  @classmethod
158  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
159  *rest) -> _DatasetScaffoldingDict:
160  """Return a new dictionary by extracting items corresponding to the
161  given keys from one or more existing dictionaries.
162 
163  Parameters
164  ----------
165  datasetTypes : `iterable` of `DatasetType`
166  DatasetTypes to use as keys for the dict. Values will be obtained
167  by lookups against ``first`` and ``rest``.
168  first : `_DatasetScaffoldingDict`
169  Another dictionary from which to extract values.
170  rest
171  Additional dictionaries from which to extract values.
172 
173  Returns
174  -------
175  dictionary : `_DatasetScaffoldingDict`
176  A new dictionary instance.
177  """
178  combined = ChainMap(first, *rest)
179  return cls(((datasetType, combined[datasetType]) for datasetType in datasetTypes),
180  universe=first.universe)
181 
182  @property
183  def dimensions(self) -> DimensionGraph:
184  """The union of all dimensions used by all dataset types in this
185  dictionary, including implied dependencies (`DimensionGraph`).
186  """
187  base = self.universe.empty
188  if len(self) == 0:
189  return base
190  return base.union(*[scaffolding.dimensions for scaffolding in self.values()])
191 
192  def unpackRefs(self) -> NamedKeyDict:
193  """Unpack nested single-element `DatasetRef` lists into a new
194  dictionary.
195 
196  This method assumes that each `_DatasetScaffolding.refs` list contains
197  exactly one `DatasetRef`, as is the case for all "init" datasets.
198 
199  Returns
200  -------
201  dictionary : `NamedKeyDict`
202  Dictionary mapping `DatasetType` to `DatasetRef`, with both
203  `DatasetType` instances and string names usable as keys.
204  """
205  return NamedKeyDict((datasetType, scaffolding.refs[0]) for datasetType, scaffolding in self.items())
206 
207 
208 @dataclass
210  """Helper class aggregating information about a `PipelineTask`, used when
211  constructing a `QuantumGraph`.
212 
213  See `_PipelineScaffolding` for a top-down description of the full
214  scaffolding data structure.
215 
216  Parameters
217  ----------
218  taskDef : `TaskDef`
219  Data structure that identifies the task class and its config.
220  parent : `_PipelineScaffolding`
221  The parent data structure that will hold the instance being
222  constructed.
223  datasetTypes : `TaskDatasetTypes`
224  Data structure that categorizes the dataset types used by this task.
225 
226  Raises
227  ------
228  GraphBuilderError
229  Raised if the task's dimensions are not a subset of the union of the
230  pipeline's dataset dimensions.
231  """
232  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
233  universe = parent.dimensions.universe
234  self.taskDef = taskDef
235  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
236  if not self.dimensions.issubset(parent.dimensions):
237  raise GraphBuilderError(f"Task with label '{taskDef.label}' has dimensions "
238  f"{self.dimensions} that are not a subset of "
239  f"the pipeline dimensions {parent.dimensions}.")
240  # Initialize _DatasetScaffoldingDicts as subsets of the one or two
241  # corresponding dicts in the parent _PipelineScaffolding.
242  self.initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
243  parent.initInputs, parent.initIntermediates)
244  self.initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
245  parent.initIntermediates, parent.initOutputs)
246  self.inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
247  parent.inputs, parent.intermediates)
248  self.outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
249  parent.intermediates, parent.outputs)
250  self.prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
251  parent.prerequisites)
252  # Add backreferences to the _DatasetScaffolding objects that point to
253  # this Task.
254  for dataset in itertools.chain(self.initInputs.values(), self.inputs.values(),
255  self.prerequisites.values()):
256  dataset.consumers[self.taskDef.label] = self
257  for dataset in itertools.chain(self.initOutputs.values(), self.outputs.values()):
258  assert dataset.producer is None
259  dataset.producer = self
260  self.dataIds = set()
261  self.quanta = []
262 
263  taskDef: TaskDef
264  """Data structure that identifies the task class and its config
265  (`TaskDef`).
266  """
267 
268  dimensions: DimensionGraph
269  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
270  """
271 
272  initInputs: _DatasetScaffoldingDict
273  """Dictionary containing information about datasets used to construct this
274  task (`_DatasetScaffoldingDict`).
275  """
276 
277  initOutputs: _DatasetScaffoldingDict
278  """Dictionary containing information about datasets produced as a
279  side-effect of constructing this task (`_DatasetScaffoldingDict`).
280  """
281 
282  inputs: _DatasetScaffoldingDict
283  """Dictionary containing information about datasets used as regular,
284  graph-constraining inputs to this task (`_DatasetScaffoldingDict`).
285  """
286 
287  outputs: _DatasetScaffoldingDict
288  """Dictionary containing information about datasets produced by this task
289  (`_DatasetScaffoldingDict`).
290  """
291 
292  prerequisites: _DatasetScaffoldingDict
293  """Dictionary containing information about input datasets that must be
294  present in the repository before any Pipeline containing this task is run
295  (`_DatasetScaffoldingDict`).
296  """
297 
298  dataIds: Set[ExpandedDataCoordinate]
299  """Data IDs for all quanta for this task in the graph (`set` of
300  `ExpandedDataCoordinate`).
301 
302  Populated after construction by `_PipelineScaffolding.fillDataIds`.
303  """
304 
305  quanta: List[Quantum]
306  """All quanta for this task in the graph (`list` of `Quantum`).
307 
308  Populated after construction by `_PipelineScaffolding.fillQuanta`.
309  """
310 
311  def addQuantum(self, quantum: Quantum):
312  config = self.taskDef.config
313  connectionClass = config.connections.ConnectionsClass
314  connectionInstance = connectionClass(config=config)
315  # This will raise if one of the check conditions is not met, which is the intended
316  # behavior
317  result = connectionInstance.adjustQuantum(quantum.predictedInputs)
318  quantum._predictedInputs = NamedKeyDict(result)
319 
320  # If this function has reached this far add the quantum
321  self.quanta.append(quantum)
322 
323  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
324  """Create a `QuantumGraphTaskNodes` instance from the information in
325  ``self``.
326 
327  Returns
328  -------
329  nodes : `QuantumGraphTaskNodes`
330  The `QuantumGraph` elements corresponding to this task.
331  """
332  return QuantumGraphTaskNodes(
333  taskDef=self.taskDef,
334  quanta=self.quanta,
335  initInputs=self.initInputs.unpackRefs(),
336  initOutputs=self.initOutputs.unpackRefs(),
337  )
338 
339 
340 @dataclass
342  """A helper data structure that organizes the information involved in
343  constructing a `QuantumGraph` for a `Pipeline`.
344 
345  Parameters
346  ----------
347  pipeline : `Pipeline`
348  Sequence of tasks from which a graph is to be constructed. Must
349  have nested task classes already imported.
350  universe : `DimensionUniverse`
351  Universe of all possible dimensions.
352 
353  Raises
354  ------
355  GraphBuilderError
356  Raised if the task's dimensions are not a subset of the union of the
357  pipeline's dataset dimensions.
358 
359  Notes
360  -----
361  The scaffolding data structure contains nested data structures for both
362  tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the
363  latter held by `_DatasetScaffoldingDict`. The dataset data structures are
364  shared between the pipeline-level structure (which aggregates all datasets
365  and categorizes them from the perspective of the complete pipeline) and the
366  individual tasks that use them as inputs and outputs.
367 
368  `QuantumGraph` construction proceeds in five steps, with each corresponding
369  to a different `_PipelineScaffolding` method:
370 
371  1. When `_PipelineScaffolding` is constructed, we extract and categorize
372  the DatasetTypes used by the pipeline (delegating to
373  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
374  nested `_TaskScaffolding` and `_DatasetScaffolding` objects.
375 
376  2. In `fillDataIds`, we construct and run the "Big Join Query", which
377  returns related tuples of all dimensions used to identify any regular
378  input, output, and intermediate datasets (not prerequisites). We then
379  iterate over these tuples of related dimensions, identifying the subsets
380  that correspond to distinct data IDs for each task and dataset type.
381 
382  3. In `fillDatasetRefs`, we run follow-up queries against all of the
383  dataset data IDs previously identified, populating the
384  `_DatasetScaffolding.refs` lists - except for those for prerequisite
385  datasets, which cannot be resolved until distinct quanta are
386  identified.
387 
388  4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into
389  the inputs and outputs for each `Quantum` and search for prerequisite
390  datasets, populating `_TaskScaffolding.quanta`.
391 
392  5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
393  per-task quanta identified in the previous step.
394  """
395  def __init__(self, pipeline, *, registry):
396  self.tasks = []
397  # Aggregate and categorize the DatasetTypes in the Pipeline.
398  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
399  # Construct dictionaries that map those DatasetTypes to structures
400  # that will (later) hold addiitonal information about them.
401  for attr in ("initInputs", "initIntermediates", "initOutputs",
402  "inputs", "intermediates", "outputs", "prerequisites"):
403  setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
404  universe=registry.dimensions))
405  # Aggregate all dimensions for all non-init, non-prerequisite
406  # DatasetTypes. These are the ones we'll include in the big join query.
407  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
408  self.outputs.dimensions)
409  # Construct scaffolding nodes for each Task, and add backreferences
410  # to the Task from each DatasetScaffolding node.
411  # Note that there's only one scaffolding node for each DatasetType, shared by
412  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
413  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
414  for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())]
415 
416  tasks: List[_TaskScaffolding]
417  """Scaffolding data structures for each task in the pipeline
418  (`list` of `_TaskScaffolding`).
419  """
420 
421  initInputs: _DatasetScaffoldingDict
422  """Datasets consumed but not produced when constructing the tasks in this
423  pipeline (`_DatasetScaffoldingDict`).
424  """
425 
426  initIntermediates: _DatasetScaffoldingDict
427  """Datasets that are both consumed and produced when constructing the tasks
428  in this pipeline (`_DatasetScaffoldingDict`).
429  """
430 
431  initOutputs: _DatasetScaffoldingDict
432  """Datasets produced but not consumed when constructing the tasks in this
433  pipeline (`_DatasetScaffoldingDict`).
434  """
435 
436  inputs: _DatasetScaffoldingDict
437  """Datasets that are consumed but not produced when running this pipeline
438  (`_DatasetScaffoldingDict`).
439  """
440 
441  intermediates: _DatasetScaffoldingDict
442  """Datasets that are both produced and consumed when running this pipeline
443  (`_DatasetScaffoldingDict`).
444  """
445 
446  outputs: _DatasetScaffoldingDict
447  """Datasets produced but not consumed when when running this pipeline
448  (`_DatasetScaffoldingDict`).
449  """
450 
451  prerequisites: _DatasetScaffoldingDict
452  """Datasets that are consumed when running this pipeline and looked up
453  per-Quantum when generating the graph (`_DatasetScaffoldingDict`).
454  """
455 
456  dimensions: DimensionGraph
457  """All dimensions used by any regular input, intermediate, or output
458  (not prerequisite) dataset; the set of dimension used in the "Big Join
459  Query" (`DimensionGraph`).
460 
461  This is required to be a superset of all task quantum dimensions.
462  """
463 
464  def fillDataIds(self, registry, inputCollections, userQuery):
465  """Query for the data IDs that connect nodes in the `QuantumGraph`.
466 
467  This method populates `_TaskScaffolding.dataIds` and
468  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
469 
470  Parameters
471  ----------
472  registry : `lsst.daf.butler.Registry`
473  Registry for the data repository; used for all data ID queries.
474  inputCollections : `~collections.abc.Mapping`
475  Mapping from dataset type name to an ordered sequence of
476  collections to search for that dataset. A `defaultdict` is
477  recommended for the case where the same collections should be
478  used for most datasets.
479  userQuery : `str`, optional
480  User-provided expression to limit the data IDs processed.
481  """
482  # Initialization datasets always have empty data IDs.
483  emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={})
484  for scaffolding in itertools.chain(self.initInputs.values(),
485  self.initIntermediates.values(),
486  self.initOutputs.values()):
487  scaffolding.dataIds.add(emptyDataId)
488  # Run one big query for the data IDs for task dimensions and regular
489  # inputs and outputs. We limit the query to only dimensions that are
490  # associated with the input dataset types, but don't (yet) try to
491  # obtain the dataset_ids for those inputs.
492  resultIter = registry.queryDimensions(
493  self.dimensions,
494  datasets={
495  datasetType: inputCollections[datasetType.name]
496  for datasetType in self.inputs
497  },
498  where=userQuery,
499  )
500  # Iterate over query results and populate the data IDs in
501  # self._TaskScaffolding.refs, extracting the subsets of the common data
502  # ID from the query corresponding to the dimensions of each. By using
503  # sets, we remove duplicates caused by query rows in which the
504  # dimensions that change are not relevant for that task or dataset
505  # type. For example, if the Big Join Query involves the dimensions
506  # (instrument, visit, detector, skymap, tract, patch), we extract
507  # "calexp" data IDs from the instrument, visit, and detector values
508  # only, and rely on `set.add` to avoid duplications due to result rows
509  # in which only skymap, tract, and patch are varying. The Big Join
510  # Query is defined such that only visit+detector and tract+patch
511  # combinations that represent spatial overlaps are included in the
512  # results.
513  for commonDataId in resultIter:
514  for taskScaffolding in self.tasks:
515  taskScaffolding.dataIds.add(commonDataId.subset(taskScaffolding.dimensions))
516  for datasetType, scaffolding in itertools.chain(self.inputs.items(),
517  self.intermediates.items(),
518  self.outputs.items()):
519  scaffolding.dataIds.add(commonDataId.subset(scaffolding.dimensions))
520 
521  def fillDatasetRefs(self, registry, inputCollections, outputCollection, *,
522  skipExisting=True, clobberExisting=False):
523  """Perform follow up queries for each dataset data ID produced in
524  `fillDataIds`.
525 
526  This method populates `_DatasetScaffolding.refs` (except for those in
527  `prerequisites`).
528 
529  Parameters
530  ----------
531  registry : `lsst.daf.butler.Registry`
532  Registry for the data repository; used for all data ID queries.
533  inputCollections : `~collections.abc.Mapping`
534  Mapping from dataset type name to an ordered sequence of
535  collections to search for that dataset. A `defaultdict` is
536  recommended for the case where the same collections should be
537  used for most datasets.
538  outputCollection : `str`
539  Collection for all output datasets.
540  skipExisting : `bool`, optional
541  If `True` (default), a Quantum is not created if all its outputs
542  already exist.
543  clobberExisting : `bool`, optional
544  If `True`, overwrite any outputs that already exist. Cannot be
545  `True` if ``skipExisting`` is.
546 
547  Raises
548  ------
549  ValueError
550  Raised if both `skipExisting` and `clobberExisting` are `True`.
551  OutputExistsError
552  Raised if an output dataset already exists in the output collection
553  and both ``skipExisting`` and ``clobberExisting`` are `False`. The
554  case where some but not all of a quantum's outputs are present and
555  ``skipExisting`` is `True` cannot be identified at this stage, and
556  is handled by `fillQuanta` instead.
557  """
558  if clobberExisting and skipExisting:
559  raise ValueError("clobberExisting and skipExisting cannot both be true.")
560  # Look up input and initInput datasets in the input collection(s).
561  for datasetType, scaffolding in itertools.chain(self.initInputs.items(), self.inputs.items()):
562  for dataId in scaffolding.dataIds:
563  refs = list(
564  registry.queryDatasets(
565  datasetType,
566  collections=inputCollections[datasetType.name],
567  dataId=dataId,
568  deduplicate=True,
569  expand=True,
570  )
571  )
572  assert len(refs) == 1, "BJQ guarantees exactly one input for each data ID."
573  scaffolding.refs.extend(refs)
574  # Look up [init] intermediate and output datasets in the output collection,
575  # unless clobberExisting is True (in which case we don't care if these
576  # already exist).
577  for datasetType, scaffolding in itertools.chain(self.initIntermediates.items(),
578  self.initOutputs.items(),
579  self.intermediates.items(),
580  self.outputs.items()):
581  for dataId in scaffolding.dataIds:
582  # TODO: we could easily support per-DatasetType clobberExisting
583  # and skipExisting (it might make sense to put them in
584  # originInfo), and I could imagine that being useful - it's
585  # probably required in order to support writing initOutputs
586  # before QuantumGraph generation.
587  if clobberExisting:
588  ref = None
589  else:
590  ref = registry.find(collection=outputCollection, datasetType=datasetType, dataId=dataId)
591  if ref is None:
592  ref = DatasetRef(datasetType, dataId)
593  elif not skipExisting:
594  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
595  f"output collection {outputCollection} with data ID {dataId}.")
596  scaffolding.refs.append(ref)
597  # Prerequisite dataset lookups are deferred until fillQuanta.
598 
599  def fillQuanta(self, registry, inputCollections, *, skipExisting=True):
600  """Define quanta for each task by splitting up the datasets associated
601  with each task data ID.
602 
603  This method populates `_TaskScaffolding.quanta`.
604 
605  Parameters
606  ----------
607  registry : `lsst.daf.butler.Registry`
608  Registry for the data repository; used for all data ID queries.
609  inputCollections : `~collections.abc.Mapping`
610  Mapping from dataset type name to an ordered sequence of
611  collections to search for that dataset. A `defaultdict` is
612  recommended for the case where the same collections should be
613  used for most datasets.
614  skipExisting : `bool`, optional
615  If `True` (default), a Quantum is not created if all its outputs
616  already exist.
617  """
618  for task in self.tasks:
619  for quantumDataId in task.dataIds:
620  # Identify the (regular) inputs that correspond to the Quantum
621  # with this data ID. These are those whose data IDs have the
622  # same values for all dimensions they have in common.
623  # We do this data IDs expanded to include implied dimensions,
624  # which is why _DatasetScaffolding.dimensions is thus expanded
625  # even though DatasetType.dimensions is not.
626  inputs = NamedKeyDict()
627  for datasetType, scaffolding in task.inputs.items():
628  inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds)
629  if quantumDataId.matches(dataId)]
630  # Same for outputs.
631  outputs = NamedKeyDict()
632  allOutputsPresent = True
633  for datasetType, scaffolding in task.outputs.items():
634  outputs[datasetType] = []
635  for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds):
636  if quantumDataId.matches(dataId):
637  if ref.id is None:
638  allOutputsPresent = False
639  else:
640  assert skipExisting, "Existing outputs should have already been identified."
641  if not allOutputsPresent:
642  raise OutputExistsError(f"Output {datasetType.name} with data ID "
643  f"{dataId} already exists, but other outputs "
644  f"for task with label {task.taskDef.label} "
645  f"and data ID {quantumDataId} do not.")
646  outputs[datasetType].append(ref)
647  if allOutputsPresent and skipExisting:
648  continue
649 
650  # Look up prerequisite datasets in the input collection(s).
651  # These may have dimensions that extend beyond those we queried
652  # for originally, because we want to permit those data ID
653  # values to differ across quanta and dataset types.
654  # For example, the same quantum may have a flat and bias with
655  # a different calibration_label, or a refcat with a skypix
656  # value that overlaps the quantum's data ID's region, but not
657  # the user expression used for the initial query.
658  for datasetType, scaffolding in task.prerequisites.items():
659  refs = list(
660  registry.queryDatasets(
661  datasetType,
662  collections=inputCollections[datasetType.name],
663  dataId=quantumDataId,
664  deduplicate=True,
665  expand=True,
666  )
667  )
668  inputs[datasetType] = refs
669  task.addQuantum(
670  Quantum(
671  taskName=task.taskDef.taskName,
672  taskClass=task.taskDef.taskClass,
673  dataId=quantumDataId,
674  initInputs=task.initInputs.unpackRefs(),
675  predictedInputs=inputs,
676  outputs=outputs,
677  )
678  )
679 
680  def makeQuantumGraph(self):
681  """Create a `QuantumGraph` from the quanta already present in
682  the scaffolding data structure.
683  """
684  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
685  graph.initInputs = self.initInputs.unpackRefs()
686  graph.initOutputs = self.initOutputs.unpackRefs()
687  graph.initIntermediates = self.initIntermediates.unpackRefs()
688  return graph
689 
690 
691 # ------------------------
692 # Exported definitions --
693 # ------------------------
694 
695 
696 class GraphBuilderError(Exception):
697  """Base class for exceptions generated by graph builder.
698  """
699  pass
700 
701 
702 class OutputExistsError(GraphBuilderError):
703  """Exception generated when output datasets already exist.
704  """
705  pass
706 
707 
709  """Exception generated when a prerequisite dataset does not exist.
710  """
711  pass
712 
713 
714 class GraphBuilder(object):
715  """GraphBuilder class is responsible for building task execution graph from
716  a Pipeline.
717 
718  Parameters
719  ----------
720  taskFactory : `TaskFactory`
721  Factory object used to load/instantiate PipelineTasks
722  registry : `~lsst.daf.butler.Registry`
723  Data butler instance.
724  skipExisting : `bool`, optional
725  If `True` (default), a Quantum is not created if all its outputs
726  already exist.
727  clobberExisting : `bool`, optional
728  If `True`, overwrite any outputs that already exist. Cannot be
729  `True` if ``skipExisting`` is.
730  """
731 
732  def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False):
733  self.taskFactory = taskFactory
734  self.registry = registry
735  self.dimensions = registry.dimensions
736  self.skipExisting = skipExisting
737  self.clobberExisting = clobberExisting
738 
739  def _loadTaskClass(self, taskDef):
740  """Make sure task class is loaded.
741 
742  Load task class, update task name to make sure it is fully-qualified,
743  do not update original taskDef in a Pipeline though.
744 
745  Parameters
746  ----------
747  taskDef : `TaskDef`
748 
749  Returns
750  -------
751  `TaskDef` instance, may be the same as parameter if task class is
752  already loaded.
753  """
754  if taskDef.taskClass is None:
755  tClass, tName = self.taskFactory.loadTaskClass(taskDef.taskName)
756  taskDef = copy.copy(taskDef)
757  taskDef.taskClass = tClass
758  taskDef.taskName = tName
759  return taskDef
760 
761  def makeGraph(self, pipeline, inputCollections, outputCollection, userQuery):
762  """Create execution graph for a pipeline.
763 
764  Parameters
765  ----------
766  pipeline : `Pipeline`
767  Pipeline definition, task names/classes and their configs.
768  inputCollections : `~collections.abc.Mapping`
769  Mapping from dataset type name to an ordered sequence of
770  collections to search for that dataset. A `defaultdict` is
771  recommended for the case where the same collections should be
772  used for most datasets.
773  outputCollection : `str`
774  Collection for all output datasets.
775  userQuery : `str`
776  String which defunes user-defined selection for registry, should be
777  empty or `None` if there is no restrictions on data selection.
778 
779  Returns
780  -------
781  graph : `QuantumGraph`
782 
783  Raises
784  ------
785  UserExpressionError
786  Raised when user expression cannot be parsed.
787  OutputExistsError
788  Raised when output datasets already exist.
789  Exception
790  Other exceptions types may be raised by underlying registry
791  classes.
792  """
793  # Make sure all task classes are loaded, creating a new Pipeline
794  # to avoid modifying the input one.
795  # TODO: in the future, it would be preferable for `Pipeline` to
796  # guarantee that its Task classes have been imported to avoid this
797  # sort of two-stage initialization.
798  pipeline = Pipeline([self._loadTaskClass(taskDef) for taskDef in pipeline])
799 
800  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
801 
802  scaffolding.fillDataIds(self.registry, inputCollections, userQuery)
803  scaffolding.fillDatasetRefs(self.registry, inputCollections, outputCollection,
804  skipExisting=self.skipExisting,
805  clobberExisting=self.clobberExisting)
806  scaffolding.fillQuanta(self.registry, inputCollections,
807  skipExisting=self.skipExisting)
808 
809  return scaffolding.makeQuantumGraph()
def fillDatasetRefs(self, registry, inputCollections, outputCollection, skipExisting=True, clobberExisting=False)
def fillDataIds(self, registry, inputCollections, userQuery)
def makeGraph(self, pipeline, inputCollections, outputCollection, userQuery)
def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False)
def fillQuanta(self, registry, inputCollections, skipExisting=True)