lsst.pipe.base  19.0.0-14-g91c0010+1
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import itertools
32 from collections import ChainMap
33 from dataclasses import dataclass
34 from typing import Set, List, Dict, Optional, Iterable
35 import logging
36 
37 # -----------------------------
38 # Imports for other modules --
39 # -----------------------------
40 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
41 from .graph import QuantumGraph, QuantumGraphTaskNodes
42 from lsst.daf.butler import (
43  DatasetRef,
44  DatasetType,
45  DimensionGraph,
46  DimensionUniverse,
47  ExpandedDataCoordinate,
48  Quantum,
49 )
50 from lsst.daf.butler.core.utils import NamedKeyDict
51 
52 # ----------------------------------
53 # Local non-exported definitions --
54 # ----------------------------------
55 
56 _LOG = logging.getLogger(__name__.partition(".")[2])
57 
58 
59 @dataclass
61  """Helper class aggregating information about a `DatasetType`, used when
62  constructing a `QuantumGraph`.
63 
64  `_DatasetScaffolding` does not hold the `DatasetType` instance itself
65  because it is usually used as the value type in `_DatasetScaffoldingDict`,
66  which uses `DatasetType` instances as keys.
67 
68  See `_PipelineScaffolding` for a top-down description of the full
69  scaffolding data structure.
70 
71  Parameters
72  ----------
73  dimensions : `DimensionGraph`
74  Dimensions of the `DatasetType`.
75  """
76  def __init__(self, dimensions: DimensionGraph):
77  self.dimensions = dimensions
78  self.producer = None
79  self.consumers = {}
80  self.dataIds = set()
81  self.refs = []
82 
83  __slots__ = ("dimensions", "producer", "consumers", "dataIds", "refs")
84 
85  dimensions: DimensionGraph
86  """The dimensions of the dataset type (`DimensionGraph`).
87 
88  Set during `_PipelineScaffolding` construction.
89  """
90 
91  producer: Optional[_TaskScaffolding]
92  """The scaffolding objects for the Task that produces this dataset.
93 
94  Set during `_PipelineScaffolding` construction.
95  """
96 
97  consumers: Dict[str, _TaskScaffolding]
98  """The scaffolding objects for the Tasks that consume this dataset,
99  keyed by their label in the `Pipeline`.
100 
101  Set during `_PipelineScaffolding` construction.
102  """
103 
104  dataIds: Set[ExpandedDataCoordinate]
105  """Data IDs for all instances of this dataset type in the graph.
106 
107  Populated after construction by `_PipelineScaffolding.fillDataIds`.
108  """
109 
110  refs: List[DatasetRef]
111  """References for all instances of this dataset type in the graph.
112 
113  Populated after construction by `_PipelineScaffolding.fillDatasetRefs`.
114  """
115 
116 
117 class _DatasetScaffoldingDict(NamedKeyDict):
118  """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`.
119 
120  See `_PipelineScaffolding` for a top-down description of the full
121  scaffolding data structure.
122 
123  Parameters
124  ----------
125  args
126  Positional arguments are forwarded to the `dict` constructor.
127  universe : `DimensionUniverse`
128  Universe of all possible dimensions.
129  """
130  def __init__(self, *args, universe: DimensionGraph):
131  super().__init__(*args)
132  self.universe = universe
133 
134  @classmethod
135  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
136  universe: DimensionUniverse) -> _DatasetScaffoldingDict:
137  """Construct a a dictionary from a flat iterable of `DatasetType` keys.
138 
139  Parameters
140  ----------
141  datasetTypes : `iterable` of `DatasetType`
142  DatasetTypes to use as keys for the dict. Values will be
143  constructed from the dimensions of the keys.
144  universe : `DimensionUniverse`
145  Universe of all possible dimensions.
146 
147  Returns
148  -------
149  dictionary : `_DatasetScaffoldingDict`
150  A new dictionary instance.
151  """
152  return cls(((datasetType, _DatasetScaffolding(datasetType.dimensions))
153  for datasetType in datasetTypes),
154  universe=universe)
155 
156  @classmethod
157  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
158  *rest) -> _DatasetScaffoldingDict:
159  """Return a new dictionary by extracting items corresponding to the
160  given keys from one or more existing dictionaries.
161 
162  Parameters
163  ----------
164  datasetTypes : `iterable` of `DatasetType`
165  DatasetTypes to use as keys for the dict. Values will be obtained
166  by lookups against ``first`` and ``rest``.
167  first : `_DatasetScaffoldingDict`
168  Another dictionary from which to extract values.
169  rest
170  Additional dictionaries from which to extract values.
171 
172  Returns
173  -------
174  dictionary : `_DatasetScaffoldingDict`
175  A new dictionary instance.
176  """
177  combined = ChainMap(first, *rest)
178  return cls(((datasetType, combined[datasetType]) for datasetType in datasetTypes),
179  universe=first.universe)
180 
181  @property
182  def dimensions(self) -> DimensionGraph:
183  """The union of all dimensions used by all dataset types in this
184  dictionary, including implied dependencies (`DimensionGraph`).
185  """
186  base = self.universe.empty
187  if len(self) == 0:
188  return base
189  return base.union(*[scaffolding.dimensions for scaffolding in self.values()])
190 
191  def unpackRefs(self) -> NamedKeyDict:
192  """Unpack nested single-element `DatasetRef` lists into a new
193  dictionary.
194 
195  This method assumes that each `_DatasetScaffolding.refs` list contains
196  exactly one `DatasetRef`, as is the case for all "init" datasets.
197 
198  Returns
199  -------
200  dictionary : `NamedKeyDict`
201  Dictionary mapping `DatasetType` to `DatasetRef`, with both
202  `DatasetType` instances and string names usable as keys.
203  """
204  return NamedKeyDict((datasetType, scaffolding.refs[0]) for datasetType, scaffolding in self.items())
205 
206 
207 @dataclass
209  """Helper class aggregating information about a `PipelineTask`, used when
210  constructing a `QuantumGraph`.
211 
212  See `_PipelineScaffolding` for a top-down description of the full
213  scaffolding data structure.
214 
215  Parameters
216  ----------
217  taskDef : `TaskDef`
218  Data structure that identifies the task class and its config.
219  parent : `_PipelineScaffolding`
220  The parent data structure that will hold the instance being
221  constructed.
222  datasetTypes : `TaskDatasetTypes`
223  Data structure that categorizes the dataset types used by this task.
224 
225  Raises
226  ------
227  GraphBuilderError
228  Raised if the task's dimensions are not a subset of the union of the
229  pipeline's dataset dimensions.
230  """
231  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
232  universe = parent.dimensions.universe
233  self.taskDef = taskDef
234  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
235  if not self.dimensions.issubset(parent.dimensions):
236  raise GraphBuilderError(f"Task with label '{taskDef.label}' has dimensions "
237  f"{self.dimensions} that are not a subset of "
238  f"the pipeline dimensions {parent.dimensions}.")
239 
240  # Initialize _DatasetScaffoldingDicts as subsets of the one or two
241  # corresponding dicts in the parent _PipelineScaffolding.
242  self.initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
243  parent.initInputs, parent.initIntermediates)
244  self.initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
245  parent.initIntermediates, parent.initOutputs)
246  self.inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
247  parent.inputs, parent.intermediates)
248  self.outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
249  parent.intermediates, parent.outputs)
250  self.prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
251  parent.prerequisites)
252  # Add backreferences to the _DatasetScaffolding objects that point to
253  # this Task.
254  for dataset in itertools.chain(self.initInputs.values(), self.inputs.values(),
255  self.prerequisites.values()):
256  dataset.consumers[self.taskDef.label] = self
257  for dataset in itertools.chain(self.initOutputs.values(), self.outputs.values()):
258  assert dataset.producer is None
259  dataset.producer = self
260  self.dataIds = set()
261  self.quanta = []
262 
263  taskDef: TaskDef
264  """Data structure that identifies the task class and its config
265  (`TaskDef`).
266  """
267 
268  dimensions: DimensionGraph
269  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
270  """
271 
272  initInputs: _DatasetScaffoldingDict
273  """Dictionary containing information about datasets used to construct this
274  task (`_DatasetScaffoldingDict`).
275  """
276 
277  initOutputs: _DatasetScaffoldingDict
278  """Dictionary containing information about datasets produced as a
279  side-effect of constructing this task (`_DatasetScaffoldingDict`).
280  """
281 
282  inputs: _DatasetScaffoldingDict
283  """Dictionary containing information about datasets used as regular,
284  graph-constraining inputs to this task (`_DatasetScaffoldingDict`).
285  """
286 
287  outputs: _DatasetScaffoldingDict
288  """Dictionary containing information about datasets produced by this task
289  (`_DatasetScaffoldingDict`).
290  """
291 
292  prerequisites: _DatasetScaffoldingDict
293  """Dictionary containing information about input datasets that must be
294  present in the repository before any Pipeline containing this task is run
295  (`_DatasetScaffoldingDict`).
296  """
297 
298  dataIds: Set[ExpandedDataCoordinate]
299  """Data IDs for all quanta for this task in the graph (`set` of
300  `ExpandedDataCoordinate`).
301 
302  Populated after construction by `_PipelineScaffolding.fillDataIds`.
303  """
304 
305  quanta: List[Quantum]
306  """All quanta for this task in the graph (`list` of `Quantum`).
307 
308  Populated after construction by `_PipelineScaffolding.fillQuanta`.
309  """
310 
311  def addQuantum(self, quantum: Quantum):
312  config = self.taskDef.config
313  connectionClass = config.connections.ConnectionsClass
314  connectionInstance = connectionClass(config=config)
315  # This will raise if one of the check conditions is not met, which is the intended
316  # behavior
317  result = connectionInstance.adjustQuantum(quantum.predictedInputs)
318  quantum._predictedInputs = NamedKeyDict(result)
319 
320  # If this function has reached this far add the quantum
321  self.quanta.append(quantum)
322 
323  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
324  """Create a `QuantumGraphTaskNodes` instance from the information in
325  ``self``.
326 
327  Returns
328  -------
329  nodes : `QuantumGraphTaskNodes`
330  The `QuantumGraph` elements corresponding to this task.
331  """
332  return QuantumGraphTaskNodes(
333  taskDef=self.taskDef,
334  quanta=self.quanta,
335  initInputs=self.initInputs.unpackRefs(),
336  initOutputs=self.initOutputs.unpackRefs(),
337  )
338 
339 
340 @dataclass
342  """A helper data structure that organizes the information involved in
343  constructing a `QuantumGraph` for a `Pipeline`.
344 
345  Parameters
346  ----------
347  pipeline : `Pipeline`
348  Sequence of tasks from which a graph is to be constructed. Must
349  have nested task classes already imported.
350  universe : `DimensionUniverse`
351  Universe of all possible dimensions.
352 
353  Raises
354  ------
355  GraphBuilderError
356  Raised if the task's dimensions are not a subset of the union of the
357  pipeline's dataset dimensions.
358 
359  Notes
360  -----
361  The scaffolding data structure contains nested data structures for both
362  tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the
363  latter held by `_DatasetScaffoldingDict`. The dataset data structures are
364  shared between the pipeline-level structure (which aggregates all datasets
365  and categorizes them from the perspective of the complete pipeline) and the
366  individual tasks that use them as inputs and outputs.
367 
368  `QuantumGraph` construction proceeds in five steps, with each corresponding
369  to a different `_PipelineScaffolding` method:
370 
371  1. When `_PipelineScaffolding` is constructed, we extract and categorize
372  the DatasetTypes used by the pipeline (delegating to
373  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
374  nested `_TaskScaffolding` and `_DatasetScaffolding` objects.
375 
376  2. In `fillDataIds`, we construct and run the "Big Join Query", which
377  returns related tuples of all dimensions used to identify any regular
378  input, output, and intermediate datasets (not prerequisites). We then
379  iterate over these tuples of related dimensions, identifying the subsets
380  that correspond to distinct data IDs for each task and dataset type.
381 
382  3. In `fillDatasetRefs`, we run follow-up queries against all of the
383  dataset data IDs previously identified, populating the
384  `_DatasetScaffolding.refs` lists - except for those for prerequisite
385  datasets, which cannot be resolved until distinct quanta are
386  identified.
387 
388  4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into
389  the inputs and outputs for each `Quantum` and search for prerequisite
390  datasets, populating `_TaskScaffolding.quanta`.
391 
392  5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
393  per-task quanta identified in the previous step.
394  """
395  def __init__(self, pipeline, *, registry):
396  self.tasks = []
397  # Aggregate and categorize the DatasetTypes in the Pipeline.
398  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
399  # Construct dictionaries that map those DatasetTypes to structures
400  # that will (later) hold addiitonal information about them.
401  for attr in ("initInputs", "initIntermediates", "initOutputs",
402  "inputs", "intermediates", "outputs", "prerequisites"):
403  setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
404  universe=registry.dimensions))
405  # Aggregate all dimensions for all non-init, non-prerequisite
406  # DatasetTypes. These are the ones we'll include in the big join query.
407  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
408  self.outputs.dimensions)
409  # Construct scaffolding nodes for each Task, and add backreferences
410  # to the Task from each DatasetScaffolding node.
411  # Note that there's only one scaffolding node for each DatasetType, shared by
412  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
413  if isinstance(pipeline, Pipeline):
414  pipeline = pipeline.toExpandedPipeline()
415  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
416  for taskDef, taskDatasetTypes in zip(pipeline,
417  datasetTypes.byTask.values())]
418 
419  tasks: List[_TaskScaffolding]
420  """Scaffolding data structures for each task in the pipeline
421  (`list` of `_TaskScaffolding`).
422  """
423 
424  initInputs: _DatasetScaffoldingDict
425  """Datasets consumed but not produced when constructing the tasks in this
426  pipeline (`_DatasetScaffoldingDict`).
427  """
428 
429  initIntermediates: _DatasetScaffoldingDict
430  """Datasets that are both consumed and produced when constructing the tasks
431  in this pipeline (`_DatasetScaffoldingDict`).
432  """
433 
434  initOutputs: _DatasetScaffoldingDict
435  """Datasets produced but not consumed when constructing the tasks in this
436  pipeline (`_DatasetScaffoldingDict`).
437  """
438 
439  inputs: _DatasetScaffoldingDict
440  """Datasets that are consumed but not produced when running this pipeline
441  (`_DatasetScaffoldingDict`).
442  """
443 
444  intermediates: _DatasetScaffoldingDict
445  """Datasets that are both produced and consumed when running this pipeline
446  (`_DatasetScaffoldingDict`).
447  """
448 
449  outputs: _DatasetScaffoldingDict
450  """Datasets produced but not consumed when when running this pipeline
451  (`_DatasetScaffoldingDict`).
452  """
453 
454  prerequisites: _DatasetScaffoldingDict
455  """Datasets that are consumed when running this pipeline and looked up
456  per-Quantum when generating the graph (`_DatasetScaffoldingDict`).
457  """
458 
459  dimensions: DimensionGraph
460  """All dimensions used by any regular input, intermediate, or output
461  (not prerequisite) dataset; the set of dimension used in the "Big Join
462  Query" (`DimensionGraph`).
463 
464  This is required to be a superset of all task quantum dimensions.
465  """
466 
467  def fillDataIds(self, registry, collections, userQuery):
468  """Query for the data IDs that connect nodes in the `QuantumGraph`.
469 
470  This method populates `_TaskScaffolding.dataIds` and
471  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
472 
473  Parameters
474  ----------
475  registry : `lsst.daf.butler.Registry`
476  Registry for the data repository; used for all data ID queries.
477  collections : `lsst.daf.butler.CollectionSearch`
478  Object representing the collections to search for input datasets.
479  userQuery : `str`, optional
480  User-provided expression to limit the data IDs processed.
481  """
482  # Initialization datasets always have empty data IDs.
483  emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={})
484  for scaffolding in itertools.chain(self.initInputs.values(),
485  self.initIntermediates.values(),
486  self.initOutputs.values()):
487  scaffolding.dataIds.add(emptyDataId)
488  # Run one big query for the data IDs for task dimensions and regular
489  # inputs and outputs. We limit the query to only dimensions that are
490  # associated with the input dataset types, but don't (yet) try to
491  # obtain the dataset_ids for those inputs.
492  resultIter = registry.queryDimensions(
493  self.dimensions,
494  datasets=list(self.inputs),
495  collections=collections,
496  where=userQuery,
497  )
498  # Iterate over query results and populate the data IDs in
499  # self._TaskScaffolding.refs, extracting the subsets of the common data
500  # ID from the query corresponding to the dimensions of each. By using
501  # sets, we remove duplicates caused by query rows in which the
502  # dimensions that change are not relevant for that task or dataset
503  # type. For example, if the Big Join Query involves the dimensions
504  # (instrument, visit, detector, skymap, tract, patch), we extract
505  # "calexp" data IDs from the instrument, visit, and detector values
506  # only, and rely on `set.add` to avoid duplications due to result rows
507  # in which only skymap, tract, and patch are varying. The Big Join
508  # Query is defined such that only visit+detector and tract+patch
509  # combinations that represent spatial overlaps are included in the
510  # results.
511  for commonDataId in resultIter:
512  for taskScaffolding in self.tasks:
513  taskScaffolding.dataIds.add(commonDataId.subset(taskScaffolding.dimensions))
514  for datasetType, scaffolding in itertools.chain(self.inputs.items(),
515  self.intermediates.items(),
516  self.outputs.items()):
517  scaffolding.dataIds.add(commonDataId.subset(scaffolding.dimensions))
518 
519  def fillDatasetRefs(self, registry, collections, run, *, skipExisting=True):
520  """Perform follow up queries for each dataset data ID produced in
521  `fillDataIds`.
522 
523  This method populates `_DatasetScaffolding.refs` (except for those in
524  `prerequisites`).
525 
526  Parameters
527  ----------
528  registry : `lsst.daf.butler.Registry`
529  Registry for the data repository; used for all data ID queries.
530  collections : `lsst.daf.butler.CollectionSearch`
531  Object representing the collections to search for input datasets.
532  run : `str`, optional
533  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
534  output datasets, if it already exists.
535  skipExisting : `bool`, optional
536  If `True` (default), a Quantum is not created if all its outputs
537  already exist in ``run``. Ignored if ``run`` is `None`.
538 
539  Raises
540  ------
541  OutputExistsError
542  Raised if an output dataset already exists in the output run
543  and ``skipExisting`` is `False`. The case where some but not all
544  of a quantum's outputs are present and ``skipExisting`` is `True`
545  cannot be identified at this stage, and is handled by `fillQuanta`
546  instead.
547  """
548  # Look up input and initInput datasets in the input collection(s).
549  for datasetType, scaffolding in itertools.chain(self.initInputs.items(), self.inputs.items()):
550  for dataId in scaffolding.dataIds:
551  refs = list(
552  registry.queryDatasets(
553  datasetType,
554  collections=collections,
555  dataId=dataId,
556  deduplicate=True,
557  expand=True,
558  )
559  )
560  assert len(refs) == 1, "BJQ guarantees exactly one input for each data ID."
561  scaffolding.refs.extend(refs)
562  # Look up [init] intermediate and output datasets in the output collection,
563  # unless clobberExisting is True (in which case we don't care if these
564  # already exist).
565  for datasetType, scaffolding in itertools.chain(self.initIntermediates.items(),
566  self.initOutputs.items(),
567  self.intermediates.items(),
568  self.outputs.items()):
569  for dataId in scaffolding.dataIds:
570  # TODO: we could easily support per-DatasetType skipExisting
571  # (it might make sense to put them in originInfo), and I could
572  # imagine that being useful - it's probably required in order
573  # to support writing initOutputs before QuantumGraph
574  # generation.
575  if run is not None:
576  ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
577  else:
578  ref = None
579  if ref is None:
580  ref = DatasetRef(datasetType, dataId)
581  elif not skipExisting:
582  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
583  f"output RUN collection '{run}' with data ID {dataId}.")
584  scaffolding.refs.append(ref)
585  # Prerequisite dataset lookups are deferred until fillQuanta.
586 
587  def fillQuanta(self, registry, collections, *, skipExisting=True):
588  """Define quanta for each task by splitting up the datasets associated
589  with each task data ID.
590 
591  This method populates `_TaskScaffolding.quanta`.
592 
593  Parameters
594  ----------
595  registry : `lsst.daf.butler.Registry`
596  Registry for the data repository; used for all data ID queries.
597  collections : `lsst.daf.butler.CollectionSearch`
598  Object representing the collections to search for input datasets.
599  skipExisting : `bool`, optional
600  If `True` (default), a Quantum is not created if all its outputs
601  already exist.
602  """
603  for task in self.tasks:
604  for quantumDataId in task.dataIds:
605  # Identify the (regular) inputs that correspond to the Quantum
606  # with this data ID. These are those whose data IDs have the
607  # same values for all dimensions they have in common.
608  # We do this data IDs expanded to include implied dimensions,
609  # which is why _DatasetScaffolding.dimensions is thus expanded
610  # even though DatasetType.dimensions is not.
611  inputs = NamedKeyDict()
612  for datasetType, scaffolding in task.inputs.items():
613  inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds)
614  if quantumDataId.matches(dataId)]
615 
616  _LOG.debug("%s dataId %s has inputs: %s",
617  task.taskDef.taskName, quantumDataId, list(inputs.names))
618 
619  # Same for outputs.
620  outputs = NamedKeyDict()
621  allOutputsPresent = True
622  for datasetType, scaffolding in task.outputs.items():
623  outputs[datasetType] = []
624  for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds):
625  if quantumDataId.matches(dataId):
626  if ref.id is None:
627  allOutputsPresent = False
628  else:
629  assert skipExisting, "Existing outputs should have already been identified."
630  if not allOutputsPresent:
631  raise OutputExistsError(f"Output {datasetType.name} with data ID "
632  f"{dataId} already exists, but other outputs "
633  f"for task with label {task.taskDef.label} "
634  f"and data ID {quantumDataId} do not.")
635  outputs[datasetType].append(ref)
636  if allOutputsPresent and skipExisting:
637  continue
638 
639  _LOG.debug("%s dataID %s has outputs: %s",
640  task.taskDef.taskName, quantumDataId, list(outputs.names))
641 
642  # Look up prerequisite datasets in the input collection(s).
643  # These may have dimensions that extend beyond those we queried
644  # for originally, because we want to permit those data ID
645  # values to differ across quanta and dataset types.
646  # For example, the same quantum may have a flat and bias with
647  # a different calibration_label, or a refcat with a skypix
648  # value that overlaps the quantum's data ID's region, but not
649  # the user expression used for the initial query.
650  connections = task.taskDef.connections
651  for con_name in connections.prerequisiteInputs:
652  con = getattr(connections, con_name)
653  for datasetType in task.prerequisites:
654  if datasetType.name == con.name:
655  break
656  if con.lookupFunction is not None:
657  refs = list(con.lookupFunction(datasetType, registry,
658  quantumDataId, collections))
659  else:
660  refs = list(
661  registry.queryDatasets(
662  datasetType,
663  collections=collections,
664  dataId=quantumDataId,
665  deduplicate=True,
666  expand=True,
667  )
668  )
669  inputs[datasetType] = refs
670 
671  _LOG.debug("%s dataID %s has inputs+prereqs: %s",
672  task.taskDef.taskName, quantumDataId, list(inputs.names))
673 
674  task.addQuantum(
675  Quantum(
676  taskName=task.taskDef.taskName,
677  taskClass=task.taskDef.taskClass,
678  dataId=quantumDataId,
679  initInputs=task.initInputs.unpackRefs(),
680  predictedInputs=inputs,
681  outputs=outputs,
682  )
683  )
684 
685  def makeQuantumGraph(self):
686  """Create a `QuantumGraph` from the quanta already present in
687  the scaffolding data structure.
688  """
689  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
690  graph.initInputs = self.initInputs.unpackRefs()
691  graph.initOutputs = self.initOutputs.unpackRefs()
692  graph.initIntermediates = self.initIntermediates.unpackRefs()
693  return graph
694 
695 
696 # ------------------------
697 # Exported definitions --
698 # ------------------------
699 
700 
701 class GraphBuilderError(Exception):
702  """Base class for exceptions generated by graph builder.
703  """
704  pass
705 
706 
707 class OutputExistsError(GraphBuilderError):
708  """Exception generated when output datasets already exist.
709  """
710  pass
711 
712 
714  """Exception generated when a prerequisite dataset does not exist.
715  """
716  pass
717 
718 
719 class GraphBuilder(object):
720  """GraphBuilder class is responsible for building task execution graph from
721  a Pipeline.
722 
723  Parameters
724  ----------
725  registry : `~lsst.daf.butler.Registry`
726  Data butler instance.
727  skipExisting : `bool`, optional
728  If `True` (default), a Quantum is not created if all its outputs
729  already exist.
730  """
731 
732  def __init__(self, registry, skipExisting=True):
733  self.registry = registry
734  self.dimensions = registry.dimensions
735  self.skipExisting = skipExisting
736 
737  def makeGraph(self, pipeline, collections, run, userQuery):
738  """Create execution graph for a pipeline.
739 
740  Parameters
741  ----------
742  pipeline : `Pipeline`
743  Pipeline definition, task names/classes and their configs.
744  collections : `lsst.daf.butler.CollectionSearch`
745  Object representing the collections to search for input datasets.
746  run : `str`, optional
747  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
748  output datasets, if it already exists.
749  userQuery : `str`
750  String which defunes user-defined selection for registry, should be
751  empty or `None` if there is no restrictions on data selection.
752 
753  Returns
754  -------
755  graph : `QuantumGraph`
756 
757  Raises
758  ------
759  UserExpressionError
760  Raised when user expression cannot be parsed.
761  OutputExistsError
762  Raised when output datasets already exist.
763  Exception
764  Other exceptions types may be raised by underlying registry
765  classes.
766  """
767  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
768  scaffolding.fillDataIds(self.registry, collections, userQuery)
769  scaffolding.fillDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting)
770  scaffolding.fillQuanta(self.registry, collections, skipExisting=self.skipExisting)
771  return scaffolding.makeQuantumGraph()
def fillDatasetRefs(self, registry, collections, run, skipExisting=True)
def makeGraph(self, pipeline, collections, run, userQuery)
def __init__(self, registry, skipExisting=True)
def fillQuanta(self, registry, collections, skipExisting=True)
def fillDataIds(self, registry, collections, userQuery)