lsst.pipe.base  19.0.0-23-gdc29a50+2
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import itertools
32 from collections import ChainMap
33 from dataclasses import dataclass
34 from typing import Set, List, Dict, Optional, Iterable
35 import logging
36 
37 # -----------------------------
38 # Imports for other modules --
39 # -----------------------------
40 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
41 from .graph import QuantumGraph, QuantumGraphTaskNodes
42 from lsst.daf.butler import (
43  DatasetRef,
44  DatasetType,
45  DimensionGraph,
46  DimensionUniverse,
47  ExpandedDataCoordinate,
48  NamedKeyDict,
49  Quantum,
50 )
51 
52 # ----------------------------------
53 # Local non-exported definitions --
54 # ----------------------------------
55 
56 _LOG = logging.getLogger(__name__.partition(".")[2])
57 
58 
59 @dataclass
61  """Helper class aggregating information about a `DatasetType`, used when
62  constructing a `QuantumGraph`.
63 
64  `_DatasetScaffolding` does not hold the `DatasetType` instance itself
65  because it is usually used as the value type in `_DatasetScaffoldingDict`,
66  which uses `DatasetType` instances as keys.
67 
68  See `_PipelineScaffolding` for a top-down description of the full
69  scaffolding data structure.
70 
71  Parameters
72  ----------
73  dimensions : `DimensionGraph`
74  Dimensions of the `DatasetType`.
75  """
76  def __init__(self, dimensions: DimensionGraph):
77  self.dimensions = dimensions
78  self.producer = None
79  self.consumers = {}
80  self.dataIds = set()
81  self.refs = []
82 
83  __slots__ = ("dimensions", "producer", "consumers", "dataIds", "refs")
84 
85  def __repr__(self):
86  # Default dataclass-injected __repr__ gets caught in an infinite loop
87  # because of back-references.
88  return f"_DatasetScaffolding(dimensions={self.dimensions}, ...)"
89 
90  dimensions: DimensionGraph
91  """The dimensions of the dataset type (`DimensionGraph`).
92 
93  Set during `_PipelineScaffolding` construction.
94  """
95 
96  producer: Optional[_TaskScaffolding]
97  """The scaffolding objects for the Task that produces this dataset.
98 
99  Set during `_PipelineScaffolding` construction.
100  """
101 
102  consumers: Dict[str, _TaskScaffolding]
103  """The scaffolding objects for the Tasks that consume this dataset,
104  keyed by their label in the `Pipeline`.
105 
106  Set during `_PipelineScaffolding` construction.
107  """
108 
109  dataIds: Set[ExpandedDataCoordinate]
110  """Data IDs for all instances of this dataset type in the graph.
111 
112  Populated after construction by `_PipelineScaffolding.fillDataIds`.
113  """
114 
115  refs: List[DatasetRef]
116  """References for all instances of this dataset type in the graph.
117 
118  Populated after construction by `_PipelineScaffolding.fillDatasetRefs`.
119  """
120 
121 
122 class _DatasetScaffoldingDict(NamedKeyDict):
123  """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`.
124 
125  See `_PipelineScaffolding` for a top-down description of the full
126  scaffolding data structure.
127 
128  Parameters
129  ----------
130  args
131  Positional arguments are forwarded to the `dict` constructor.
132  universe : `DimensionUniverse`
133  Universe of all possible dimensions.
134  """
135  def __init__(self, *args, universe: DimensionGraph):
136  super().__init__(*args)
137  self.universe = universe
138 
139  @classmethod
140  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
141  universe: DimensionUniverse) -> _DatasetScaffoldingDict:
142  """Construct a a dictionary from a flat iterable of `DatasetType` keys.
143 
144  Parameters
145  ----------
146  datasetTypes : `iterable` of `DatasetType`
147  DatasetTypes to use as keys for the dict. Values will be
148  constructed from the dimensions of the keys.
149  universe : `DimensionUniverse`
150  Universe of all possible dimensions.
151 
152  Returns
153  -------
154  dictionary : `_DatasetScaffoldingDict`
155  A new dictionary instance.
156  """
157  return cls(((datasetType, _DatasetScaffolding(datasetType.dimensions))
158  for datasetType in datasetTypes),
159  universe=universe)
160 
161  @classmethod
162  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
163  *rest) -> _DatasetScaffoldingDict:
164  """Return a new dictionary by extracting items corresponding to the
165  given keys from one or more existing dictionaries.
166 
167  Parameters
168  ----------
169  datasetTypes : `iterable` of `DatasetType`
170  DatasetTypes to use as keys for the dict. Values will be obtained
171  by lookups against ``first`` and ``rest``.
172  first : `_DatasetScaffoldingDict`
173  Another dictionary from which to extract values.
174  rest
175  Additional dictionaries from which to extract values.
176 
177  Returns
178  -------
179  dictionary : `_DatasetScaffoldingDict`
180  A new dictionary instance.
181  """
182  combined = ChainMap(first, *rest)
183  return cls(((datasetType, combined[datasetType]) for datasetType in datasetTypes),
184  universe=first.universe)
185 
186  @property
187  def dimensions(self) -> DimensionGraph:
188  """The union of all dimensions used by all dataset types in this
189  dictionary, including implied dependencies (`DimensionGraph`).
190  """
191  base = self.universe.empty
192  if len(self) == 0:
193  return base
194  return base.union(*[scaffolding.dimensions for scaffolding in self.values()])
195 
196  def unpackRefs(self) -> NamedKeyDict:
197  """Unpack nested single-element `DatasetRef` lists into a new
198  dictionary.
199 
200  This method assumes that each `_DatasetScaffolding.refs` list contains
201  exactly one `DatasetRef`, as is the case for all "init" datasets.
202 
203  Returns
204  -------
205  dictionary : `NamedKeyDict`
206  Dictionary mapping `DatasetType` to `DatasetRef`, with both
207  `DatasetType` instances and string names usable as keys.
208  """
209  return NamedKeyDict((datasetType, scaffolding.refs[0]) for datasetType, scaffolding in self.items())
210 
211 
212 @dataclass
214  """Helper class aggregating information about a `PipelineTask`, used when
215  constructing a `QuantumGraph`.
216 
217  See `_PipelineScaffolding` for a top-down description of the full
218  scaffolding data structure.
219 
220  Parameters
221  ----------
222  taskDef : `TaskDef`
223  Data structure that identifies the task class and its config.
224  parent : `_PipelineScaffolding`
225  The parent data structure that will hold the instance being
226  constructed.
227  datasetTypes : `TaskDatasetTypes`
228  Data structure that categorizes the dataset types used by this task.
229 
230  Raises
231  ------
232  GraphBuilderError
233  Raised if the task's dimensions are not a subset of the union of the
234  pipeline's dataset dimensions.
235  """
236  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
237  universe = parent.dimensions.universe
238  self.taskDef = taskDef
239  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
240  if not self.dimensions.issubset(parent.dimensions):
241  raise GraphBuilderError(f"Task with label '{taskDef.label}' has dimensions "
242  f"{self.dimensions} that are not a subset of "
243  f"the pipeline dimensions {parent.dimensions}.")
244 
245  # Initialize _DatasetScaffoldingDicts as subsets of the one or two
246  # corresponding dicts in the parent _PipelineScaffolding.
247  self.initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
248  parent.initInputs, parent.initIntermediates)
249  self.initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
250  parent.initIntermediates, parent.initOutputs)
251  self.inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
252  parent.inputs, parent.intermediates)
253  self.outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
254  parent.intermediates, parent.outputs)
255  self.prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
256  parent.prerequisites)
257  # Add backreferences to the _DatasetScaffolding objects that point to
258  # this Task.
259  for dataset in itertools.chain(self.initInputs.values(), self.inputs.values(),
260  self.prerequisites.values()):
261  dataset.consumers[self.taskDef.label] = self
262  for dataset in itertools.chain(self.initOutputs.values(), self.outputs.values()):
263  assert dataset.producer is None
264  dataset.producer = self
265  self.dataIds = set()
266  self.quanta = []
267 
268  def __repr__(self):
269  # Default dataclass-injected __repr__ gets caught in an infinite loop
270  # because of back-references.
271  return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
272 
273  taskDef: TaskDef
274  """Data structure that identifies the task class and its config
275  (`TaskDef`).
276  """
277 
278  dimensions: DimensionGraph
279  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
280  """
281 
282  initInputs: _DatasetScaffoldingDict
283  """Dictionary containing information about datasets used to construct this
284  task (`_DatasetScaffoldingDict`).
285  """
286 
287  initOutputs: _DatasetScaffoldingDict
288  """Dictionary containing information about datasets produced as a
289  side-effect of constructing this task (`_DatasetScaffoldingDict`).
290  """
291 
292  inputs: _DatasetScaffoldingDict
293  """Dictionary containing information about datasets used as regular,
294  graph-constraining inputs to this task (`_DatasetScaffoldingDict`).
295  """
296 
297  outputs: _DatasetScaffoldingDict
298  """Dictionary containing information about datasets produced by this task
299  (`_DatasetScaffoldingDict`).
300  """
301 
302  prerequisites: _DatasetScaffoldingDict
303  """Dictionary containing information about input datasets that must be
304  present in the repository before any Pipeline containing this task is run
305  (`_DatasetScaffoldingDict`).
306  """
307 
308  dataIds: Set[ExpandedDataCoordinate]
309  """Data IDs for all quanta for this task in the graph (`set` of
310  `ExpandedDataCoordinate`).
311 
312  Populated after construction by `_PipelineScaffolding.fillDataIds`.
313  """
314 
315  quanta: List[Quantum]
316  """All quanta for this task in the graph (`list` of `Quantum`).
317 
318  Populated after construction by `_PipelineScaffolding.fillQuanta`.
319  """
320 
321  def addQuantum(self, quantum: Quantum):
322  config = self.taskDef.config
323  connectionClass = config.connections.ConnectionsClass
324  connectionInstance = connectionClass(config=config)
325  # This will raise if one of the check conditions is not met, which is the intended
326  # behavior
327  result = connectionInstance.adjustQuantum(quantum.predictedInputs)
328  quantum._predictedInputs = NamedKeyDict(result)
329 
330  # If this function has reached this far add the quantum
331  self.quanta.append(quantum)
332 
333  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
334  """Create a `QuantumGraphTaskNodes` instance from the information in
335  ``self``.
336 
337  Returns
338  -------
339  nodes : `QuantumGraphTaskNodes`
340  The `QuantumGraph` elements corresponding to this task.
341  """
342  return QuantumGraphTaskNodes(
343  taskDef=self.taskDef,
344  quanta=self.quanta,
345  initInputs=self.initInputs.unpackRefs(),
346  initOutputs=self.initOutputs.unpackRefs(),
347  )
348 
349 
350 @dataclass
352  """A helper data structure that organizes the information involved in
353  constructing a `QuantumGraph` for a `Pipeline`.
354 
355  Parameters
356  ----------
357  pipeline : `Pipeline`
358  Sequence of tasks from which a graph is to be constructed. Must
359  have nested task classes already imported.
360  universe : `DimensionUniverse`
361  Universe of all possible dimensions.
362 
363  Raises
364  ------
365  GraphBuilderError
366  Raised if the task's dimensions are not a subset of the union of the
367  pipeline's dataset dimensions.
368 
369  Notes
370  -----
371  The scaffolding data structure contains nested data structures for both
372  tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the
373  latter held by `_DatasetScaffoldingDict`. The dataset data structures are
374  shared between the pipeline-level structure (which aggregates all datasets
375  and categorizes them from the perspective of the complete pipeline) and the
376  individual tasks that use them as inputs and outputs.
377 
378  `QuantumGraph` construction proceeds in five steps, with each corresponding
379  to a different `_PipelineScaffolding` method:
380 
381  1. When `_PipelineScaffolding` is constructed, we extract and categorize
382  the DatasetTypes used by the pipeline (delegating to
383  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
384  nested `_TaskScaffolding` and `_DatasetScaffolding` objects.
385 
386  2. In `fillDataIds`, we construct and run the "Big Join Query", which
387  returns related tuples of all dimensions used to identify any regular
388  input, output, and intermediate datasets (not prerequisites). We then
389  iterate over these tuples of related dimensions, identifying the subsets
390  that correspond to distinct data IDs for each task and dataset type.
391 
392  3. In `fillDatasetRefs`, we run follow-up queries against all of the
393  dataset data IDs previously identified, populating the
394  `_DatasetScaffolding.refs` lists - except for those for prerequisite
395  datasets, which cannot be resolved until distinct quanta are
396  identified.
397 
398  4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into
399  the inputs and outputs for each `Quantum` and search for prerequisite
400  datasets, populating `_TaskScaffolding.quanta`.
401 
402  5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
403  per-task quanta identified in the previous step.
404  """
405  def __init__(self, pipeline, *, registry):
406  self.tasks = []
407  # Aggregate and categorize the DatasetTypes in the Pipeline.
408  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
409  # Construct dictionaries that map those DatasetTypes to structures
410  # that will (later) hold addiitonal information about them.
411  for attr in ("initInputs", "initIntermediates", "initOutputs",
412  "inputs", "intermediates", "outputs", "prerequisites"):
413  setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
414  universe=registry.dimensions))
415  # Aggregate all dimensions for all non-init, non-prerequisite
416  # DatasetTypes. These are the ones we'll include in the big join query.
417  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
418  self.outputs.dimensions)
419  # Construct scaffolding nodes for each Task, and add backreferences
420  # to the Task from each DatasetScaffolding node.
421  # Note that there's only one scaffolding node for each DatasetType, shared by
422  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
423  if isinstance(pipeline, Pipeline):
424  pipeline = pipeline.toExpandedPipeline()
425  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
426  for taskDef, taskDatasetTypes in zip(pipeline,
427  datasetTypes.byTask.values())]
428 
429  def __repr__(self):
430  # Default dataclass-injected __repr__ gets caught in an infinite loop
431  # because of back-references.
432  return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
433 
434  tasks: List[_TaskScaffolding]
435  """Scaffolding data structures for each task in the pipeline
436  (`list` of `_TaskScaffolding`).
437  """
438 
439  initInputs: _DatasetScaffoldingDict
440  """Datasets consumed but not produced when constructing the tasks in this
441  pipeline (`_DatasetScaffoldingDict`).
442  """
443 
444  initIntermediates: _DatasetScaffoldingDict
445  """Datasets that are both consumed and produced when constructing the tasks
446  in this pipeline (`_DatasetScaffoldingDict`).
447  """
448 
449  initOutputs: _DatasetScaffoldingDict
450  """Datasets produced but not consumed when constructing the tasks in this
451  pipeline (`_DatasetScaffoldingDict`).
452  """
453 
454  inputs: _DatasetScaffoldingDict
455  """Datasets that are consumed but not produced when running this pipeline
456  (`_DatasetScaffoldingDict`).
457  """
458 
459  intermediates: _DatasetScaffoldingDict
460  """Datasets that are both produced and consumed when running this pipeline
461  (`_DatasetScaffoldingDict`).
462  """
463 
464  outputs: _DatasetScaffoldingDict
465  """Datasets produced but not consumed when when running this pipeline
466  (`_DatasetScaffoldingDict`).
467  """
468 
469  prerequisites: _DatasetScaffoldingDict
470  """Datasets that are consumed when running this pipeline and looked up
471  per-Quantum when generating the graph (`_DatasetScaffoldingDict`).
472  """
473 
474  dimensions: DimensionGraph
475  """All dimensions used by any regular input, intermediate, or output
476  (not prerequisite) dataset; the set of dimension used in the "Big Join
477  Query" (`DimensionGraph`).
478 
479  This is required to be a superset of all task quantum dimensions.
480  """
481 
482  def fillDataIds(self, registry, collections, userQuery):
483  """Query for the data IDs that connect nodes in the `QuantumGraph`.
484 
485  This method populates `_TaskScaffolding.dataIds` and
486  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
487 
488  Parameters
489  ----------
490  registry : `lsst.daf.butler.Registry`
491  Registry for the data repository; used for all data ID queries.
492  collections : `lsst.daf.butler.CollectionSearch`
493  Object representing the collections to search for input datasets.
494  userQuery : `str`, optional
495  User-provided expression to limit the data IDs processed.
496  """
497  # Initialization datasets always have empty data IDs.
498  emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={})
499  for scaffolding in itertools.chain(self.initInputs.values(),
500  self.initIntermediates.values(),
501  self.initOutputs.values()):
502  scaffolding.dataIds.add(emptyDataId)
503  # Run one big query for the data IDs for task dimensions and regular
504  # inputs and outputs. We limit the query to only dimensions that are
505  # associated with the input dataset types, but don't (yet) try to
506  # obtain the dataset_ids for those inputs.
507  resultIter = registry.queryDimensions(
508  self.dimensions,
509  datasets=list(self.inputs),
510  collections=collections,
511  where=userQuery,
512  )
513  # Iterate over query results and populate the data IDs in
514  # self._TaskScaffolding.refs, extracting the subsets of the common data
515  # ID from the query corresponding to the dimensions of each. By using
516  # sets, we remove duplicates caused by query rows in which the
517  # dimensions that change are not relevant for that task or dataset
518  # type. For example, if the Big Join Query involves the dimensions
519  # (instrument, visit, detector, skymap, tract, patch), we extract
520  # "calexp" data IDs from the instrument, visit, and detector values
521  # only, and rely on `set.add` to avoid duplications due to result rows
522  # in which only skymap, tract, and patch are varying. The Big Join
523  # Query is defined such that only visit+detector and tract+patch
524  # combinations that represent spatial overlaps are included in the
525  # results.
526  for commonDataId in resultIter:
527  for taskScaffolding in self.tasks:
528  taskScaffolding.dataIds.add(commonDataId.subset(taskScaffolding.dimensions))
529  for datasetType, scaffolding in itertools.chain(self.inputs.items(),
530  self.intermediates.items(),
531  self.outputs.items()):
532  scaffolding.dataIds.add(commonDataId.subset(scaffolding.dimensions))
533 
534  def fillDatasetRefs(self, registry, collections, run, *, skipExisting=True):
535  """Perform follow up queries for each dataset data ID produced in
536  `fillDataIds`.
537 
538  This method populates `_DatasetScaffolding.refs` (except for those in
539  `prerequisites`).
540 
541  Parameters
542  ----------
543  registry : `lsst.daf.butler.Registry`
544  Registry for the data repository; used for all data ID queries.
545  collections : `lsst.daf.butler.CollectionSearch`
546  Object representing the collections to search for input datasets.
547  run : `str`, optional
548  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
549  output datasets, if it already exists.
550  skipExisting : `bool`, optional
551  If `True` (default), a Quantum is not created if all its outputs
552  already exist in ``run``. Ignored if ``run`` is `None`.
553 
554  Raises
555  ------
556  OutputExistsError
557  Raised if an output dataset already exists in the output run
558  and ``skipExisting`` is `False`. The case where some but not all
559  of a quantum's outputs are present and ``skipExisting`` is `True`
560  cannot be identified at this stage, and is handled by `fillQuanta`
561  instead.
562  """
563  # Look up input and initInput datasets in the input collection(s).
564  for datasetType, scaffolding in itertools.chain(self.initInputs.items(), self.inputs.items()):
565  for dataId in scaffolding.dataIds:
566  refs = list(
567  registry.queryDatasets(
568  datasetType,
569  collections=collections,
570  dataId=dataId,
571  deduplicate=True,
572  expand=True,
573  )
574  )
575  if len(refs) != 1:
576  raise RuntimeError(f"Expected exactly one instance of input {datasetType} "
577  f"for data ID {dataId}; got {refs}.")
578  scaffolding.refs.extend(refs)
579  # Look up [init] intermediate and output datasets in the output collection,
580  # unless clobberExisting is True (in which case we don't care if these
581  # already exist).
582  for datasetType, scaffolding in itertools.chain(self.initIntermediates.items(),
583  self.initOutputs.items(),
584  self.intermediates.items(),
585  self.outputs.items()):
586  for dataId in scaffolding.dataIds:
587  # TODO: we could easily support per-DatasetType skipExisting
588  # (it might make sense to put them in originInfo), and I could
589  # imagine that being useful - it's probably required in order
590  # to support writing initOutputs before QuantumGraph
591  # generation.
592  if run is not None:
593  ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
594  else:
595  ref = None
596  if ref is None:
597  ref = DatasetRef(datasetType, dataId)
598  elif not skipExisting:
599  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
600  f"output RUN collection '{run}' with data ID {dataId}.")
601  scaffolding.refs.append(ref)
602  # Prerequisite dataset lookups are deferred until fillQuanta.
603 
604  def fillQuanta(self, registry, collections, *, skipExisting=True):
605  """Define quanta for each task by splitting up the datasets associated
606  with each task data ID.
607 
608  This method populates `_TaskScaffolding.quanta`.
609 
610  Parameters
611  ----------
612  registry : `lsst.daf.butler.Registry`
613  Registry for the data repository; used for all data ID queries.
614  collections : `lsst.daf.butler.CollectionSearch`
615  Object representing the collections to search for input datasets.
616  skipExisting : `bool`, optional
617  If `True` (default), a Quantum is not created if all its outputs
618  already exist.
619  """
620  for task in self.tasks:
621  for quantumDataId in task.dataIds:
622  # Identify the (regular) inputs that correspond to the Quantum
623  # with this data ID. These are those whose data IDs have the
624  # same values for all dimensions they have in common.
625  # We do this data IDs expanded to include implied dimensions,
626  # which is why _DatasetScaffolding.dimensions is thus expanded
627  # even though DatasetType.dimensions is not.
628  inputs = NamedKeyDict()
629  for datasetType, scaffolding in task.inputs.items():
630  inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds)
631  if registry.relateDataIds(quantumDataId, dataId)]
632 
633  _LOG.debug("%s dataId %s has inputs: %s",
634  task.taskDef.taskName, quantumDataId, list(inputs.names))
635 
636  # Same for outputs.
637  outputs = NamedKeyDict()
638  allOutputsPresent = True
639  for datasetType, scaffolding in task.outputs.items():
640  outputs[datasetType] = []
641  for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds):
642  if registry.relateDataIds(quantumDataId, dataId):
643  if ref.id is None:
644  allOutputsPresent = False
645  else:
646  assert skipExisting, "Existing outputs should have already been identified."
647  if not allOutputsPresent:
648  raise OutputExistsError(f"Output {datasetType.name} with data ID "
649  f"{dataId} already exists, but other outputs "
650  f"for task with label {task.taskDef.label} "
651  f"and data ID {quantumDataId} do not.")
652  outputs[datasetType].append(ref)
653  if allOutputsPresent and skipExisting:
654  continue
655 
656  _LOG.debug("%s dataID %s has outputs: %s",
657  task.taskDef.taskName, quantumDataId, list(outputs.names))
658 
659  # Look up prerequisite datasets in the input collection(s).
660  # These may have dimensions that extend beyond those we queried
661  # for originally, because we want to permit those data ID
662  # values to differ across quanta and dataset types.
663  # For example, the same quantum may have a flat and bias with
664  # a different calibration_label, or a refcat with a skypix
665  # value that overlaps the quantum's data ID's region, but not
666  # the user expression used for the initial query.
667  connections = task.taskDef.connections
668  for con_name in connections.prerequisiteInputs:
669  con = getattr(connections, con_name)
670  for datasetType in task.prerequisites:
671  if datasetType.name == con.name:
672  break
673  if con.lookupFunction is not None:
674  refs = list(con.lookupFunction(datasetType, registry,
675  quantumDataId, collections))
676  else:
677  refs = list(
678  registry.queryDatasets(
679  datasetType,
680  collections=collections,
681  dataId=quantumDataId,
682  deduplicate=True,
683  expand=True,
684  )
685  )
686  inputs[datasetType] = refs
687 
688  _LOG.debug("%s dataID %s has inputs+prereqs: %s",
689  task.taskDef.taskName, quantumDataId, list(inputs.names))
690 
691  task.addQuantum(
692  Quantum(
693  taskName=task.taskDef.taskName,
694  taskClass=task.taskDef.taskClass,
695  dataId=quantumDataId,
696  initInputs=task.initInputs.unpackRefs(),
697  predictedInputs=inputs,
698  outputs=outputs,
699  )
700  )
701 
702  def makeQuantumGraph(self):
703  """Create a `QuantumGraph` from the quanta already present in
704  the scaffolding data structure.
705  """
706  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
707  graph.initInputs = self.initInputs.unpackRefs()
708  graph.initOutputs = self.initOutputs.unpackRefs()
709  graph.initIntermediates = self.initIntermediates.unpackRefs()
710  return graph
711 
712 
713 # ------------------------
714 # Exported definitions --
715 # ------------------------
716 
717 
718 class GraphBuilderError(Exception):
719  """Base class for exceptions generated by graph builder.
720  """
721  pass
722 
723 
724 class OutputExistsError(GraphBuilderError):
725  """Exception generated when output datasets already exist.
726  """
727  pass
728 
729 
731  """Exception generated when a prerequisite dataset does not exist.
732  """
733  pass
734 
735 
736 class GraphBuilder(object):
737  """GraphBuilder class is responsible for building task execution graph from
738  a Pipeline.
739 
740  Parameters
741  ----------
742  registry : `~lsst.daf.butler.Registry`
743  Data butler instance.
744  skipExisting : `bool`, optional
745  If `True` (default), a Quantum is not created if all its outputs
746  already exist.
747  """
748 
749  def __init__(self, registry, skipExisting=True):
750  self.registry = registry
751  self.dimensions = registry.dimensions
752  self.skipExisting = skipExisting
753 
754  def makeGraph(self, pipeline, collections, run, userQuery):
755  """Create execution graph for a pipeline.
756 
757  Parameters
758  ----------
759  pipeline : `Pipeline`
760  Pipeline definition, task names/classes and their configs.
761  collections : `lsst.daf.butler.CollectionSearch`
762  Object representing the collections to search for input datasets.
763  run : `str`, optional
764  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
765  output datasets, if it already exists.
766  userQuery : `str`
767  String which defunes user-defined selection for registry, should be
768  empty or `None` if there is no restrictions on data selection.
769 
770  Returns
771  -------
772  graph : `QuantumGraph`
773 
774  Raises
775  ------
776  UserExpressionError
777  Raised when user expression cannot be parsed.
778  OutputExistsError
779  Raised when output datasets already exist.
780  Exception
781  Other exceptions types may be raised by underlying registry
782  classes.
783  """
784  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
785  scaffolding.fillDataIds(self.registry, collections, userQuery)
786  scaffolding.fillDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting)
787  scaffolding.fillQuanta(self.registry, collections, skipExisting=self.skipExisting)
788  return scaffolding.makeQuantumGraph()
lsst::pipe::base.graphBuilder._PipelineScaffolding.dimensions
dimensions
Definition: graphBuilder.py:417
lsst::pipe::base.graphBuilder._DatasetScaffolding.consumers
consumers
Definition: graphBuilder.py:79
lsst::pipe::base.graph.QuantumGraph
Definition: graph.py:120
lsst::pipe::base.graphBuilder._PipelineScaffolding.tasks
tasks
Definition: graphBuilder.py:406
lsst::pipe::base.graphBuilder._TaskScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:255
lsst::pipe::base.graphBuilder.GraphBuilder.skipExisting
skipExisting
Definition: graphBuilder.py:752
lsst::pipe::base.graphBuilder._DatasetScaffolding.refs
refs
Definition: graphBuilder.py:81
lsst::pipe::base.graphBuilder._TaskScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:268
lsst::pipe::base.graphBuilder._TaskScaffolding.taskDef
taskDef
Definition: graphBuilder.py:238
lsst::pipe::base.graphBuilder._DatasetScaffoldingDict
Definition: graphBuilder.py:122
lsst::pipe::base.graphBuilder._TaskScaffolding.outputs
outputs
Definition: graphBuilder.py:253
lsst::pipe::base.graphBuilder._DatasetScaffolding.dimensions
dimensions
Definition: graphBuilder.py:77
lsst::pipe::base.graphBuilder._TaskScaffolding.initInputs
initInputs
Definition: graphBuilder.py:247
lsst::pipe::base.graphBuilder._PipelineScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:429
lsst::pipe::base.graphBuilder._DatasetScaffoldingDict.unpackRefs
NamedKeyDict unpackRefs(self)
Definition: graphBuilder.py:196
lsst::pipe::base.graphBuilder._PipelineScaffolding.fillDatasetRefs
def fillDatasetRefs(self, registry, collections, run, *skipExisting=True)
Definition: graphBuilder.py:534
lsst::pipe::base.graphBuilder.PrerequisiteMissingError
Definition: graphBuilder.py:730
lsst::pipe::base.graphBuilder._DatasetScaffoldingDict.universe
universe
Definition: graphBuilder.py:137
lsst::pipe::base.graphBuilder._DatasetScaffolding.dataIds
dataIds
Definition: graphBuilder.py:80
lsst::pipe::base.graph.QuantumGraphTaskNodes
Definition: graph.py:89
lsst::pipe::base.graphBuilder._TaskScaffolding.initOutputs
initOutputs
Definition: graphBuilder.py:249
lsst::pipe::base.graphBuilder._PipelineScaffolding.__init__
def __init__(self, pipeline, *registry)
Definition: graphBuilder.py:405
lsst::pipe::base.graphBuilder._TaskScaffolding.inputs
inputs
Definition: graphBuilder.py:251
lsst::pipe::base.graphBuilder._PipelineScaffolding
Definition: graphBuilder.py:351
lsst::pipe::base.graphBuilder._PipelineScaffolding.fillDataIds
def fillDataIds(self, registry, collections, userQuery)
Definition: graphBuilder.py:482
lsst::pipe::base.graphBuilder._DatasetScaffolding
Definition: graphBuilder.py:60
lsst::pipe::base.graphBuilder._TaskScaffolding.addQuantum
def addQuantum(self, Quantum quantum)
Definition: graphBuilder.py:321
lsst::pipe::base.graphBuilder.GraphBuilder.dimensions
dimensions
Definition: graphBuilder.py:751
lsst::pipe::base.graphBuilder._DatasetScaffoldingDict.dimensions
DimensionGraph dimensions(self)
Definition: graphBuilder.py:187
lsst::pipe::base.graphBuilder._TaskScaffolding.dataIds
dataIds
Definition: graphBuilder.py:265
lsst::pipe::base.graphBuilder._TaskScaffolding.makeQuantumGraphTaskNodes
QuantumGraphTaskNodes makeQuantumGraphTaskNodes(self)
Definition: graphBuilder.py:333
lsst::pipe::base.graphBuilder.GraphBuilder.registry
registry
Definition: graphBuilder.py:750
lsst::pipe::base.graphBuilder._TaskScaffolding.quanta
quanta
Definition: graphBuilder.py:266
lsst::pipe::base.graphBuilder.GraphBuilder.makeGraph
def makeGraph(self, pipeline, collections, run, userQuery)
Definition: graphBuilder.py:754
lsst::pipe::base.graphBuilder._DatasetScaffoldingDict.fromSubset
_DatasetScaffoldingDict fromSubset(cls, Iterable[DatasetType] datasetTypes, _DatasetScaffoldingDict first, *rest)
Definition: graphBuilder.py:162
lsst::pipe::base.graphBuilder._TaskScaffolding
Definition: graphBuilder.py:213
lsst::pipe::base.graphBuilder._PipelineScaffolding.fillQuanta
def fillQuanta(self, registry, collections, *skipExisting=True)
Definition: graphBuilder.py:604
lsst::pipe::base.graphBuilder._DatasetScaffoldingDict.__init__
def __init__(self, *args, DimensionGraph universe)
Definition: graphBuilder.py:135
lsst::pipe::base.graphBuilder._TaskScaffolding.dimensions
dimensions
Definition: graphBuilder.py:239
lsst::pipe::base.graphBuilder.GraphBuilder
Definition: graphBuilder.py:736
lsst::pipe::base.graphBuilder.GraphBuilder.__init__
def __init__(self, registry, skipExisting=True)
Definition: graphBuilder.py:749
lsst::pipe::base.graphBuilder._DatasetScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:85
lsst::pipe::base.graphBuilder._DatasetScaffoldingDict.fromDatasetTypes
_DatasetScaffoldingDict fromDatasetTypes(cls, Iterable[DatasetType] datasetTypes, *DimensionUniverse universe)
Definition: graphBuilder.py:140
lsst::pipe::base.graphBuilder.GraphBuilderError
Definition: graphBuilder.py:718
lsst::pipe::base.graphBuilder._PipelineScaffolding.makeQuantumGraph
def makeQuantumGraph(self)
Definition: graphBuilder.py:702
lsst::pipe::base.graphBuilder.OutputExistsError
Definition: graphBuilder.py:724
lsst::pipe::base.graphBuilder._DatasetScaffolding.__init__
def __init__(self, DimensionGraph dimensions)
Definition: graphBuilder.py:76
lsst::pipe::base.graphBuilder._DatasetScaffolding.producer
producer
Definition: graphBuilder.py:78
lsst::pipe::base.graphBuilder._TaskScaffolding.__init__
def __init__(self, TaskDef taskDef, _PipelineScaffolding parent, TaskDatasetTypes datasetTypes)
Definition: graphBuilder.py:236