lsst.pipe.base  18.1.0-4-g6c9d669+2
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import copy
32 import itertools
33 from collections import ChainMap
34 from dataclasses import dataclass
35 from typing import Set, List, Dict, Optional, Iterable
36 import logging
37 
38 # -----------------------------
39 # Imports for other modules --
40 # -----------------------------
41 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, Pipeline, TaskDef
42 from .graph import QuantumGraph, QuantumGraphTaskNodes
43 from lsst.daf.butler import Quantum, DatasetRef, DimensionGraph, DataId, DimensionUniverse, DatasetType
44 from lsst.daf.butler.core.utils import NamedKeyDict
45 from lsst.daf.butler.sql import DataIdQueryBuilder, SingleDatasetQueryBuilder
46 
47 # ----------------------------------
48 # Local non-exported definitions --
49 # ----------------------------------
50 
51 _LOG = logging.getLogger(__name__.partition(".")[2])
52 
53 
54 @dataclass
56  """Helper class aggregating information about a `DatasetType`, used when
57  constructing a `QuantumGraph`.
58 
59  `_DatasetScaffolding` does not hold the `DatasetType` instance itself
60  because it is usually used as the value type in `_DatasetScaffoldingDict`,
61  which uses `DatasetType` instances as keys.
62 
63  See `_PipelineScaffolding` for a top-down description of the full
64  scaffolding data structure.
65 
66  Parameters
67  ----------
68  dimensions : `DimensionGraph`
69  Dimensions of the `DatasetType`, expanded to include implied
70  dependencies.
71  """
72  def __init__(self, dimensions: DimensionGraph):
73  self.dimensions = dimensions
74  self.producer = None
75  self.consumers = {}
76  self.dataIds = set()
77  self.refs = []
78 
79  __slots__ = ("dimensions", "producer", "consumers", "dataIds", "refs")
80 
81  dimensions: DimensionGraph
82  """The dimensions of the dataset type, expanded to included implied
83  dependencies.
84 
85  Set during `_PipelineScaffolding` construction.
86  """
87 
88  producer: Optional[_TaskScaffolding]
89  """The scaffolding objects for the Task that produces this dataset.
90 
91  Set during `_PipelineScaffolding` construction.
92  """
93 
94  consumers: Dict[str, _TaskScaffolding]
95  """The scaffolding objects for the Tasks that consume this dataset,
96  keyed by their label in the `Pipeline`.
97 
98  Set during `_PipelineScaffolding` construction.
99  """
100 
101  dataIds: Set[DataId]
102  """Data IDs for all instances of this dataset type in the graph.
103 
104  These data IDs cover the full set of implied-expanded dimensions (i.e.
105  the `dimensions` attribute of this instance), which is a supserset of the
106  dimensions used in `DatasetRef` instances (e.g. in ``refs``).
107 
108  Populated after construction by `_PipelineScaffolding.fillDataIds`.
109  """
110 
111  refs: List[DatasetRef]
112  """References for all instances of this dataset type in the graph.
113 
114  Populated after construction by `_PipelineScaffolding.fillDatasetRefs`.
115  """
116 
117 
118 class _DatasetScaffoldingDict(NamedKeyDict):
119  """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`.
120 
121  See `_PipelineScaffolding` for a top-down description of the full
122  scaffolding data structure.
123 
124  Parameters
125  ----------
126  args
127  Positional arguments are forwarded to the `dict` constructor.
128  universe : `DimensionUniverse`
129  Universe of all possible dimensions.
130  """
131  def __init__(self, *args, universe: DimensionGraph):
132  super().__init__(*args)
133  self.universe = universe
134 
135  @classmethod
136  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
137  universe: DimensionUniverse) -> _DatasetScaffoldingDict:
138  """Construct a a dictionary from a flat iterable of `DatasetType` keys.
139 
140  Parameters
141  ----------
142  datasetTypes : `iterable` of `DatasetType`
143  DatasetTypes to use as keys for the dict. Values will be
144  constructed from the dimensions of the keys.
145  universe : `DimensionUniverse`
146  Universe of all possible dimensions.
147 
148  Returns
149  -------
150  dictionary : `_DatasetScaffoldingDict`
151  A new dictionary instance.
152  """
153  return cls(((datasetType, _DatasetScaffolding(datasetType.dimensions.implied(only=False)))
154  for datasetType in datasetTypes),
155  universe=universe)
156 
157  @classmethod
158  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
159  *rest) -> _DatasetScaffoldingDict:
160  """Return a new dictionary by extracting items corresponding to the
161  given keys from one or more existing dictionaries.
162 
163  Parameters
164  ----------
165  datasetTypes : `iterable` of `DatasetType`
166  DatasetTypes to use as keys for the dict. Values will be obtained
167  by lookups against ``first`` and ``rest``.
168  first : `_DatasetScaffoldingDict`
169  Another dictionary from which to extract values.
170  rest
171  Additional dictionaries from which to extract values.
172 
173  Returns
174  -------
175  dictionary : `_DatasetScaffoldingDict`
176  A new dictionary instance.
177  """
178  combined = ChainMap(first, *rest)
179  return cls(((datasetType, combined[datasetType]) for datasetType in datasetTypes),
180  universe=first.universe)
181 
182  @property
183  def dimensions(self) -> DimensionGraph:
184  """The union of all dimensions used by all dataset types in this
185  dictionary, including implied dependencies (`DimensionGraph`).
186  """
187  base = self.universe.empty
188  if len(self) == 0:
189  return base
190  return base.union(*(scaffolding.dimensions for scaffolding in self.values()), implied=True)
191 
192  def unpackRefs(self) -> NamedKeyDict:
193  """Unpack nested single-element `DatasetRef` lists into a new
194  dictionary.
195 
196  This method assumes that each `_DatasetScaffolding.refs` list contains
197  exactly one `DatasetRef`, as is the case for all "init" datasets.
198 
199  Returns
200  -------
201  dictionary : `NamedKeyDict`
202  Dictionary mapping `DatasetType` to `DatasetRef`, with both
203  `DatasetType` instances and string names usable as keys.
204  """
205  return NamedKeyDict((datasetType, scaffolding.refs[0]) for datasetType, scaffolding in self.items())
206 
207 
208 @dataclass
210  """Helper class aggregating information about a `PipelineTask`, used when
211  constructing a `QuantumGraph`.
212 
213  See `_PipelineScaffolding` for a top-down description of the full
214  scaffolding data structure.
215 
216  Parameters
217  ----------
218  taskDef : `TaskDef`
219  Data structure that identifies the task class and its config.
220  parent : `_PipelineScaffolding`
221  The parent data structure that will hold the instance being
222  constructed.
223  datasetTypes : `TaskDatasetTypes`
224  Data structure that categorizes the dataset types used by this task.
225 
226  Raises
227  ------
228  GraphBuilderError
229  Raised if the task's dimensions are not a subset of the union of the
230  pipeline's dataset dimensions.
231  """
232  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
233  universe = parent.dimensions.universe
234  self.taskDef = taskDef
235  self.dimensions = universe.extract(taskDef.connections.dimensions, implied=True)
236  if not self.dimensions.issubset(parent.dimensions):
237  raise GraphBuilderError(f"Task with label '{taskDef.label}' has dimensions "
238  f"{self.dimensions.toSet()} that are not a subset of "
239  f"the pipeline dimensions {parent.dimensions.toSet()}.")
240  # Initialize _DatasetScaffoldingDicts as subsets of the one or two
241  # corresponding dicts in the parent _PipelineScaffolding.
242  self.initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
243  parent.initInputs, parent.initIntermediates)
244  self.initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
245  parent.initIntermediates, parent.initOutputs)
246  self.inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
247  parent.inputs, parent.intermediates)
248  self.outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
249  parent.intermediates, parent.outputs)
250  self.prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
251  parent.prerequisites)
252  # Add backreferences to the _DatasetScaffolding objects that point to
253  # this Task.
254  for dataset in itertools.chain(self.initInputs.values(), self.inputs.values(),
255  self.prerequisites.values()):
256  dataset.consumers[self.taskDef.label] = self
257  for dataset in itertools.chain(self.initOutputs.values(), self.outputs.values()):
258  assert dataset.producer is None
259  dataset.producer = self
260  self.dataIds = set()
261  self.quanta = []
262 
263  taskDef: TaskDef
264  """Data structure that identifies the task class and its config
265  (`TaskDef`).
266  """
267 
268  dimensions: DimensionGraph
269  """The dimensions of a single `Quantum` of this task, expanded to include
270  implied dependencies (`DimensionGraph`).
271  """
272 
273  initInputs: _DatasetScaffoldingDict
274  """Dictionary containing information about datasets used to construct this
275  task (`_DatasetScaffoldingDict`).
276  """
277 
278  initOutputs: _DatasetScaffoldingDict
279  """Dictionary containing information about datasets produced as a
280  side-effect of constructing this task (`_DatasetScaffoldingDict`).
281  """
282 
283  inputs: _DatasetScaffoldingDict
284  """Dictionary containing information about datasets used as regular,
285  graph-constraining inputs to this task (`_DatasetScaffoldingDict`).
286  """
287 
288  outputs: _DatasetScaffoldingDict
289  """Dictionary containing information about datasets produced by this task
290  (`_DatasetScaffoldingDict`).
291  """
292 
293  prerequisites: _DatasetScaffoldingDict
294  """Dictionary containing information about input datasets that must be
295  present in the repository before any Pipeline containing this task is run
296  (`_DatasetScaffoldingDict`).
297  """
298 
299  dataIds: Set[DataId]
300  """Data IDs for all quanta for this task in the graph (`set` of `DataId`).
301 
302  Populated after construction by `_PipelineScaffolding.fillDataIds`.
303  """
304 
305  quanta: List[Quantum]
306  """All quanta for this task in the graph (`list` of `Quantum`).
307 
308  Populated after construction by `_PipelineScaffolding.fillQuanta`.
309  """
310 
311  def addQuantum(self, quantum: Quantum):
312  config = self.taskDef.config
313  connectionClass = config.connections.ConnectionsClass
314  connectionInstance = connectionClass(config=config)
315  # This will raise if one of the check conditions is not met, which is the intended
316  # behavior
317  result = connectionInstance.adjustQuantum(quantum.predictedInputs)
318  quantum._predictedInputs = NamedKeyDict(result)
319 
320  # If this function has reached this far add the quantum
321  self.quanta.append(quantum)
322 
323  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
324  """Create a `QuantumGraphTaskNodes` instance from the information in
325  ``self``.
326 
327  Returns
328  -------
329  nodes : `QuantumGraphTaskNodes`
330  The `QuantumGraph` elements corresponding to this task.
331  """
332  return QuantumGraphTaskNodes(
333  taskDef=self.taskDef,
334  quanta=self.quanta,
335  initInputs=self.initInputs.unpackRefs(),
336  initOutputs=self.initOutputs.unpackRefs(),
337  )
338 
339 
340 @dataclass
342  """A helper data structure that organizes the information involved in
343  constructing a `QuantumGraph` for a `Pipeline`.
344 
345  Parameters
346  ----------
347  pipeline : `Pipeline`
348  Sequence of tasks from which a graph is to be constructed. Must
349  have nested task classes already imported.
350  universe : `DimensionUniverse`
351  Universe of all possible dimensions.
352 
353  Raises
354  ------
355  GraphBuilderError
356  Raised if the task's dimensions are not a subset of the union of the
357  pipeline's dataset dimensions.
358 
359  Notes
360  -----
361  The scaffolding data structure contains nested data structures for both
362  tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the
363  latter held by `_DatasetScaffoldingDict`. The dataset data structures are
364  shared between the pipeline-level structure (which aggregates all datasets
365  and categorizes them from the perspective of the complete pipeline) and the
366  individual tasks that use them as inputs and outputs.
367 
368  `QuantumGraph` construction proceeds in five steps, with each corresponding
369  to a different `_PipelineScaffolding` method:
370 
371  1. When `_PipelineScaffolding` is constructed, we extract and categorize
372  the DatasetTypes used by the pipeline (delegating to
373  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
374  nested `_TaskScaffolding` and `_DatasetScaffolding` objects.
375 
376  2. In `fillDataIds`, we construct and run the "Big Join Query", which
377  returns related tuples of all dimensions used to identify any regular
378  input, output, and intermediate datasets (not prerequisites). We then
379  iterate over these tuples of related dimensions, identifying the subsets
380  that correspond to distinct data IDs for each task and dataset type.
381 
382  3. In `fillDatasetRefs`, we run follow-up queries against all of the
383  dataset data IDs previously identified, populating the
384  `_DatasetScaffolding.refs` lists - except for those for prerequisite
385  datasets, which cannot be resolved until distinct quanta are
386  identified.
387 
388  4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into
389  the inputs and outputs for each `Quantum` and search for prerequisite
390  datasets, populating `_TaskScaffolding.quanta`.
391 
392  5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
393  per-task quanta identified in the previous step.
394  """
395  def __init__(self, pipeline, *, universe):
396  self.tasks = []
397  # Aggregate and categorize the DatasetTypes in the Pipeline.
398  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, universe=universe)
399  # Construct dictionaries that map those DatasetTypes to structures
400  # that will (later) hold addiitonal information about them.
401  for attr in ("initInputs", "initIntermediates", "initOutputs",
402  "inputs", "intermediates", "outputs", "prerequisites"):
403  setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
404  universe=universe))
405  # Aggregate all dimensions for all non-init, non-prerequisite
406  # DatasetTypes. These are the ones we'll include in the big join query.
407  self.dimensions = self.inputs.dimensions.union(self.inputs.dimensions,
408  self.intermediates.dimensions,
409  self.outputs.dimensions, implied=True)
410  # Construct scaffolding nodes for each Task, and add backreferences
411  # to the Task from each DatasetScaffolding node.
412  # Note that there's only one scaffolding node for each DatasetType, shared by
413  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
414  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
415  for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())]
416 
417  tasks: List[_TaskScaffolding]
418  """Scaffolding data structures for each task in the pipeline
419  (`list` of `_TaskScaffolding`).
420  """
421 
422  initInputs: _DatasetScaffoldingDict
423  """Datasets consumed but not produced when constructing the tasks in this
424  pipeline (`_DatasetScaffoldingDict`).
425  """
426 
427  initIntermediates: _DatasetScaffoldingDict
428  """Datasets that are both consumed and produced when constructing the tasks
429  in this pipeline (`_DatasetScaffoldingDict`).
430  """
431 
432  initOutputs: _DatasetScaffoldingDict
433  """Datasets produced but not consumed when constructing the tasks in this
434  pipeline (`_DatasetScaffoldingDict`).
435  """
436 
437  inputs: _DatasetScaffoldingDict
438  """Datasets that are consumed but not produced when running this pipeline
439  (`_DatasetScaffoldingDict`).
440  """
441 
442  intermediates: _DatasetScaffoldingDict
443  """Datasets that are both produced and consumed when running this pipeline
444  (`_DatasetScaffoldingDict`).
445  """
446 
447  outputs: _DatasetScaffoldingDict
448  """Datasets produced but not consumed when when running this pipeline
449  (`_DatasetScaffoldingDict`).
450  """
451 
452  prerequisites: _DatasetScaffoldingDict
453  """Datasets that are consumed when running this pipeline and looked up
454  per-Quantum when generating the graph (`_DatasetScaffoldingDict`).
455  """
456 
457  dimensions: DimensionGraph
458  """All dimensions used by any regular input, intermediate, or output
459  (not prerequisite) dataset; the set of dimension used in the "Big Join
460  Query" (`DimensionGraph`).
461 
462  This is required to be a superset of all task quantum dimensions.
463  """
464 
465  def fillDataIds(self, registry, originInfo, userQuery):
466  """Query for the data IDs that connect nodes in the `QuantumGraph`.
467 
468  This method populates `_TaskScaffolding.dataIds` and
469  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
470 
471  Parameters
472  ----------
473  registry : `lsst.daf.butler.Registry`
474  Registry for the data repository; used for all data ID queries.
475  originInfo : `lsst.daf.butler.DatasetOriginInfo`
476  Object holding the input and output collections for each
477  `DatasetType`.
478  userQuery : `str`, optional
479  User-provided expression to limit the data IDs processed.
480  """
481  # Initialization datasets always have empty data IDs.
482  emptyDataId = DataId(dimensions=registry.dimensions.empty)
483  for scaffolding in itertools.chain(self.initInputs.values(),
484  self.initIntermediates.values(),
485  self.initOutputs.values()):
486  scaffolding.dataIds.add(emptyDataId)
487  # We'll run one big query for the data IDs for task dimensions and
488  # regular input and outputs.
489  query = DataIdQueryBuilder.fromDimensions(registry, self.dimensions)
490  # Limit the query to only dimensions that are associated with the input
491  # dataset types.
492  for datasetType in self.inputs:
493  query.requireDataset(datasetType, originInfo.getInputCollections(datasetType.name))
494  # Add the user expression, if any
495  if userQuery:
496  query.whereParsedExpression(userQuery)
497  # Execute the query and populate the data IDs in self
498  # _TaskScaffolding.refs, extracting the subsets of the common data ID
499  # from the query corresponding to the dimensions of each. By using
500  # sets, we remove duplicates caused by query rows in which the
501  # dimensions that change are not relevant for that task or dataset
502  # type. For example, if the Big Join Query involves the dimensions
503  # (instrument, visit, detector, skymap, tract, patch), we extract
504  # "calexp" data IDs from the instrument, visit, and detector values
505  # only, and rely on `set.add` to avoid duplications due to result rows
506  # in which only skymap, tract, and patch are varying.
507  # The Big Join Query is defined such that only visit+detector and
508  # tract+patch combinations that represent spatial overlaps are included
509  # in the results.
510  for commonDataId in query.execute():
511  for taskScaffolding in self.tasks:
512  dataId = DataId(commonDataId, dimensions=taskScaffolding.dimensions)
513  taskScaffolding.dataIds.add(dataId)
514  for datasetType, scaffolding in itertools.chain(self.inputs.items(),
515  self.intermediates.items(),
516  self.outputs.items()):
517  dataId = DataId(commonDataId, dimensions=scaffolding.dimensions)
518  scaffolding.dataIds.add(dataId)
519 
520  def fillDatasetRefs(self, registry, originInfo, *, skipExisting=True, clobberExisting=False):
521  """Perform follow up queries for each dataset data ID produced in
522  `fillDataIds`.
523 
524  This method populates `_DatasetScaffolding.refs` (except for those in
525  `prerequisites`).
526 
527  Parameters
528  ----------
529  registry : `lsst.daf.butler.Registry`
530  Registry for the data repository; used for all data ID queries.
531  originInfo : `lsst.daf.butler.DatasetOriginInfo`
532  Object holding the input and output collections for each
533  `DatasetType`.
534  skipExisting : `bool`, optional
535  If `True` (default), a Quantum is not created if all its outputs
536  already exist.
537  clobberExisting : `bool`, optional
538  If `True`, overwrite any outputs that already exist. Cannot be
539  `True` if ``skipExisting`` is.
540 
541  Raises
542  ------
543  ValueError
544  Raised if both `skipExisting` and `clobberExisting` are `True`.
545  OutputExistsError
546  Raised if an output dataset already exists in the output collection
547  and both ``skipExisting`` and ``clobberExisting`` are `False`. The
548  case where some but not all of a quantum's outputs are present and
549  ``skipExisting`` is `True` cannot be identified at this stage, and
550  is handled by `fillQuanta` instead.
551  """
552  if clobberExisting and skipExisting:
553  raise ValueError("clobberExisting and skipExisting cannot both be true.")
554  # Look up input and initInput datasets in the input collection(s).
555  for datasetType, scaffolding in itertools.chain(self.initInputs.items(), self.inputs.items()):
556  for dataId in scaffolding.dataIds:
557  # TODO: we only need to use SingleDatasetQueryBuilder here because
558  # it provides multi-collection search support. There should be a
559  # way to do that directly with Registry, and it should probably
560  # operate by just doing an unordered collection search and
561  # resolving the order in Python.
562  builder = SingleDatasetQueryBuilder.fromCollections(
563  registry, datasetType,
564  collections=originInfo.getInputCollections(datasetType.name)
565  )
566  builder.whereDataId(dataId)
567  ref = builder.executeOne(expandDataId=True)
568  if ref is None:
569  # Data IDs have been expanded to include implied
570  # dimensions, which is not what we want for the DatasetRef.
571  # Constructing a new DataID shrinks them back down.
572  ref = DatasetRef(datasetType, DataId(dataId, dimensions=datasetType.dimensions))
573  scaffolding.refs.append(ref)
574  # Look up [init] intermediate and output datasets in the output collection,
575  # unless clobberExisting is True (in which case we don't care if these
576  # already exist).
577  for datasetType, scaffolding in itertools.chain(self.initIntermediates.items(),
578  self.initOutputs.items(),
579  self.intermediates.items(),
580  self.outputs.items()):
581  collection = originInfo.getOutputCollection(datasetType.name)
582  for dataId in scaffolding.dataIds:
583  # TODO: we could easily support per-DatasetType clobberExisting
584  # and skipExisting (it might make sense to put them in
585  # originInfo), and I could imagine that being useful - it's
586  # probably required in order to support writing initOutputs
587  # before QuantumGraph generation.
588  if clobberExisting:
589  ref = None
590  else:
591  ref = registry.find(collection=collection, datasetType=datasetType, dataId=dataId)
592  if ref is None:
593  # data IDs have been expanded to include implied dimensions,
594  # which is not what we want for the DatasetRef.
595  ref = DatasetRef(datasetType, DataId(dataId, dimensions=datasetType.dimensions))
596  elif not skipExisting:
597  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
598  f"output collection {collection} with data ID {dataId}.")
599  scaffolding.refs.append(ref)
600  # Prerequisite dataset lookups are deferred until fillQuanta.
601 
602  def fillQuanta(self, registry, originInfo, *, skipExisting=True):
603  """Define quanta for each task by splitting up the datasets associated
604  with each task data ID.
605 
606  This method populates `_TaskScaffolding.quanta`.
607 
608  Parameters
609  ----------
610  registry : `lsst.daf.butler.Registry`
611  Registry for the data repository; used for all data ID queries.
612  originInfo : `lsst.daf.butler.DatasetOriginInfo`
613  Object holding the input and output collections for each
614  `DatasetType`.
615  skipExisting : `bool`, optional
616  If `True` (default), a Quantum is not created if all its outputs
617  already exist.
618  """
619  for task in self.tasks:
620  for quantumDataId in task.dataIds:
621  # Identify the (regular) inputs that correspond to the Quantum
622  # with this data ID. These are those whose data IDs have the
623  # same values for all dimensions they have in common.
624  # We do this data IDs expanded to include implied dimensions,
625  # which is why _DatasetScaffolding.dimensions is thus expanded
626  # even though DatasetType.dimensions is not.
627  inputs = NamedKeyDict()
628  for datasetType, scaffolding in task.inputs.items():
629  inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds)
630  if quantumDataId.matches(dataId)]
631  # Same for outputs.
632  outputs = NamedKeyDict()
633  allOutputsPresent = True
634  for datasetType, scaffolding in task.outputs.items():
635  outputs[datasetType] = []
636  for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds):
637  if quantumDataId.matches(dataId):
638  if ref.id is None:
639  allOutputsPresent = False
640  else:
641  assert skipExisting, "Existing outputs should have already been identified."
642  if not allOutputsPresent:
643  raise OutputExistsError(f"Output {datasetType.name} with data ID "
644  f"{dataId} already exists, but other outputs "
645  f"for task with label {task.taskDef.label} "
646  f"and data ID {quantumDataId} do not.")
647  outputs[datasetType].append(ref)
648  if allOutputsPresent and skipExisting:
649  continue
650 
651  # Look up prerequisite datasets in the input collection(s).
652  # These may have dimensions that extend beyond those we queried
653  # for originally, because we want to permit those data ID
654  # values to differ across quanta and dataset types.
655  # For example, the same quantum may have a flat and bias with
656  # a different calibration_label, or a refcat with a skypix
657  # value that overlaps the quantum's data ID's region, but not
658  # the user expression used for the initial query.
659  for datasetType, scaffolding in task.prerequisites.items():
660  builder = SingleDatasetQueryBuilder.fromCollections(
661  registry, datasetType,
662  collections=originInfo.getInputCollections(datasetType.name)
663  )
664  if not datasetType.dimensions.issubset(quantumDataId.dimensions()):
665  builder.relateDimensions(quantumDataId.dimensions(), addResultColumns=False)
666  builder.whereDataId(quantumDataId)
667  refs = list(builder.execute(expandDataId=True))
668  if len(refs) == 0:
670  f"No instances of prerequisite dataset {datasetType.name} found for task "
671  f"with label {task.taskDef.label} and quantum data ID {quantumDataId}."
672  )
673  inputs[datasetType] = refs
674  task.addQuantum(
675  Quantum(
676  taskName=task.taskDef.taskName,
677  taskClass=task.taskDef.taskClass,
678  dataId=quantumDataId,
679  initInputs=task.initInputs.unpackRefs(),
680  predictedInputs=inputs,
681  outputs=outputs,
682  )
683  )
684 
685  def makeQuantumGraph(self):
686  """Create a `QuantumGraph` from the quanta already present in
687  the scaffolding data structure.
688  """
689  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
690  graph.initInputs = self.initInputs.unpackRefs()
691  graph.initOutputs = self.initOutputs.unpackRefs()
692  graph.initIntermediates = self.initIntermediates.unpackRefs()
693  return graph
694 
695 
696 # ------------------------
697 # Exported definitions --
698 # ------------------------
699 
700 
701 class GraphBuilderError(Exception):
702  """Base class for exceptions generated by graph builder.
703  """
704  pass
705 
706 
707 class OutputExistsError(GraphBuilderError):
708  """Exception generated when output datasets already exist.
709  """
710  pass
711 
712 
714  """Exception generated when a prerequisite dataset does not exist.
715  """
716  pass
717 
718 
719 class GraphBuilder(object):
720  """GraphBuilder class is responsible for building task execution graph from
721  a Pipeline.
722 
723  Parameters
724  ----------
725  taskFactory : `TaskFactory`
726  Factory object used to load/instantiate PipelineTasks
727  registry : `~lsst.daf.butler.Registry`
728  Data butler instance.
729  skipExisting : `bool`, optional
730  If `True` (default), a Quantum is not created if all its outputs
731  already exist.
732  clobberExisting : `bool`, optional
733  If `True`, overwrite any outputs that already exist. Cannot be
734  `True` if ``skipExisting`` is.
735  """
736 
737  def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False):
738  self.taskFactory = taskFactory
739  self.registry = registry
740  self.dimensions = registry.dimensions
741  self.skipExisting = skipExisting
742  self.clobberExisting = clobberExisting
743 
744  def _loadTaskClass(self, taskDef):
745  """Make sure task class is loaded.
746 
747  Load task class, update task name to make sure it is fully-qualified,
748  do not update original taskDef in a Pipeline though.
749 
750  Parameters
751  ----------
752  taskDef : `TaskDef`
753 
754  Returns
755  -------
756  `TaskDef` instance, may be the same as parameter if task class is
757  already loaded.
758  """
759  if taskDef.taskClass is None:
760  tClass, tName = self.taskFactory.loadTaskClass(taskDef.taskName)
761  taskDef = copy.copy(taskDef)
762  taskDef.taskClass = tClass
763  taskDef.taskName = tName
764  return taskDef
765 
766  def makeGraph(self, pipeline, originInfo, userQuery):
767  """Create execution graph for a pipeline.
768 
769  Parameters
770  ----------
771  pipeline : `Pipeline`
772  Pipeline definition, task names/classes and their configs.
773  originInfo : `~lsst.daf.butler.DatasetOriginInfo`
774  Object which provides names of the input/output collections.
775  userQuery : `str`
776  String which defunes user-defined selection for registry, should be
777  empty or `None` if there is no restrictions on data selection.
778 
779  Returns
780  -------
781  graph : `QuantumGraph`
782 
783  Raises
784  ------
785  UserExpressionError
786  Raised when user expression cannot be parsed.
787  OutputExistsError
788  Raised when output datasets already exist.
789  Exception
790  Other exceptions types may be raised by underlying registry
791  classes.
792  """
793  # Make sure all task classes are loaded, creating a new Pipeline
794  # to avoid modifying the input one.
795  # TODO: in the future, it would be preferable for `Pipeline` to
796  # guarantee that its Task classes have been imported to avoid this
797  # sort of two-stage initialization.
798  pipeline = Pipeline([self._loadTaskClass(taskDef) for taskDef in pipeline])
799 
800  scaffolding = _PipelineScaffolding(pipeline, universe=self.registry.dimensions)
801 
802  scaffolding.fillDataIds(self.registry, originInfo, userQuery)
803  scaffolding.fillDatasetRefs(self.registry, originInfo,
804  skipExisting=self.skipExisting,
805  clobberExisting=self.clobberExisting)
806  scaffolding.fillQuanta(self.registry, originInfo,
807  skipExisting=self.skipExisting)
808 
809  return scaffolding.makeQuantumGraph()
def makeGraph(self, pipeline, originInfo, userQuery)
def fillDataIds(self, registry, originInfo, userQuery)
def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False)
def fillQuanta(self, registry, originInfo, skipExisting=True)
def fillDatasetRefs(self, registry, originInfo, skipExisting=True, clobberExisting=False)