lsst.pipe.base  18.1.0-3-g9cb968e+12
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import copy
32 import itertools
33 from collections import ChainMap
34 from dataclasses import dataclass
35 from typing import Set, List, Dict, Optional, Iterable
36 import logging
37 
38 # -----------------------------
39 # Imports for other modules --
40 # -----------------------------
41 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, Pipeline, TaskDef
42 from .graph import QuantumGraph, QuantumGraphTaskNodes
43 from lsst.daf.butler import Quantum, DatasetRef, DimensionGraph, DataId, DimensionUniverse, DatasetType
44 from lsst.daf.butler.core.utils import NamedKeyDict
45 from lsst.daf.butler.sql import DataIdQueryBuilder, SingleDatasetQueryBuilder
46 
47 # ----------------------------------
48 # Local non-exported definitions --
49 # ----------------------------------
50 
51 _LOG = logging.getLogger(__name__.partition(".")[2])
52 
53 
54 @dataclass
56  """Helper class aggregating information about a `DatasetType`, used when
57  constructing a `QuantumGraph`.
58 
59  `_DatasetScaffolding` does not hold the `DatasetType` instance itself
60  because it is usually used as the value type in `_DatasetScaffoldingDict`,
61  which uses `DatasetType` instances as keys.
62 
63  See `_PipelineScaffolding` for a top-down description of the full
64  scaffolding data structure.
65 
66  Parameters
67  ----------
68  dimensions : `DimensionGraph`
69  Dimensions of the `DatasetType`, expanded to include implied
70  dependencies.
71  """
72  def __init__(self, dimensions: DimensionGraph):
73  self.dimensions = dimensions
74  self.producer = None
75  self.consumers = {}
76  self.dataIds = set()
77  self.refs = []
78 
79  __slots__ = ("dimensions", "producer", "consumers", "dataIds", "refs")
80 
81  dimensions: DimensionGraph
82  """The dimensions of the dataset type, expanded to included implied
83  dependencies.
84 
85  Set during `_PipelineScaffolding` construction.
86  """
87 
88  producer: Optional[_TaskScaffolding]
89  """The scaffolding objects for the Task that produces this dataset.
90 
91  Set during `_PipelineScaffolding` construction.
92  """
93 
94  consumers: Dict[str, _TaskScaffolding]
95  """The scaffolding objects for the Tasks that consume this dataset,
96  keyed by their label in the `Pipeline`.
97 
98  Set during `_PipelineScaffolding` construction.
99  """
100 
101  dataIds: Set[DataId]
102  """Data IDs for all instances of this dataset type in the graph.
103 
104  These data IDs cover the full set of implied-expanded dimensions (i.e.
105  the `dimensions` attribute of this instance), which is a supserset of the
106  dimensions used in `DatasetRef` instances (e.g. in ``refs``).
107 
108  Populated after construction by `_PipelineScaffolding.fillDataIds`.
109  """
110 
111  refs: List[DatasetRef]
112  """References for all instances of this dataset type in the graph.
113 
114  Populated after construction by `_PipelineScaffolding.fillDatasetRefs`.
115  """
116 
117 
118 class _DatasetScaffoldingDict(NamedKeyDict):
119  """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`.
120 
121  See `_PipelineScaffolding` for a top-down description of the full
122  scaffolding data structure.
123 
124  Parameters
125  ----------
126  args
127  Positional arguments are forwarded to the `dict` constructor.
128  universe : `DimensionUniverse`
129  Universe of all possible dimensions.
130  """
131  def __init__(self, *args, universe: DimensionGraph):
132  super().__init__(*args)
133  self.universe = universe
134 
135  @classmethod
136  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
137  universe: DimensionUniverse) -> _DatasetScaffoldingDict:
138  """Construct a a dictionary from a flat iterable of `DatasetType` keys.
139 
140  Parameters
141  ----------
142  datasetTypes : `iterable` of `DatasetType`
143  DatasetTypes to use as keys for the dict. Values will be
144  constructed from the dimensions of the keys.
145  universe : `DimensionUniverse`
146  Universe of all possible dimensions.
147 
148  Returns
149  -------
150  dictionary : `_DatasetScaffoldingDict`
151  A new dictionary instance.
152  """
153  return cls(((datasetType, _DatasetScaffolding(datasetType.dimensions.implied(only=False)))
154  for datasetType in datasetTypes),
155  universe=universe)
156 
157  @classmethod
158  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict,
159  *rest) -> _DatasetScaffoldingDict:
160  """Return a new dictionary by extracting items corresponding to the
161  given keys from one or more existing dictionaries.
162 
163  Parameters
164  ----------
165  datasetTypes : `iterable` of `DatasetType`
166  DatasetTypes to use as keys for the dict. Values will be obtained
167  by lookups against ``first`` and ``rest``.
168  first : `_DatasetScaffoldingDict`
169  Another dictionary from which to extract values.
170  rest
171  Additional dictionaries from which to extract values.
172 
173  Returns
174  -------
175  dictionary : `_DatasetScaffoldingDict`
176  A new dictionary instance.
177  """
178  combined = ChainMap(first, *rest)
179  return cls(((datasetType, combined[datasetType]) for datasetType in datasetTypes),
180  universe=first.universe)
181 
182  @property
183  def dimensions(self) -> DimensionGraph:
184  """The union of all dimensions used by all dataset types in this
185  dictionary, including implied dependencies (`DimensionGraph`).
186  """
187  base = self.universe.empty
188  if len(self) == 0:
189  return base
190  return base.union(*(scaffolding.dimensions for scaffolding in self.values()), implied=True)
191 
192  def unpackRefs(self) -> NamedKeyDict:
193  """Unpack nested single-element `DatasetRef` lists into a new
194  dictionary.
195 
196  This method assumes that each `_DatasetScaffolding.refs` list contains
197  exactly one `DatasetRef`, as is the case for all "init" datasets.
198 
199  Returns
200  -------
201  dictionary : `NamedKeyDict`
202  Dictionary mapping `DatasetType` to `DatasetRef`, with both
203  `DatasetType` instances and string names usable as keys.
204  """
205  return NamedKeyDict((datasetType, scaffolding.refs[0]) for datasetType, scaffolding in self.items())
206 
207 
208 @dataclass
210  """Helper class aggregating information about a `PipelineTask`, used when
211  constructing a `QuantumGraph`.
212 
213  See `_PipelineScaffolding` for a top-down description of the full
214  scaffolding data structure.
215 
216  Parameters
217  ----------
218  taskDef : `TaskDef`
219  Data structure that identifies the task class and its config.
220  parent : `_PipelineScaffolding`
221  The parent data structure that will hold the instance being
222  constructed.
223  datasetTypes : `TaskDatasetTypes`
224  Data structure that categorizes the dataset types used by this task.
225 
226  Raises
227  ------
228  GraphBuilderError
229  Raised if the task's dimensions are not a subset of the union of the
230  pipeline's dataset dimensions.
231  """
232  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
233  universe = parent.dimensions.universe
234  self.taskDef = taskDef
235  self.dimensions = universe.extract(taskDef.config.quantum.dimensions, implied=True)
236  if not self.dimensions.issubset(parent.dimensions):
237  raise GraphBuilderError(f"Task with label '{taskDef.label}' has dimensions "
238  f"{self.dimensions.toSet()} that are not a subset of "
239  f"the pipeline dimensions {parent.dimensions.toSet()}.")
240  # Initialize _DatasetScaffoldingDicts as subsets of the one or two
241  # corresponding dicts in the parent _PipelineScaffolding.
242  self.initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs,
243  parent.initInputs, parent.initIntermediates)
244  self.initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs,
245  parent.initIntermediates, parent.initOutputs)
246  self.inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs,
247  parent.inputs, parent.intermediates)
248  self.outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs,
249  parent.intermediates, parent.outputs)
250  self.prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites,
251  parent.prerequisites)
252  # Add backreferences to the _DatasetScaffolding objects that point to
253  # this Task.
254  for dataset in itertools.chain(self.initInputs.values(), self.inputs.values(),
255  self.prerequisites.values()):
256  dataset.consumers[self.taskDef.label] = self
257  for dataset in itertools.chain(self.initOutputs.values(), self.outputs.values()):
258  assert dataset.producer is None
259  dataset.producer = self
260  self.dataIds = set()
261  self.quanta = []
262 
263  taskDef: TaskDef
264  """Data structure that identifies the task class and its config
265  (`TaskDef`).
266  """
267 
268  dimensions: DimensionGraph
269  """The dimensions of a single `Quantum` of this task, expanded to include
270  implied dependencies (`DimensionGraph`).
271  """
272 
273  initInputs: _DatasetScaffoldingDict
274  """Dictionary containing information about datasets used to construct this
275  task (`_DatasetScaffoldingDict`).
276  """
277 
278  initOutputs: _DatasetScaffoldingDict
279  """Dictionary containing information about datasets produced as a
280  side-effect of constructing this task (`_DatasetScaffoldingDict`).
281  """
282 
283  inputs: _DatasetScaffoldingDict
284  """Dictionary containing information about datasets used as regular,
285  graph-constraining inputs to this task (`_DatasetScaffoldingDict`).
286  """
287 
288  outputs: _DatasetScaffoldingDict
289  """Dictionary containing information about datasets produced by this task
290  (`_DatasetScaffoldingDict`).
291  """
292 
293  prerequisites: _DatasetScaffoldingDict
294  """Dictionary containing information about input datasets that must be
295  present in the repository before any Pipeline containing this task is run
296  (`_DatasetScaffoldingDict`).
297  """
298 
299  dataIds: Set[DataId]
300  """Data IDs for all quanta for this task in the graph (`set` of `DataId`).
301 
302  Populated after construction by `_PipelineScaffolding.fillDataIds`.
303  """
304 
305  quanta: List[Quantum]
306  """All quanta for this task in the graph (`list` of `Quantum`).
307 
308  Populated after construction by `_PipelineScaffolding.fillQuanta`.
309  """
310 
311  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
312  """Create a `QuantumGraphTaskNodes` instance from the information in
313  ``self``.
314 
315  Returns
316  -------
317  nodes : `QuantumGraphTaskNodes`
318  The `QuantumGraph` elements corresponding to this task.
319  """
320  return QuantumGraphTaskNodes(
321  taskDef=self.taskDef,
322  quanta=self.quanta,
323  initInputs=self.initInputs.unpackRefs(),
324  initOutputs=self.initOutputs.unpackRefs(),
325  )
326 
327 
328 @dataclass
330  """A helper data structure that organizes the information involved in
331  constructing a `QuantumGraph` for a `Pipeline`.
332 
333  Parameters
334  ----------
335  pipeline : `Pipeline`
336  Sequence of tasks from which a graph is to be constructed. Must
337  have nested task classes already imported.
338  universe : `DimensionUniverse`
339  Universe of all possible dimensions.
340 
341  Raises
342  ------
343  GraphBuilderError
344  Raised if the task's dimensions are not a subset of the union of the
345  pipeline's dataset dimensions.
346 
347  Notes
348  -----
349  The scaffolding data structure contains nested data structures for both
350  tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the
351  latter held by `_DatasetScaffoldingDict`. The dataset data structures are
352  shared between the pipeline-level structure (which aggregates all datasets
353  and categorizes them from the perspective of the complete pipeline) and the
354  individual tasks that use them as inputs and outputs.
355 
356  `QuantumGraph` construction proceeds in five steps, with each corresponding
357  to a different `_PipelineScaffolding` method:
358 
359  1. When `_PipelineScaffolding` is constructed, we extract and categorize
360  the DatasetTypes used by the pipeline (delegating to
361  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
362  nested `_TaskScaffolding` and `_DatasetScaffolding` objects.
363 
364  2. In `fillDataIds`, we construct and run the "Big Join Query", which
365  returns related tuples of all dimensions used to identify any regular
366  input, output, and intermediate datasets (not prerequisites). We then
367  iterate over these tuples of related dimensions, identifying the subsets
368  that correspond to distinct data IDs for each task and dataset type.
369 
370  3. In `fillDatasetRefs`, we run follow-up queries against all of the
371  dataset data IDs previously identified, populating the
372  `_DatasetScaffolding.refs` lists - except for those for prerequisite
373  datasets, which cannot be resolved until distinct quanta are
374  identified.
375 
376  4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into
377  the inputs and outputs for each `Quantum` and search for prerequisite
378  datasets, populating `_TaskScaffolding.quanta`.
379 
380  5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
381  per-task quanta identified in the previous step.
382  """
383  def __init__(self, pipeline, *, universe):
384  self.tasks = []
385  # Aggregate and categorize the DatasetTypes in the Pipeline.
386  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, universe=universe)
387  # Construct dictionaries that map those DatasetTypes to structures
388  # that will (later) hold addiitonal information about them.
389  for attr in ("initInputs", "initIntermediates", "initOutputs",
390  "inputs", "intermediates", "outputs", "prerequisites"):
391  setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr),
392  universe=universe))
393  # Aggregate all dimensions for all non-init, non-prerequisite
394  # DatasetTypes. These are the ones we'll include in the big join query.
395  self.dimensions = self.inputs.dimensions.union(self.inputs.dimensions,
396  self.intermediates.dimensions,
397  self.outputs.dimensions, implied=True)
398  # Construct scaffolding nodes for each Task, and add backreferences
399  # to the Task from each DatasetScaffolding node.
400  # Note that there's only one scaffolding node for each DatasetType, shared by
401  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
402  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
403  for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())]
404 
405  tasks: List[_TaskScaffolding]
406  """Scaffolding data structures for each task in the pipeline
407  (`list` of `_TaskScaffolding`).
408  """
409 
410  initInputs: _DatasetScaffoldingDict
411  """Datasets consumed but not produced when constructing the tasks in this
412  pipeline (`_DatasetScaffoldingDict`).
413  """
414 
415  initIntermediates: _DatasetScaffoldingDict
416  """Datasets that are both consumed and produced when constructing the tasks
417  in this pipeline (`_DatasetScaffoldingDict`).
418  """
419 
420  initOutputs: _DatasetScaffoldingDict
421  """Datasets produced but not consumed when constructing the tasks in this
422  pipeline (`_DatasetScaffoldingDict`).
423  """
424 
425  inputs: _DatasetScaffoldingDict
426  """Datasets that are consumed but not produced when running this pipeline
427  (`_DatasetScaffoldingDict`).
428  """
429 
430  intermediates: _DatasetScaffoldingDict
431  """Datasets that are both produced and consumed when running this pipeline
432  (`_DatasetScaffoldingDict`).
433  """
434 
435  outputs: _DatasetScaffoldingDict
436  """Datasets produced but not consumed when when running this pipeline
437  (`_DatasetScaffoldingDict`).
438  """
439 
440  prerequisites: _DatasetScaffoldingDict
441  """Datasets that are consumed when running this pipeline and looked up
442  per-Quantum when generating the graph (`_DatasetScaffoldingDict`).
443  """
444 
445  dimensions: DimensionGraph
446  """All dimensions used by any regular input, intermediate, or output
447  (not prerequisite) dataset; the set of dimension used in the "Big Join
448  Query" (`DimensionGraph`).
449 
450  This is required to be a superset of all task quantum dimensions.
451  """
452 
453  def fillDataIds(self, registry, originInfo, userQuery):
454  """Query for the data IDs that connect nodes in the `QuantumGraph`.
455 
456  This method populates `_TaskScaffolding.dataIds` and
457  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
458 
459  Parameters
460  ----------
461  registry : `lsst.daf.butler.Registry`
462  Registry for the data repository; used for all data ID queries.
463  originInfo : `lsst.daf.butler.DatasetOriginInfo`
464  Object holding the input and output collections for each
465  `DatasetType`.
466  userQuery : `str`, optional
467  User-provided expression to limit the data IDs processed.
468  """
469  # Initialization datasets always have empty data IDs.
470  emptyDataId = DataId(dimensions=registry.dimensions.empty)
471  for scaffolding in itertools.chain(self.initInputs.values(),
472  self.initIntermediates.values(),
473  self.initOutputs.values()):
474  scaffolding.dataIds.add(emptyDataId)
475  # We'll run one big query for the data IDs for task dimensions and
476  # regular input and outputs.
477  query = DataIdQueryBuilder.fromDimensions(registry, self.dimensions)
478  # Limit the query to only dimensions that are associated with the input
479  # dataset types.
480  for datasetType in self.inputs:
481  query.requireDataset(datasetType, originInfo.getInputCollections(datasetType.name))
482  # Add the user expression, if any
483  if userQuery:
484  query.whereParsedExpression(userQuery)
485  # Execute the query and populate the data IDs in self
486  # _TaskScaffolding.refs, extracting the subsets of the common data ID
487  # from the query corresponding to the dimensions of each. By using
488  # sets, we remove duplicates caused by query rows in which the
489  # dimensions that change are not relevant for that task or dataset
490  # type. For example, if the Big Join Query involves the dimensions
491  # (instrument, visit, detector, skymap, tract, patch), we extract
492  # "calexp" data IDs from the instrument, visit, and detector values
493  # only, and rely on `set.add` to avoid duplications due to result rows
494  # in which only skymap, tract, and patch are varying.
495  # The Big Join Query is defined such that only visit+detector and
496  # tract+patch combinations that represent spatial overlaps are included
497  # in the results.
498  for commonDataId in query.execute():
499  for taskScaffolding in self.tasks:
500  dataId = DataId(commonDataId, dimensions=taskScaffolding.dimensions)
501  taskScaffolding.dataIds.add(dataId)
502  for datasetType, scaffolding in itertools.chain(self.inputs.items(),
503  self.intermediates.items(),
504  self.outputs.items()):
505  dataId = DataId(commonDataId, dimensions=scaffolding.dimensions)
506  scaffolding.dataIds.add(dataId)
507 
508  def fillDatasetRefs(self, registry, originInfo, *, skipExisting=True, clobberExisting=False):
509  """Perform follow up queries for each dataset data ID produced in
510  `fillDataIds`.
511 
512  This method populates `_DatasetScaffolding.refs` (except for those in
513  `prerequisites`).
514 
515  Parameters
516  ----------
517  registry : `lsst.daf.butler.Registry`
518  Registry for the data repository; used for all data ID queries.
519  originInfo : `lsst.daf.butler.DatasetOriginInfo`
520  Object holding the input and output collections for each
521  `DatasetType`.
522  skipExisting : `bool`, optional
523  If `True` (default), a Quantum is not created if all its outputs
524  already exist.
525  clobberExisting : `bool`, optional
526  If `True`, overwrite any outputs that already exist. Cannot be
527  `True` if ``skipExisting`` is.
528 
529  Raises
530  ------
531  ValueError
532  Raised if both `skipExisting` and `clobberExisting` are `True`.
533  OutputExistsError
534  Raised if an output dataset already exists in the output collection
535  and both ``skipExisting`` and ``clobberExisting`` are `False`. The
536  case where some but not all of a quantum's outputs are present and
537  ``skipExisting`` is `True` cannot be identified at this stage, and
538  is handled by `fillQuanta` instead.
539  """
540  if clobberExisting and skipExisting:
541  raise ValueError("clobberExisting and skipExisting cannot both be true.")
542  # Look up input and initInput datasets in the input collection(s).
543  for datasetType, scaffolding in itertools.chain(self.initInputs.items(), self.inputs.items()):
544  for dataId in scaffolding.dataIds:
545  # TODO: we only need to use SingleDatasetQueryBuilder here because
546  # it provides multi-collection search support. There should be a
547  # way to do that directly with Registry, and it should probably
548  # operate by just doing an unordered collection search and
549  # resolving the order in Python.
550  builder = SingleDatasetQueryBuilder.fromCollections(
551  registry, datasetType,
552  collections=originInfo.getInputCollections(datasetType.name)
553  )
554  builder.whereDataId(dataId)
555  ref = builder.executeOne(expandDataId=True)
556  if ref is None:
557  # Data IDs have been expanded to include implied
558  # dimensions, which is not what we want for the DatasetRef.
559  # Constructing a new DataID shrinks them back down.
560  ref = DatasetRef(datasetType, DataId(dataId, dimensions=datasetType.dimensions))
561  scaffolding.refs.append(ref)
562  # Look up [init] intermediate and output datasets in the output collection,
563  # unless clobberExisting is True (in which case we don't care if these
564  # already exist).
565  for datasetType, scaffolding in itertools.chain(self.initIntermediates.items(),
566  self.initOutputs.items(),
567  self.intermediates.items(),
568  self.outputs.items()):
569  collection = originInfo.getOutputCollection(datasetType.name)
570  for dataId in scaffolding.dataIds:
571  # TODO: we could easily support per-DatasetType clobberExisting
572  # and skipExisting (it might make sense to put them in
573  # originInfo), and I could imagine that being useful - it's
574  # probably required in order to support writing initOutputs
575  # before QuantumGraph generation.
576  if clobberExisting:
577  ref = None
578  else:
579  ref = registry.find(collection=collection, datasetType=datasetType, dataId=dataId)
580  if ref is None:
581  # data IDs have been expanded to include implied dimensions,
582  # which is not what we want for the DatasetRef.
583  ref = DatasetRef(datasetType, DataId(dataId, dimensions=datasetType.dimensions))
584  elif not skipExisting:
585  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
586  f"output collection {collection} with data ID {dataId}.")
587  scaffolding.refs.append(ref)
588  # Prerequisite dataset lookups are deferred until fillQuanta.
589 
590  def fillQuanta(self, registry, originInfo, *, skipExisting=True):
591  """Define quanta for each task by splitting up the datasets associated
592  with each task data ID.
593 
594  This method populates `_TaskScaffolding.quanta`.
595 
596  Parameters
597  ----------
598  registry : `lsst.daf.butler.Registry`
599  Registry for the data repository; used for all data ID queries.
600  originInfo : `lsst.daf.butler.DatasetOriginInfo`
601  Object holding the input and output collections for each
602  `DatasetType`.
603  skipExisting : `bool`, optional
604  If `True` (default), a Quantum is not created if all its outputs
605  already exist.
606  """
607  for task in self.tasks:
608  for quantumDataId in task.dataIds:
609  # Identify the (regular) inputs that correspond to the Quantum
610  # with this data ID. These are those whose data IDs have the
611  # same values for all dimensions they have in common.
612  # We do this data IDs expanded to include implied dimensions,
613  # which is why _DatasetScaffolding.dimensions is thus expanded
614  # even though DatasetType.dimensions is not.
615  inputs = NamedKeyDict()
616  for datasetType, scaffolding in task.inputs.items():
617  inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds)
618  if quantumDataId.matches(dataId)]
619  # Same for outputs.
620  outputs = NamedKeyDict()
621  allOutputsPresent = True
622  for datasetType, scaffolding in task.outputs.items():
623  outputs[datasetType] = []
624  for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds):
625  if quantumDataId.matches(dataId):
626  if ref.id is None:
627  allOutputsPresent = False
628  else:
629  assert skipExisting, "Existing outputs should have already been identified."
630  if not allOutputsPresent:
631  raise OutputExistsError(f"Output {datasetType.name} with data ID "
632  f"{dataId} already exists, but other outputs "
633  f"for task with label {task.taskDef.label} "
634  f"and data ID {quantumDataId} do not.")
635  outputs[datasetType].append(ref)
636  if allOutputsPresent and skipExisting:
637  continue
638 
639  # Look up prerequisite datasets in the input collection(s).
640  # These may have dimensions that extend beyond those we queried
641  # for originally, because we want to permit those data ID
642  # values to differ across quanta and dataset types.
643  # For example, the same quantum may have a flat and bias with
644  # a different calibration_label, or a refcat with a skypix
645  # value that overlaps the quantum's data ID's region, but not
646  # the user expression used for the initial query.
647  for datasetType, scaffolding in task.prerequisites.items():
648  builder = SingleDatasetQueryBuilder.fromCollections(
649  registry, datasetType,
650  collections=originInfo.getInputCollections(datasetType.name)
651  )
652  if not datasetType.dimensions.issubset(quantumDataId.dimensions()):
653  builder.relateDimensions(quantumDataId.dimensions(), addResultColumns=False)
654  builder.whereDataId(quantumDataId)
655  refs = list(builder.execute(expandDataId=True))
656  if len(refs) == 0:
658  f"No instances of prerequisite dataset {datasetType.name} found for task "
659  f"with label {task.taskDef.label} and quantum data ID {quantumDataId}."
660  )
661  inputs[datasetType] = refs
662  task.quanta.append(
663  Quantum(
664  taskName=task.taskDef.taskName,
665  taskClass=task.taskDef.taskClass,
666  dataId=quantumDataId,
667  initInputs=task.initInputs.unpackRefs(),
668  predictedInputs=inputs,
669  outputs=outputs,
670  )
671  )
672 
673  def makeQuantumGraph(self):
674  """Create a `QuantumGraph` from the quanta already present in
675  the scaffolding data structure.
676  """
677  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
678  graph.initInputs = self.initInputs.unpackRefs()
679  graph.initOutputs = self.initOutputs.unpackRefs()
680  graph.initIntermediates = self.initIntermediates.unpackRefs()
681  return graph
682 
683 
684 # ------------------------
685 # Exported definitions --
686 # ------------------------
687 
688 
689 class GraphBuilderError(Exception):
690  """Base class for exceptions generated by graph builder.
691  """
692  pass
693 
694 
695 class OutputExistsError(GraphBuilderError):
696  """Exception generated when output datasets already exist.
697  """
698  pass
699 
700 
702  """Exception generated when a prerequisite dataset does not exist.
703  """
704  pass
705 
706 
707 class GraphBuilder(object):
708  """GraphBuilder class is responsible for building task execution graph from
709  a Pipeline.
710 
711  Parameters
712  ----------
713  taskFactory : `TaskFactory`
714  Factory object used to load/instantiate PipelineTasks
715  registry : `~lsst.daf.butler.Registry`
716  Data butler instance.
717  skipExisting : `bool`, optional
718  If `True` (default), a Quantum is not created if all its outputs
719  already exist.
720  clobberExisting : `bool`, optional
721  If `True`, overwrite any outputs that already exist. Cannot be
722  `True` if ``skipExisting`` is.
723  """
724 
725  def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False):
726  self.taskFactory = taskFactory
727  self.registry = registry
728  self.dimensions = registry.dimensions
729  self.skipExisting = skipExisting
730  self.clobberExisting = clobberExisting
731 
732  def _loadTaskClass(self, taskDef):
733  """Make sure task class is loaded.
734 
735  Load task class, update task name to make sure it is fully-qualified,
736  do not update original taskDef in a Pipeline though.
737 
738  Parameters
739  ----------
740  taskDef : `TaskDef`
741 
742  Returns
743  -------
744  `TaskDef` instance, may be the same as parameter if task class is
745  already loaded.
746  """
747  if taskDef.taskClass is None:
748  tClass, tName = self.taskFactory.loadTaskClass(taskDef.taskName)
749  taskDef = copy.copy(taskDef)
750  taskDef.taskClass = tClass
751  taskDef.taskName = tName
752  return taskDef
753 
754  def makeGraph(self, pipeline, originInfo, userQuery):
755  """Create execution graph for a pipeline.
756 
757  Parameters
758  ----------
759  pipeline : `Pipeline`
760  Pipeline definition, task names/classes and their configs.
761  originInfo : `~lsst.daf.butler.DatasetOriginInfo`
762  Object which provides names of the input/output collections.
763  userQuery : `str`
764  String which defunes user-defined selection for registry, should be
765  empty or `None` if there is no restrictions on data selection.
766 
767  Returns
768  -------
769  graph : `QuantumGraph`
770 
771  Raises
772  ------
773  UserExpressionError
774  Raised when user expression cannot be parsed.
775  OutputExistsError
776  Raised when output datasets already exist.
777  Exception
778  Other exceptions types may be raised by underlying registry
779  classes.
780  """
781  # Make sure all task classes are loaded, creating a new Pipeline
782  # to avoid modifying the input one.
783  # TODO: in the future, it would be preferable for `Pipeline` to
784  # guarantee that its Task classes have been imported to avoid this
785  # sort of two-stage initialization.
786  pipeline = Pipeline([self._loadTaskClass(taskDef) for taskDef in pipeline])
787 
788  scaffolding = _PipelineScaffolding(pipeline, universe=self.registry.dimensions)
789 
790  scaffolding.fillDataIds(self.registry, originInfo, userQuery)
791  scaffolding.fillDatasetRefs(self.registry, originInfo,
792  skipExisting=self.skipExisting,
793  clobberExisting=self.clobberExisting)
794  scaffolding.fillQuanta(self.registry, originInfo,
795  skipExisting=self.skipExisting)
796 
797  return scaffolding.makeQuantumGraph()
def makeGraph(self, pipeline, originInfo, userQuery)
def fillDataIds(self, registry, originInfo, userQuery)
def __init__(self, taskFactory, registry, skipExisting=True, clobberExisting=False)
def fillQuanta(self, registry, originInfo, skipExisting=True)
def fillDatasetRefs(self, registry, originInfo, skipExisting=True, clobberExisting=False)