lsst.pipe.base  20.0.0-7-gb92c176+aa8a99367b
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import itertools
32 from collections import ChainMap
33 from dataclasses import dataclass
34 from typing import Dict, Iterable, Iterator, List
35 import logging
36 
37 # -----------------------------
38 # Imports for other modules --
39 # -----------------------------
40 from .connections import iterConnections
41 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
42 from .graph import QuantumGraph, QuantumGraphTaskNodes
43 from lsst.daf.butler import (
44  DataCoordinate,
45  DatasetRef,
46  DatasetType,
47  DimensionGraph,
48  DimensionUniverse,
49  NamedKeyDict,
50  Quantum,
51 )
52 
53 # ----------------------------------
54 # Local non-exported definitions --
55 # ----------------------------------
56 
57 _LOG = logging.getLogger(__name__.partition(".")[2])
58 
59 
60 class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
61  """A custom dictionary that maps `DatasetType` to a nested dictionary of
62  the known `DatasetRef` instances of that type.
63 
64  Parameters
65  ----------
66  args
67  Positional arguments are forwarded to the `dict` constructor.
68  universe : `DimensionUniverse`
69  Universe of all possible dimensions.
70  """
71  def __init__(self, *args, universe: DimensionGraph):
72  super().__init__(*args)
73  self.universe = universe
74 
75  @classmethod
76  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
77  universe: DimensionUniverse) -> _DatasetDict:
78  """Construct a dictionary from a flat iterable of `DatasetType` keys.
79 
80  Parameters
81  ----------
82  datasetTypes : `iterable` of `DatasetType`
83  DatasetTypes to use as keys for the dict. Values will be empty
84  dictionaries.
85  universe : `DimensionUniverse`
86  Universe of all possible dimensions.
87 
88  Returns
89  -------
90  dictionary : `_DatasetDict`
91  A new `_DatasetDict` instance.
92  """
93  return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
94 
95  @classmethod
96  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
97  ) -> _DatasetDict:
98  """Return a new dictionary by extracting items corresponding to the
99  given keys from one or more existing dictionaries.
100 
101  Parameters
102  ----------
103  datasetTypes : `iterable` of `DatasetType`
104  DatasetTypes to use as keys for the dict. Values will be obtained
105  by lookups against ``first`` and ``rest``.
106  first : `_DatasetDict`
107  Another dictionary from which to extract values.
108  rest
109  Additional dictionaries from which to extract values.
110 
111  Returns
112  -------
113  dictionary : `_DatasetDict`
114  A new dictionary instance.
115  """
116  combined = ChainMap(first, *rest)
117  return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
118  universe=first.universe)
119 
120  @property
121  def dimensions(self) -> DimensionGraph:
122  """The union of all dimensions used by all dataset types in this
123  dictionary, including implied dependencies (`DimensionGraph`).
124  """
125  base = self.universe.empty
126  if len(self) == 0:
127  return base
128  return base.union(*[datasetType.dimensions for datasetType in self.keys()])
129 
130  def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
131  """Unpack nested single-element `DatasetRef` dicts into a new
132  mapping with `DatasetType` keys and `DatasetRef` values.
133 
134  This method assumes that each nest contains exactly one item, as is the
135  case for all "init" datasets.
136 
137  Returns
138  -------
139  dictionary : `NamedKeyDict`
140  Dictionary mapping `DatasetType` to `DatasetRef`, with both
141  `DatasetType` instances and string names usable as keys.
142  """
143  def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
144  ref, = refs.values()
145  return ref
146  return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
147 
148  def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
149  """Unpack nested multi-element `DatasetRef` dicts into a new
150  mapping with `DatasetType` keys and `set` of `DatasetRef` values.
151 
152  Returns
153  -------
154  dictionary : `NamedKeyDict`
155  Dictionary mapping `DatasetType` to `DatasetRef`, with both
156  `DatasetType` instances and string names usable as keys.
157  """
158  return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
159 
160  def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
161  ) -> Iterator[DatasetRef]:
162  """Iterate over the contained `DatasetRef` instances that match the
163  given `DatasetType` and data IDs.
164 
165  Parameters
166  ----------
167  datasetType : `DatasetType`
168  Dataset type to match.
169  dataIds : `Iterable` [ `DataCoordinate` ]
170  Data IDs to match.
171 
172  Returns
173  -------
174  refs : `Iterator` [ `DatasetRef` ]
175  DatasetRef instances for which ``ref.datasetType == datasetType``
176  and ``ref.dataId`` is in ``dataIds``.
177  """
178  refs = self[datasetType]
179  return (refs[dataId] for dataId in dataIds)
180 
181 
183  """Helper class aggregating information about a `Quantum`, used when
184  constructing a `QuantumGraph`.
185 
186  See `_PipelineScaffolding` for a top-down description of the full
187  scaffolding data structure.
188 
189  Parameters
190  ----------
191  task : _TaskScaffolding
192  Back-reference to the helper object for the `PipelineTask` this quantum
193  represents an execution of.
194  dataId : `DataCoordinate`
195  Data ID for this quantum.
196  """
197  def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
198  self.task = task
199  self.dataId = dataId
200  self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
201  self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
202  self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
203  universe=dataId.universe)
204 
205  __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
206 
207  def __repr__(self):
208  return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
209 
210  task: _TaskScaffolding
211  """Back-reference to the helper object for the `PipelineTask` this quantum
212  represents an execution of.
213  """
214 
215  dataId: DataCoordinate
216  """Data ID for this quantum.
217  """
218 
219  inputs: _DatasetDict
220  """Nested dictionary containing `DatasetRef` inputs to this quantum.
221 
222  This is initialized to map each `DatasetType` to an empty dictionary at
223  construction. Those nested dictionaries are populated (with data IDs as
224  keys) with unresolved `DatasetRef` instances in
225  `_PipelineScaffolding.connectDataIds`.
226  """
227 
228  outputs: _DatasetDict
229  """Nested dictionary containing `DatasetRef` outputs this quantum.
230  """
231 
232  prerequisites: _DatasetDict
233  """Nested dictionary containing `DatasetRef` prerequisite inputs to this
234  quantum.
235  """
236 
237  def makeQuantum(self) -> Quantum:
238  """Transform the scaffolding object into a true `Quantum` instance.
239 
240  Returns
241  -------
242  quantum : `Quantum`
243  An actual `Quantum` instance.
244  """
245  allInputs = self.inputs.unpackMultiRefs()
246  allInputs.update(self.prerequisites.unpackMultiRefs())
247  # Give the task's Connections class an opportunity to remove some
248  # inputs, or complain if they are unacceptable.
249  config = self.task.taskDef.config
250  connections = config.connections.ConnectionsClass(config=config)
251  # This will raise if one of the check conditions is not met, which is the intended
252  # behavior
253  allInputs = connections.adjustQuantum(allInputs)
254  return Quantum(
255  taskName=self.task.taskDef.taskName,
256  taskClass=self.task.taskDef.taskClass,
257  dataId=self.dataId,
258  initInputs=self.task.initInputs.unpackSingleRefs(),
259  predictedInputs=allInputs,
260  outputs=self.outputs.unpackMultiRefs(),
261  )
262 
263 
264 @dataclass
266  """Helper class aggregating information about a `PipelineTask`, used when
267  constructing a `QuantumGraph`.
268 
269  See `_PipelineScaffolding` for a top-down description of the full
270  scaffolding data structure.
271 
272  Parameters
273  ----------
274  taskDef : `TaskDef`
275  Data structure that identifies the task class and its config.
276  parent : `_PipelineScaffolding`
277  The parent data structure that will hold the instance being
278  constructed.
279  datasetTypes : `TaskDatasetTypes`
280  Data structure that categorizes the dataset types used by this task.
281  """
282  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
283  universe = parent.dimensions.universe
284  self.taskDef = taskDef
285  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
286  assert self.dimensions.issubset(parent.dimensions)
287  # Initialize _DatasetDicts as subsets of the one or two
288  # corresponding dicts in the parent _PipelineScaffolding.
289  self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
290  parent.initIntermediates)
291  self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
292  parent.initOutputs)
293  self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
294  self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
295  self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
296  self.dataIds = set()
297  self.quanta = {}
298 
299  def __repr__(self):
300  # Default dataclass-injected __repr__ gets caught in an infinite loop
301  # because of back-references.
302  return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
303 
304  taskDef: TaskDef
305  """Data structure that identifies the task class and its config
306  (`TaskDef`).
307  """
308 
309  dimensions: DimensionGraph
310  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
311  """
312 
313  initInputs: _DatasetDict
314  """Dictionary containing information about datasets used to construct this
315  task (`_DatasetDict`).
316  """
317 
318  initOutputs: _DatasetDict
319  """Dictionary containing information about datasets produced as a
320  side-effect of constructing this task (`_DatasetDict`).
321  """
322 
323  inputs: _DatasetDict
324  """Dictionary containing information about datasets used as regular,
325  graph-constraining inputs to this task (`_DatasetDict`).
326  """
327 
328  outputs: _DatasetDict
329  """Dictionary containing information about datasets produced by this task
330  (`_DatasetDict`).
331  """
332 
333  prerequisites: _DatasetDict
334  """Dictionary containing information about input datasets that must be
335  present in the repository before any Pipeline containing this task is run
336  (`_DatasetDict`).
337  """
338 
339  quanta: Dict[DataCoordinate, _QuantumScaffolding]
340  """Dictionary mapping data ID to a scaffolding object for the Quantum of
341  this task with that data ID.
342  """
343 
344  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
345  """Create a `QuantumGraphTaskNodes` instance from the information in
346  ``self``.
347 
348  Returns
349  -------
350  nodes : `QuantumGraphTaskNodes`
351  The `QuantumGraph` elements corresponding to this task.
352  """
353  return QuantumGraphTaskNodes(
354  taskDef=self.taskDef,
355  quanta=[q.makeQuantum() for q in self.quanta.values()],
356  initInputs=self.initInputs.unpackSingleRefs(),
357  initOutputs=self.initOutputs.unpackSingleRefs(),
358  )
359 
360 
361 @dataclass
363  """A helper data structure that organizes the information involved in
364  constructing a `QuantumGraph` for a `Pipeline`.
365 
366  Parameters
367  ----------
368  pipeline : `Pipeline`
369  Sequence of tasks from which a graph is to be constructed. Must
370  have nested task classes already imported.
371  universe : `DimensionUniverse`
372  Universe of all possible dimensions.
373 
374  Notes
375  -----
376  The scaffolding data structure contains nested data structures for both
377  tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
378  data structures are shared between the pipeline-level structure (which
379  aggregates all datasets and categorizes them from the perspective of the
380  complete pipeline) and the individual tasks that use them as inputs and
381  outputs.
382 
383  `QuantumGraph` construction proceeds in four steps, with each corresponding
384  to a different `_PipelineScaffolding` method:
385 
386  1. When `_PipelineScaffolding` is constructed, we extract and categorize
387  the DatasetTypes used by the pipeline (delegating to
388  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
389  nested `_TaskScaffolding` and `_DatasetDict` objects.
390 
391  2. In `connectDataIds`, we construct and run the "Big Join Query", which
392  returns related tuples of all dimensions used to identify any regular
393  input, output, and intermediate datasets (not prerequisites). We then
394  iterate over these tuples of related dimensions, identifying the subsets
395  that correspond to distinct data IDs for each task and dataset type,
396  and then create `_QuantumScaffolding` objects.
397 
398  3. In `resolveDatasetRefs`, we run follow-up queries against all of the
399  dataset data IDs previously identified, transforming unresolved
400  DatasetRefs into resolved DatasetRefs where appropriate. We then look
401  up prerequisite datasets for all quanta.
402 
403  4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
404  per-task `_QuantumScaffolding` objects.
405  """
406  def __init__(self, pipeline, *, registry):
407  _LOG.debug("Initializing data structures for QuantumGraph generation.")
408  self.tasks = []
409  # Aggregate and categorize the DatasetTypes in the Pipeline.
410  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
411  # Construct dictionaries that map those DatasetTypes to structures
412  # that will (later) hold addiitonal information about them.
413  for attr in ("initInputs", "initIntermediates", "initOutputs",
414  "inputs", "intermediates", "outputs", "prerequisites"):
415  setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
416  universe=registry.dimensions))
417  # Aggregate all dimensions for all non-init, non-prerequisite
418  # DatasetTypes. These are the ones we'll include in the big join query.
419  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
420  self.outputs.dimensions)
421  # Construct scaffolding nodes for each Task, and add backreferences
422  # to the Task from each DatasetScaffolding node.
423  # Note that there's only one scaffolding node for each DatasetType, shared by
424  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
425  if isinstance(pipeline, Pipeline):
426  pipeline = pipeline.toExpandedPipeline()
427  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
428  for taskDef, taskDatasetTypes in zip(pipeline,
429  datasetTypes.byTask.values())]
430 
431  def __repr__(self):
432  # Default dataclass-injected __repr__ gets caught in an infinite loop
433  # because of back-references.
434  return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
435 
436  tasks: List[_TaskScaffolding]
437  """Scaffolding data structures for each task in the pipeline
438  (`list` of `_TaskScaffolding`).
439  """
440 
441  initInputs: _DatasetDict
442  """Datasets consumed but not produced when constructing the tasks in this
443  pipeline (`_DatasetDict`).
444  """
445 
446  initIntermediates: _DatasetDict
447  """Datasets that are both consumed and produced when constructing the tasks
448  in this pipeline (`_DatasetDict`).
449  """
450 
451  initOutputs: _DatasetDict
452  """Datasets produced but not consumed when constructing the tasks in this
453  pipeline (`_DatasetDict`).
454  """
455 
456  inputs: _DatasetDict
457  """Datasets that are consumed but not produced when running this pipeline
458  (`_DatasetDict`).
459  """
460 
461  intermediates: _DatasetDict
462  """Datasets that are both produced and consumed when running this pipeline
463  (`_DatasetDict`).
464  """
465 
466  outputs: _DatasetDict
467  """Datasets produced but not consumed when when running this pipeline
468  (`_DatasetDict`).
469  """
470 
471  prerequisites: _DatasetDict
472  """Datasets that are consumed when running this pipeline and looked up
473  per-Quantum when generating the graph (`_DatasetDict`).
474  """
475 
476  dimensions: DimensionGraph
477  """All dimensions used by any regular input, intermediate, or output
478  (not prerequisite) dataset; the set of dimension used in the "Big Join
479  Query" (`DimensionGraph`).
480 
481  This is required to be a superset of all task quantum dimensions.
482  """
483 
484  def connectDataIds(self, registry, collections, userQuery):
485  """Query for the data IDs that connect nodes in the `QuantumGraph`.
486 
487  This method populates `_TaskScaffolding.dataIds` and
488  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
489 
490  Parameters
491  ----------
492  registry : `lsst.daf.butler.Registry`
493  Registry for the data repository; used for all data ID queries.
494  collections : `lsst.daf.butler.CollectionSearch`
495  Object representing the collections to search for input datasets.
496  userQuery : `str`, optional
497  User-provided expression to limit the data IDs processed.
498  """
499  _LOG.debug("Building query for data IDs.")
500  # Initialization datasets always have empty data IDs.
501  emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
502  for datasetType, refs in itertools.chain(self.initInputs.items(),
503  self.initIntermediates.items(),
504  self.initOutputs.items()):
505  refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
506  # Run one big query for the data IDs for task dimensions and regular
507  # inputs and outputs. We limit the query to only dimensions that are
508  # associated with the input dataset types, but don't (yet) try to
509  # obtain the dataset_ids for those inputs.
510  _LOG.debug("Submitting data ID query and processing results.")
511  resultIter = registry.queryDimensions(
512  self.dimensions,
513  datasets=list(self.inputs),
514  collections=collections,
515  where=userQuery,
516  )
517  # Iterate over query results, populating data IDs for datasets and
518  # quanta and then connecting them to each other.
519  n = -1 # If we had no results
520  for n, commonDataId in enumerate(resultIter):
521  # Create DatasetRefs for all DatasetTypes from this result row,
522  # noting that we might have created some already.
523  # We remember both those that already existed and those that we
524  # create now.
525  refsForRow = {}
526  for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
527  self.outputs.items()):
528  datasetDataId = commonDataId.subset(datasetType.dimensions)
529  ref = refs.get(datasetDataId)
530  if ref is None:
531  ref = DatasetRef(datasetType, datasetDataId)
532  refs[datasetDataId] = ref
533  refsForRow[datasetType.name] = ref
534  # Create _QuantumScaffolding objects for all tasks from this result
535  # row, noting that we might have created some already.
536  for task in self.tasks:
537  quantumDataId = commonDataId.subset(task.dimensions)
538  quantum = task.quanta.get(quantumDataId)
539  if quantum is None:
540  quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
541  task.quanta[quantumDataId] = quantum
542  # Whether this is a new quantum or an existing one, we can now
543  # associate the DatasetRefs for this row with it. The fact
544  # the fact that a Quantum data ID and a dataset data ID both
545  # came from the same result row is what tells us they should
546  # be associated.
547  # Many of these associates will be duplicates (because another
548  # query row that differed from this one only in irrelevant
549  # dimensions already added them), and we use sets to skip.
550  for datasetType in task.inputs:
551  ref = refsForRow[datasetType.name]
552  quantum.inputs[datasetType.name][ref.dataId] = ref
553  for datasetType in task.outputs:
554  ref = refsForRow[datasetType.name]
555  quantum.outputs[datasetType.name][ref.dataId] = ref
556  if n >= 0:
557  _LOG.debug("Finished processing %d rows from data ID query.", n+1)
558  else:
559  _LOG.debug("Received no rows from data ID query.")
560 
561  def resolveDatasetRefs(self, registry, collections, run, *, skipExisting=True):
562  """Perform follow up queries for each dataset data ID produced in
563  `fillDataIds`.
564 
565  This method populates `_DatasetScaffolding.refs` (except for those in
566  `prerequisites`).
567 
568  Parameters
569  ----------
570  registry : `lsst.daf.butler.Registry`
571  Registry for the data repository; used for all data ID queries.
572  collections : `lsst.daf.butler.CollectionSearch`
573  Object representing the collections to search for input datasets.
574  run : `str`, optional
575  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
576  output datasets, if it already exists.
577  skipExisting : `bool`, optional
578  If `True` (default), a Quantum is not created if all its outputs
579  already exist in ``run``. Ignored if ``run`` is `None`.
580 
581  Raises
582  ------
583  OutputExistsError
584  Raised if an output dataset already exists in the output run
585  and ``skipExisting`` is `False`. The case where some but not all
586  of a quantum's outputs are present and ``skipExisting`` is `True`
587  cannot be identified at this stage, and is handled by `fillQuanta`
588  instead.
589  """
590  # Look up [init] intermediate and output datasets in the output
591  # collection, if there is an output collection.
592  if run is not None:
593  for datasetType, refs in itertools.chain(self.initIntermediates.items(),
594  self.initOutputs.items(),
595  self.intermediates.items(),
596  self.outputs.items()):
597  _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
598  len(refs), datasetType.name)
599  for dataId, unresolvedRef in refs.items():
600  # TODO: we could easily support per-DatasetType
601  # skipExisting and I could imagine that being useful - it's
602  # probably required in order to support writing initOutputs
603  # before QuantumGraph generation.
604  ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run)
605  if ref is not None:
606  if skipExisting:
607  refs[dataId] = ref
608  else:
609  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
610  f"output RUN collection '{run}' with data ID {dataId}.")
611  # Look up input and initInput datasets in the input collection(s).
612  for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
613  _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
614  for dataId in refs:
615  refs[dataId] = registry.findDataset(datasetType, dataId=dataId, collections=collections)
616  if any(ref is None for ref in refs.values()):
617  raise RuntimeError(
618  f"One or more dataset of type '{datasetType.name}' was "
619  f"present in a previous query, but could not be found now."
620  f"This is either a logic bug in QuantumGraph generation, "
621  f"or the input collections have been modified since "
622  f"QuantumGraph generation began."
623  )
624  # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
625  # replacing the unresolved refs there, and then look up prerequisites.
626  for task in self.tasks:
627  _LOG.debug(
628  "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
629  len(task.quanta),
630  task.taskDef.label
631  )
632  lookupFunctions = {
633  c.name: c.lookupFunction
634  for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
635  if c.lookupFunction is not None
636  }
637  dataIdsToSkip = []
638  for quantum in task.quanta.values():
639  # Process outputs datasets only if there is a run to look for
640  # outputs in and skipExisting is True. Note that if
641  # skipExisting is False, any output datasets that already exist
642  # would have already caused an exception to be raised.
643  # We never update the DatasetRefs in the quantum because those
644  # should never be resolved.
645  if run is not None and skipExisting:
646  resolvedRefs = []
647  unresolvedRefs = []
648  for datasetType, originalRefs in quantum.outputs.items():
649  for ref in task.outputs.extract(datasetType, originalRefs.keys()):
650  if ref.id is not None:
651  resolvedRefs.append(ref)
652  else:
653  unresolvedRefs.append(ref)
654  if resolvedRefs:
655  if unresolvedRefs:
656  raise OutputExistsError(
657  f"Quantum {quantum.dataId} of task with label "
658  f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
659  f"and others that don't ({unresolvedRefs})."
660  )
661  else:
662  # All outputs are already present; skip this
663  # quantum and continue to the next.
664  dataIdsToSkip.append(quantum.dataId)
665  continue
666  # Update the input DatasetRefs to the resolved ones we already
667  # searched for.
668  for datasetType, refs in quantum.inputs.items():
669  for ref in task.inputs.extract(datasetType, refs.keys()):
670  refs[ref.dataId] = ref
671  # Look up prerequisite datasets in the input collection(s).
672  # These may have dimensions that extend beyond those we queried
673  # for originally, because we want to permit those data ID
674  # values to differ across quanta and dataset types.
675  # For example, the same quantum may have a flat and bias with
676  # a different calibration_label, or a refcat with a skypix
677  # value that overlaps the quantum's data ID's region, but not
678  # the user expression used for the initial query.
679  for datasetType in task.prerequisites:
680  lookupFunction = lookupFunctions.get(datasetType.name)
681  if lookupFunction is not None:
682  refs = list(
683  lookupFunction(datasetType, registry, quantum.dataId, collections)
684  )
685  else:
686  refs = list(
687  registry.queryDatasets(
688  datasetType,
689  collections=collections,
690  dataId=quantum.dataId,
691  deduplicate=True,
692  expand=True,
693  )
694  )
695  quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs})
696  # Actually remove any quanta that we decided to skip above.
697  if dataIdsToSkip:
698  _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
699  len(dataIdsToSkip), task.taskDef.label)
700  for dataId in dataIdsToSkip:
701  del task.quanta[dataId]
702 
703  def makeQuantumGraph(self):
704  """Create a `QuantumGraph` from the quanta already present in
705  the scaffolding data structure.
706 
707  Returns
708  -------
709  graph : `QuantumGraph`
710  The full `QuantumGraph`.
711  """
712  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
713  graph.initInputs = self.initInputs.unpackSingleRefs()
714  graph.initOutputs = self.initOutputs.unpackSingleRefs()
715  graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
716  return graph
717 
718 
719 # ------------------------
720 # Exported definitions --
721 # ------------------------
722 
723 
724 class GraphBuilderError(Exception):
725  """Base class for exceptions generated by graph builder.
726  """
727  pass
728 
729 
730 class OutputExistsError(GraphBuilderError):
731  """Exception generated when output datasets already exist.
732  """
733  pass
734 
735 
737  """Exception generated when a prerequisite dataset does not exist.
738  """
739  pass
740 
741 
742 class GraphBuilder(object):
743  """GraphBuilder class is responsible for building task execution graph from
744  a Pipeline.
745 
746  Parameters
747  ----------
748  registry : `~lsst.daf.butler.Registry`
749  Data butler instance.
750  skipExisting : `bool`, optional
751  If `True` (default), a Quantum is not created if all its outputs
752  already exist.
753  """
754 
755  def __init__(self, registry, skipExisting=True):
756  self.registry = registry
757  self.dimensions = registry.dimensions
758  self.skipExisting = skipExisting
759 
760  def makeGraph(self, pipeline, collections, run, userQuery):
761  """Create execution graph for a pipeline.
762 
763  Parameters
764  ----------
765  pipeline : `Pipeline`
766  Pipeline definition, task names/classes and their configs.
767  collections : `lsst.daf.butler.CollectionSearch`
768  Object representing the collections to search for input datasets.
769  run : `str`, optional
770  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
771  output datasets, if it already exists.
772  userQuery : `str`
773  String which defunes user-defined selection for registry, should be
774  empty or `None` if there is no restrictions on data selection.
775 
776  Returns
777  -------
778  graph : `QuantumGraph`
779 
780  Raises
781  ------
782  UserExpressionError
783  Raised when user expression cannot be parsed.
784  OutputExistsError
785  Raised when output datasets already exist.
786  Exception
787  Other exceptions types may be raised by underlying registry
788  classes.
789  """
790  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
791  scaffolding.connectDataIds(self.registry, collections, userQuery)
792  scaffolding.resolveDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting)
793  return scaffolding.makeQuantumGraph()
lsst::pipe::base.graphBuilder._PipelineScaffolding.dimensions
dimensions
Definition: graphBuilder.py:419
lsst::pipe::base.graph.QuantumGraph
Definition: graph.py:120
lsst::pipe::base.graphBuilder._PipelineScaffolding.tasks
tasks
Definition: graphBuilder.py:408
lsst::pipe::base.graphBuilder._QuantumScaffolding.__init__
def __init__(self, _TaskScaffolding task, DataCoordinate dataId)
Definition: graphBuilder.py:197
lsst::pipe::base.graphBuilder._QuantumScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:202
lsst::pipe::base.graphBuilder._TaskScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:295
lsst::pipe::base.graphBuilder._DatasetDict.fromSubset
_DatasetDict fromSubset(cls, Iterable[DatasetType] datasetTypes, _DatasetDict first, *_DatasetDict rest)
Definition: graphBuilder.py:96
lsst::pipe::base.graphBuilder.GraphBuilder.skipExisting
skipExisting
Definition: graphBuilder.py:758
lsst::pipe::base.graphBuilder._PipelineScaffolding.connectDataIds
def connectDataIds(self, registry, collections, userQuery)
Definition: graphBuilder.py:484
lsst::pipe::base.graphBuilder._TaskScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:299
lsst::pipe::base.graphBuilder._TaskScaffolding.taskDef
taskDef
Definition: graphBuilder.py:284
lsst::pipe::base.graphBuilder._QuantumScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:207
lsst::pipe::base.graphBuilder._TaskScaffolding.outputs
outputs
Definition: graphBuilder.py:294
lsst::pipe::base.graphBuilder._TaskScaffolding.initInputs
initInputs
Definition: graphBuilder.py:289
lsst::pipe::base.graphBuilder._PipelineScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:431
lsst::pipe::base.graphBuilder._QuantumScaffolding.outputs
outputs
Definition: graphBuilder.py:201
lsst::pipe::base.graphBuilder._QuantumScaffolding.dataId
dataId
Definition: graphBuilder.py:199
lsst::pipe::base.graphBuilder._QuantumScaffolding.makeQuantum
Quantum makeQuantum(self)
Definition: graphBuilder.py:237
lsst::pipe::base.graphBuilder._DatasetDict.fromDatasetTypes
_DatasetDict fromDatasetTypes(cls, Iterable[DatasetType] datasetTypes, *DimensionUniverse universe)
Definition: graphBuilder.py:76
lsst::pipe::base.graphBuilder.PrerequisiteMissingError
Definition: graphBuilder.py:736
lsst::pipe::base.graphBuilder._DatasetDict
Definition: graphBuilder.py:60
lsst::pipe::base.graphBuilder._QuantumScaffolding.inputs
inputs
Definition: graphBuilder.py:200
lsst::pipe::base.graphBuilder._QuantumScaffolding.task
task
Definition: graphBuilder.py:198
lsst::pipe::base.graph.QuantumGraphTaskNodes
Definition: graph.py:89
lsst::pipe::base.graphBuilder._TaskScaffolding.initOutputs
initOutputs
Definition: graphBuilder.py:291
lsst::pipe::base.graphBuilder._PipelineScaffolding.__init__
def __init__(self, pipeline, *registry)
Definition: graphBuilder.py:406
lsst::pipe::base.graphBuilder._TaskScaffolding.inputs
inputs
Definition: graphBuilder.py:293
lsst::pipe::base.graphBuilder._PipelineScaffolding
Definition: graphBuilder.py:362
lsst::pipe::base.graphBuilder._DatasetDict.extract
Iterator[DatasetRef] extract(self, DatasetType datasetType, Iterable[DataCoordinate] dataIds)
Definition: graphBuilder.py:160
lsst::pipe::base.graphBuilder._DatasetDict.__init__
def __init__(self, *args, DimensionGraph universe)
Definition: graphBuilder.py:71
lsst::pipe::base.graphBuilder.GraphBuilder.dimensions
dimensions
Definition: graphBuilder.py:757
lsst::pipe::base.connections.iterConnections
typing.Generator iterConnections(PipelineTaskConnections connections, str connectionType)
Definition: connections.py:499
lsst::pipe::base.graphBuilder._TaskScaffolding.dataIds
dataIds
Definition: graphBuilder.py:296
lsst::pipe::base.graphBuilder._TaskScaffolding.makeQuantumGraphTaskNodes
QuantumGraphTaskNodes makeQuantumGraphTaskNodes(self)
Definition: graphBuilder.py:344
lsst::pipe::base.graphBuilder.GraphBuilder.registry
registry
Definition: graphBuilder.py:756
lsst::pipe::base.graphBuilder._TaskScaffolding.quanta
quanta
Definition: graphBuilder.py:297
lsst::pipe::base.graphBuilder.GraphBuilder.makeGraph
def makeGraph(self, pipeline, collections, run, userQuery)
Definition: graphBuilder.py:760
lsst::pipe::base.graphBuilder._DatasetDict.unpackMultiRefs
NamedKeyDict[DatasetType, DatasetRef] unpackMultiRefs(self)
Definition: graphBuilder.py:148
lsst::pipe::base.graphBuilder._DatasetDict.dimensions
DimensionGraph dimensions(self)
Definition: graphBuilder.py:121
lsst::pipe::base.graphBuilder._TaskScaffolding
Definition: graphBuilder.py:265
lsst::pipe::base.graphBuilder._DatasetDict.unpackSingleRefs
NamedKeyDict[DatasetType, DatasetRef] unpackSingleRefs(self)
Definition: graphBuilder.py:130
lsst::pipe::base.graphBuilder._PipelineScaffolding.resolveDatasetRefs
def resolveDatasetRefs(self, registry, collections, run, *skipExisting=True)
Definition: graphBuilder.py:561
lsst::pipe::base.graphBuilder._QuantumScaffolding
Definition: graphBuilder.py:182
lsst::pipe::base.graphBuilder._DatasetDict.universe
universe
Definition: graphBuilder.py:73
lsst::pipe::base.graphBuilder._TaskScaffolding.dimensions
dimensions
Definition: graphBuilder.py:285
lsst::pipe::base.graphBuilder.GraphBuilder
Definition: graphBuilder.py:742
lsst::pipe::base.graphBuilder.GraphBuilder.__init__
def __init__(self, registry, skipExisting=True)
Definition: graphBuilder.py:755
lsst::pipe::base.graphBuilder.GraphBuilderError
Definition: graphBuilder.py:724
lsst::pipe::base.graphBuilder._PipelineScaffolding.makeQuantumGraph
def makeQuantumGraph(self)
Definition: graphBuilder.py:703
lsst::pipe::base.graphBuilder.OutputExistsError
Definition: graphBuilder.py:730
lsst::pipe::base.graphBuilder._TaskScaffolding.__init__
def __init__(self, TaskDef taskDef, _PipelineScaffolding parent, TaskDatasetTypes datasetTypes)
Definition: graphBuilder.py:282