lsst.pipe.base  20.0.0-9-gabd0d4c+836ba05b7d
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import itertools
32 from collections import ChainMap
33 from contextlib import contextmanager
34 from dataclasses import dataclass
35 from typing import Dict, Iterable, Iterator, List
36 import logging
37 
38 # -----------------------------
39 # Imports for other modules --
40 # -----------------------------
41 from .connections import iterConnections
42 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
43 from .graph import QuantumGraph, QuantumGraphTaskNodes
44 from lsst.daf.butler import (
45  DataCoordinate,
46  DatasetRef,
47  DatasetType,
48  DimensionGraph,
49  DimensionUniverse,
50  NamedKeyDict,
51  Quantum,
52 )
53 
54 # ----------------------------------
55 # Local non-exported definitions --
56 # ----------------------------------
57 
58 _LOG = logging.getLogger(__name__.partition(".")[2])
59 
60 
61 class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
62  """A custom dictionary that maps `DatasetType` to a nested dictionary of
63  the known `DatasetRef` instances of that type.
64 
65  Parameters
66  ----------
67  args
68  Positional arguments are forwarded to the `dict` constructor.
69  universe : `DimensionUniverse`
70  Universe of all possible dimensions.
71  """
72  def __init__(self, *args, universe: DimensionGraph):
73  super().__init__(*args)
74  self.universe = universe
75 
76  @classmethod
77  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
78  universe: DimensionUniverse) -> _DatasetDict:
79  """Construct a dictionary from a flat iterable of `DatasetType` keys.
80 
81  Parameters
82  ----------
83  datasetTypes : `iterable` of `DatasetType`
84  DatasetTypes to use as keys for the dict. Values will be empty
85  dictionaries.
86  universe : `DimensionUniverse`
87  Universe of all possible dimensions.
88 
89  Returns
90  -------
91  dictionary : `_DatasetDict`
92  A new `_DatasetDict` instance.
93  """
94  return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
95 
96  @classmethod
97  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
98  ) -> _DatasetDict:
99  """Return a new dictionary by extracting items corresponding to the
100  given keys from one or more existing dictionaries.
101 
102  Parameters
103  ----------
104  datasetTypes : `iterable` of `DatasetType`
105  DatasetTypes to use as keys for the dict. Values will be obtained
106  by lookups against ``first`` and ``rest``.
107  first : `_DatasetDict`
108  Another dictionary from which to extract values.
109  rest
110  Additional dictionaries from which to extract values.
111 
112  Returns
113  -------
114  dictionary : `_DatasetDict`
115  A new dictionary instance.
116  """
117  combined = ChainMap(first, *rest)
118  return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
119  universe=first.universe)
120 
121  @property
122  def dimensions(self) -> DimensionGraph:
123  """The union of all dimensions used by all dataset types in this
124  dictionary, including implied dependencies (`DimensionGraph`).
125  """
126  base = self.universe.empty
127  if len(self) == 0:
128  return base
129  return base.union(*[datasetType.dimensions for datasetType in self.keys()])
130 
131  def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
132  """Unpack nested single-element `DatasetRef` dicts into a new
133  mapping with `DatasetType` keys and `DatasetRef` values.
134 
135  This method assumes that each nest contains exactly one item, as is the
136  case for all "init" datasets.
137 
138  Returns
139  -------
140  dictionary : `NamedKeyDict`
141  Dictionary mapping `DatasetType` to `DatasetRef`, with both
142  `DatasetType` instances and string names usable as keys.
143  """
144  def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
145  ref, = refs.values()
146  return ref
147  return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
148 
149  def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
150  """Unpack nested multi-element `DatasetRef` dicts into a new
151  mapping with `DatasetType` keys and `set` of `DatasetRef` values.
152 
153  Returns
154  -------
155  dictionary : `NamedKeyDict`
156  Dictionary mapping `DatasetType` to `DatasetRef`, with both
157  `DatasetType` instances and string names usable as keys.
158  """
159  return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
160 
161  def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
162  ) -> Iterator[DatasetRef]:
163  """Iterate over the contained `DatasetRef` instances that match the
164  given `DatasetType` and data IDs.
165 
166  Parameters
167  ----------
168  datasetType : `DatasetType`
169  Dataset type to match.
170  dataIds : `Iterable` [ `DataCoordinate` ]
171  Data IDs to match.
172 
173  Returns
174  -------
175  refs : `Iterator` [ `DatasetRef` ]
176  DatasetRef instances for which ``ref.datasetType == datasetType``
177  and ``ref.dataId`` is in ``dataIds``.
178  """
179  refs = self[datasetType]
180  return (refs[dataId] for dataId in dataIds)
181 
182 
184  """Helper class aggregating information about a `Quantum`, used when
185  constructing a `QuantumGraph`.
186 
187  See `_PipelineScaffolding` for a top-down description of the full
188  scaffolding data structure.
189 
190  Parameters
191  ----------
192  task : _TaskScaffolding
193  Back-reference to the helper object for the `PipelineTask` this quantum
194  represents an execution of.
195  dataId : `DataCoordinate`
196  Data ID for this quantum.
197  """
198  def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
199  self.task = task
200  self.dataId = dataId
201  self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
202  self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
203  self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
204  universe=dataId.universe)
205 
206  __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
207 
208  def __repr__(self):
209  return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
210 
211  task: _TaskScaffolding
212  """Back-reference to the helper object for the `PipelineTask` this quantum
213  represents an execution of.
214  """
215 
216  dataId: DataCoordinate
217  """Data ID for this quantum.
218  """
219 
220  inputs: _DatasetDict
221  """Nested dictionary containing `DatasetRef` inputs to this quantum.
222 
223  This is initialized to map each `DatasetType` to an empty dictionary at
224  construction. Those nested dictionaries are populated (with data IDs as
225  keys) with unresolved `DatasetRef` instances in
226  `_PipelineScaffolding.connectDataIds`.
227  """
228 
229  outputs: _DatasetDict
230  """Nested dictionary containing `DatasetRef` outputs this quantum.
231  """
232 
233  prerequisites: _DatasetDict
234  """Nested dictionary containing `DatasetRef` prerequisite inputs to this
235  quantum.
236  """
237 
238  def makeQuantum(self) -> Quantum:
239  """Transform the scaffolding object into a true `Quantum` instance.
240 
241  Returns
242  -------
243  quantum : `Quantum`
244  An actual `Quantum` instance.
245  """
246  allInputs = self.inputs.unpackMultiRefs()
247  allInputs.update(self.prerequisites.unpackMultiRefs())
248  # Give the task's Connections class an opportunity to remove some
249  # inputs, or complain if they are unacceptable.
250  config = self.task.taskDef.config
251  connections = config.connections.ConnectionsClass(config=config)
252  # This will raise if one of the check conditions is not met, which is the intended
253  # behavior
254  allInputs = connections.adjustQuantum(allInputs)
255  return Quantum(
256  taskName=self.task.taskDef.taskName,
257  taskClass=self.task.taskDef.taskClass,
258  dataId=self.dataId,
259  initInputs=self.task.initInputs.unpackSingleRefs(),
260  predictedInputs=allInputs,
261  outputs=self.outputs.unpackMultiRefs(),
262  )
263 
264 
265 @dataclass
267  """Helper class aggregating information about a `PipelineTask`, used when
268  constructing a `QuantumGraph`.
269 
270  See `_PipelineScaffolding` for a top-down description of the full
271  scaffolding data structure.
272 
273  Parameters
274  ----------
275  taskDef : `TaskDef`
276  Data structure that identifies the task class and its config.
277  parent : `_PipelineScaffolding`
278  The parent data structure that will hold the instance being
279  constructed.
280  datasetTypes : `TaskDatasetTypes`
281  Data structure that categorizes the dataset types used by this task.
282  """
283  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
284  universe = parent.dimensions.universe
285  self.taskDef = taskDef
286  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
287  assert self.dimensions.issubset(parent.dimensions)
288  # Initialize _DatasetDicts as subsets of the one or two
289  # corresponding dicts in the parent _PipelineScaffolding.
290  self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
291  parent.initIntermediates)
292  self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
293  parent.initOutputs)
294  self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
295  self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
296  self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
297  self.dataIds = set()
298  self.quanta = {}
299 
300  def __repr__(self):
301  # Default dataclass-injected __repr__ gets caught in an infinite loop
302  # because of back-references.
303  return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
304 
305  taskDef: TaskDef
306  """Data structure that identifies the task class and its config
307  (`TaskDef`).
308  """
309 
310  dimensions: DimensionGraph
311  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
312  """
313 
314  initInputs: _DatasetDict
315  """Dictionary containing information about datasets used to construct this
316  task (`_DatasetDict`).
317  """
318 
319  initOutputs: _DatasetDict
320  """Dictionary containing information about datasets produced as a
321  side-effect of constructing this task (`_DatasetDict`).
322  """
323 
324  inputs: _DatasetDict
325  """Dictionary containing information about datasets used as regular,
326  graph-constraining inputs to this task (`_DatasetDict`).
327  """
328 
329  outputs: _DatasetDict
330  """Dictionary containing information about datasets produced by this task
331  (`_DatasetDict`).
332  """
333 
334  prerequisites: _DatasetDict
335  """Dictionary containing information about input datasets that must be
336  present in the repository before any Pipeline containing this task is run
337  (`_DatasetDict`).
338  """
339 
340  quanta: Dict[DataCoordinate, _QuantumScaffolding]
341  """Dictionary mapping data ID to a scaffolding object for the Quantum of
342  this task with that data ID.
343  """
344 
345  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
346  """Create a `QuantumGraphTaskNodes` instance from the information in
347  ``self``.
348 
349  Returns
350  -------
351  nodes : `QuantumGraphTaskNodes`
352  The `QuantumGraph` elements corresponding to this task.
353  """
354  return QuantumGraphTaskNodes(
355  taskDef=self.taskDef,
356  quanta=[q.makeQuantum() for q in self.quanta.values()],
357  initInputs=self.initInputs.unpackSingleRefs(),
358  initOutputs=self.initOutputs.unpackSingleRefs(),
359  )
360 
361 
362 @dataclass
364  """A helper data structure that organizes the information involved in
365  constructing a `QuantumGraph` for a `Pipeline`.
366 
367  Parameters
368  ----------
369  pipeline : `Pipeline`
370  Sequence of tasks from which a graph is to be constructed. Must
371  have nested task classes already imported.
372  universe : `DimensionUniverse`
373  Universe of all possible dimensions.
374 
375  Notes
376  -----
377  The scaffolding data structure contains nested data structures for both
378  tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
379  data structures are shared between the pipeline-level structure (which
380  aggregates all datasets and categorizes them from the perspective of the
381  complete pipeline) and the individual tasks that use them as inputs and
382  outputs.
383 
384  `QuantumGraph` construction proceeds in four steps, with each corresponding
385  to a different `_PipelineScaffolding` method:
386 
387  1. When `_PipelineScaffolding` is constructed, we extract and categorize
388  the DatasetTypes used by the pipeline (delegating to
389  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
390  nested `_TaskScaffolding` and `_DatasetDict` objects.
391 
392  2. In `connectDataIds`, we construct and run the "Big Join Query", which
393  returns related tuples of all dimensions used to identify any regular
394  input, output, and intermediate datasets (not prerequisites). We then
395  iterate over these tuples of related dimensions, identifying the subsets
396  that correspond to distinct data IDs for each task and dataset type,
397  and then create `_QuantumScaffolding` objects.
398 
399  3. In `resolveDatasetRefs`, we run follow-up queries against all of the
400  dataset data IDs previously identified, transforming unresolved
401  DatasetRefs into resolved DatasetRefs where appropriate. We then look
402  up prerequisite datasets for all quanta.
403 
404  4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
405  per-task `_QuantumScaffolding` objects.
406  """
407  def __init__(self, pipeline, *, registry):
408  _LOG.debug("Initializing data structures for QuantumGraph generation.")
409  self.tasks = []
410  # Aggregate and categorize the DatasetTypes in the Pipeline.
411  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
412  # Construct dictionaries that map those DatasetTypes to structures
413  # that will (later) hold addiitonal information about them.
414  for attr in ("initInputs", "initIntermediates", "initOutputs",
415  "inputs", "intermediates", "outputs", "prerequisites"):
416  setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
417  universe=registry.dimensions))
418  # Aggregate all dimensions for all non-init, non-prerequisite
419  # DatasetTypes. These are the ones we'll include in the big join query.
420  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
421  self.outputs.dimensions)
422  # Construct scaffolding nodes for each Task, and add backreferences
423  # to the Task from each DatasetScaffolding node.
424  # Note that there's only one scaffolding node for each DatasetType, shared by
425  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
426  if isinstance(pipeline, Pipeline):
427  pipeline = pipeline.toExpandedPipeline()
428  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
429  for taskDef, taskDatasetTypes in zip(pipeline,
430  datasetTypes.byTask.values())]
431 
432  def __repr__(self):
433  # Default dataclass-injected __repr__ gets caught in an infinite loop
434  # because of back-references.
435  return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
436 
437  tasks: List[_TaskScaffolding]
438  """Scaffolding data structures for each task in the pipeline
439  (`list` of `_TaskScaffolding`).
440  """
441 
442  initInputs: _DatasetDict
443  """Datasets consumed but not produced when constructing the tasks in this
444  pipeline (`_DatasetDict`).
445  """
446 
447  initIntermediates: _DatasetDict
448  """Datasets that are both consumed and produced when constructing the tasks
449  in this pipeline (`_DatasetDict`).
450  """
451 
452  initOutputs: _DatasetDict
453  """Datasets produced but not consumed when constructing the tasks in this
454  pipeline (`_DatasetDict`).
455  """
456 
457  inputs: _DatasetDict
458  """Datasets that are consumed but not produced when running this pipeline
459  (`_DatasetDict`).
460  """
461 
462  intermediates: _DatasetDict
463  """Datasets that are both produced and consumed when running this pipeline
464  (`_DatasetDict`).
465  """
466 
467  outputs: _DatasetDict
468  """Datasets produced but not consumed when when running this pipeline
469  (`_DatasetDict`).
470  """
471 
472  prerequisites: _DatasetDict
473  """Datasets that are consumed when running this pipeline and looked up
474  per-Quantum when generating the graph (`_DatasetDict`).
475  """
476 
477  dimensions: DimensionGraph
478  """All dimensions used by any regular input, intermediate, or output
479  (not prerequisite) dataset; the set of dimension used in the "Big Join
480  Query" (`DimensionGraph`).
481 
482  This is required to be a superset of all task quantum dimensions.
483  """
484 
485  @contextmanager
486  def connectDataIds(self, registry, collections, userQuery):
487  """Query for the data IDs that connect nodes in the `QuantumGraph`.
488 
489  This method populates `_TaskScaffolding.dataIds` and
490  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
491 
492  Parameters
493  ----------
494  registry : `lsst.daf.butler.Registry`
495  Registry for the data repository; used for all data ID queries.
496  collections : `lsst.daf.butler.CollectionSearch`
497  Object representing the collections to search for input datasets.
498  userQuery : `str`, optional
499  User-provided expression to limit the data IDs processed.
500 
501  Returns
502  -------
503  commonDataIds : \
504  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
505  An interface to a database temporary table containing all data IDs
506  that will appear in this `QuantumGraph`. Returned inside a
507  context manager, which will drop the temporary table at the end of
508  the `with` block in which this method is called.
509  """
510  _LOG.debug("Building query for data IDs.")
511  # Initialization datasets always have empty data IDs.
512  emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
513  for datasetType, refs in itertools.chain(self.initInputs.items(),
514  self.initIntermediates.items(),
515  self.initOutputs.items()):
516  refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
517  # Run one big query for the data IDs for task dimensions and regular
518  # inputs and outputs. We limit the query to only dimensions that are
519  # associated with the input dataset types, but don't (yet) try to
520  # obtain the dataset_ids for those inputs.
521  _LOG.debug("Submitting data ID query and materializing results.")
522  with registry.queryDataIds(self.dimensions,
523  datasets=list(self.inputs),
524  collections=collections,
525  where=userQuery,
526  ).materialize() as commonDataIds:
527  _LOG.debug("Expanding data IDs.")
528  commonDataIds = commonDataIds.expanded()
529  _LOG.debug("Iterating over query results to associate quanta with datasets.")
530  # Iterate over query results, populating data IDs for datasets and
531  # quanta and then connecting them to each other.
532  n = 0
533  for n, commonDataId in enumerate(commonDataIds):
534  # Create DatasetRefs for all DatasetTypes from this result row,
535  # noting that we might have created some already.
536  # We remember both those that already existed and those that we
537  # create now.
538  refsForRow = {}
539  for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
540  self.outputs.items()):
541  datasetDataId = commonDataId.subset(datasetType.dimensions)
542  ref = refs.get(datasetDataId)
543  if ref is None:
544  ref = DatasetRef(datasetType, datasetDataId)
545  refs[datasetDataId] = ref
546  refsForRow[datasetType.name] = ref
547  # Create _QuantumScaffolding objects for all tasks from this result
548  # row, noting that we might have created some already.
549  for task in self.tasks:
550  quantumDataId = commonDataId.subset(task.dimensions)
551  quantum = task.quanta.get(quantumDataId)
552  if quantum is None:
553  quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
554  task.quanta[quantumDataId] = quantum
555  # Whether this is a new quantum or an existing one, we can now
556  # associate the DatasetRefs for this row with it. The fact
557  # the fact that a Quantum data ID and a dataset data ID both
558  # came from the same result row is what tells us they should
559  # be associated.
560  # Many of these associates will be duplicates (because another
561  # query row that differed from this one only in irrelevant
562  # dimensions already added them), and we use sets to skip.
563  for datasetType in task.inputs:
564  ref = refsForRow[datasetType.name]
565  quantum.inputs[datasetType.name][ref.dataId] = ref
566  for datasetType in task.outputs:
567  ref = refsForRow[datasetType.name]
568  quantum.outputs[datasetType.name][ref.dataId] = ref
569  _LOG.debug("Finished processing %d rows from data ID query.", n)
570  yield commonDataIds
571 
572  def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True):
573  """Perform follow up queries for each dataset data ID produced in
574  `fillDataIds`.
575 
576  This method populates `_DatasetScaffolding.refs` (except for those in
577  `prerequisites`).
578 
579  Parameters
580  ----------
581  registry : `lsst.daf.butler.Registry`
582  Registry for the data repository; used for all data ID queries.
583  collections : `lsst.daf.butler.CollectionSearch`
584  Object representing the collections to search for input datasets.
585  run : `str`, optional
586  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
587  output datasets, if it already exists.
588  commonDataIds : \
589  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
590  Result of a previous call to `connectDataIds`.
591  skipExisting : `bool`, optional
592  If `True` (default), a Quantum is not created if all its outputs
593  already exist in ``run``. Ignored if ``run`` is `None`.
594 
595  Raises
596  ------
597  OutputExistsError
598  Raised if an output dataset already exists in the output run
599  and ``skipExisting`` is `False`. The case where some but not all
600  of a quantum's outputs are present and ``skipExisting`` is `True`
601  cannot be identified at this stage, and is handled by `fillQuanta`
602  instead.
603  """
604  # Look up [init] intermediate and output datasets in the output
605  # collection, if there is an output collection.
606  if run is not None:
607  for datasetType, refs in itertools.chain(self.initIntermediates.items(),
608  self.initOutputs.items(),
609  self.intermediates.items(),
610  self.outputs.items()):
611  _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
612  len(refs), datasetType.name)
613  isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
614  resolvedRefQueryResults = commonDataIds.subset(
615  datasetType.dimensions,
616  unique=True
617  ).findDatasets(
618  datasetType,
619  collections=run,
620  deduplicate=True
621  )
622  for resolvedRef in resolvedRefQueryResults:
623  # TODO: we could easily support per-DatasetType
624  # skipExisting and I could imagine that being useful - it's
625  # probably required in order to support writing initOutputs
626  # before QuantumGraph generation.
627  assert resolvedRef.dataId in refs
628  if skipExisting or isInit:
629  refs[resolvedRef.dataId] = resolvedRef
630  else:
631  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
632  f"output RUN collection '{run}' with data ID"
633  f" {resolvedRef.dataId}.")
634  # Look up input and initInput datasets in the input collection(s).
635  for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
636  _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
637  resolvedRefQueryResults = commonDataIds.subset(
638  datasetType.dimensions,
639  unique=True
640  ).findDatasets(
641  datasetType,
642  collections=collections,
643  deduplicate=True
644  )
645  dataIdsNotFoundYet = set(refs.keys())
646  for resolvedRef in resolvedRefQueryResults:
647  dataIdsNotFoundYet.discard(resolvedRef.dataId)
648  refs[resolvedRef.dataId] = resolvedRef
649  if dataIdsNotFoundYet:
650  raise RuntimeError(
651  f"{len(dataIdsNotFoundYet)} dataset(s) of type "
652  f"'{datasetType.name}' was/were present in a previous "
653  f"query, but could not be found now."
654  f"This is either a logic bug in QuantumGraph generation "
655  f"or the input collections have been modified since "
656  f"QuantumGraph generation began."
657  )
658  # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
659  # replacing the unresolved refs there, and then look up prerequisites.
660  for task in self.tasks:
661  _LOG.debug(
662  "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
663  len(task.quanta),
664  task.taskDef.label
665  )
666  lookupFunctions = {
667  c.name: c.lookupFunction
668  for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
669  if c.lookupFunction is not None
670  }
671  dataIdsToSkip = []
672  for quantum in task.quanta.values():
673  # Process outputs datasets only if there is a run to look for
674  # outputs in and skipExisting is True. Note that if
675  # skipExisting is False, any output datasets that already exist
676  # would have already caused an exception to be raised.
677  # We never update the DatasetRefs in the quantum because those
678  # should never be resolved.
679  if run is not None and skipExisting:
680  resolvedRefs = []
681  unresolvedRefs = []
682  for datasetType, originalRefs in quantum.outputs.items():
683  for ref in task.outputs.extract(datasetType, originalRefs.keys()):
684  if ref.id is not None:
685  resolvedRefs.append(ref)
686  else:
687  unresolvedRefs.append(ref)
688  if resolvedRefs:
689  if unresolvedRefs:
690  raise OutputExistsError(
691  f"Quantum {quantum.dataId} of task with label "
692  f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
693  f"and others that don't ({unresolvedRefs})."
694  )
695  else:
696  # All outputs are already present; skip this
697  # quantum and continue to the next.
698  dataIdsToSkip.append(quantum.dataId)
699  continue
700  # Update the input DatasetRefs to the resolved ones we already
701  # searched for.
702  for datasetType, refs in quantum.inputs.items():
703  for ref in task.inputs.extract(datasetType, refs.keys()):
704  refs[ref.dataId] = ref
705  # Look up prerequisite datasets in the input collection(s).
706  # These may have dimensions that extend beyond those we queried
707  # for originally, because we want to permit those data ID
708  # values to differ across quanta and dataset types.
709  # For example, the same quantum may have a flat and bias with
710  # a different calibration_label, or a refcat with a skypix
711  # value that overlaps the quantum's data ID's region, but not
712  # the user expression used for the initial query.
713  for datasetType in task.prerequisites:
714  lookupFunction = lookupFunctions.get(datasetType.name)
715  if lookupFunction is not None:
716  refs = list(
717  lookupFunction(datasetType, registry, quantum.dataId, collections)
718  )
719  else:
720  refs = list(registry.queryDatasets(datasetType,
721  collections=collections,
722  dataId=quantum.dataId,
723  deduplicate=True).expanded())
724  quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs})
725  # Actually remove any quanta that we decided to skip above.
726  if dataIdsToSkip:
727  _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
728  len(dataIdsToSkip), task.taskDef.label)
729  for dataId in dataIdsToSkip:
730  del task.quanta[dataId]
731 
732  def makeQuantumGraph(self):
733  """Create a `QuantumGraph` from the quanta already present in
734  the scaffolding data structure.
735 
736  Returns
737  -------
738  graph : `QuantumGraph`
739  The full `QuantumGraph`.
740  """
741  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
742  graph.initInputs = self.initInputs.unpackSingleRefs()
743  graph.initOutputs = self.initOutputs.unpackSingleRefs()
744  graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
745  return graph
746 
747 
748 # ------------------------
749 # Exported definitions --
750 # ------------------------
751 
752 
753 class GraphBuilderError(Exception):
754  """Base class for exceptions generated by graph builder.
755  """
756  pass
757 
758 
759 class OutputExistsError(GraphBuilderError):
760  """Exception generated when output datasets already exist.
761  """
762  pass
763 
764 
766  """Exception generated when a prerequisite dataset does not exist.
767  """
768  pass
769 
770 
771 class GraphBuilder(object):
772  """GraphBuilder class is responsible for building task execution graph from
773  a Pipeline.
774 
775  Parameters
776  ----------
777  registry : `~lsst.daf.butler.Registry`
778  Data butler instance.
779  skipExisting : `bool`, optional
780  If `True` (default), a Quantum is not created if all its outputs
781  already exist.
782  """
783 
784  def __init__(self, registry, skipExisting=True):
785  self.registry = registry
786  self.dimensions = registry.dimensions
787  self.skipExisting = skipExisting
788 
789  def makeGraph(self, pipeline, collections, run, userQuery):
790  """Create execution graph for a pipeline.
791 
792  Parameters
793  ----------
794  pipeline : `Pipeline`
795  Pipeline definition, task names/classes and their configs.
796  collections : `lsst.daf.butler.CollectionSearch`
797  Object representing the collections to search for input datasets.
798  run : `str`, optional
799  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
800  output datasets, if it already exists.
801  userQuery : `str`
802  String which defunes user-defined selection for registry, should be
803  empty or `None` if there is no restrictions on data selection.
804 
805  Returns
806  -------
807  graph : `QuantumGraph`
808 
809  Raises
810  ------
811  UserExpressionError
812  Raised when user expression cannot be parsed.
813  OutputExistsError
814  Raised when output datasets already exist.
815  Exception
816  Other exceptions types may be raised by underlying registry
817  classes.
818  """
819  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
820  with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds:
821  scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
822  skipExisting=self.skipExisting)
823  return scaffolding.makeQuantumGraph()
lsst::pipe::base.graphBuilder._PipelineScaffolding.dimensions
dimensions
Definition: graphBuilder.py:420
lsst::pipe::base.graph.QuantumGraph
Definition: graph.py:120
lsst::pipe::base.graphBuilder._PipelineScaffolding.tasks
tasks
Definition: graphBuilder.py:409
lsst::pipe::base.graphBuilder._QuantumScaffolding.__init__
def __init__(self, _TaskScaffolding task, DataCoordinate dataId)
Definition: graphBuilder.py:198
lsst::pipe::base.graphBuilder._QuantumScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:203
lsst::pipe::base.graphBuilder._TaskScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:296
lsst::pipe::base.graphBuilder._DatasetDict.fromSubset
_DatasetDict fromSubset(cls, Iterable[DatasetType] datasetTypes, _DatasetDict first, *_DatasetDict rest)
Definition: graphBuilder.py:97
lsst::pipe::base.graphBuilder.GraphBuilder.skipExisting
skipExisting
Definition: graphBuilder.py:787
lsst::pipe::base.graphBuilder._PipelineScaffolding.connectDataIds
def connectDataIds(self, registry, collections, userQuery)
Definition: graphBuilder.py:486
lsst::pipe::base.graphBuilder._TaskScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:300
lsst::pipe::base.graphBuilder._TaskScaffolding.taskDef
taskDef
Definition: graphBuilder.py:285
lsst::pipe::base.graphBuilder._QuantumScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:208
lsst::pipe::base.graphBuilder._TaskScaffolding.outputs
outputs
Definition: graphBuilder.py:295
lsst::pipe::base.graphBuilder._TaskScaffolding.initInputs
initInputs
Definition: graphBuilder.py:290
lsst::pipe::base.graphBuilder._PipelineScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:432
lsst::pipe::base.graphBuilder._QuantumScaffolding.outputs
outputs
Definition: graphBuilder.py:202
lsst::pipe::base.graphBuilder._QuantumScaffolding.dataId
dataId
Definition: graphBuilder.py:200
lsst::pipe::base.graphBuilder._QuantumScaffolding.makeQuantum
Quantum makeQuantum(self)
Definition: graphBuilder.py:238
lsst::pipe::base.graphBuilder._DatasetDict.fromDatasetTypes
_DatasetDict fromDatasetTypes(cls, Iterable[DatasetType] datasetTypes, *DimensionUniverse universe)
Definition: graphBuilder.py:77
lsst::pipe::base.graphBuilder.PrerequisiteMissingError
Definition: graphBuilder.py:765
lsst::pipe::base.graphBuilder._DatasetDict
Definition: graphBuilder.py:61
lsst::pipe::base.graphBuilder._QuantumScaffolding.inputs
inputs
Definition: graphBuilder.py:201
lsst::pipe::base.graphBuilder._QuantumScaffolding.task
task
Definition: graphBuilder.py:199
lsst::pipe::base.graph.QuantumGraphTaskNodes
Definition: graph.py:89
lsst::pipe::base.graphBuilder._TaskScaffolding.initOutputs
initOutputs
Definition: graphBuilder.py:292
lsst::pipe::base.graphBuilder._PipelineScaffolding.__init__
def __init__(self, pipeline, *registry)
Definition: graphBuilder.py:407
lsst::pipe::base.graphBuilder._TaskScaffolding.inputs
inputs
Definition: graphBuilder.py:294
lsst::pipe::base.graphBuilder._PipelineScaffolding.resolveDatasetRefs
def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *skipExisting=True)
Definition: graphBuilder.py:572
lsst::pipe::base.graphBuilder._PipelineScaffolding
Definition: graphBuilder.py:363
lsst::pipe::base.graphBuilder._DatasetDict.extract
Iterator[DatasetRef] extract(self, DatasetType datasetType, Iterable[DataCoordinate] dataIds)
Definition: graphBuilder.py:161
lsst::pipe::base.graphBuilder._DatasetDict.__init__
def __init__(self, *args, DimensionGraph universe)
Definition: graphBuilder.py:72
lsst::pipe::base.graphBuilder.GraphBuilder.dimensions
dimensions
Definition: graphBuilder.py:786
lsst::pipe::base.connections.iterConnections
typing.Generator iterConnections(PipelineTaskConnections connections, str connectionType)
Definition: connections.py:500
lsst::pipe::base.graphBuilder._TaskScaffolding.dataIds
dataIds
Definition: graphBuilder.py:297
lsst::pipe::base.graphBuilder._TaskScaffolding.makeQuantumGraphTaskNodes
QuantumGraphTaskNodes makeQuantumGraphTaskNodes(self)
Definition: graphBuilder.py:345
lsst::pipe::base.graphBuilder.GraphBuilder.registry
registry
Definition: graphBuilder.py:785
lsst::pipe::base.graphBuilder._TaskScaffolding.quanta
quanta
Definition: graphBuilder.py:298
lsst::pipe::base.graphBuilder.GraphBuilder.makeGraph
def makeGraph(self, pipeline, collections, run, userQuery)
Definition: graphBuilder.py:789
lsst::pipe::base.graphBuilder._DatasetDict.unpackMultiRefs
NamedKeyDict[DatasetType, DatasetRef] unpackMultiRefs(self)
Definition: graphBuilder.py:149
lsst::pipe::base.graphBuilder._DatasetDict.dimensions
DimensionGraph dimensions(self)
Definition: graphBuilder.py:122
lsst::pipe::base.graphBuilder._TaskScaffolding
Definition: graphBuilder.py:266
lsst::pipe::base.graphBuilder._DatasetDict.unpackSingleRefs
NamedKeyDict[DatasetType, DatasetRef] unpackSingleRefs(self)
Definition: graphBuilder.py:131
lsst::pipe::base.graphBuilder._QuantumScaffolding
Definition: graphBuilder.py:183
lsst::pipe::base.graphBuilder._DatasetDict.universe
universe
Definition: graphBuilder.py:74
lsst::pipe::base.graphBuilder._TaskScaffolding.dimensions
dimensions
Definition: graphBuilder.py:286
lsst::pipe::base.graphBuilder.GraphBuilder
Definition: graphBuilder.py:771
lsst::pipe::base.graphBuilder.GraphBuilder.__init__
def __init__(self, registry, skipExisting=True)
Definition: graphBuilder.py:784
lsst::pipe::base.graphBuilder.GraphBuilderError
Definition: graphBuilder.py:753
lsst::pipe::base.graphBuilder._PipelineScaffolding.makeQuantumGraph
def makeQuantumGraph(self)
Definition: graphBuilder.py:732
lsst::pipe::base.graphBuilder.OutputExistsError
Definition: graphBuilder.py:759
lsst::pipe::base.graphBuilder._TaskScaffolding.__init__
def __init__(self, TaskDef taskDef, _PipelineScaffolding parent, TaskDatasetTypes datasetTypes)
Definition: graphBuilder.py:283