lsst.pipe.base  20.0.0-13-ge9dc5b3+55648be1db
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import itertools
32 from collections import ChainMap
33 from contextlib import contextmanager
34 from dataclasses import dataclass
35 from typing import Dict, Iterable, Iterator, List
36 import logging
37 
38 
39 # -----------------------------
40 # Imports for other modules --
41 # -----------------------------
42 from .connections import iterConnections
43 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44 from .graph import QuantumGraph, QuantumGraphTaskNodes
45 from lsst.daf.butler import (
46  DataCoordinate,
47  DatasetRef,
48  DatasetType,
49  DimensionGraph,
50  DimensionUniverse,
51  NamedKeyDict,
52  Quantum,
53 )
54 from lsst.daf.butler.registry.queries.exprParser import ParseError, ParserYacc, TreeVisitor
55 from lsst.utils import doImport
56 
57 # ----------------------------------
58 # Local non-exported definitions --
59 # ----------------------------------
60 
61 _LOG = logging.getLogger(__name__.partition(".")[2])
62 
63 
64 class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
65  """A custom dictionary that maps `DatasetType` to a nested dictionary of
66  the known `DatasetRef` instances of that type.
67 
68  Parameters
69  ----------
70  args
71  Positional arguments are forwarded to the `dict` constructor.
72  universe : `DimensionUniverse`
73  Universe of all possible dimensions.
74  """
75  def __init__(self, *args, universe: DimensionGraph):
76  super().__init__(*args)
77  self.universe = universe
78 
79  @classmethod
80  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
81  universe: DimensionUniverse) -> _DatasetDict:
82  """Construct a dictionary from a flat iterable of `DatasetType` keys.
83 
84  Parameters
85  ----------
86  datasetTypes : `iterable` of `DatasetType`
87  DatasetTypes to use as keys for the dict. Values will be empty
88  dictionaries.
89  universe : `DimensionUniverse`
90  Universe of all possible dimensions.
91 
92  Returns
93  -------
94  dictionary : `_DatasetDict`
95  A new `_DatasetDict` instance.
96  """
97  return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
98 
99  @classmethod
100  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
101  ) -> _DatasetDict:
102  """Return a new dictionary by extracting items corresponding to the
103  given keys from one or more existing dictionaries.
104 
105  Parameters
106  ----------
107  datasetTypes : `iterable` of `DatasetType`
108  DatasetTypes to use as keys for the dict. Values will be obtained
109  by lookups against ``first`` and ``rest``.
110  first : `_DatasetDict`
111  Another dictionary from which to extract values.
112  rest
113  Additional dictionaries from which to extract values.
114 
115  Returns
116  -------
117  dictionary : `_DatasetDict`
118  A new dictionary instance.
119  """
120  combined = ChainMap(first, *rest)
121  return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
122  universe=first.universe)
123 
124  @property
125  def dimensions(self) -> DimensionGraph:
126  """The union of all dimensions used by all dataset types in this
127  dictionary, including implied dependencies (`DimensionGraph`).
128  """
129  base = self.universe.empty
130  if len(self) == 0:
131  return base
132  return base.union(*[datasetType.dimensions for datasetType in self.keys()])
133 
134  def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
135  """Unpack nested single-element `DatasetRef` dicts into a new
136  mapping with `DatasetType` keys and `DatasetRef` values.
137 
138  This method assumes that each nest contains exactly one item, as is the
139  case for all "init" datasets.
140 
141  Returns
142  -------
143  dictionary : `NamedKeyDict`
144  Dictionary mapping `DatasetType` to `DatasetRef`, with both
145  `DatasetType` instances and string names usable as keys.
146  """
147  def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
148  ref, = refs.values()
149  return ref
150  return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
151 
152  def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
153  """Unpack nested multi-element `DatasetRef` dicts into a new
154  mapping with `DatasetType` keys and `set` of `DatasetRef` values.
155 
156  Returns
157  -------
158  dictionary : `NamedKeyDict`
159  Dictionary mapping `DatasetType` to `DatasetRef`, with both
160  `DatasetType` instances and string names usable as keys.
161  """
162  return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
163 
164  def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
165  ) -> Iterator[DatasetRef]:
166  """Iterate over the contained `DatasetRef` instances that match the
167  given `DatasetType` and data IDs.
168 
169  Parameters
170  ----------
171  datasetType : `DatasetType`
172  Dataset type to match.
173  dataIds : `Iterable` [ `DataCoordinate` ]
174  Data IDs to match.
175 
176  Returns
177  -------
178  refs : `Iterator` [ `DatasetRef` ]
179  DatasetRef instances for which ``ref.datasetType == datasetType``
180  and ``ref.dataId`` is in ``dataIds``.
181  """
182  refs = self[datasetType]
183  return (refs[dataId] for dataId in dataIds)
184 
185 
187  """Helper class aggregating information about a `Quantum`, used when
188  constructing a `QuantumGraph`.
189 
190  See `_PipelineScaffolding` for a top-down description of the full
191  scaffolding data structure.
192 
193  Parameters
194  ----------
195  task : _TaskScaffolding
196  Back-reference to the helper object for the `PipelineTask` this quantum
197  represents an execution of.
198  dataId : `DataCoordinate`
199  Data ID for this quantum.
200  """
201  def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
202  self.task = task
203  self.dataId = dataId
204  self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
205  self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
206  self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
207  universe=dataId.universe)
208 
209  __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
210 
211  def __repr__(self):
212  return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
213 
214  task: _TaskScaffolding
215  """Back-reference to the helper object for the `PipelineTask` this quantum
216  represents an execution of.
217  """
218 
219  dataId: DataCoordinate
220  """Data ID for this quantum.
221  """
222 
223  inputs: _DatasetDict
224  """Nested dictionary containing `DatasetRef` inputs to this quantum.
225 
226  This is initialized to map each `DatasetType` to an empty dictionary at
227  construction. Those nested dictionaries are populated (with data IDs as
228  keys) with unresolved `DatasetRef` instances in
229  `_PipelineScaffolding.connectDataIds`.
230  """
231 
232  outputs: _DatasetDict
233  """Nested dictionary containing `DatasetRef` outputs this quantum.
234  """
235 
236  prerequisites: _DatasetDict
237  """Nested dictionary containing `DatasetRef` prerequisite inputs to this
238  quantum.
239  """
240 
241  def makeQuantum(self) -> Quantum:
242  """Transform the scaffolding object into a true `Quantum` instance.
243 
244  Returns
245  -------
246  quantum : `Quantum`
247  An actual `Quantum` instance.
248  """
249  allInputs = self.inputs.unpackMultiRefs()
250  allInputs.update(self.prerequisites.unpackMultiRefs())
251  # Give the task's Connections class an opportunity to remove some
252  # inputs, or complain if they are unacceptable.
253  # This will raise if one of the check conditions is not met, which is the intended
254  # behavior
255  allInputs = self.task.taskDef.connections.adjustQuantum(allInputs)
256  return Quantum(
257  taskName=self.task.taskDef.taskName,
258  taskClass=self.task.taskDef.taskClass,
259  dataId=self.dataId,
260  initInputs=self.task.initInputs.unpackSingleRefs(),
261  predictedInputs=allInputs,
262  outputs=self.outputs.unpackMultiRefs(),
263  )
264 
265 
266 @dataclass
268  """Helper class aggregating information about a `PipelineTask`, used when
269  constructing a `QuantumGraph`.
270 
271  See `_PipelineScaffolding` for a top-down description of the full
272  scaffolding data structure.
273 
274  Parameters
275  ----------
276  taskDef : `TaskDef`
277  Data structure that identifies the task class and its config.
278  parent : `_PipelineScaffolding`
279  The parent data structure that will hold the instance being
280  constructed.
281  datasetTypes : `TaskDatasetTypes`
282  Data structure that categorizes the dataset types used by this task.
283  """
284  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
285  universe = parent.dimensions.universe
286  self.taskDef = taskDef
287  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
288  assert self.dimensions.issubset(parent.dimensions)
289  # Initialize _DatasetDicts as subsets of the one or two
290  # corresponding dicts in the parent _PipelineScaffolding.
291  self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
292  parent.initIntermediates)
293  self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
294  parent.initOutputs)
295  self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
296  self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
297  self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
298  self.dataIds = set()
299  self.quanta = {}
300 
301  def __repr__(self):
302  # Default dataclass-injected __repr__ gets caught in an infinite loop
303  # because of back-references.
304  return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
305 
306  taskDef: TaskDef
307  """Data structure that identifies the task class and its config
308  (`TaskDef`).
309  """
310 
311  dimensions: DimensionGraph
312  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
313  """
314 
315  initInputs: _DatasetDict
316  """Dictionary containing information about datasets used to construct this
317  task (`_DatasetDict`).
318  """
319 
320  initOutputs: _DatasetDict
321  """Dictionary containing information about datasets produced as a
322  side-effect of constructing this task (`_DatasetDict`).
323  """
324 
325  inputs: _DatasetDict
326  """Dictionary containing information about datasets used as regular,
327  graph-constraining inputs to this task (`_DatasetDict`).
328  """
329 
330  outputs: _DatasetDict
331  """Dictionary containing information about datasets produced by this task
332  (`_DatasetDict`).
333  """
334 
335  prerequisites: _DatasetDict
336  """Dictionary containing information about input datasets that must be
337  present in the repository before any Pipeline containing this task is run
338  (`_DatasetDict`).
339  """
340 
341  quanta: Dict[DataCoordinate, _QuantumScaffolding]
342  """Dictionary mapping data ID to a scaffolding object for the Quantum of
343  this task with that data ID.
344  """
345 
346  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
347  """Create a `QuantumGraphTaskNodes` instance from the information in
348  ``self``.
349 
350  Returns
351  -------
352  nodes : `QuantumGraphTaskNodes`
353  The `QuantumGraph` elements corresponding to this task.
354  """
355  return QuantumGraphTaskNodes(
356  taskDef=self.taskDef,
357  quanta=[q.makeQuantum() for q in self.quanta.values()],
358  initInputs=self.initInputs.unpackSingleRefs(),
359  initOutputs=self.initOutputs.unpackSingleRefs(),
360  )
361 
362 
363 @dataclass
365  """A helper data structure that organizes the information involved in
366  constructing a `QuantumGraph` for a `Pipeline`.
367 
368  Parameters
369  ----------
370  pipeline : `Pipeline`
371  Sequence of tasks from which a graph is to be constructed. Must
372  have nested task classes already imported.
373  universe : `DimensionUniverse`
374  Universe of all possible dimensions.
375 
376  Notes
377  -----
378  The scaffolding data structure contains nested data structures for both
379  tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
380  data structures are shared between the pipeline-level structure (which
381  aggregates all datasets and categorizes them from the perspective of the
382  complete pipeline) and the individual tasks that use them as inputs and
383  outputs.
384 
385  `QuantumGraph` construction proceeds in four steps, with each corresponding
386  to a different `_PipelineScaffolding` method:
387 
388  1. When `_PipelineScaffolding` is constructed, we extract and categorize
389  the DatasetTypes used by the pipeline (delegating to
390  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
391  nested `_TaskScaffolding` and `_DatasetDict` objects.
392 
393  2. In `connectDataIds`, we construct and run the "Big Join Query", which
394  returns related tuples of all dimensions used to identify any regular
395  input, output, and intermediate datasets (not prerequisites). We then
396  iterate over these tuples of related dimensions, identifying the subsets
397  that correspond to distinct data IDs for each task and dataset type,
398  and then create `_QuantumScaffolding` objects.
399 
400  3. In `resolveDatasetRefs`, we run follow-up queries against all of the
401  dataset data IDs previously identified, transforming unresolved
402  DatasetRefs into resolved DatasetRefs where appropriate. We then look
403  up prerequisite datasets for all quanta.
404 
405  4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
406  per-task `_QuantumScaffolding` objects.
407  """
408  def __init__(self, pipeline, *, registry):
409  _LOG.debug("Initializing data structures for QuantumGraph generation.")
410  self.tasks = []
411  # Aggregate and categorize the DatasetTypes in the Pipeline.
412  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
413  # Construct dictionaries that map those DatasetTypes to structures
414  # that will (later) hold addiitonal information about them.
415  for attr in ("initInputs", "initIntermediates", "initOutputs",
416  "inputs", "intermediates", "outputs", "prerequisites"):
417  setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
418  universe=registry.dimensions))
419  # Aggregate all dimensions for all non-init, non-prerequisite
420  # DatasetTypes. These are the ones we'll include in the big join query.
421  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
422  self.outputs.dimensions)
423  # Construct scaffolding nodes for each Task, and add backreferences
424  # to the Task from each DatasetScaffolding node.
425  # Note that there's only one scaffolding node for each DatasetType, shared by
426  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
427  if isinstance(pipeline, Pipeline):
428  pipeline = pipeline.toExpandedPipeline()
429  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
430  for taskDef, taskDatasetTypes in zip(pipeline,
431  datasetTypes.byTask.values())]
432 
433  def __repr__(self):
434  # Default dataclass-injected __repr__ gets caught in an infinite loop
435  # because of back-references.
436  return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
437 
438  tasks: List[_TaskScaffolding]
439  """Scaffolding data structures for each task in the pipeline
440  (`list` of `_TaskScaffolding`).
441  """
442 
443  initInputs: _DatasetDict
444  """Datasets consumed but not produced when constructing the tasks in this
445  pipeline (`_DatasetDict`).
446  """
447 
448  initIntermediates: _DatasetDict
449  """Datasets that are both consumed and produced when constructing the tasks
450  in this pipeline (`_DatasetDict`).
451  """
452 
453  initOutputs: _DatasetDict
454  """Datasets produced but not consumed when constructing the tasks in this
455  pipeline (`_DatasetDict`).
456  """
457 
458  inputs: _DatasetDict
459  """Datasets that are consumed but not produced when running this pipeline
460  (`_DatasetDict`).
461  """
462 
463  intermediates: _DatasetDict
464  """Datasets that are both produced and consumed when running this pipeline
465  (`_DatasetDict`).
466  """
467 
468  outputs: _DatasetDict
469  """Datasets produced but not consumed when when running this pipeline
470  (`_DatasetDict`).
471  """
472 
473  prerequisites: _DatasetDict
474  """Datasets that are consumed when running this pipeline and looked up
475  per-Quantum when generating the graph (`_DatasetDict`).
476  """
477 
478  dimensions: DimensionGraph
479  """All dimensions used by any regular input, intermediate, or output
480  (not prerequisite) dataset; the set of dimension used in the "Big Join
481  Query" (`DimensionGraph`).
482 
483  This is required to be a superset of all task quantum dimensions.
484  """
485 
486  @contextmanager
487  def connectDataIds(self, registry, collections, userQuery):
488  """Query for the data IDs that connect nodes in the `QuantumGraph`.
489 
490  This method populates `_TaskScaffolding.dataIds` and
491  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
492 
493  Parameters
494  ----------
495  registry : `lsst.daf.butler.Registry`
496  Registry for the data repository; used for all data ID queries.
497  collections : `lsst.daf.butler.CollectionSearch`
498  Object representing the collections to search for input datasets.
499  userQuery : `str`, optional
500  User-provided expression to limit the data IDs processed.
501 
502  Returns
503  -------
504  commonDataIds : \
505  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
506  An interface to a database temporary table containing all data IDs
507  that will appear in this `QuantumGraph`. Returned inside a
508  context manager, which will drop the temporary table at the end of
509  the `with` block in which this method is called.
510  """
511  _LOG.debug("Building query for data IDs.")
512  # Initialization datasets always have empty data IDs.
513  emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
514  for datasetType, refs in itertools.chain(self.initInputs.items(),
515  self.initIntermediates.items(),
516  self.initOutputs.items()):
517  refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
518  # Run one big query for the data IDs for task dimensions and regular
519  # inputs and outputs. We limit the query to only dimensions that are
520  # associated with the input dataset types, but don't (yet) try to
521  # obtain the dataset_ids for those inputs.
522  _LOG.debug("Submitting data ID query and materializing results.")
523  with registry.queryDataIds(self.dimensions,
524  datasets=list(self.inputs),
525  collections=collections,
526  where=userQuery,
527  ).materialize() as commonDataIds:
528  _LOG.debug("Expanding data IDs.")
529  commonDataIds = commonDataIds.expanded()
530  _LOG.debug("Iterating over query results to associate quanta with datasets.")
531  # Iterate over query results, populating data IDs for datasets and
532  # quanta and then connecting them to each other.
533  n = 0
534  for n, commonDataId in enumerate(commonDataIds):
535  # Create DatasetRefs for all DatasetTypes from this result row,
536  # noting that we might have created some already.
537  # We remember both those that already existed and those that we
538  # create now.
539  refsForRow = {}
540  for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
541  self.outputs.items()):
542  datasetDataId = commonDataId.subset(datasetType.dimensions)
543  ref = refs.get(datasetDataId)
544  if ref is None:
545  ref = DatasetRef(datasetType, datasetDataId)
546  refs[datasetDataId] = ref
547  refsForRow[datasetType.name] = ref
548  # Create _QuantumScaffolding objects for all tasks from this result
549  # row, noting that we might have created some already.
550  for task in self.tasks:
551  quantumDataId = commonDataId.subset(task.dimensions)
552  quantum = task.quanta.get(quantumDataId)
553  if quantum is None:
554  quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
555  task.quanta[quantumDataId] = quantum
556  # Whether this is a new quantum or an existing one, we can now
557  # associate the DatasetRefs for this row with it. The fact
558  # the fact that a Quantum data ID and a dataset data ID both
559  # came from the same result row is what tells us they should
560  # be associated.
561  # Many of these associates will be duplicates (because another
562  # query row that differed from this one only in irrelevant
563  # dimensions already added them), and we use sets to skip.
564  for datasetType in task.inputs:
565  ref = refsForRow[datasetType.name]
566  quantum.inputs[datasetType.name][ref.dataId] = ref
567  for datasetType in task.outputs:
568  ref = refsForRow[datasetType.name]
569  quantum.outputs[datasetType.name][ref.dataId] = ref
570  _LOG.debug("Finished processing %d rows from data ID query.", n)
571  yield commonDataIds
572 
573  def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True):
574  """Perform follow up queries for each dataset data ID produced in
575  `fillDataIds`.
576 
577  This method populates `_DatasetScaffolding.refs` (except for those in
578  `prerequisites`).
579 
580  Parameters
581  ----------
582  registry : `lsst.daf.butler.Registry`
583  Registry for the data repository; used for all data ID queries.
584  collections : `lsst.daf.butler.CollectionSearch`
585  Object representing the collections to search for input datasets.
586  run : `str`, optional
587  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
588  output datasets, if it already exists.
589  commonDataIds : \
590  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
591  Result of a previous call to `connectDataIds`.
592  skipExisting : `bool`, optional
593  If `True` (default), a Quantum is not created if all its outputs
594  already exist in ``run``. Ignored if ``run`` is `None`.
595 
596  Raises
597  ------
598  OutputExistsError
599  Raised if an output dataset already exists in the output run
600  and ``skipExisting`` is `False`. The case where some but not all
601  of a quantum's outputs are present and ``skipExisting`` is `True`
602  cannot be identified at this stage, and is handled by `fillQuanta`
603  instead.
604  """
605  # Look up [init] intermediate and output datasets in the output
606  # collection, if there is an output collection.
607  if run is not None:
608  for datasetType, refs in itertools.chain(self.initIntermediates.items(),
609  self.initOutputs.items(),
610  self.intermediates.items(),
611  self.outputs.items()):
612  _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
613  len(refs), datasetType.name)
614  isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
615  resolvedRefQueryResults = commonDataIds.subset(
616  datasetType.dimensions,
617  unique=True
618  ).findDatasets(
619  datasetType,
620  collections=run,
621  deduplicate=True
622  )
623  for resolvedRef in resolvedRefQueryResults:
624  # TODO: we could easily support per-DatasetType
625  # skipExisting and I could imagine that being useful - it's
626  # probably required in order to support writing initOutputs
627  # before QuantumGraph generation.
628  assert resolvedRef.dataId in refs
629  if skipExisting or isInit:
630  refs[resolvedRef.dataId] = resolvedRef
631  else:
632  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
633  f"output RUN collection '{run}' with data ID"
634  f" {resolvedRef.dataId}.")
635  # Look up input and initInput datasets in the input collection(s).
636  for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
637  _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
638  resolvedRefQueryResults = commonDataIds.subset(
639  datasetType.dimensions,
640  unique=True
641  ).findDatasets(
642  datasetType,
643  collections=collections,
644  deduplicate=True
645  )
646  dataIdsNotFoundYet = set(refs.keys())
647  for resolvedRef in resolvedRefQueryResults:
648  dataIdsNotFoundYet.discard(resolvedRef.dataId)
649  refs[resolvedRef.dataId] = resolvedRef
650  if dataIdsNotFoundYet:
651  raise RuntimeError(
652  f"{len(dataIdsNotFoundYet)} dataset(s) of type "
653  f"'{datasetType.name}' was/were present in a previous "
654  f"query, but could not be found now."
655  f"This is either a logic bug in QuantumGraph generation "
656  f"or the input collections have been modified since "
657  f"QuantumGraph generation began."
658  )
659  # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
660  # replacing the unresolved refs there, and then look up prerequisites.
661  for task in self.tasks:
662  _LOG.debug(
663  "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
664  len(task.quanta),
665  task.taskDef.label
666  )
667  lookupFunctions = {
668  c.name: c.lookupFunction
669  for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
670  if c.lookupFunction is not None
671  }
672  dataIdsToSkip = []
673  for quantum in task.quanta.values():
674  # Process outputs datasets only if there is a run to look for
675  # outputs in and skipExisting is True. Note that if
676  # skipExisting is False, any output datasets that already exist
677  # would have already caused an exception to be raised.
678  # We never update the DatasetRefs in the quantum because those
679  # should never be resolved.
680  if run is not None and skipExisting:
681  resolvedRefs = []
682  unresolvedRefs = []
683  for datasetType, originalRefs in quantum.outputs.items():
684  for ref in task.outputs.extract(datasetType, originalRefs.keys()):
685  if ref.id is not None:
686  resolvedRefs.append(ref)
687  else:
688  unresolvedRefs.append(ref)
689  if resolvedRefs:
690  if unresolvedRefs:
691  raise OutputExistsError(
692  f"Quantum {quantum.dataId} of task with label "
693  f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
694  f"and others that don't ({unresolvedRefs})."
695  )
696  else:
697  # All outputs are already present; skip this
698  # quantum and continue to the next.
699  dataIdsToSkip.append(quantum.dataId)
700  continue
701  # Update the input DatasetRefs to the resolved ones we already
702  # searched for.
703  for datasetType, refs in quantum.inputs.items():
704  for ref in task.inputs.extract(datasetType, refs.keys()):
705  refs[ref.dataId] = ref
706  # Look up prerequisite datasets in the input collection(s).
707  # These may have dimensions that extend beyond those we queried
708  # for originally, because we want to permit those data ID
709  # values to differ across quanta and dataset types.
710  # For example, the same quantum may have a flat and bias with
711  # a different calibration_label, or a refcat with a skypix
712  # value that overlaps the quantum's data ID's region, but not
713  # the user expression used for the initial query.
714  for datasetType in task.prerequisites:
715  lookupFunction = lookupFunctions.get(datasetType.name)
716  if lookupFunction is not None:
717  refs = list(
718  lookupFunction(datasetType, registry, quantum.dataId, collections)
719  )
720  else:
721  refs = list(registry.queryDatasets(datasetType,
722  collections=collections,
723  dataId=quantum.dataId,
724  deduplicate=True).expanded())
725  quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs})
726  # Actually remove any quanta that we decided to skip above.
727  if dataIdsToSkip:
728  _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
729  len(dataIdsToSkip), task.taskDef.label)
730  for dataId in dataIdsToSkip:
731  del task.quanta[dataId]
732 
733  def makeQuantumGraph(self):
734  """Create a `QuantumGraph` from the quanta already present in
735  the scaffolding data structure.
736 
737  Returns
738  -------
739  graph : `QuantumGraph`
740  The full `QuantumGraph`.
741  """
742  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
743  graph.initInputs = self.initInputs.unpackSingleRefs()
744  graph.initOutputs = self.initOutputs.unpackSingleRefs()
745  graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
746  return graph
747 
748 
749 class _InstrumentFinder(TreeVisitor):
750  """Implementation of TreeVisitor which looks for instrument name
751 
752  Instrument should be specified as a boolean expression
753 
754  instrument = 'string'
755  'string' = instrument
756 
757  so we only need to find a binary operator where operator is "=",
758  one side is a string literal and other side is an identifier.
759  All visit methods return tuple of (type, value), non-useful nodes
760  return None for both type and value.
761  """
762  def __init__(self):
763  self.instruments = []
764 
765  def visitNumericLiteral(self, value, node):
766  # do not care about numbers
767  return (None, None)
768 
769  def visitStringLiteral(self, value, node):
770  # return type and value
771  return ("str", value)
772 
773  def visitTimeLiteral(self, value, node):
774  # do not care about these
775  return (None, None)
776 
777  def visitRangeLiteral(self, start, stop, stride, node):
778  # do not care about these
779  return (None, None)
780 
781  def visitIdentifier(self, name, node):
782  if name.lower() == "instrument":
783  return ("id", "instrument")
784  return (None, None)
785 
786  def visitUnaryOp(self, operator, operand, node):
787  # do not care about these
788  return (None, None)
789 
790  def visitBinaryOp(self, operator, lhs, rhs, node):
791  if operator == "=":
792  if lhs == ("id", "instrument") and rhs[0] == "str":
793  self.instruments.append(rhs[1])
794  elif rhs == ("id", "instrument") and lhs[0] == "str":
795  self.instruments.append(lhs[1])
796  return (None, None)
797 
798  def visitIsIn(self, lhs, values, not_in, node):
799  # do not care about these
800  return (None, None)
801 
802  def visitParens(self, expression, node):
803  # do not care about these
804  return (None, None)
805 
806 
807 def _findInstruments(queryStr):
808  """Get the names of any instrument named in the query string by searching
809  for "instrument = <value>" and similar patterns.
810 
811  Parameters
812  ----------
813  queryStr : `str` or None
814  The query string to search, or None if there is no query.
815 
816  Returns
817  -------
818  instruments : `list` [`str`]
819  The list of instrument names found in the query.
820 
821  Raises
822  ------
823  ValueError
824  If the query expression can not be parsed.
825  """
826  if not queryStr:
827  return []
828  parser = ParserYacc()
829  finder = _InstrumentFinder()
830  try:
831  tree = parser.parse(queryStr)
832  except ParseError as exc:
833  raise ValueError(f"failed to parse query expression: {queryStr}") from exc
834  tree.visit(finder)
835  return finder.instruments
836 
837 
838 # ------------------------
839 # Exported definitions --
840 # ------------------------
841 
842 
843 class GraphBuilderError(Exception):
844  """Base class for exceptions generated by graph builder.
845  """
846  pass
847 
848 
849 class OutputExistsError(GraphBuilderError):
850  """Exception generated when output datasets already exist.
851  """
852  pass
853 
854 
856  """Exception generated when a prerequisite dataset does not exist.
857  """
858  pass
859 
860 
861 class GraphBuilder(object):
862  """GraphBuilder class is responsible for building task execution graph from
863  a Pipeline.
864 
865  Parameters
866  ----------
867  registry : `~lsst.daf.butler.Registry`
868  Data butler instance.
869  skipExisting : `bool`, optional
870  If `True` (default), a Quantum is not created if all its outputs
871  already exist.
872  """
873 
874  def __init__(self, registry, skipExisting=True):
875  self.registry = registry
876  self.dimensions = registry.dimensions
877  self.skipExisting = skipExisting
878 
879  def makeGraph(self, pipeline, collections, run, userQuery):
880  """Create execution graph for a pipeline.
881 
882  Parameters
883  ----------
884  pipeline : `Pipeline`
885  Pipeline definition, task names/classes and their configs.
886  collections : `lsst.daf.butler.CollectionSearch`
887  Object representing the collections to search for input datasets.
888  run : `str`, optional
889  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
890  output datasets, if it already exists.
891  userQuery : `str`
892  String which defines user-defined selection for registry, should be
893  empty or `None` if there is no restrictions on data selection.
894 
895  Returns
896  -------
897  graph : `QuantumGraph`
898 
899  Raises
900  ------
901  UserExpressionError
902  Raised when user expression cannot be parsed.
903  OutputExistsError
904  Raised when output datasets already exist.
905  Exception
906  Other exceptions types may be raised by underlying registry
907  classes.
908  """
909  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
910 
911  instrument = pipeline.getInstrument()
912  if isinstance(instrument, str):
913  instrument = doImport(instrument)
914  instrumentName = instrument.getName() if instrument else None
915  userQuery = self._verifyInstrumentRestriction(instrumentName, userQuery)
916 
917  with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds:
918  scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
919  skipExisting=self.skipExisting)
920  return scaffolding.makeQuantumGraph()
921 
922  @staticmethod
923  def _verifyInstrumentRestriction(instrumentName, query):
924  """Add an instrument restriction to the query if it does not have one,
925  and verify that if given an instrument name that there are no other
926  instrument restrictions in the query.
927 
928  Parameters
929  ----------
930  instrumentName : `str`
931  The name of the instrument that should appear in the query.
932  query : `str`
933  The query string.
934 
935  Returns
936  -------
937  query : `str`
938  The query string with the instrument added to it if needed.
939 
940  Raises
941  ------
942  RuntimeError
943  If the pipeline names an instrument and the query contains more
944  than one instrument or the name of the instrument in the query does
945  not match the instrument named by the pipeline.
946  """
947  if not instrumentName:
948  return query
949  queryInstruments = _findInstruments(query)
950  if len(queryInstruments) > 1:
951  raise RuntimeError(f"When the pipeline has an instrument (\"{instrumentName}\") the query must "
952  "have zero instruments or one instrument that matches the pipeline. "
953  f"Found these instruments in the query: {queryInstruments}.")
954  if not queryInstruments:
955  # There is not an instrument in the query, add it:
956  restriction = f"instrument = '{instrumentName}'"
957  _LOG.debug(f"Adding restriction \"{restriction}\" to query.")
958  query = f"{restriction} AND ({query})" if query else restriction # (there may not be a query)
959  elif queryInstruments[0] != instrumentName:
960  # Since there is an instrument in the query, it should match
961  # the instrument in the pipeline.
962  raise RuntimeError(f"The instrument named in the query (\"{queryInstruments[0]}\") does not "
963  f"match the instrument named by the pipeline (\"{instrumentName}\")")
964  return query
lsst::pipe::base.graphBuilder._PipelineScaffolding.dimensions
dimensions
Definition: graphBuilder.py:421
lsst::pipe::base.graph.QuantumGraph
Definition: graph.py:120
lsst::pipe::base.graphBuilder._PipelineScaffolding.tasks
tasks
Definition: graphBuilder.py:410
lsst::pipe::base.graphBuilder._QuantumScaffolding.__init__
def __init__(self, _TaskScaffolding task, DataCoordinate dataId)
Definition: graphBuilder.py:201
lsst::pipe::base.graphBuilder._QuantumScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:206
lsst::pipe::base.graphBuilder._TaskScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:297
lsst::pipe::base.graphBuilder._DatasetDict.fromSubset
_DatasetDict fromSubset(cls, Iterable[DatasetType] datasetTypes, _DatasetDict first, *_DatasetDict rest)
Definition: graphBuilder.py:100
lsst::pipe::base.graphBuilder.GraphBuilder.skipExisting
skipExisting
Definition: graphBuilder.py:877
lsst::pipe::base.graphBuilder._InstrumentFinder.visitStringLiteral
def visitStringLiteral(self, value, node)
Definition: graphBuilder.py:769
lsst::pipe::base.graphBuilder._PipelineScaffolding.connectDataIds
def connectDataIds(self, registry, collections, userQuery)
Definition: graphBuilder.py:487
lsst::pipe::base.graphBuilder._TaskScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:301
lsst::pipe::base.graphBuilder._TaskScaffolding.taskDef
taskDef
Definition: graphBuilder.py:286
lsst::pipe::base.graphBuilder._QuantumScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:211
lsst::pipe::base.graphBuilder._TaskScaffolding.outputs
outputs
Definition: graphBuilder.py:296
lsst::pipe::base.graphBuilder._TaskScaffolding.initInputs
initInputs
Definition: graphBuilder.py:291
lsst::pipe::base.graphBuilder._PipelineScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:433
lsst::pipe::base.graphBuilder._QuantumScaffolding.outputs
outputs
Definition: graphBuilder.py:205
lsst::pipe::base.graphBuilder._QuantumScaffolding.dataId
dataId
Definition: graphBuilder.py:203
lsst::pipe::base.graphBuilder._InstrumentFinder.visitTimeLiteral
def visitTimeLiteral(self, value, node)
Definition: graphBuilder.py:773
lsst::pipe::base.graphBuilder.GraphBuilder._verifyInstrumentRestriction
def _verifyInstrumentRestriction(instrumentName, query)
Definition: graphBuilder.py:923
lsst::pipe::base.graphBuilder._QuantumScaffolding.makeQuantum
Quantum makeQuantum(self)
Definition: graphBuilder.py:241
lsst::pipe::base.graphBuilder._DatasetDict.fromDatasetTypes
_DatasetDict fromDatasetTypes(cls, Iterable[DatasetType] datasetTypes, *DimensionUniverse universe)
Definition: graphBuilder.py:80
lsst::pipe::base.graphBuilder.PrerequisiteMissingError
Definition: graphBuilder.py:855
lsst::pipe::base.graphBuilder._DatasetDict
Definition: graphBuilder.py:64
lsst::pipe::base.graphBuilder._QuantumScaffolding.inputs
inputs
Definition: graphBuilder.py:204
lsst::pipe::base.graphBuilder._QuantumScaffolding.task
task
Definition: graphBuilder.py:202
lsst::pipe::base.graph.QuantumGraphTaskNodes
Definition: graph.py:89
lsst::pipe::base.graphBuilder._TaskScaffolding.initOutputs
initOutputs
Definition: graphBuilder.py:293
lsst::pipe::base.graphBuilder._PipelineScaffolding.__init__
def __init__(self, pipeline, *registry)
Definition: graphBuilder.py:408
lsst::pipe::base.graphBuilder._InstrumentFinder.visitNumericLiteral
def visitNumericLiteral(self, value, node)
Definition: graphBuilder.py:765
lsst::pipe::base.graphBuilder._TaskScaffolding.inputs
inputs
Definition: graphBuilder.py:295
lsst::pipe::base.graphBuilder._PipelineScaffolding.resolveDatasetRefs
def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *skipExisting=True)
Definition: graphBuilder.py:573
lsst::pipe::base.graphBuilder._InstrumentFinder.visitUnaryOp
def visitUnaryOp(self, operator, operand, node)
Definition: graphBuilder.py:786
lsst::pipe::base.graphBuilder._InstrumentFinder.visitBinaryOp
def visitBinaryOp(self, operator, lhs, rhs, node)
Definition: graphBuilder.py:790
lsst::pipe::base.graphBuilder._InstrumentFinder.instruments
instruments
Definition: graphBuilder.py:763
lsst::pipe::base.graphBuilder._PipelineScaffolding
Definition: graphBuilder.py:364
lsst::pipe::base.graphBuilder._DatasetDict.extract
Iterator[DatasetRef] extract(self, DatasetType datasetType, Iterable[DataCoordinate] dataIds)
Definition: graphBuilder.py:164
lsst::pipe::base.graphBuilder._DatasetDict.__init__
def __init__(self, *args, DimensionGraph universe)
Definition: graphBuilder.py:75
lsst::pipe::base.graphBuilder.GraphBuilder.dimensions
dimensions
Definition: graphBuilder.py:876
lsst::pipe::base.graphBuilder._InstrumentFinder
Definition: graphBuilder.py:749
lsst::utils
lsst::pipe::base.connections.iterConnections
typing.Generator iterConnections(PipelineTaskConnections connections, str connectionType)
Definition: connections.py:500
lsst::pipe::base.graphBuilder._InstrumentFinder.__init__
def __init__(self)
Definition: graphBuilder.py:762
lsst::pipe::base.graphBuilder._TaskScaffolding.dataIds
dataIds
Definition: graphBuilder.py:298
lsst::pipe::base.graphBuilder._TaskScaffolding.makeQuantumGraphTaskNodes
QuantumGraphTaskNodes makeQuantumGraphTaskNodes(self)
Definition: graphBuilder.py:346
lsst::pipe::base.graphBuilder.GraphBuilder.registry
registry
Definition: graphBuilder.py:875
lsst::pipe::base.graphBuilder._TaskScaffolding.quanta
quanta
Definition: graphBuilder.py:299
lsst::pipe::base.graphBuilder.GraphBuilder.makeGraph
def makeGraph(self, pipeline, collections, run, userQuery)
Definition: graphBuilder.py:879
lsst::pipe::base.graphBuilder._DatasetDict.unpackMultiRefs
NamedKeyDict[DatasetType, DatasetRef] unpackMultiRefs(self)
Definition: graphBuilder.py:152
lsst::pipe::base.graphBuilder._InstrumentFinder.visitParens
def visitParens(self, expression, node)
Definition: graphBuilder.py:802
lsst::pipe::base.graphBuilder._DatasetDict.dimensions
DimensionGraph dimensions(self)
Definition: graphBuilder.py:125
lsst::pipe::base.graphBuilder._TaskScaffolding
Definition: graphBuilder.py:267
lsst::pipe::base.graphBuilder._DatasetDict.unpackSingleRefs
NamedKeyDict[DatasetType, DatasetRef] unpackSingleRefs(self)
Definition: graphBuilder.py:134
lsst::pipe::base.graphBuilder._QuantumScaffolding
Definition: graphBuilder.py:186
lsst::pipe::base.graphBuilder._InstrumentFinder.visitIsIn
def visitIsIn(self, lhs, values, not_in, node)
Definition: graphBuilder.py:798
lsst::pipe::base.graphBuilder._DatasetDict.universe
universe
Definition: graphBuilder.py:77
lsst::pipe::base.graphBuilder._TaskScaffolding.dimensions
dimensions
Definition: graphBuilder.py:287
lsst::pipe::base.graphBuilder.GraphBuilder
Definition: graphBuilder.py:861
lsst::pipe::base.graphBuilder.GraphBuilder.__init__
def __init__(self, registry, skipExisting=True)
Definition: graphBuilder.py:874
lsst::pipe::base.graphBuilder.GraphBuilderError
Definition: graphBuilder.py:843
lsst::pipe::base.graphBuilder._InstrumentFinder.visitRangeLiteral
def visitRangeLiteral(self, start, stop, stride, node)
Definition: graphBuilder.py:777
lsst::pipe::base.graphBuilder._PipelineScaffolding.makeQuantumGraph
def makeQuantumGraph(self)
Definition: graphBuilder.py:733
lsst::pipe::base.graphBuilder.OutputExistsError
Definition: graphBuilder.py:849
lsst::pipe::base.graphBuilder._InstrumentFinder.visitIdentifier
def visitIdentifier(self, name, node)
Definition: graphBuilder.py:781
lsst::pipe::base.graphBuilder._TaskScaffolding.__init__
def __init__(self, TaskDef taskDef, _PipelineScaffolding parent, TaskDatasetTypes datasetTypes)
Definition: graphBuilder.py:284