lsst.pipe.base  20.0.0-14-g1ce627f+450400e286
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import itertools
32 from collections import ChainMap
33 from contextlib import contextmanager
34 from dataclasses import dataclass
35 from typing import Dict, Iterable, Iterator, List
36 import logging
37 
38 
39 # -----------------------------
40 # Imports for other modules --
41 # -----------------------------
42 from .connections import iterConnections
43 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44 from .graph import QuantumGraph, QuantumGraphTaskNodes
45 from lsst.daf.butler import (
46  DataCoordinate,
47  DatasetRef,
48  DatasetType,
49  DimensionGraph,
50  DimensionUniverse,
51  NamedKeyDict,
52  Quantum,
53 )
54 from lsst.daf.butler.registry.queries.exprParser import ParseError, ParserYacc, TreeVisitor
55 from lsst.utils import doImport
56 
57 # ----------------------------------
58 # Local non-exported definitions --
59 # ----------------------------------
60 
61 _LOG = logging.getLogger(__name__.partition(".")[2])
62 
63 
64 class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
65  """A custom dictionary that maps `DatasetType` to a nested dictionary of
66  the known `DatasetRef` instances of that type.
67 
68  Parameters
69  ----------
70  args
71  Positional arguments are forwarded to the `dict` constructor.
72  universe : `DimensionUniverse`
73  Universe of all possible dimensions.
74  """
75  def __init__(self, *args, universe: DimensionGraph):
76  super().__init__(*args)
77  self.universe = universe
78 
79  @classmethod
80  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
81  universe: DimensionUniverse) -> _DatasetDict:
82  """Construct a dictionary from a flat iterable of `DatasetType` keys.
83 
84  Parameters
85  ----------
86  datasetTypes : `iterable` of `DatasetType`
87  DatasetTypes to use as keys for the dict. Values will be empty
88  dictionaries.
89  universe : `DimensionUniverse`
90  Universe of all possible dimensions.
91 
92  Returns
93  -------
94  dictionary : `_DatasetDict`
95  A new `_DatasetDict` instance.
96  """
97  return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
98 
99  @classmethod
100  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
101  ) -> _DatasetDict:
102  """Return a new dictionary by extracting items corresponding to the
103  given keys from one or more existing dictionaries.
104 
105  Parameters
106  ----------
107  datasetTypes : `iterable` of `DatasetType`
108  DatasetTypes to use as keys for the dict. Values will be obtained
109  by lookups against ``first`` and ``rest``.
110  first : `_DatasetDict`
111  Another dictionary from which to extract values.
112  rest
113  Additional dictionaries from which to extract values.
114 
115  Returns
116  -------
117  dictionary : `_DatasetDict`
118  A new dictionary instance.
119  """
120  combined = ChainMap(first, *rest)
121  return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
122  universe=first.universe)
123 
124  @property
125  def dimensions(self) -> DimensionGraph:
126  """The union of all dimensions used by all dataset types in this
127  dictionary, including implied dependencies (`DimensionGraph`).
128  """
129  base = self.universe.empty
130  if len(self) == 0:
131  return base
132  return base.union(*[datasetType.dimensions for datasetType in self.keys()])
133 
134  def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
135  """Unpack nested single-element `DatasetRef` dicts into a new
136  mapping with `DatasetType` keys and `DatasetRef` values.
137 
138  This method assumes that each nest contains exactly one item, as is the
139  case for all "init" datasets.
140 
141  Returns
142  -------
143  dictionary : `NamedKeyDict`
144  Dictionary mapping `DatasetType` to `DatasetRef`, with both
145  `DatasetType` instances and string names usable as keys.
146  """
147  def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
148  ref, = refs.values()
149  return ref
150  return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
151 
152  def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
153  """Unpack nested multi-element `DatasetRef` dicts into a new
154  mapping with `DatasetType` keys and `set` of `DatasetRef` values.
155 
156  Returns
157  -------
158  dictionary : `NamedKeyDict`
159  Dictionary mapping `DatasetType` to `DatasetRef`, with both
160  `DatasetType` instances and string names usable as keys.
161  """
162  return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
163 
164  def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
165  ) -> Iterator[DatasetRef]:
166  """Iterate over the contained `DatasetRef` instances that match the
167  given `DatasetType` and data IDs.
168 
169  Parameters
170  ----------
171  datasetType : `DatasetType`
172  Dataset type to match.
173  dataIds : `Iterable` [ `DataCoordinate` ]
174  Data IDs to match.
175 
176  Returns
177  -------
178  refs : `Iterator` [ `DatasetRef` ]
179  DatasetRef instances for which ``ref.datasetType == datasetType``
180  and ``ref.dataId`` is in ``dataIds``.
181  """
182  refs = self[datasetType]
183  return (refs[dataId] for dataId in dataIds)
184 
185 
187  """Helper class aggregating information about a `Quantum`, used when
188  constructing a `QuantumGraph`.
189 
190  See `_PipelineScaffolding` for a top-down description of the full
191  scaffolding data structure.
192 
193  Parameters
194  ----------
195  task : _TaskScaffolding
196  Back-reference to the helper object for the `PipelineTask` this quantum
197  represents an execution of.
198  dataId : `DataCoordinate`
199  Data ID for this quantum.
200  """
201  def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
202  self.task = task
203  self.dataId = dataId
204  self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
205  self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
206  self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
207  universe=dataId.universe)
208 
209  __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
210 
211  def __repr__(self):
212  return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
213 
214  task: _TaskScaffolding
215  """Back-reference to the helper object for the `PipelineTask` this quantum
216  represents an execution of.
217  """
218 
219  dataId: DataCoordinate
220  """Data ID for this quantum.
221  """
222 
223  inputs: _DatasetDict
224  """Nested dictionary containing `DatasetRef` inputs to this quantum.
225 
226  This is initialized to map each `DatasetType` to an empty dictionary at
227  construction. Those nested dictionaries are populated (with data IDs as
228  keys) with unresolved `DatasetRef` instances in
229  `_PipelineScaffolding.connectDataIds`.
230  """
231 
232  outputs: _DatasetDict
233  """Nested dictionary containing `DatasetRef` outputs this quantum.
234  """
235 
236  prerequisites: _DatasetDict
237  """Nested dictionary containing `DatasetRef` prerequisite inputs to this
238  quantum.
239  """
240 
241  def makeQuantum(self) -> Quantum:
242  """Transform the scaffolding object into a true `Quantum` instance.
243 
244  Returns
245  -------
246  quantum : `Quantum`
247  An actual `Quantum` instance.
248  """
249  allInputs = self.inputs.unpackMultiRefs()
250  allInputs.update(self.prerequisites.unpackMultiRefs())
251  # Give the task's Connections class an opportunity to remove some
252  # inputs, or complain if they are unacceptable.
253  # This will raise if one of the check conditions is not met, which is the intended
254  # behavior
255  allInputs = self.task.taskDef.connections.adjustQuantum(allInputs)
256  return Quantum(
257  taskName=self.task.taskDef.taskName,
258  taskClass=self.task.taskDef.taskClass,
259  dataId=self.dataId,
260  initInputs=self.task.initInputs.unpackSingleRefs(),
261  predictedInputs=allInputs,
262  outputs=self.outputs.unpackMultiRefs(),
263  )
264 
265 
266 @dataclass
268  """Helper class aggregating information about a `PipelineTask`, used when
269  constructing a `QuantumGraph`.
270 
271  See `_PipelineScaffolding` for a top-down description of the full
272  scaffolding data structure.
273 
274  Parameters
275  ----------
276  taskDef : `TaskDef`
277  Data structure that identifies the task class and its config.
278  parent : `_PipelineScaffolding`
279  The parent data structure that will hold the instance being
280  constructed.
281  datasetTypes : `TaskDatasetTypes`
282  Data structure that categorizes the dataset types used by this task.
283  """
284  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
285  universe = parent.dimensions.universe
286  self.taskDef = taskDef
287  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
288  assert self.dimensions.issubset(parent.dimensions)
289  # Initialize _DatasetDicts as subsets of the one or two
290  # corresponding dicts in the parent _PipelineScaffolding.
291  self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
292  parent.initIntermediates)
293  self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
294  parent.initOutputs)
295  self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
296  self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
297  self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
298  self.dataIds = set()
299  self.quanta = {}
300 
301  def __repr__(self):
302  # Default dataclass-injected __repr__ gets caught in an infinite loop
303  # because of back-references.
304  return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
305 
306  taskDef: TaskDef
307  """Data structure that identifies the task class and its config
308  (`TaskDef`).
309  """
310 
311  dimensions: DimensionGraph
312  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
313  """
314 
315  initInputs: _DatasetDict
316  """Dictionary containing information about datasets used to construct this
317  task (`_DatasetDict`).
318  """
319 
320  initOutputs: _DatasetDict
321  """Dictionary containing information about datasets produced as a
322  side-effect of constructing this task (`_DatasetDict`).
323  """
324 
325  inputs: _DatasetDict
326  """Dictionary containing information about datasets used as regular,
327  graph-constraining inputs to this task (`_DatasetDict`).
328  """
329 
330  outputs: _DatasetDict
331  """Dictionary containing information about datasets produced by this task
332  (`_DatasetDict`).
333  """
334 
335  prerequisites: _DatasetDict
336  """Dictionary containing information about input datasets that must be
337  present in the repository before any Pipeline containing this task is run
338  (`_DatasetDict`).
339  """
340 
341  quanta: Dict[DataCoordinate, _QuantumScaffolding]
342  """Dictionary mapping data ID to a scaffolding object for the Quantum of
343  this task with that data ID.
344  """
345 
346  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
347  """Create a `QuantumGraphTaskNodes` instance from the information in
348  ``self``.
349 
350  Returns
351  -------
352  nodes : `QuantumGraphTaskNodes`
353  The `QuantumGraph` elements corresponding to this task.
354  """
355  return QuantumGraphTaskNodes(
356  taskDef=self.taskDef,
357  quanta=[q.makeQuantum() for q in self.quanta.values()],
358  initInputs=self.initInputs.unpackSingleRefs(),
359  initOutputs=self.initOutputs.unpackSingleRefs(),
360  )
361 
362 
363 @dataclass
365  """A helper data structure that organizes the information involved in
366  constructing a `QuantumGraph` for a `Pipeline`.
367 
368  Parameters
369  ----------
370  pipeline : `Pipeline`
371  Sequence of tasks from which a graph is to be constructed. Must
372  have nested task classes already imported.
373  universe : `DimensionUniverse`
374  Universe of all possible dimensions.
375 
376  Notes
377  -----
378  The scaffolding data structure contains nested data structures for both
379  tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
380  data structures are shared between the pipeline-level structure (which
381  aggregates all datasets and categorizes them from the perspective of the
382  complete pipeline) and the individual tasks that use them as inputs and
383  outputs.
384 
385  `QuantumGraph` construction proceeds in four steps, with each corresponding
386  to a different `_PipelineScaffolding` method:
387 
388  1. When `_PipelineScaffolding` is constructed, we extract and categorize
389  the DatasetTypes used by the pipeline (delegating to
390  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
391  nested `_TaskScaffolding` and `_DatasetDict` objects.
392 
393  2. In `connectDataIds`, we construct and run the "Big Join Query", which
394  returns related tuples of all dimensions used to identify any regular
395  input, output, and intermediate datasets (not prerequisites). We then
396  iterate over these tuples of related dimensions, identifying the subsets
397  that correspond to distinct data IDs for each task and dataset type,
398  and then create `_QuantumScaffolding` objects.
399 
400  3. In `resolveDatasetRefs`, we run follow-up queries against all of the
401  dataset data IDs previously identified, transforming unresolved
402  DatasetRefs into resolved DatasetRefs where appropriate. We then look
403  up prerequisite datasets for all quanta.
404 
405  4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
406  per-task `_QuantumScaffolding` objects.
407  """
408  def __init__(self, pipeline, *, registry):
409  _LOG.debug("Initializing data structures for QuantumGraph generation.")
410  self.tasks = []
411  # Aggregate and categorize the DatasetTypes in the Pipeline.
412  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
413  # Construct dictionaries that map those DatasetTypes to structures
414  # that will (later) hold addiitonal information about them.
415  for attr in ("initInputs", "initIntermediates", "initOutputs",
416  "inputs", "intermediates", "outputs", "prerequisites"):
417  setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
418  universe=registry.dimensions))
419  # Aggregate all dimensions for all non-init, non-prerequisite
420  # DatasetTypes. These are the ones we'll include in the big join query.
421  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
422  self.outputs.dimensions)
423  # Construct scaffolding nodes for each Task, and add backreferences
424  # to the Task from each DatasetScaffolding node.
425  # Note that there's only one scaffolding node for each DatasetType, shared by
426  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
427  if isinstance(pipeline, Pipeline):
428  pipeline = pipeline.toExpandedPipeline()
429  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
430  for taskDef, taskDatasetTypes in zip(pipeline,
431  datasetTypes.byTask.values())]
432 
433  def __repr__(self):
434  # Default dataclass-injected __repr__ gets caught in an infinite loop
435  # because of back-references.
436  return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
437 
438  tasks: List[_TaskScaffolding]
439  """Scaffolding data structures for each task in the pipeline
440  (`list` of `_TaskScaffolding`).
441  """
442 
443  initInputs: _DatasetDict
444  """Datasets consumed but not produced when constructing the tasks in this
445  pipeline (`_DatasetDict`).
446  """
447 
448  initIntermediates: _DatasetDict
449  """Datasets that are both consumed and produced when constructing the tasks
450  in this pipeline (`_DatasetDict`).
451  """
452 
453  initOutputs: _DatasetDict
454  """Datasets produced but not consumed when constructing the tasks in this
455  pipeline (`_DatasetDict`).
456  """
457 
458  inputs: _DatasetDict
459  """Datasets that are consumed but not produced when running this pipeline
460  (`_DatasetDict`).
461  """
462 
463  intermediates: _DatasetDict
464  """Datasets that are both produced and consumed when running this pipeline
465  (`_DatasetDict`).
466  """
467 
468  outputs: _DatasetDict
469  """Datasets produced but not consumed when when running this pipeline
470  (`_DatasetDict`).
471  """
472 
473  prerequisites: _DatasetDict
474  """Datasets that are consumed when running this pipeline and looked up
475  per-Quantum when generating the graph (`_DatasetDict`).
476  """
477 
478  dimensions: DimensionGraph
479  """All dimensions used by any regular input, intermediate, or output
480  (not prerequisite) dataset; the set of dimension used in the "Big Join
481  Query" (`DimensionGraph`).
482 
483  This is required to be a superset of all task quantum dimensions.
484  """
485 
486  @contextmanager
487  def connectDataIds(self, registry, collections, userQuery):
488  """Query for the data IDs that connect nodes in the `QuantumGraph`.
489 
490  This method populates `_TaskScaffolding.dataIds` and
491  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
492 
493  Parameters
494  ----------
495  registry : `lsst.daf.butler.Registry`
496  Registry for the data repository; used for all data ID queries.
497  collections : `lsst.daf.butler.CollectionSearch`
498  Object representing the collections to search for input datasets.
499  userQuery : `str`, optional
500  User-provided expression to limit the data IDs processed.
501 
502  Returns
503  -------
504  commonDataIds : \
505  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
506  An interface to a database temporary table containing all data IDs
507  that will appear in this `QuantumGraph`. Returned inside a
508  context manager, which will drop the temporary table at the end of
509  the `with` block in which this method is called.
510  """
511  _LOG.debug("Building query for data IDs.")
512  # Initialization datasets always have empty data IDs.
513  emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
514  for datasetType, refs in itertools.chain(self.initInputs.items(),
515  self.initIntermediates.items(),
516  self.initOutputs.items()):
517  refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
518  # Run one big query for the data IDs for task dimensions and regular
519  # inputs and outputs. We limit the query to only dimensions that are
520  # associated with the input dataset types, but don't (yet) try to
521  # obtain the dataset_ids for those inputs.
522  _LOG.debug("Submitting data ID query and materializing results.")
523  with registry.queryDataIds(self.dimensions,
524  datasets=list(self.inputs),
525  collections=collections,
526  where=userQuery,
527  ).materialize() as commonDataIds:
528  _LOG.debug("Expanding data IDs.")
529  commonDataIds = commonDataIds.expanded()
530  _LOG.debug("Iterating over query results to associate quanta with datasets.")
531  # Iterate over query results, populating data IDs for datasets and
532  # quanta and then connecting them to each other.
533  n = 0
534  for n, commonDataId in enumerate(commonDataIds):
535  # Create DatasetRefs for all DatasetTypes from this result row,
536  # noting that we might have created some already.
537  # We remember both those that already existed and those that we
538  # create now.
539  refsForRow = {}
540  for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
541  self.outputs.items()):
542  datasetDataId = commonDataId.subset(datasetType.dimensions)
543  ref = refs.get(datasetDataId)
544  if ref is None:
545  ref = DatasetRef(datasetType, datasetDataId)
546  refs[datasetDataId] = ref
547  refsForRow[datasetType.name] = ref
548  # Create _QuantumScaffolding objects for all tasks from this result
549  # row, noting that we might have created some already.
550  for task in self.tasks:
551  quantumDataId = commonDataId.subset(task.dimensions)
552  quantum = task.quanta.get(quantumDataId)
553  if quantum is None:
554  quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
555  task.quanta[quantumDataId] = quantum
556  # Whether this is a new quantum or an existing one, we can now
557  # associate the DatasetRefs for this row with it. The fact
558  # the fact that a Quantum data ID and a dataset data ID both
559  # came from the same result row is what tells us they should
560  # be associated.
561  # Many of these associates will be duplicates (because another
562  # query row that differed from this one only in irrelevant
563  # dimensions already added them), and we use sets to skip.
564  for datasetType in task.inputs:
565  ref = refsForRow[datasetType.name]
566  quantum.inputs[datasetType.name][ref.dataId] = ref
567  for datasetType in task.outputs:
568  ref = refsForRow[datasetType.name]
569  quantum.outputs[datasetType.name][ref.dataId] = ref
570  _LOG.debug("Finished processing %d rows from data ID query.", n)
571  yield commonDataIds
572 
573  def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True):
574  """Perform follow up queries for each dataset data ID produced in
575  `fillDataIds`.
576 
577  This method populates `_DatasetScaffolding.refs` (except for those in
578  `prerequisites`).
579 
580  Parameters
581  ----------
582  registry : `lsst.daf.butler.Registry`
583  Registry for the data repository; used for all data ID queries.
584  collections : `lsst.daf.butler.CollectionSearch`
585  Object representing the collections to search for input datasets.
586  run : `str`, optional
587  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
588  output datasets, if it already exists.
589  commonDataIds : \
590  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
591  Result of a previous call to `connectDataIds`.
592  skipExisting : `bool`, optional
593  If `True` (default), a Quantum is not created if all its outputs
594  already exist in ``run``. Ignored if ``run`` is `None`.
595 
596  Raises
597  ------
598  OutputExistsError
599  Raised if an output dataset already exists in the output run
600  and ``skipExisting`` is `False`. The case where some but not all
601  of a quantum's outputs are present and ``skipExisting`` is `True`
602  cannot be identified at this stage, and is handled by `fillQuanta`
603  instead.
604  """
605  # Look up [init] intermediate and output datasets in the output
606  # collection, if there is an output collection.
607  if run is not None:
608  for datasetType, refs in itertools.chain(self.initIntermediates.items(),
609  self.initOutputs.items(),
610  self.intermediates.items(),
611  self.outputs.items()):
612  _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
613  len(refs), datasetType.name)
614  isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
615  resolvedRefQueryResults = commonDataIds.subset(
616  datasetType.dimensions,
617  unique=True
618  ).findDatasets(
619  datasetType,
620  collections=run,
621  deduplicate=True
622  )
623  for resolvedRef in resolvedRefQueryResults:
624  # TODO: we could easily support per-DatasetType
625  # skipExisting and I could imagine that being useful - it's
626  # probably required in order to support writing initOutputs
627  # before QuantumGraph generation.
628  assert resolvedRef.dataId in refs
629  if skipExisting or isInit:
630  refs[resolvedRef.dataId] = resolvedRef
631  else:
632  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
633  f"output RUN collection '{run}' with data ID"
634  f" {resolvedRef.dataId}.")
635  # Look up input and initInput datasets in the input collection(s).
636  for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
637  _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
638  resolvedRefQueryResults = commonDataIds.subset(
639  datasetType.dimensions,
640  unique=True
641  ).findDatasets(
642  datasetType,
643  collections=collections,
644  deduplicate=True
645  )
646  dataIdsNotFoundYet = set(refs.keys())
647  for resolvedRef in resolvedRefQueryResults:
648  dataIdsNotFoundYet.discard(resolvedRef.dataId)
649  refs[resolvedRef.dataId] = resolvedRef
650  if dataIdsNotFoundYet:
651  raise RuntimeError(
652  f"{len(dataIdsNotFoundYet)} dataset(s) of type "
653  f"'{datasetType.name}' was/were present in a previous "
654  f"query, but could not be found now."
655  f"This is either a logic bug in QuantumGraph generation "
656  f"or the input collections have been modified since "
657  f"QuantumGraph generation began."
658  )
659  # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
660  # replacing the unresolved refs there, and then look up prerequisites.
661  for task in self.tasks:
662  _LOG.debug(
663  "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
664  len(task.quanta),
665  task.taskDef.label
666  )
667  lookupFunctions = {
668  c.name: c.lookupFunction
669  for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
670  if c.lookupFunction is not None
671  }
672  dataIdsToSkip = []
673  for quantum in task.quanta.values():
674  # Process outputs datasets only if there is a run to look for
675  # outputs in and skipExisting is True. Note that if
676  # skipExisting is False, any output datasets that already exist
677  # would have already caused an exception to be raised.
678  # We never update the DatasetRefs in the quantum because those
679  # should never be resolved.
680  if run is not None and skipExisting:
681  resolvedRefs = []
682  unresolvedRefs = []
683  for datasetType, originalRefs in quantum.outputs.items():
684  for ref in task.outputs.extract(datasetType, originalRefs.keys()):
685  if ref.id is not None:
686  resolvedRefs.append(ref)
687  else:
688  unresolvedRefs.append(ref)
689  if resolvedRefs:
690  if unresolvedRefs:
691  raise OutputExistsError(
692  f"Quantum {quantum.dataId} of task with label "
693  f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
694  f"and others that don't ({unresolvedRefs})."
695  )
696  else:
697  # All outputs are already present; skip this
698  # quantum and continue to the next.
699  dataIdsToSkip.append(quantum.dataId)
700  continue
701  # Update the input DatasetRefs to the resolved ones we already
702  # searched for.
703  for datasetType, refs in quantum.inputs.items():
704  for ref in task.inputs.extract(datasetType, refs.keys()):
705  refs[ref.dataId] = ref
706  # Look up prerequisite datasets in the input collection(s).
707  # These may have dimensions that extend beyond those we queried
708  # for originally, because we want to permit those data ID
709  # values to differ across quanta and dataset types.
710  for datasetType in task.prerequisites:
711  lookupFunction = lookupFunctions.get(datasetType.name)
712  if lookupFunction is not None:
713  # PipelineTask has provided its own function to do the
714  # lookup. This always takes precedence.
715  refs = list(
716  lookupFunction(datasetType, registry, quantum.dataId, collections)
717  )
718  elif (datasetType.isCalibration()
719  and datasetType.dimensions <= quantum.dataId.graph
720  and quantum.dataId.graph.temporal):
721  # This is a master calibration lookup, which we have to
722  # handle specially because the query system can't do a
723  # temporal join on a non-dimension-based timespan yet.
724  timespan = quantum.dataId.timespan
725  try:
726  refs = [registry.findDataset(datasetType, quantum.dataId,
727  collections=collections,
728  timespan=timespan)]
729  except KeyError:
730  # This dataset type is not present in the registry,
731  # which just means there are no datasets here.
732  refs = []
733  else:
734  # Most general case.
735  refs = list(registry.queryDatasets(datasetType,
736  collections=collections,
737  dataId=quantum.dataId,
738  deduplicate=True).expanded())
739  quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs
740  if ref is not None})
741  # Actually remove any quanta that we decided to skip above.
742  if dataIdsToSkip:
743  _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
744  len(dataIdsToSkip), task.taskDef.label)
745  for dataId in dataIdsToSkip:
746  del task.quanta[dataId]
747 
748  def makeQuantumGraph(self):
749  """Create a `QuantumGraph` from the quanta already present in
750  the scaffolding data structure.
751 
752  Returns
753  -------
754  graph : `QuantumGraph`
755  The full `QuantumGraph`.
756  """
757  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
758  graph.initInputs = self.initInputs.unpackSingleRefs()
759  graph.initOutputs = self.initOutputs.unpackSingleRefs()
760  graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
761  return graph
762 
763 
764 class _InstrumentFinder(TreeVisitor):
765  """Implementation of TreeVisitor which looks for instrument name
766 
767  Instrument should be specified as a boolean expression
768 
769  instrument = 'string'
770  'string' = instrument
771 
772  so we only need to find a binary operator where operator is "=",
773  one side is a string literal and other side is an identifier.
774  All visit methods return tuple of (type, value), non-useful nodes
775  return None for both type and value.
776  """
777  def __init__(self):
778  self.instruments = []
779 
780  def visitNumericLiteral(self, value, node):
781  # do not care about numbers
782  return (None, None)
783 
784  def visitStringLiteral(self, value, node):
785  # return type and value
786  return ("str", value)
787 
788  def visitTimeLiteral(self, value, node):
789  # do not care about these
790  return (None, None)
791 
792  def visitRangeLiteral(self, start, stop, stride, node):
793  # do not care about these
794  return (None, None)
795 
796  def visitIdentifier(self, name, node):
797  if name.lower() == "instrument":
798  return ("id", "instrument")
799  return (None, None)
800 
801  def visitUnaryOp(self, operator, operand, node):
802  # do not care about these
803  return (None, None)
804 
805  def visitBinaryOp(self, operator, lhs, rhs, node):
806  if operator == "=":
807  if lhs == ("id", "instrument") and rhs[0] == "str":
808  self.instruments.append(rhs[1])
809  elif rhs == ("id", "instrument") and lhs[0] == "str":
810  self.instruments.append(lhs[1])
811  return (None, None)
812 
813  def visitIsIn(self, lhs, values, not_in, node):
814  # do not care about these
815  return (None, None)
816 
817  def visitParens(self, expression, node):
818  # do not care about these
819  return (None, None)
820 
821 
822 def _findInstruments(queryStr):
823  """Get the names of any instrument named in the query string by searching
824  for "instrument = <value>" and similar patterns.
825 
826  Parameters
827  ----------
828  queryStr : `str` or None
829  The query string to search, or None if there is no query.
830 
831  Returns
832  -------
833  instruments : `list` [`str`]
834  The list of instrument names found in the query.
835 
836  Raises
837  ------
838  ValueError
839  If the query expression can not be parsed.
840  """
841  if not queryStr:
842  return []
843  parser = ParserYacc()
844  finder = _InstrumentFinder()
845  try:
846  tree = parser.parse(queryStr)
847  except ParseError as exc:
848  raise ValueError(f"failed to parse query expression: {queryStr}") from exc
849  tree.visit(finder)
850  return finder.instruments
851 
852 
853 # ------------------------
854 # Exported definitions --
855 # ------------------------
856 
857 
858 class GraphBuilderError(Exception):
859  """Base class for exceptions generated by graph builder.
860  """
861  pass
862 
863 
864 class OutputExistsError(GraphBuilderError):
865  """Exception generated when output datasets already exist.
866  """
867  pass
868 
869 
871  """Exception generated when a prerequisite dataset does not exist.
872  """
873  pass
874 
875 
876 class GraphBuilder(object):
877  """GraphBuilder class is responsible for building task execution graph from
878  a Pipeline.
879 
880  Parameters
881  ----------
882  registry : `~lsst.daf.butler.Registry`
883  Data butler instance.
884  skipExisting : `bool`, optional
885  If `True` (default), a Quantum is not created if all its outputs
886  already exist.
887  """
888 
889  def __init__(self, registry, skipExisting=True):
890  self.registry = registry
891  self.dimensions = registry.dimensions
892  self.skipExisting = skipExisting
893 
894  def makeGraph(self, pipeline, collections, run, userQuery):
895  """Create execution graph for a pipeline.
896 
897  Parameters
898  ----------
899  pipeline : `Pipeline`
900  Pipeline definition, task names/classes and their configs.
901  collections : `lsst.daf.butler.CollectionSearch`
902  Object representing the collections to search for input datasets.
903  run : `str`, optional
904  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
905  output datasets, if it already exists.
906  userQuery : `str`
907  String which defines user-defined selection for registry, should be
908  empty or `None` if there is no restrictions on data selection.
909 
910  Returns
911  -------
912  graph : `QuantumGraph`
913 
914  Raises
915  ------
916  UserExpressionError
917  Raised when user expression cannot be parsed.
918  OutputExistsError
919  Raised when output datasets already exist.
920  Exception
921  Other exceptions types may be raised by underlying registry
922  classes.
923  """
924  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
925 
926  instrument = pipeline.getInstrument()
927  if isinstance(instrument, str):
928  instrument = doImport(instrument)
929  instrumentName = instrument.getName() if instrument else None
930  userQuery = self._verifyInstrumentRestriction(instrumentName, userQuery)
931 
932  with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds:
933  scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
934  skipExisting=self.skipExisting)
935  return scaffolding.makeQuantumGraph()
936 
937  @staticmethod
938  def _verifyInstrumentRestriction(instrumentName, query):
939  """Add an instrument restriction to the query if it does not have one,
940  and verify that if given an instrument name that there are no other
941  instrument restrictions in the query.
942 
943  Parameters
944  ----------
945  instrumentName : `str`
946  The name of the instrument that should appear in the query.
947  query : `str`
948  The query string.
949 
950  Returns
951  -------
952  query : `str`
953  The query string with the instrument added to it if needed.
954 
955  Raises
956  ------
957  RuntimeError
958  If the pipeline names an instrument and the query contains more
959  than one instrument or the name of the instrument in the query does
960  not match the instrument named by the pipeline.
961  """
962  if not instrumentName:
963  return query
964  queryInstruments = _findInstruments(query)
965  if len(queryInstruments) > 1:
966  raise RuntimeError(f"When the pipeline has an instrument (\"{instrumentName}\") the query must "
967  "have zero instruments or one instrument that matches the pipeline. "
968  f"Found these instruments in the query: {queryInstruments}.")
969  if not queryInstruments:
970  # There is not an instrument in the query, add it:
971  restriction = f"instrument = '{instrumentName}'"
972  _LOG.debug(f"Adding restriction \"{restriction}\" to query.")
973  query = f"{restriction} AND ({query})" if query else restriction # (there may not be a query)
974  elif queryInstruments[0] != instrumentName:
975  # Since there is an instrument in the query, it should match
976  # the instrument in the pipeline.
977  raise RuntimeError(f"The instrument named in the query (\"{queryInstruments[0]}\") does not "
978  f"match the instrument named by the pipeline (\"{instrumentName}\")")
979  return query
lsst::pipe::base.graphBuilder._PipelineScaffolding.dimensions
dimensions
Definition: graphBuilder.py:421
lsst::pipe::base.graph.QuantumGraph
Definition: graph.py:120
lsst::pipe::base.graphBuilder._PipelineScaffolding.tasks
tasks
Definition: graphBuilder.py:410
lsst::pipe::base.graphBuilder._QuantumScaffolding.__init__
def __init__(self, _TaskScaffolding task, DataCoordinate dataId)
Definition: graphBuilder.py:201
lsst::pipe::base.graphBuilder._QuantumScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:206
lsst::pipe::base.graphBuilder._TaskScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:297
lsst::pipe::base.graphBuilder._DatasetDict.fromSubset
_DatasetDict fromSubset(cls, Iterable[DatasetType] datasetTypes, _DatasetDict first, *_DatasetDict rest)
Definition: graphBuilder.py:100
lsst::pipe::base.graphBuilder.GraphBuilder.skipExisting
skipExisting
Definition: graphBuilder.py:892
lsst::pipe::base.graphBuilder._InstrumentFinder.visitStringLiteral
def visitStringLiteral(self, value, node)
Definition: graphBuilder.py:784
lsst::pipe::base.graphBuilder._PipelineScaffolding.connectDataIds
def connectDataIds(self, registry, collections, userQuery)
Definition: graphBuilder.py:487
lsst::pipe::base.graphBuilder._TaskScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:301
lsst::pipe::base.graphBuilder._TaskScaffolding.taskDef
taskDef
Definition: graphBuilder.py:286
lsst::pipe::base.graphBuilder._QuantumScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:211
lsst::pipe::base.graphBuilder._TaskScaffolding.outputs
outputs
Definition: graphBuilder.py:296
lsst::pipe::base.graphBuilder._TaskScaffolding.initInputs
initInputs
Definition: graphBuilder.py:291
lsst::pipe::base.graphBuilder._PipelineScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:433
lsst::pipe::base.graphBuilder._QuantumScaffolding.outputs
outputs
Definition: graphBuilder.py:205
lsst::pipe::base.graphBuilder._QuantumScaffolding.dataId
dataId
Definition: graphBuilder.py:203
lsst::pipe::base.graphBuilder._InstrumentFinder.visitTimeLiteral
def visitTimeLiteral(self, value, node)
Definition: graphBuilder.py:788
lsst::pipe::base.graphBuilder.GraphBuilder._verifyInstrumentRestriction
def _verifyInstrumentRestriction(instrumentName, query)
Definition: graphBuilder.py:938
lsst::pipe::base.graphBuilder._QuantumScaffolding.makeQuantum
Quantum makeQuantum(self)
Definition: graphBuilder.py:241
lsst::pipe::base.graphBuilder._DatasetDict.fromDatasetTypes
_DatasetDict fromDatasetTypes(cls, Iterable[DatasetType] datasetTypes, *DimensionUniverse universe)
Definition: graphBuilder.py:80
lsst::pipe::base.graphBuilder.PrerequisiteMissingError
Definition: graphBuilder.py:870
lsst::pipe::base.graphBuilder._DatasetDict
Definition: graphBuilder.py:64
lsst::pipe::base.graphBuilder._QuantumScaffolding.inputs
inputs
Definition: graphBuilder.py:204
lsst::pipe::base.graphBuilder._QuantumScaffolding.task
task
Definition: graphBuilder.py:202
lsst::pipe::base.graph.QuantumGraphTaskNodes
Definition: graph.py:89
lsst::pipe::base.graphBuilder._TaskScaffolding.initOutputs
initOutputs
Definition: graphBuilder.py:293
lsst::pipe::base.graphBuilder._PipelineScaffolding.__init__
def __init__(self, pipeline, *registry)
Definition: graphBuilder.py:408
lsst::pipe::base.graphBuilder._InstrumentFinder.visitNumericLiteral
def visitNumericLiteral(self, value, node)
Definition: graphBuilder.py:780
lsst::pipe::base.graphBuilder._TaskScaffolding.inputs
inputs
Definition: graphBuilder.py:295
lsst::pipe::base.graphBuilder._PipelineScaffolding.resolveDatasetRefs
def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *skipExisting=True)
Definition: graphBuilder.py:573
lsst::pipe::base.graphBuilder._InstrumentFinder.visitUnaryOp
def visitUnaryOp(self, operator, operand, node)
Definition: graphBuilder.py:801
lsst::pipe::base.graphBuilder._InstrumentFinder.visitBinaryOp
def visitBinaryOp(self, operator, lhs, rhs, node)
Definition: graphBuilder.py:805
lsst::pipe::base.graphBuilder._InstrumentFinder.instruments
instruments
Definition: graphBuilder.py:778
lsst::pipe::base.graphBuilder._PipelineScaffolding
Definition: graphBuilder.py:364
lsst::pipe::base.graphBuilder._DatasetDict.extract
Iterator[DatasetRef] extract(self, DatasetType datasetType, Iterable[DataCoordinate] dataIds)
Definition: graphBuilder.py:164
lsst::pipe::base.graphBuilder._DatasetDict.__init__
def __init__(self, *args, DimensionGraph universe)
Definition: graphBuilder.py:75
lsst::pipe::base.graphBuilder.GraphBuilder.dimensions
dimensions
Definition: graphBuilder.py:891
lsst::pipe::base.graphBuilder._InstrumentFinder
Definition: graphBuilder.py:764
lsst::utils
lsst::pipe::base.connections.iterConnections
typing.Generator iterConnections(PipelineTaskConnections connections, str connectionType)
Definition: connections.py:500
lsst::pipe::base.graphBuilder._InstrumentFinder.__init__
def __init__(self)
Definition: graphBuilder.py:777
lsst::pipe::base.graphBuilder._TaskScaffolding.dataIds
dataIds
Definition: graphBuilder.py:298
lsst::pipe::base.graphBuilder._TaskScaffolding.makeQuantumGraphTaskNodes
QuantumGraphTaskNodes makeQuantumGraphTaskNodes(self)
Definition: graphBuilder.py:346
lsst::pipe::base.graphBuilder.GraphBuilder.registry
registry
Definition: graphBuilder.py:890
lsst::pipe::base.graphBuilder._TaskScaffolding.quanta
quanta
Definition: graphBuilder.py:299
lsst::pipe::base.graphBuilder.GraphBuilder.makeGraph
def makeGraph(self, pipeline, collections, run, userQuery)
Definition: graphBuilder.py:894
lsst::pipe::base.graphBuilder._DatasetDict.unpackMultiRefs
NamedKeyDict[DatasetType, DatasetRef] unpackMultiRefs(self)
Definition: graphBuilder.py:152
lsst::pipe::base.graphBuilder._InstrumentFinder.visitParens
def visitParens(self, expression, node)
Definition: graphBuilder.py:817
lsst::pipe::base.graphBuilder._DatasetDict.dimensions
DimensionGraph dimensions(self)
Definition: graphBuilder.py:125
lsst::pipe::base.graphBuilder._TaskScaffolding
Definition: graphBuilder.py:267
lsst::pipe::base.graphBuilder._DatasetDict.unpackSingleRefs
NamedKeyDict[DatasetType, DatasetRef] unpackSingleRefs(self)
Definition: graphBuilder.py:134
lsst::pipe::base.graphBuilder._QuantumScaffolding
Definition: graphBuilder.py:186
lsst::pipe::base.graphBuilder._InstrumentFinder.visitIsIn
def visitIsIn(self, lhs, values, not_in, node)
Definition: graphBuilder.py:813
lsst::pipe::base.graphBuilder._DatasetDict.universe
universe
Definition: graphBuilder.py:77
lsst::pipe::base.graphBuilder._TaskScaffolding.dimensions
dimensions
Definition: graphBuilder.py:287
lsst::pipe::base.graphBuilder.GraphBuilder
Definition: graphBuilder.py:876
lsst::pipe::base.graphBuilder.GraphBuilder.__init__
def __init__(self, registry, skipExisting=True)
Definition: graphBuilder.py:889
lsst::pipe::base.graphBuilder.GraphBuilderError
Definition: graphBuilder.py:858
lsst::pipe::base.graphBuilder._InstrumentFinder.visitRangeLiteral
def visitRangeLiteral(self, start, stop, stride, node)
Definition: graphBuilder.py:792
lsst::pipe::base.graphBuilder._PipelineScaffolding.makeQuantumGraph
def makeQuantumGraph(self)
Definition: graphBuilder.py:748
lsst::pipe::base.graphBuilder.OutputExistsError
Definition: graphBuilder.py:864
lsst::pipe::base.graphBuilder._InstrumentFinder.visitIdentifier
def visitIdentifier(self, name, node)
Definition: graphBuilder.py:796
lsst::pipe::base.graphBuilder._TaskScaffolding.__init__
def __init__(self, TaskDef taskDef, _PipelineScaffolding parent, TaskDatasetTypes datasetTypes)
Definition: graphBuilder.py:284