lsst.pipe.base  20.0.0-10-gd6f3b0e+e5a7ca23b1
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import itertools
32 from collections import ChainMap
33 from contextlib import contextmanager
34 from dataclasses import dataclass
35 from typing import Dict, Iterable, Iterator, List
36 import logging
37 
38 
39 # -----------------------------
40 # Imports for other modules --
41 # -----------------------------
42 from .connections import iterConnections
43 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44 from .graph import QuantumGraph, QuantumGraphTaskNodes
45 from lsst.daf.butler import (
46  DataCoordinate,
47  DatasetRef,
48  DatasetType,
49  DimensionGraph,
50  DimensionUniverse,
51  NamedKeyDict,
52  Quantum,
53 )
54 from lsst.daf.butler.registry.queries.exprParser import ParseError, ParserYacc, TreeVisitor
55 from lsst.utils import doImport
56 
57 # ----------------------------------
58 # Local non-exported definitions --
59 # ----------------------------------
60 
61 _LOG = logging.getLogger(__name__.partition(".")[2])
62 
63 
64 class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
65  """A custom dictionary that maps `DatasetType` to a nested dictionary of
66  the known `DatasetRef` instances of that type.
67 
68  Parameters
69  ----------
70  args
71  Positional arguments are forwarded to the `dict` constructor.
72  universe : `DimensionUniverse`
73  Universe of all possible dimensions.
74  """
75  def __init__(self, *args, universe: DimensionGraph):
76  super().__init__(*args)
77  self.universe = universe
78 
79  @classmethod
80  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
81  universe: DimensionUniverse) -> _DatasetDict:
82  """Construct a dictionary from a flat iterable of `DatasetType` keys.
83 
84  Parameters
85  ----------
86  datasetTypes : `iterable` of `DatasetType`
87  DatasetTypes to use as keys for the dict. Values will be empty
88  dictionaries.
89  universe : `DimensionUniverse`
90  Universe of all possible dimensions.
91 
92  Returns
93  -------
94  dictionary : `_DatasetDict`
95  A new `_DatasetDict` instance.
96  """
97  return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
98 
99  @classmethod
100  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
101  ) -> _DatasetDict:
102  """Return a new dictionary by extracting items corresponding to the
103  given keys from one or more existing dictionaries.
104 
105  Parameters
106  ----------
107  datasetTypes : `iterable` of `DatasetType`
108  DatasetTypes to use as keys for the dict. Values will be obtained
109  by lookups against ``first`` and ``rest``.
110  first : `_DatasetDict`
111  Another dictionary from which to extract values.
112  rest
113  Additional dictionaries from which to extract values.
114 
115  Returns
116  -------
117  dictionary : `_DatasetDict`
118  A new dictionary instance.
119  """
120  combined = ChainMap(first, *rest)
121  return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
122  universe=first.universe)
123 
124  @property
125  def dimensions(self) -> DimensionGraph:
126  """The union of all dimensions used by all dataset types in this
127  dictionary, including implied dependencies (`DimensionGraph`).
128  """
129  base = self.universe.empty
130  if len(self) == 0:
131  return base
132  return base.union(*[datasetType.dimensions for datasetType in self.keys()])
133 
134  def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
135  """Unpack nested single-element `DatasetRef` dicts into a new
136  mapping with `DatasetType` keys and `DatasetRef` values.
137 
138  This method assumes that each nest contains exactly one item, as is the
139  case for all "init" datasets.
140 
141  Returns
142  -------
143  dictionary : `NamedKeyDict`
144  Dictionary mapping `DatasetType` to `DatasetRef`, with both
145  `DatasetType` instances and string names usable as keys.
146  """
147  def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
148  ref, = refs.values()
149  return ref
150  return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
151 
152  def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
153  """Unpack nested multi-element `DatasetRef` dicts into a new
154  mapping with `DatasetType` keys and `set` of `DatasetRef` values.
155 
156  Returns
157  -------
158  dictionary : `NamedKeyDict`
159  Dictionary mapping `DatasetType` to `DatasetRef`, with both
160  `DatasetType` instances and string names usable as keys.
161  """
162  return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
163 
164  def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
165  ) -> Iterator[DatasetRef]:
166  """Iterate over the contained `DatasetRef` instances that match the
167  given `DatasetType` and data IDs.
168 
169  Parameters
170  ----------
171  datasetType : `DatasetType`
172  Dataset type to match.
173  dataIds : `Iterable` [ `DataCoordinate` ]
174  Data IDs to match.
175 
176  Returns
177  -------
178  refs : `Iterator` [ `DatasetRef` ]
179  DatasetRef instances for which ``ref.datasetType == datasetType``
180  and ``ref.dataId`` is in ``dataIds``.
181  """
182  refs = self[datasetType]
183  return (refs[dataId] for dataId in dataIds)
184 
185 
187  """Helper class aggregating information about a `Quantum`, used when
188  constructing a `QuantumGraph`.
189 
190  See `_PipelineScaffolding` for a top-down description of the full
191  scaffolding data structure.
192 
193  Parameters
194  ----------
195  task : _TaskScaffolding
196  Back-reference to the helper object for the `PipelineTask` this quantum
197  represents an execution of.
198  dataId : `DataCoordinate`
199  Data ID for this quantum.
200  """
201  def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
202  self.task = task
203  self.dataId = dataId
204  self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
205  self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
206  self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
207  universe=dataId.universe)
208 
209  __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
210 
211  def __repr__(self):
212  return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
213 
214  task: _TaskScaffolding
215  """Back-reference to the helper object for the `PipelineTask` this quantum
216  represents an execution of.
217  """
218 
219  dataId: DataCoordinate
220  """Data ID for this quantum.
221  """
222 
223  inputs: _DatasetDict
224  """Nested dictionary containing `DatasetRef` inputs to this quantum.
225 
226  This is initialized to map each `DatasetType` to an empty dictionary at
227  construction. Those nested dictionaries are populated (with data IDs as
228  keys) with unresolved `DatasetRef` instances in
229  `_PipelineScaffolding.connectDataIds`.
230  """
231 
232  outputs: _DatasetDict
233  """Nested dictionary containing `DatasetRef` outputs this quantum.
234  """
235 
236  prerequisites: _DatasetDict
237  """Nested dictionary containing `DatasetRef` prerequisite inputs to this
238  quantum.
239  """
240 
241  def makeQuantum(self) -> Quantum:
242  """Transform the scaffolding object into a true `Quantum` instance.
243 
244  Returns
245  -------
246  quantum : `Quantum`
247  An actual `Quantum` instance.
248  """
249  allInputs = self.inputs.unpackMultiRefs()
250  allInputs.update(self.prerequisites.unpackMultiRefs())
251  # Give the task's Connections class an opportunity to remove some
252  # inputs, or complain if they are unacceptable.
253  config = self.task.taskDef.config
254  connections = config.connections.ConnectionsClass(config=config)
255  # This will raise if one of the check conditions is not met, which is the intended
256  # behavior
257  allInputs = connections.adjustQuantum(allInputs)
258  return Quantum(
259  taskName=self.task.taskDef.taskName,
260  taskClass=self.task.taskDef.taskClass,
261  dataId=self.dataId,
262  initInputs=self.task.initInputs.unpackSingleRefs(),
263  predictedInputs=allInputs,
264  outputs=self.outputs.unpackMultiRefs(),
265  )
266 
267 
268 @dataclass
270  """Helper class aggregating information about a `PipelineTask`, used when
271  constructing a `QuantumGraph`.
272 
273  See `_PipelineScaffolding` for a top-down description of the full
274  scaffolding data structure.
275 
276  Parameters
277  ----------
278  taskDef : `TaskDef`
279  Data structure that identifies the task class and its config.
280  parent : `_PipelineScaffolding`
281  The parent data structure that will hold the instance being
282  constructed.
283  datasetTypes : `TaskDatasetTypes`
284  Data structure that categorizes the dataset types used by this task.
285  """
286  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
287  universe = parent.dimensions.universe
288  self.taskDef = taskDef
289  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
290  assert self.dimensions.issubset(parent.dimensions)
291  # Initialize _DatasetDicts as subsets of the one or two
292  # corresponding dicts in the parent _PipelineScaffolding.
293  self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
294  parent.initIntermediates)
295  self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
296  parent.initOutputs)
297  self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
298  self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
299  self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
300  self.dataIds = set()
301  self.quanta = {}
302 
303  def __repr__(self):
304  # Default dataclass-injected __repr__ gets caught in an infinite loop
305  # because of back-references.
306  return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
307 
308  taskDef: TaskDef
309  """Data structure that identifies the task class and its config
310  (`TaskDef`).
311  """
312 
313  dimensions: DimensionGraph
314  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
315  """
316 
317  initInputs: _DatasetDict
318  """Dictionary containing information about datasets used to construct this
319  task (`_DatasetDict`).
320  """
321 
322  initOutputs: _DatasetDict
323  """Dictionary containing information about datasets produced as a
324  side-effect of constructing this task (`_DatasetDict`).
325  """
326 
327  inputs: _DatasetDict
328  """Dictionary containing information about datasets used as regular,
329  graph-constraining inputs to this task (`_DatasetDict`).
330  """
331 
332  outputs: _DatasetDict
333  """Dictionary containing information about datasets produced by this task
334  (`_DatasetDict`).
335  """
336 
337  prerequisites: _DatasetDict
338  """Dictionary containing information about input datasets that must be
339  present in the repository before any Pipeline containing this task is run
340  (`_DatasetDict`).
341  """
342 
343  quanta: Dict[DataCoordinate, _QuantumScaffolding]
344  """Dictionary mapping data ID to a scaffolding object for the Quantum of
345  this task with that data ID.
346  """
347 
348  def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
349  """Create a `QuantumGraphTaskNodes` instance from the information in
350  ``self``.
351 
352  Returns
353  -------
354  nodes : `QuantumGraphTaskNodes`
355  The `QuantumGraph` elements corresponding to this task.
356  """
357  return QuantumGraphTaskNodes(
358  taskDef=self.taskDef,
359  quanta=[q.makeQuantum() for q in self.quanta.values()],
360  initInputs=self.initInputs.unpackSingleRefs(),
361  initOutputs=self.initOutputs.unpackSingleRefs(),
362  )
363 
364 
365 @dataclass
367  """A helper data structure that organizes the information involved in
368  constructing a `QuantumGraph` for a `Pipeline`.
369 
370  Parameters
371  ----------
372  pipeline : `Pipeline`
373  Sequence of tasks from which a graph is to be constructed. Must
374  have nested task classes already imported.
375  universe : `DimensionUniverse`
376  Universe of all possible dimensions.
377 
378  Notes
379  -----
380  The scaffolding data structure contains nested data structures for both
381  tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
382  data structures are shared between the pipeline-level structure (which
383  aggregates all datasets and categorizes them from the perspective of the
384  complete pipeline) and the individual tasks that use them as inputs and
385  outputs.
386 
387  `QuantumGraph` construction proceeds in four steps, with each corresponding
388  to a different `_PipelineScaffolding` method:
389 
390  1. When `_PipelineScaffolding` is constructed, we extract and categorize
391  the DatasetTypes used by the pipeline (delegating to
392  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
393  nested `_TaskScaffolding` and `_DatasetDict` objects.
394 
395  2. In `connectDataIds`, we construct and run the "Big Join Query", which
396  returns related tuples of all dimensions used to identify any regular
397  input, output, and intermediate datasets (not prerequisites). We then
398  iterate over these tuples of related dimensions, identifying the subsets
399  that correspond to distinct data IDs for each task and dataset type,
400  and then create `_QuantumScaffolding` objects.
401 
402  3. In `resolveDatasetRefs`, we run follow-up queries against all of the
403  dataset data IDs previously identified, transforming unresolved
404  DatasetRefs into resolved DatasetRefs where appropriate. We then look
405  up prerequisite datasets for all quanta.
406 
407  4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
408  per-task `_QuantumScaffolding` objects.
409  """
410  def __init__(self, pipeline, *, registry):
411  _LOG.debug("Initializing data structures for QuantumGraph generation.")
412  self.tasks = []
413  # Aggregate and categorize the DatasetTypes in the Pipeline.
414  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
415  # Construct dictionaries that map those DatasetTypes to structures
416  # that will (later) hold addiitonal information about them.
417  for attr in ("initInputs", "initIntermediates", "initOutputs",
418  "inputs", "intermediates", "outputs", "prerequisites"):
419  setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
420  universe=registry.dimensions))
421  # Aggregate all dimensions for all non-init, non-prerequisite
422  # DatasetTypes. These are the ones we'll include in the big join query.
423  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
424  self.outputs.dimensions)
425  # Construct scaffolding nodes for each Task, and add backreferences
426  # to the Task from each DatasetScaffolding node.
427  # Note that there's only one scaffolding node for each DatasetType, shared by
428  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
429  if isinstance(pipeline, Pipeline):
430  pipeline = pipeline.toExpandedPipeline()
431  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
432  for taskDef, taskDatasetTypes in zip(pipeline,
433  datasetTypes.byTask.values())]
434 
435  def __repr__(self):
436  # Default dataclass-injected __repr__ gets caught in an infinite loop
437  # because of back-references.
438  return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
439 
440  tasks: List[_TaskScaffolding]
441  """Scaffolding data structures for each task in the pipeline
442  (`list` of `_TaskScaffolding`).
443  """
444 
445  initInputs: _DatasetDict
446  """Datasets consumed but not produced when constructing the tasks in this
447  pipeline (`_DatasetDict`).
448  """
449 
450  initIntermediates: _DatasetDict
451  """Datasets that are both consumed and produced when constructing the tasks
452  in this pipeline (`_DatasetDict`).
453  """
454 
455  initOutputs: _DatasetDict
456  """Datasets produced but not consumed when constructing the tasks in this
457  pipeline (`_DatasetDict`).
458  """
459 
460  inputs: _DatasetDict
461  """Datasets that are consumed but not produced when running this pipeline
462  (`_DatasetDict`).
463  """
464 
465  intermediates: _DatasetDict
466  """Datasets that are both produced and consumed when running this pipeline
467  (`_DatasetDict`).
468  """
469 
470  outputs: _DatasetDict
471  """Datasets produced but not consumed when when running this pipeline
472  (`_DatasetDict`).
473  """
474 
475  prerequisites: _DatasetDict
476  """Datasets that are consumed when running this pipeline and looked up
477  per-Quantum when generating the graph (`_DatasetDict`).
478  """
479 
480  dimensions: DimensionGraph
481  """All dimensions used by any regular input, intermediate, or output
482  (not prerequisite) dataset; the set of dimension used in the "Big Join
483  Query" (`DimensionGraph`).
484 
485  This is required to be a superset of all task quantum dimensions.
486  """
487 
488  @contextmanager
489  def connectDataIds(self, registry, collections, userQuery):
490  """Query for the data IDs that connect nodes in the `QuantumGraph`.
491 
492  This method populates `_TaskScaffolding.dataIds` and
493  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
494 
495  Parameters
496  ----------
497  registry : `lsst.daf.butler.Registry`
498  Registry for the data repository; used for all data ID queries.
499  collections : `lsst.daf.butler.CollectionSearch`
500  Object representing the collections to search for input datasets.
501  userQuery : `str`, optional
502  User-provided expression to limit the data IDs processed.
503 
504  Returns
505  -------
506  commonDataIds : \
507  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
508  An interface to a database temporary table containing all data IDs
509  that will appear in this `QuantumGraph`. Returned inside a
510  context manager, which will drop the temporary table at the end of
511  the `with` block in which this method is called.
512  """
513  _LOG.debug("Building query for data IDs.")
514  # Initialization datasets always have empty data IDs.
515  emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
516  for datasetType, refs in itertools.chain(self.initInputs.items(),
517  self.initIntermediates.items(),
518  self.initOutputs.items()):
519  refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
520  # Run one big query for the data IDs for task dimensions and regular
521  # inputs and outputs. We limit the query to only dimensions that are
522  # associated with the input dataset types, but don't (yet) try to
523  # obtain the dataset_ids for those inputs.
524  _LOG.debug("Submitting data ID query and materializing results.")
525  with registry.queryDataIds(self.dimensions,
526  datasets=list(self.inputs),
527  collections=collections,
528  where=userQuery,
529  ).materialize() as commonDataIds:
530  _LOG.debug("Expanding data IDs.")
531  commonDataIds = commonDataIds.expanded()
532  _LOG.debug("Iterating over query results to associate quanta with datasets.")
533  # Iterate over query results, populating data IDs for datasets and
534  # quanta and then connecting them to each other.
535  n = 0
536  for n, commonDataId in enumerate(commonDataIds):
537  # Create DatasetRefs for all DatasetTypes from this result row,
538  # noting that we might have created some already.
539  # We remember both those that already existed and those that we
540  # create now.
541  refsForRow = {}
542  for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
543  self.outputs.items()):
544  datasetDataId = commonDataId.subset(datasetType.dimensions)
545  ref = refs.get(datasetDataId)
546  if ref is None:
547  ref = DatasetRef(datasetType, datasetDataId)
548  refs[datasetDataId] = ref
549  refsForRow[datasetType.name] = ref
550  # Create _QuantumScaffolding objects for all tasks from this result
551  # row, noting that we might have created some already.
552  for task in self.tasks:
553  quantumDataId = commonDataId.subset(task.dimensions)
554  quantum = task.quanta.get(quantumDataId)
555  if quantum is None:
556  quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
557  task.quanta[quantumDataId] = quantum
558  # Whether this is a new quantum or an existing one, we can now
559  # associate the DatasetRefs for this row with it. The fact
560  # the fact that a Quantum data ID and a dataset data ID both
561  # came from the same result row is what tells us they should
562  # be associated.
563  # Many of these associates will be duplicates (because another
564  # query row that differed from this one only in irrelevant
565  # dimensions already added them), and we use sets to skip.
566  for datasetType in task.inputs:
567  ref = refsForRow[datasetType.name]
568  quantum.inputs[datasetType.name][ref.dataId] = ref
569  for datasetType in task.outputs:
570  ref = refsForRow[datasetType.name]
571  quantum.outputs[datasetType.name][ref.dataId] = ref
572  _LOG.debug("Finished processing %d rows from data ID query.", n)
573  yield commonDataIds
574 
575  def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True):
576  """Perform follow up queries for each dataset data ID produced in
577  `fillDataIds`.
578 
579  This method populates `_DatasetScaffolding.refs` (except for those in
580  `prerequisites`).
581 
582  Parameters
583  ----------
584  registry : `lsst.daf.butler.Registry`
585  Registry for the data repository; used for all data ID queries.
586  collections : `lsst.daf.butler.CollectionSearch`
587  Object representing the collections to search for input datasets.
588  run : `str`, optional
589  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
590  output datasets, if it already exists.
591  commonDataIds : \
592  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
593  Result of a previous call to `connectDataIds`.
594  skipExisting : `bool`, optional
595  If `True` (default), a Quantum is not created if all its outputs
596  already exist in ``run``. Ignored if ``run`` is `None`.
597 
598  Raises
599  ------
600  OutputExistsError
601  Raised if an output dataset already exists in the output run
602  and ``skipExisting`` is `False`. The case where some but not all
603  of a quantum's outputs are present and ``skipExisting`` is `True`
604  cannot be identified at this stage, and is handled by `fillQuanta`
605  instead.
606  """
607  # Look up [init] intermediate and output datasets in the output
608  # collection, if there is an output collection.
609  if run is not None:
610  for datasetType, refs in itertools.chain(self.initIntermediates.items(),
611  self.initOutputs.items(),
612  self.intermediates.items(),
613  self.outputs.items()):
614  _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
615  len(refs), datasetType.name)
616  isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
617  resolvedRefQueryResults = commonDataIds.subset(
618  datasetType.dimensions,
619  unique=True
620  ).findDatasets(
621  datasetType,
622  collections=run,
623  deduplicate=True
624  )
625  for resolvedRef in resolvedRefQueryResults:
626  # TODO: we could easily support per-DatasetType
627  # skipExisting and I could imagine that being useful - it's
628  # probably required in order to support writing initOutputs
629  # before QuantumGraph generation.
630  assert resolvedRef.dataId in refs
631  if skipExisting or isInit:
632  refs[resolvedRef.dataId] = resolvedRef
633  else:
634  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
635  f"output RUN collection '{run}' with data ID"
636  f" {resolvedRef.dataId}.")
637  # Look up input and initInput datasets in the input collection(s).
638  for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
639  _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
640  resolvedRefQueryResults = commonDataIds.subset(
641  datasetType.dimensions,
642  unique=True
643  ).findDatasets(
644  datasetType,
645  collections=collections,
646  deduplicate=True
647  )
648  dataIdsNotFoundYet = set(refs.keys())
649  for resolvedRef in resolvedRefQueryResults:
650  dataIdsNotFoundYet.discard(resolvedRef.dataId)
651  refs[resolvedRef.dataId] = resolvedRef
652  if dataIdsNotFoundYet:
653  raise RuntimeError(
654  f"{len(dataIdsNotFoundYet)} dataset(s) of type "
655  f"'{datasetType.name}' was/were present in a previous "
656  f"query, but could not be found now."
657  f"This is either a logic bug in QuantumGraph generation "
658  f"or the input collections have been modified since "
659  f"QuantumGraph generation began."
660  )
661  # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
662  # replacing the unresolved refs there, and then look up prerequisites.
663  for task in self.tasks:
664  _LOG.debug(
665  "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
666  len(task.quanta),
667  task.taskDef.label
668  )
669  lookupFunctions = {
670  c.name: c.lookupFunction
671  for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
672  if c.lookupFunction is not None
673  }
674  dataIdsToSkip = []
675  for quantum in task.quanta.values():
676  # Process outputs datasets only if there is a run to look for
677  # outputs in and skipExisting is True. Note that if
678  # skipExisting is False, any output datasets that already exist
679  # would have already caused an exception to be raised.
680  # We never update the DatasetRefs in the quantum because those
681  # should never be resolved.
682  if run is not None and skipExisting:
683  resolvedRefs = []
684  unresolvedRefs = []
685  for datasetType, originalRefs in quantum.outputs.items():
686  for ref in task.outputs.extract(datasetType, originalRefs.keys()):
687  if ref.id is not None:
688  resolvedRefs.append(ref)
689  else:
690  unresolvedRefs.append(ref)
691  if resolvedRefs:
692  if unresolvedRefs:
693  raise OutputExistsError(
694  f"Quantum {quantum.dataId} of task with label "
695  f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
696  f"and others that don't ({unresolvedRefs})."
697  )
698  else:
699  # All outputs are already present; skip this
700  # quantum and continue to the next.
701  dataIdsToSkip.append(quantum.dataId)
702  continue
703  # Update the input DatasetRefs to the resolved ones we already
704  # searched for.
705  for datasetType, refs in quantum.inputs.items():
706  for ref in task.inputs.extract(datasetType, refs.keys()):
707  refs[ref.dataId] = ref
708  # Look up prerequisite datasets in the input collection(s).
709  # These may have dimensions that extend beyond those we queried
710  # for originally, because we want to permit those data ID
711  # values to differ across quanta and dataset types.
712  # For example, the same quantum may have a flat and bias with
713  # a different calibration_label, or a refcat with a skypix
714  # value that overlaps the quantum's data ID's region, but not
715  # the user expression used for the initial query.
716  for datasetType in task.prerequisites:
717  lookupFunction = lookupFunctions.get(datasetType.name)
718  if lookupFunction is not None:
719  refs = list(
720  lookupFunction(datasetType, registry, quantum.dataId, collections)
721  )
722  else:
723  refs = list(registry.queryDatasets(datasetType,
724  collections=collections,
725  dataId=quantum.dataId,
726  deduplicate=True).expanded())
727  quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs})
728  # Actually remove any quanta that we decided to skip above.
729  if dataIdsToSkip:
730  _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
731  len(dataIdsToSkip), task.taskDef.label)
732  for dataId in dataIdsToSkip:
733  del task.quanta[dataId]
734 
735  def makeQuantumGraph(self):
736  """Create a `QuantumGraph` from the quanta already present in
737  the scaffolding data structure.
738 
739  Returns
740  -------
741  graph : `QuantumGraph`
742  The full `QuantumGraph`.
743  """
744  graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
745  graph.initInputs = self.initInputs.unpackSingleRefs()
746  graph.initOutputs = self.initOutputs.unpackSingleRefs()
747  graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
748  return graph
749 
750 
751 class _InstrumentFinder(TreeVisitor):
752  """Implementation of TreeVisitor which looks for instrument name
753 
754  Instrument should be specified as a boolean expression
755 
756  instrument = 'string'
757  'string' = instrument
758 
759  so we only need to find a binary operator where operator is "=",
760  one side is a string literal and other side is an identifier.
761  All visit methods return tuple of (type, value), non-useful nodes
762  return None for both type and value.
763  """
764  def __init__(self):
765  self.instruments = []
766 
767  def visitNumericLiteral(self, value, node):
768  # do not care about numbers
769  return (None, None)
770 
771  def visitStringLiteral(self, value, node):
772  # return type and value
773  return ("str", value)
774 
775  def visitTimeLiteral(self, value, node):
776  # do not care about these
777  return (None, None)
778 
779  def visitRangeLiteral(self, start, stop, stride, node):
780  # do not care about these
781  return (None, None)
782 
783  def visitIdentifier(self, name, node):
784  if name.lower() == "instrument":
785  return ("id", "instrument")
786  return (None, None)
787 
788  def visitUnaryOp(self, operator, operand, node):
789  # do not care about these
790  return (None, None)
791 
792  def visitBinaryOp(self, operator, lhs, rhs, node):
793  if operator == "=":
794  if lhs == ("id", "instrument") and rhs[0] == "str":
795  self.instruments.append(rhs[1])
796  elif rhs == ("id", "instrument") and lhs[0] == "str":
797  self.instruments.append(lhs[1])
798  return (None, None)
799 
800  def visitIsIn(self, lhs, values, not_in, node):
801  # do not care about these
802  return (None, None)
803 
804  def visitParens(self, expression, node):
805  # do not care about these
806  return (None, None)
807 
808 
809 def _findInstruments(queryStr):
810  parser = ParserYacc()
811  finder = _InstrumentFinder()
812  try:
813  tree = parser.parse(queryStr)
814  except ParseError as exc:
815  raise ValueError(f"failed to parse query expression: {queryStr}") from exc
816  tree.visit(finder)
817  return finder.instruments
818 
819 
820 # ------------------------
821 # Exported definitions --
822 # ------------------------
823 
824 
825 class GraphBuilderError(Exception):
826  """Base class for exceptions generated by graph builder.
827  """
828  pass
829 
830 
831 class OutputExistsError(GraphBuilderError):
832  """Exception generated when output datasets already exist.
833  """
834  pass
835 
836 
838  """Exception generated when a prerequisite dataset does not exist.
839  """
840  pass
841 
842 
843 class GraphBuilder(object):
844  """GraphBuilder class is responsible for building task execution graph from
845  a Pipeline.
846 
847  Parameters
848  ----------
849  registry : `~lsst.daf.butler.Registry`
850  Data butler instance.
851  skipExisting : `bool`, optional
852  If `True` (default), a Quantum is not created if all its outputs
853  already exist.
854  """
855 
856  def __init__(self, registry, skipExisting=True):
857  self.registry = registry
858  self.dimensions = registry.dimensions
859  self.skipExisting = skipExisting
860 
861  def makeGraph(self, pipeline, collections, run, userQuery):
862  """Create execution graph for a pipeline.
863 
864  Parameters
865  ----------
866  pipeline : `Pipeline`
867  Pipeline definition, task names/classes and their configs.
868  collections : `lsst.daf.butler.CollectionSearch`
869  Object representing the collections to search for input datasets.
870  run : `str`, optional
871  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
872  output datasets, if it already exists.
873  userQuery : `str`
874  String which defines user-defined selection for registry, should be
875  empty or `None` if there is no restrictions on data selection.
876 
877  Returns
878  -------
879  graph : `QuantumGraph`
880 
881  Raises
882  ------
883  UserExpressionError
884  Raised when user expression cannot be parsed.
885  OutputExistsError
886  Raised when output datasets already exist.
887  Exception
888  Other exceptions types may be raised by underlying registry
889  classes.
890  """
891  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
892 
893  instrument = pipeline.getInstrument()
894  if isinstance(instrument, str):
895  instrument = doImport(instrument)
896  instrumentName = instrument.getName() if instrument else None
897  userQuery = self._verifyInstrumentRestriction(instrumentName, userQuery)
898 
899  with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds:
900  scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
901  skipExisting=self.skipExisting)
902  return scaffolding.makeQuantumGraph()
903 
904  @staticmethod
905  def _verifyInstrumentRestriction(instrumentName, query):
906  """Add an instrument restriction to the query if it does not have one,
907  and verify that if given an instrument name that there are no other
908  instrument restrictions in the query.
909 
910  Parameters
911  ----------
912  instrumentName : `str`
913  The name of the instrument that should appear in the query.
914  query : `str`
915  The query string.
916 
917  Returns
918  -------
919  query : `str`
920  The query string with the instrument added to it if needed.
921 
922  Raises
923  ------
924  RuntimeError
925  If the pipeline names an instrument and the query contains more
926  than one instrument or the name of the instrument in the query does
927  not match the instrument named by the pipeline.
928  """
929  if not instrumentName:
930  return query
931  queryInstruments = _findInstruments(query)
932  if len(queryInstruments) > 1:
933  raise RuntimeError(f"When the pipeline has an instrument (\"{instrumentName}\") the query must "
934  "have zero instruments or one instrument that matches the pipeline. "
935  f"Found these instruments in the query: {queryInstruments}.")
936  if not queryInstruments:
937  # There is not an instrument in the query, add it:
938  restriction = f"instrument = '{instrumentName}'"
939  _LOG.debug(f"Adding restriction \"{restriction}\" to query.")
940  query = f"{restriction} AND ({query})"
941  elif queryInstruments[0] != instrumentName:
942  # Since there is an instrument in the query, it should match
943  # the instrument in the pipeline.
944  raise RuntimeError(f"The instrument named in the query (\"{queryInstruments[0]}\") does not "
945  f"match the instrument named by the pipeline (\"{instrumentName}\")")
946  return query
lsst::pipe::base.graphBuilder._PipelineScaffolding.dimensions
dimensions
Definition: graphBuilder.py:423
lsst::pipe::base.graph.QuantumGraph
Definition: graph.py:120
lsst::pipe::base.graphBuilder._PipelineScaffolding.tasks
tasks
Definition: graphBuilder.py:412
lsst::pipe::base.graphBuilder._QuantumScaffolding.__init__
def __init__(self, _TaskScaffolding task, DataCoordinate dataId)
Definition: graphBuilder.py:201
lsst::pipe::base.graphBuilder._QuantumScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:206
lsst::pipe::base.graphBuilder._TaskScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:299
lsst::pipe::base.graphBuilder._DatasetDict.fromSubset
_DatasetDict fromSubset(cls, Iterable[DatasetType] datasetTypes, _DatasetDict first, *_DatasetDict rest)
Definition: graphBuilder.py:100
lsst::pipe::base.graphBuilder.GraphBuilder.skipExisting
skipExisting
Definition: graphBuilder.py:859
lsst::pipe::base.graphBuilder._InstrumentFinder.visitStringLiteral
def visitStringLiteral(self, value, node)
Definition: graphBuilder.py:771
lsst::pipe::base.graphBuilder._PipelineScaffolding.connectDataIds
def connectDataIds(self, registry, collections, userQuery)
Definition: graphBuilder.py:489
lsst::pipe::base.graphBuilder._TaskScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:303
lsst::pipe::base.graphBuilder._TaskScaffolding.taskDef
taskDef
Definition: graphBuilder.py:288
lsst::pipe::base.graphBuilder._QuantumScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:211
lsst::pipe::base.graphBuilder._TaskScaffolding.outputs
outputs
Definition: graphBuilder.py:298
lsst::pipe::base.graphBuilder._TaskScaffolding.initInputs
initInputs
Definition: graphBuilder.py:293
lsst::pipe::base.graphBuilder._PipelineScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:435
lsst::pipe::base.graphBuilder._QuantumScaffolding.outputs
outputs
Definition: graphBuilder.py:205
lsst::pipe::base.graphBuilder._QuantumScaffolding.dataId
dataId
Definition: graphBuilder.py:203
lsst::pipe::base.graphBuilder._InstrumentFinder.visitTimeLiteral
def visitTimeLiteral(self, value, node)
Definition: graphBuilder.py:775
lsst::pipe::base.graphBuilder.GraphBuilder._verifyInstrumentRestriction
def _verifyInstrumentRestriction(instrumentName, query)
Definition: graphBuilder.py:905
lsst::pipe::base.graphBuilder._QuantumScaffolding.makeQuantum
Quantum makeQuantum(self)
Definition: graphBuilder.py:241
lsst::pipe::base.graphBuilder._DatasetDict.fromDatasetTypes
_DatasetDict fromDatasetTypes(cls, Iterable[DatasetType] datasetTypes, *DimensionUniverse universe)
Definition: graphBuilder.py:80
lsst::pipe::base.graphBuilder.PrerequisiteMissingError
Definition: graphBuilder.py:837
lsst::pipe::base.graphBuilder._DatasetDict
Definition: graphBuilder.py:64
lsst::pipe::base.graphBuilder._QuantumScaffolding.inputs
inputs
Definition: graphBuilder.py:204
lsst::pipe::base.graphBuilder._QuantumScaffolding.task
task
Definition: graphBuilder.py:202
lsst::pipe::base.graph.QuantumGraphTaskNodes
Definition: graph.py:89
lsst::pipe::base.graphBuilder._TaskScaffolding.initOutputs
initOutputs
Definition: graphBuilder.py:295
lsst::pipe::base.graphBuilder._PipelineScaffolding.__init__
def __init__(self, pipeline, *registry)
Definition: graphBuilder.py:410
lsst::pipe::base.graphBuilder._InstrumentFinder.visitNumericLiteral
def visitNumericLiteral(self, value, node)
Definition: graphBuilder.py:767
lsst::pipe::base.graphBuilder._TaskScaffolding.inputs
inputs
Definition: graphBuilder.py:297
lsst::pipe::base.graphBuilder._PipelineScaffolding.resolveDatasetRefs
def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *skipExisting=True)
Definition: graphBuilder.py:575
lsst::pipe::base.graphBuilder._InstrumentFinder.visitUnaryOp
def visitUnaryOp(self, operator, operand, node)
Definition: graphBuilder.py:788
lsst::pipe::base.graphBuilder._InstrumentFinder.visitBinaryOp
def visitBinaryOp(self, operator, lhs, rhs, node)
Definition: graphBuilder.py:792
lsst::pipe::base.graphBuilder._InstrumentFinder.instruments
instruments
Definition: graphBuilder.py:765
lsst::pipe::base.graphBuilder._PipelineScaffolding
Definition: graphBuilder.py:366
lsst::pipe::base.graphBuilder._DatasetDict.extract
Iterator[DatasetRef] extract(self, DatasetType datasetType, Iterable[DataCoordinate] dataIds)
Definition: graphBuilder.py:164
lsst::pipe::base.graphBuilder._DatasetDict.__init__
def __init__(self, *args, DimensionGraph universe)
Definition: graphBuilder.py:75
lsst::pipe::base.graphBuilder.GraphBuilder.dimensions
dimensions
Definition: graphBuilder.py:858
lsst::pipe::base.graphBuilder._InstrumentFinder
Definition: graphBuilder.py:751
lsst::utils
lsst::pipe::base.connections.iterConnections
typing.Generator iterConnections(PipelineTaskConnections connections, str connectionType)
Definition: connections.py:500
lsst::pipe::base.graphBuilder._InstrumentFinder.__init__
def __init__(self)
Definition: graphBuilder.py:764
lsst::pipe::base.graphBuilder._TaskScaffolding.dataIds
dataIds
Definition: graphBuilder.py:300
lsst::pipe::base.graphBuilder._TaskScaffolding.makeQuantumGraphTaskNodes
QuantumGraphTaskNodes makeQuantumGraphTaskNodes(self)
Definition: graphBuilder.py:348
lsst::pipe::base.graphBuilder.GraphBuilder.registry
registry
Definition: graphBuilder.py:857
lsst::pipe::base.graphBuilder._TaskScaffolding.quanta
quanta
Definition: graphBuilder.py:301
lsst::pipe::base.graphBuilder.GraphBuilder.makeGraph
def makeGraph(self, pipeline, collections, run, userQuery)
Definition: graphBuilder.py:861
lsst::pipe::base.graphBuilder._DatasetDict.unpackMultiRefs
NamedKeyDict[DatasetType, DatasetRef] unpackMultiRefs(self)
Definition: graphBuilder.py:152
lsst::pipe::base.graphBuilder._InstrumentFinder.visitParens
def visitParens(self, expression, node)
Definition: graphBuilder.py:804
lsst::pipe::base.graphBuilder._DatasetDict.dimensions
DimensionGraph dimensions(self)
Definition: graphBuilder.py:125
lsst::pipe::base.graphBuilder._TaskScaffolding
Definition: graphBuilder.py:269
lsst::pipe::base.graphBuilder._DatasetDict.unpackSingleRefs
NamedKeyDict[DatasetType, DatasetRef] unpackSingleRefs(self)
Definition: graphBuilder.py:134
lsst::pipe::base.graphBuilder._QuantumScaffolding
Definition: graphBuilder.py:186
lsst::pipe::base.graphBuilder._InstrumentFinder.visitIsIn
def visitIsIn(self, lhs, values, not_in, node)
Definition: graphBuilder.py:800
lsst::pipe::base.graphBuilder._DatasetDict.universe
universe
Definition: graphBuilder.py:77
lsst::pipe::base.graphBuilder._TaskScaffolding.dimensions
dimensions
Definition: graphBuilder.py:289
lsst::pipe::base.graphBuilder.GraphBuilder
Definition: graphBuilder.py:843
lsst::pipe::base.graphBuilder.GraphBuilder.__init__
def __init__(self, registry, skipExisting=True)
Definition: graphBuilder.py:856
lsst::pipe::base.graphBuilder.GraphBuilderError
Definition: graphBuilder.py:825
lsst::pipe::base.graphBuilder._InstrumentFinder.visitRangeLiteral
def visitRangeLiteral(self, start, stop, stride, node)
Definition: graphBuilder.py:779
lsst::pipe::base.graphBuilder._PipelineScaffolding.makeQuantumGraph
def makeQuantumGraph(self)
Definition: graphBuilder.py:735
lsst::pipe::base.graphBuilder.OutputExistsError
Definition: graphBuilder.py:831
lsst::pipe::base.graphBuilder._InstrumentFinder.visitIdentifier
def visitIdentifier(self, name, node)
Definition: graphBuilder.py:783
lsst::pipe::base.graphBuilder._TaskScaffolding.__init__
def __init__(self, TaskDef taskDef, _PipelineScaffolding parent, TaskDatasetTypes datasetTypes)
Definition: graphBuilder.py:286