lsst.pipe.base  20.0.0-19-gcdd82e7+6f5ab6e0f6
graphBuilder.py
Go to the documentation of this file.
1 # This file is part of pipe_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 """Module defining GraphBuilder class and related methods.
24 """
25 
26 __all__ = ['GraphBuilder']
27 
28 # -------------------------------
29 # Imports of standard modules --
30 # -------------------------------
31 import itertools
32 from collections import ChainMap
33 from contextlib import contextmanager
34 from dataclasses import dataclass
35 from typing import Dict, Iterable, Iterator, List, Set
36 import logging
37 
38 
39 # -----------------------------
40 # Imports for other modules --
41 # -----------------------------
42 from .connections import iterConnections
43 from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44 from .graph import QuantumGraph
45 from lsst.daf.butler import (
46  DataCoordinate,
47  DatasetRef,
48  DatasetType,
49  DimensionGraph,
50  DimensionUniverse,
51  NamedKeyDict,
52  Quantum,
53 )
54 from lsst.daf.butler.registry.queries.exprParser import ParseError, ParserYacc, TreeVisitor
55 from lsst.utils import doImport
56 
57 # ----------------------------------
58 # Local non-exported definitions --
59 # ----------------------------------
60 
61 _LOG = logging.getLogger(__name__.partition(".")[2])
62 
63 
64 class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
65  """A custom dictionary that maps `DatasetType` to a nested dictionary of
66  the known `DatasetRef` instances of that type.
67 
68  Parameters
69  ----------
70  args
71  Positional arguments are forwarded to the `dict` constructor.
72  universe : `DimensionUniverse`
73  Universe of all possible dimensions.
74  """
75  def __init__(self, *args, universe: DimensionGraph):
76  super().__init__(*args)
77  self.universe = universe
78 
79  @classmethod
80  def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
81  universe: DimensionUniverse) -> _DatasetDict:
82  """Construct a dictionary from a flat iterable of `DatasetType` keys.
83 
84  Parameters
85  ----------
86  datasetTypes : `iterable` of `DatasetType`
87  DatasetTypes to use as keys for the dict. Values will be empty
88  dictionaries.
89  universe : `DimensionUniverse`
90  Universe of all possible dimensions.
91 
92  Returns
93  -------
94  dictionary : `_DatasetDict`
95  A new `_DatasetDict` instance.
96  """
97  return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
98 
99  @classmethod
100  def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
101  ) -> _DatasetDict:
102  """Return a new dictionary by extracting items corresponding to the
103  given keys from one or more existing dictionaries.
104 
105  Parameters
106  ----------
107  datasetTypes : `iterable` of `DatasetType`
108  DatasetTypes to use as keys for the dict. Values will be obtained
109  by lookups against ``first`` and ``rest``.
110  first : `_DatasetDict`
111  Another dictionary from which to extract values.
112  rest
113  Additional dictionaries from which to extract values.
114 
115  Returns
116  -------
117  dictionary : `_DatasetDict`
118  A new dictionary instance.
119  """
120  combined = ChainMap(first, *rest)
121  return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
122  universe=first.universe)
123 
124  @property
125  def dimensions(self) -> DimensionGraph:
126  """The union of all dimensions used by all dataset types in this
127  dictionary, including implied dependencies (`DimensionGraph`).
128  """
129  base = self.universe.empty
130  if len(self) == 0:
131  return base
132  return base.union(*[datasetType.dimensions for datasetType in self.keys()])
133 
134  def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
135  """Unpack nested single-element `DatasetRef` dicts into a new
136  mapping with `DatasetType` keys and `DatasetRef` values.
137 
138  This method assumes that each nest contains exactly one item, as is the
139  case for all "init" datasets.
140 
141  Returns
142  -------
143  dictionary : `NamedKeyDict`
144  Dictionary mapping `DatasetType` to `DatasetRef`, with both
145  `DatasetType` instances and string names usable as keys.
146  """
147  def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
148  ref, = refs.values()
149  return ref
150  return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
151 
152  def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
153  """Unpack nested multi-element `DatasetRef` dicts into a new
154  mapping with `DatasetType` keys and `set` of `DatasetRef` values.
155 
156  Returns
157  -------
158  dictionary : `NamedKeyDict`
159  Dictionary mapping `DatasetType` to `DatasetRef`, with both
160  `DatasetType` instances and string names usable as keys.
161  """
162  return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
163 
164  def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
165  ) -> Iterator[DatasetRef]:
166  """Iterate over the contained `DatasetRef` instances that match the
167  given `DatasetType` and data IDs.
168 
169  Parameters
170  ----------
171  datasetType : `DatasetType`
172  Dataset type to match.
173  dataIds : `Iterable` [ `DataCoordinate` ]
174  Data IDs to match.
175 
176  Returns
177  -------
178  refs : `Iterator` [ `DatasetRef` ]
179  DatasetRef instances for which ``ref.datasetType == datasetType``
180  and ``ref.dataId`` is in ``dataIds``.
181  """
182  refs = self[datasetType]
183  return (refs[dataId] for dataId in dataIds)
184 
185 
187  """Helper class aggregating information about a `Quantum`, used when
188  constructing a `QuantumGraph`.
189 
190  See `_PipelineScaffolding` for a top-down description of the full
191  scaffolding data structure.
192 
193  Parameters
194  ----------
195  task : _TaskScaffolding
196  Back-reference to the helper object for the `PipelineTask` this quantum
197  represents an execution of.
198  dataId : `DataCoordinate`
199  Data ID for this quantum.
200  """
201  def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
202  self.task = task
203  self.dataId = dataId
204  self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
205  self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
206  self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
207  universe=dataId.universe)
208 
209  __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
210 
211  def __repr__(self):
212  return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
213 
214  task: _TaskScaffolding
215  """Back-reference to the helper object for the `PipelineTask` this quantum
216  represents an execution of.
217  """
218 
219  dataId: DataCoordinate
220  """Data ID for this quantum.
221  """
222 
223  inputs: _DatasetDict
224  """Nested dictionary containing `DatasetRef` inputs to this quantum.
225 
226  This is initialized to map each `DatasetType` to an empty dictionary at
227  construction. Those nested dictionaries are populated (with data IDs as
228  keys) with unresolved `DatasetRef` instances in
229  `_PipelineScaffolding.connectDataIds`.
230  """
231 
232  outputs: _DatasetDict
233  """Nested dictionary containing `DatasetRef` outputs this quantum.
234  """
235 
236  prerequisites: _DatasetDict
237  """Nested dictionary containing `DatasetRef` prerequisite inputs to this
238  quantum.
239  """
240 
241  def makeQuantum(self) -> Quantum:
242  """Transform the scaffolding object into a true `Quantum` instance.
243 
244  Returns
245  -------
246  quantum : `Quantum`
247  An actual `Quantum` instance.
248  """
249  allInputs = self.inputs.unpackMultiRefs()
250  allInputs.update(self.prerequisites.unpackMultiRefs())
251  # Give the task's Connections class an opportunity to remove some
252  # inputs, or complain if they are unacceptable.
253  # This will raise if one of the check conditions is not met, which is the intended
254  # behavior
255  allInputs = self.task.taskDef.connections.adjustQuantum(allInputs)
256  return Quantum(
257  taskName=self.task.taskDef.taskName,
258  taskClass=self.task.taskDef.taskClass,
259  dataId=self.dataId,
260  initInputs=self.task.initInputs.unpackSingleRefs(),
261  inputs=allInputs,
262  outputs=self.outputs.unpackMultiRefs(),
263  )
264 
265 
266 @dataclass
268  """Helper class aggregating information about a `PipelineTask`, used when
269  constructing a `QuantumGraph`.
270 
271  See `_PipelineScaffolding` for a top-down description of the full
272  scaffolding data structure.
273 
274  Parameters
275  ----------
276  taskDef : `TaskDef`
277  Data structure that identifies the task class and its config.
278  parent : `_PipelineScaffolding`
279  The parent data structure that will hold the instance being
280  constructed.
281  datasetTypes : `TaskDatasetTypes`
282  Data structure that categorizes the dataset types used by this task.
283  """
284  def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
285  universe = parent.dimensions.universe
286  self.taskDef = taskDef
287  self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
288  assert self.dimensions.issubset(parent.dimensions)
289  # Initialize _DatasetDicts as subsets of the one or two
290  # corresponding dicts in the parent _PipelineScaffolding.
291  self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
292  parent.initIntermediates)
293  self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
294  parent.initOutputs)
295  self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
296  self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
297  self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
298  self.dataIds = set()
299  self.quanta = {}
300 
301  def __repr__(self):
302  # Default dataclass-injected __repr__ gets caught in an infinite loop
303  # because of back-references.
304  return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
305 
306  taskDef: TaskDef
307  """Data structure that identifies the task class and its config
308  (`TaskDef`).
309  """
310 
311  dimensions: DimensionGraph
312  """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
313  """
314 
315  initInputs: _DatasetDict
316  """Dictionary containing information about datasets used to construct this
317  task (`_DatasetDict`).
318  """
319 
320  initOutputs: _DatasetDict
321  """Dictionary containing information about datasets produced as a
322  side-effect of constructing this task (`_DatasetDict`).
323  """
324 
325  inputs: _DatasetDict
326  """Dictionary containing information about datasets used as regular,
327  graph-constraining inputs to this task (`_DatasetDict`).
328  """
329 
330  outputs: _DatasetDict
331  """Dictionary containing information about datasets produced by this task
332  (`_DatasetDict`).
333  """
334 
335  prerequisites: _DatasetDict
336  """Dictionary containing information about input datasets that must be
337  present in the repository before any Pipeline containing this task is run
338  (`_DatasetDict`).
339  """
340 
341  quanta: Dict[DataCoordinate, _QuantumScaffolding]
342  """Dictionary mapping data ID to a scaffolding object for the Quantum of
343  this task with that data ID.
344  """
345 
346  def makeQuantumSet(self) -> Set[Quantum]:
347  """Create a `set` of `Quantum` from the information in ``self``.
348 
349  Returns
350  -------
351  nodes : `set` of `Quantum
352  The `Quantum` elements corresponding to this task.
353  """
354  return set(q.makeQuantum() for q in self.quanta.values())
355 
356 
357 @dataclass
359  """A helper data structure that organizes the information involved in
360  constructing a `QuantumGraph` for a `Pipeline`.
361 
362  Parameters
363  ----------
364  pipeline : `Pipeline`
365  Sequence of tasks from which a graph is to be constructed. Must
366  have nested task classes already imported.
367  universe : `DimensionUniverse`
368  Universe of all possible dimensions.
369 
370  Notes
371  -----
372  The scaffolding data structure contains nested data structures for both
373  tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
374  data structures are shared between the pipeline-level structure (which
375  aggregates all datasets and categorizes them from the perspective of the
376  complete pipeline) and the individual tasks that use them as inputs and
377  outputs.
378 
379  `QuantumGraph` construction proceeds in four steps, with each corresponding
380  to a different `_PipelineScaffolding` method:
381 
382  1. When `_PipelineScaffolding` is constructed, we extract and categorize
383  the DatasetTypes used by the pipeline (delegating to
384  `PipelineDatasetTypes.fromPipeline`), then use these to construct the
385  nested `_TaskScaffolding` and `_DatasetDict` objects.
386 
387  2. In `connectDataIds`, we construct and run the "Big Join Query", which
388  returns related tuples of all dimensions used to identify any regular
389  input, output, and intermediate datasets (not prerequisites). We then
390  iterate over these tuples of related dimensions, identifying the subsets
391  that correspond to distinct data IDs for each task and dataset type,
392  and then create `_QuantumScaffolding` objects.
393 
394  3. In `resolveDatasetRefs`, we run follow-up queries against all of the
395  dataset data IDs previously identified, transforming unresolved
396  DatasetRefs into resolved DatasetRefs where appropriate. We then look
397  up prerequisite datasets for all quanta.
398 
399  4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
400  per-task `_QuantumScaffolding` objects.
401  """
402  def __init__(self, pipeline, *, registry):
403  _LOG.debug("Initializing data structures for QuantumGraph generation.")
404  self.tasks = []
405  # Aggregate and categorize the DatasetTypes in the Pipeline.
406  datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
407  # Construct dictionaries that map those DatasetTypes to structures
408  # that will (later) hold addiitonal information about them.
409  for attr in ("initInputs", "initIntermediates", "initOutputs",
410  "inputs", "intermediates", "outputs", "prerequisites"):
411  setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
412  universe=registry.dimensions))
413  # Aggregate all dimensions for all non-init, non-prerequisite
414  # DatasetTypes. These are the ones we'll include in the big join query.
415  self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
416  self.outputs.dimensions)
417  # Construct scaffolding nodes for each Task, and add backreferences
418  # to the Task from each DatasetScaffolding node.
419  # Note that there's only one scaffolding node for each DatasetType, shared by
420  # _PipelineScaffolding and all _TaskScaffoldings that reference it.
421  if isinstance(pipeline, Pipeline):
422  pipeline = pipeline.toExpandedPipeline()
423  self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
424  for taskDef, taskDatasetTypes in zip(pipeline,
425  datasetTypes.byTask.values())]
426 
427  def __repr__(self):
428  # Default dataclass-injected __repr__ gets caught in an infinite loop
429  # because of back-references.
430  return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
431 
432  tasks: List[_TaskScaffolding]
433  """Scaffolding data structures for each task in the pipeline
434  (`list` of `_TaskScaffolding`).
435  """
436 
437  initInputs: _DatasetDict
438  """Datasets consumed but not produced when constructing the tasks in this
439  pipeline (`_DatasetDict`).
440  """
441 
442  initIntermediates: _DatasetDict
443  """Datasets that are both consumed and produced when constructing the tasks
444  in this pipeline (`_DatasetDict`).
445  """
446 
447  initOutputs: _DatasetDict
448  """Datasets produced but not consumed when constructing the tasks in this
449  pipeline (`_DatasetDict`).
450  """
451 
452  inputs: _DatasetDict
453  """Datasets that are consumed but not produced when running this pipeline
454  (`_DatasetDict`).
455  """
456 
457  intermediates: _DatasetDict
458  """Datasets that are both produced and consumed when running this pipeline
459  (`_DatasetDict`).
460  """
461 
462  outputs: _DatasetDict
463  """Datasets produced but not consumed when when running this pipeline
464  (`_DatasetDict`).
465  """
466 
467  prerequisites: _DatasetDict
468  """Datasets that are consumed when running this pipeline and looked up
469  per-Quantum when generating the graph (`_DatasetDict`).
470  """
471 
472  dimensions: DimensionGraph
473  """All dimensions used by any regular input, intermediate, or output
474  (not prerequisite) dataset; the set of dimension used in the "Big Join
475  Query" (`DimensionGraph`).
476 
477  This is required to be a superset of all task quantum dimensions.
478  """
479 
480  @contextmanager
481  def connectDataIds(self, registry, collections, userQuery):
482  """Query for the data IDs that connect nodes in the `QuantumGraph`.
483 
484  This method populates `_TaskScaffolding.dataIds` and
485  `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
486 
487  Parameters
488  ----------
489  registry : `lsst.daf.butler.Registry`
490  Registry for the data repository; used for all data ID queries.
491  collections : `lsst.daf.butler.CollectionSearch`
492  Object representing the collections to search for input datasets.
493  userQuery : `str`, optional
494  User-provided expression to limit the data IDs processed.
495 
496  Returns
497  -------
498  commonDataIds : \
499  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
500  An interface to a database temporary table containing all data IDs
501  that will appear in this `QuantumGraph`. Returned inside a
502  context manager, which will drop the temporary table at the end of
503  the `with` block in which this method is called.
504  """
505  _LOG.debug("Building query for data IDs.")
506  # Initialization datasets always have empty data IDs.
507  emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
508  for datasetType, refs in itertools.chain(self.initInputs.items(),
509  self.initIntermediates.items(),
510  self.initOutputs.items()):
511  refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
512  # Run one big query for the data IDs for task dimensions and regular
513  # inputs and outputs. We limit the query to only dimensions that are
514  # associated with the input dataset types, but don't (yet) try to
515  # obtain the dataset_ids for those inputs.
516  _LOG.debug("Submitting data ID query and materializing results.")
517  with registry.queryDataIds(self.dimensions,
518  datasets=list(self.inputs),
519  collections=collections,
520  where=userQuery,
521  ).materialize() as commonDataIds:
522  _LOG.debug("Expanding data IDs.")
523  commonDataIds = commonDataIds.expanded()
524  _LOG.debug("Iterating over query results to associate quanta with datasets.")
525  # Iterate over query results, populating data IDs for datasets and
526  # quanta and then connecting them to each other.
527  n = 0
528  for n, commonDataId in enumerate(commonDataIds):
529  # Create DatasetRefs for all DatasetTypes from this result row,
530  # noting that we might have created some already.
531  # We remember both those that already existed and those that we
532  # create now.
533  refsForRow = {}
534  for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
535  self.outputs.items()):
536  datasetDataId = commonDataId.subset(datasetType.dimensions)
537  ref = refs.get(datasetDataId)
538  if ref is None:
539  ref = DatasetRef(datasetType, datasetDataId)
540  refs[datasetDataId] = ref
541  refsForRow[datasetType.name] = ref
542  # Create _QuantumScaffolding objects for all tasks from this result
543  # row, noting that we might have created some already.
544  for task in self.tasks:
545  quantumDataId = commonDataId.subset(task.dimensions)
546  quantum = task.quanta.get(quantumDataId)
547  if quantum is None:
548  quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
549  task.quanta[quantumDataId] = quantum
550  # Whether this is a new quantum or an existing one, we can now
551  # associate the DatasetRefs for this row with it. The fact
552  # the fact that a Quantum data ID and a dataset data ID both
553  # came from the same result row is what tells us they should
554  # be associated.
555  # Many of these associates will be duplicates (because another
556  # query row that differed from this one only in irrelevant
557  # dimensions already added them), and we use sets to skip.
558  for datasetType in task.inputs:
559  ref = refsForRow[datasetType.name]
560  quantum.inputs[datasetType.name][ref.dataId] = ref
561  for datasetType in task.outputs:
562  ref = refsForRow[datasetType.name]
563  quantum.outputs[datasetType.name][ref.dataId] = ref
564  _LOG.debug("Finished processing %d rows from data ID query.", n)
565  yield commonDataIds
566 
567  def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True):
568  """Perform follow up queries for each dataset data ID produced in
569  `fillDataIds`.
570 
571  This method populates `_DatasetScaffolding.refs` (except for those in
572  `prerequisites`).
573 
574  Parameters
575  ----------
576  registry : `lsst.daf.butler.Registry`
577  Registry for the data repository; used for all data ID queries.
578  collections : `lsst.daf.butler.CollectionSearch`
579  Object representing the collections to search for input datasets.
580  run : `str`, optional
581  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
582  output datasets, if it already exists.
583  commonDataIds : \
584  `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
585  Result of a previous call to `connectDataIds`.
586  skipExisting : `bool`, optional
587  If `True` (default), a Quantum is not created if all its outputs
588  already exist in ``run``. Ignored if ``run`` is `None`.
589 
590  Raises
591  ------
592  OutputExistsError
593  Raised if an output dataset already exists in the output run
594  and ``skipExisting`` is `False`. The case where some but not all
595  of a quantum's outputs are present and ``skipExisting`` is `True`
596  cannot be identified at this stage, and is handled by `fillQuanta`
597  instead.
598  """
599  # Look up [init] intermediate and output datasets in the output
600  # collection, if there is an output collection.
601  if run is not None:
602  for datasetType, refs in itertools.chain(self.initIntermediates.items(),
603  self.initOutputs.items(),
604  self.intermediates.items(),
605  self.outputs.items()):
606  _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
607  len(refs), datasetType.name)
608  isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
609  resolvedRefQueryResults = commonDataIds.subset(
610  datasetType.dimensions,
611  unique=True
612  ).findDatasets(
613  datasetType,
614  collections=run,
615  deduplicate=True
616  )
617  for resolvedRef in resolvedRefQueryResults:
618  # TODO: we could easily support per-DatasetType
619  # skipExisting and I could imagine that being useful - it's
620  # probably required in order to support writing initOutputs
621  # before QuantumGraph generation.
622  assert resolvedRef.dataId in refs
623  if skipExisting or isInit:
624  refs[resolvedRef.dataId] = resolvedRef
625  else:
626  raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
627  f"output RUN collection '{run}' with data ID"
628  f" {resolvedRef.dataId}.")
629  # Look up input and initInput datasets in the input collection(s).
630  for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
631  _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
632  resolvedRefQueryResults = commonDataIds.subset(
633  datasetType.dimensions,
634  unique=True
635  ).findDatasets(
636  datasetType,
637  collections=collections,
638  deduplicate=True
639  )
640  dataIdsNotFoundYet = set(refs.keys())
641  for resolvedRef in resolvedRefQueryResults:
642  dataIdsNotFoundYet.discard(resolvedRef.dataId)
643  refs[resolvedRef.dataId] = resolvedRef
644  if dataIdsNotFoundYet:
645  raise RuntimeError(
646  f"{len(dataIdsNotFoundYet)} dataset(s) of type "
647  f"'{datasetType.name}' was/were present in a previous "
648  f"query, but could not be found now."
649  f"This is either a logic bug in QuantumGraph generation "
650  f"or the input collections have been modified since "
651  f"QuantumGraph generation began."
652  )
653  # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
654  # replacing the unresolved refs there, and then look up prerequisites.
655  for task in self.tasks:
656  _LOG.debug(
657  "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
658  len(task.quanta),
659  task.taskDef.label
660  )
661  lookupFunctions = {
662  c.name: c.lookupFunction
663  for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
664  if c.lookupFunction is not None
665  }
666  dataIdsToSkip = []
667  for quantum in task.quanta.values():
668  # Process outputs datasets only if there is a run to look for
669  # outputs in and skipExisting is True. Note that if
670  # skipExisting is False, any output datasets that already exist
671  # would have already caused an exception to be raised.
672  # We never update the DatasetRefs in the quantum because those
673  # should never be resolved.
674  if run is not None and skipExisting:
675  resolvedRefs = []
676  unresolvedRefs = []
677  for datasetType, originalRefs in quantum.outputs.items():
678  for ref in task.outputs.extract(datasetType, originalRefs.keys()):
679  if ref.id is not None:
680  resolvedRefs.append(ref)
681  else:
682  unresolvedRefs.append(ref)
683  if resolvedRefs:
684  if unresolvedRefs:
685  raise OutputExistsError(
686  f"Quantum {quantum.dataId} of task with label "
687  f"'{quantum.task.taskDef.label}' has some outputs that exist "
688  f"({resolvedRefs}) "
689  f"and others that don't ({unresolvedRefs})."
690  )
691  else:
692  # All outputs are already present; skip this
693  # quantum and continue to the next.
694  dataIdsToSkip.append(quantum.dataId)
695  continue
696  # Update the input DatasetRefs to the resolved ones we already
697  # searched for.
698  for datasetType, refs in quantum.inputs.items():
699  for ref in task.inputs.extract(datasetType, refs.keys()):
700  refs[ref.dataId] = ref
701  # Look up prerequisite datasets in the input collection(s).
702  # These may have dimensions that extend beyond those we queried
703  # for originally, because we want to permit those data ID
704  # values to differ across quanta and dataset types.
705  for datasetType in task.prerequisites:
706  lookupFunction = lookupFunctions.get(datasetType.name)
707  if lookupFunction is not None:
708  # PipelineTask has provided its own function to do the
709  # lookup. This always takes precedence.
710  refs = list(
711  lookupFunction(datasetType, registry, quantum.dataId, collections)
712  )
713  elif (datasetType.isCalibration()
714  and datasetType.dimensions <= quantum.dataId.graph
715  and quantum.dataId.graph.temporal):
716  # This is a master calibration lookup, which we have to
717  # handle specially because the query system can't do a
718  # temporal join on a non-dimension-based timespan yet.
719  timespan = quantum.dataId.timespan
720  try:
721  refs = [registry.findDataset(datasetType, quantum.dataId,
722  collections=collections,
723  timespan=timespan)]
724  except KeyError:
725  # This dataset type is not present in the registry,
726  # which just means there are no datasets here.
727  refs = []
728  else:
729  # Most general case.
730  refs = list(registry.queryDatasets(datasetType,
731  collections=collections,
732  dataId=quantum.dataId,
733  deduplicate=True).expanded())
734  quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs
735  if ref is not None})
736  # Actually remove any quanta that we decided to skip above.
737  if dataIdsToSkip:
738  _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
739  len(dataIdsToSkip), task.taskDef.label)
740  for dataId in dataIdsToSkip:
741  del task.quanta[dataId]
742 
743  def makeQuantumGraph(self):
744  """Create a `QuantumGraph` from the quanta already present in
745  the scaffolding data structure.
746 
747  Returns
748  -------
749  graph : `QuantumGraph`
750  The full `QuantumGraph`.
751  """
752  graph = QuantumGraph({task.taskDef: task.makeQuantumSet() for task in self.tasks})
753  return graph
754 
755 
756 class _InstrumentFinder(TreeVisitor):
757  """Implementation of TreeVisitor which looks for instrument name
758 
759  Instrument should be specified as a boolean expression
760 
761  instrument = 'string'
762  'string' = instrument
763 
764  so we only need to find a binary operator where operator is "=",
765  one side is a string literal and other side is an identifier.
766  All visit methods return tuple of (type, value), non-useful nodes
767  return None for both type and value.
768  """
769  def __init__(self):
770  self.instruments = []
771 
772  def visitNumericLiteral(self, value, node):
773  # do not care about numbers
774  return (None, None)
775 
776  def visitStringLiteral(self, value, node):
777  # return type and value
778  return ("str", value)
779 
780  def visitTimeLiteral(self, value, node):
781  # do not care about these
782  return (None, None)
783 
784  def visitRangeLiteral(self, start, stop, stride, node):
785  # do not care about these
786  return (None, None)
787 
788  def visitIdentifier(self, name, node):
789  if name.lower() == "instrument":
790  return ("id", "instrument")
791  return (None, None)
792 
793  def visitUnaryOp(self, operator, operand, node):
794  # do not care about these
795  return (None, None)
796 
797  def visitBinaryOp(self, operator, lhs, rhs, node):
798  if operator == "=":
799  if lhs == ("id", "instrument") and rhs[0] == "str":
800  self.instruments.append(rhs[1])
801  elif rhs == ("id", "instrument") and lhs[0] == "str":
802  self.instruments.append(lhs[1])
803  return (None, None)
804 
805  def visitIsIn(self, lhs, values, not_in, node):
806  # do not care about these
807  return (None, None)
808 
809  def visitParens(self, expression, node):
810  # do not care about these
811  return (None, None)
812 
813 
814 def _findInstruments(queryStr):
815  """Get the names of any instrument named in the query string by searching
816  for "instrument = <value>" and similar patterns.
817 
818  Parameters
819  ----------
820  queryStr : `str` or None
821  The query string to search, or None if there is no query.
822 
823  Returns
824  -------
825  instruments : `list` [`str`]
826  The list of instrument names found in the query.
827 
828  Raises
829  ------
830  ValueError
831  If the query expression can not be parsed.
832  """
833  if not queryStr:
834  return []
835  parser = ParserYacc()
836  finder = _InstrumentFinder()
837  try:
838  tree = parser.parse(queryStr)
839  except ParseError as exc:
840  raise ValueError(f"failed to parse query expression: {queryStr}") from exc
841  tree.visit(finder)
842  return finder.instruments
843 
844 
845 # ------------------------
846 # Exported definitions --
847 # ------------------------
848 
849 
850 class GraphBuilderError(Exception):
851  """Base class for exceptions generated by graph builder.
852  """
853  pass
854 
855 
856 class OutputExistsError(GraphBuilderError):
857  """Exception generated when output datasets already exist.
858  """
859  pass
860 
861 
863  """Exception generated when a prerequisite dataset does not exist.
864  """
865  pass
866 
867 
868 class GraphBuilder(object):
869  """GraphBuilder class is responsible for building task execution graph from
870  a Pipeline.
871 
872  Parameters
873  ----------
874  registry : `~lsst.daf.butler.Registry`
875  Data butler instance.
876  skipExisting : `bool`, optional
877  If `True` (default), a Quantum is not created if all its outputs
878  already exist.
879  """
880 
881  def __init__(self, registry, skipExisting=True):
882  self.registry = registry
883  self.dimensions = registry.dimensions
884  self.skipExisting = skipExisting
885 
886  def makeGraph(self, pipeline, collections, run, userQuery):
887  """Create execution graph for a pipeline.
888 
889  Parameters
890  ----------
891  pipeline : `Pipeline`
892  Pipeline definition, task names/classes and their configs.
893  collections : `lsst.daf.butler.CollectionSearch`
894  Object representing the collections to search for input datasets.
895  run : `str`, optional
896  Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
897  output datasets, if it already exists.
898  userQuery : `str`
899  String which defines user-defined selection for registry, should be
900  empty or `None` if there is no restrictions on data selection.
901 
902  Returns
903  -------
904  graph : `QuantumGraph`
905 
906  Raises
907  ------
908  UserExpressionError
909  Raised when user expression cannot be parsed.
910  OutputExistsError
911  Raised when output datasets already exist.
912  Exception
913  Other exceptions types may be raised by underlying registry
914  classes.
915  """
916  scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
917 
918  instrument = pipeline.getInstrument()
919  if isinstance(instrument, str):
920  instrument = doImport(instrument)
921  instrumentName = instrument.getName() if instrument else None
922  userQuery = self._verifyInstrumentRestriction(instrumentName, userQuery)
923 
924  with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds:
925  scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
926  skipExisting=self.skipExisting)
927  return scaffolding.makeQuantumGraph()
928 
929  @staticmethod
930  def _verifyInstrumentRestriction(instrumentName, query):
931  """Add an instrument restriction to the query if it does not have one,
932  and verify that if given an instrument name that there are no other
933  instrument restrictions in the query.
934 
935  Parameters
936  ----------
937  instrumentName : `str`
938  The name of the instrument that should appear in the query.
939  query : `str`
940  The query string.
941 
942  Returns
943  -------
944  query : `str`
945  The query string with the instrument added to it if needed.
946 
947  Raises
948  ------
949  RuntimeError
950  If the pipeline names an instrument and the query contains more
951  than one instrument or the name of the instrument in the query does
952  not match the instrument named by the pipeline.
953  """
954  if not instrumentName:
955  return query
956  queryInstruments = _findInstruments(query)
957  if len(queryInstruments) > 1:
958  raise RuntimeError(f"When the pipeline has an instrument (\"{instrumentName}\") the query must "
959  "have zero instruments or one instrument that matches the pipeline. "
960  f"Found these instruments in the query: {queryInstruments}.")
961  if not queryInstruments:
962  # There is not an instrument in the query, add it:
963  restriction = f"instrument = '{instrumentName}'"
964  _LOG.debug(f"Adding restriction \"{restriction}\" to query.")
965  query = f"{restriction} AND ({query})" if query else restriction # (there may not be a query)
966  elif queryInstruments[0] != instrumentName:
967  # Since there is an instrument in the query, it should match
968  # the instrument in the pipeline.
969  raise RuntimeError(f"The instrument named in the query (\"{queryInstruments[0]}\") does not "
970  f"match the instrument named by the pipeline (\"{instrumentName}\")")
971  return query
lsst::pipe::base.graphBuilder._PipelineScaffolding.dimensions
dimensions
Definition: graphBuilder.py:415
lsst::pipe::base.graphBuilder._PipelineScaffolding.tasks
tasks
Definition: graphBuilder.py:404
lsst::pipe::base.graphBuilder._QuantumScaffolding.__init__
def __init__(self, _TaskScaffolding task, DataCoordinate dataId)
Definition: graphBuilder.py:201
lsst::pipe::base.graphBuilder._QuantumScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:206
lsst::pipe::base.graphBuilder._TaskScaffolding.prerequisites
prerequisites
Definition: graphBuilder.py:297
lsst::pipe::base.graphBuilder._DatasetDict.fromSubset
_DatasetDict fromSubset(cls, Iterable[DatasetType] datasetTypes, _DatasetDict first, *_DatasetDict rest)
Definition: graphBuilder.py:100
lsst::pipe::base.graphBuilder.GraphBuilder.skipExisting
skipExisting
Definition: graphBuilder.py:884
lsst::pipe::base.graphBuilder._InstrumentFinder.visitStringLiteral
def visitStringLiteral(self, value, node)
Definition: graphBuilder.py:776
lsst::pipe::base.graphBuilder._PipelineScaffolding.connectDataIds
def connectDataIds(self, registry, collections, userQuery)
Definition: graphBuilder.py:481
lsst::pipe::base.graphBuilder._TaskScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:301
lsst::pipe::base.graphBuilder._TaskScaffolding.taskDef
taskDef
Definition: graphBuilder.py:286
lsst::pipe::base.graphBuilder._QuantumScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:211
lsst::pipe::base.graphBuilder._TaskScaffolding.outputs
outputs
Definition: graphBuilder.py:296
lsst::pipe::base.graphBuilder._TaskScaffolding.makeQuantumSet
Set[Quantum] makeQuantumSet(self)
Definition: graphBuilder.py:346
lsst::pipe::base.graphBuilder._TaskScaffolding.initInputs
initInputs
Definition: graphBuilder.py:291
lsst::pipe::base.graphBuilder._PipelineScaffolding.__repr__
def __repr__(self)
Definition: graphBuilder.py:427
lsst::pipe::base.graphBuilder._QuantumScaffolding.outputs
outputs
Definition: graphBuilder.py:205
lsst::pipe::base.graphBuilder._QuantumScaffolding.dataId
dataId
Definition: graphBuilder.py:203
lsst::pipe::base.graphBuilder._InstrumentFinder.visitTimeLiteral
def visitTimeLiteral(self, value, node)
Definition: graphBuilder.py:780
lsst::pipe::base.graphBuilder.GraphBuilder._verifyInstrumentRestriction
def _verifyInstrumentRestriction(instrumentName, query)
Definition: graphBuilder.py:930
lsst::pipe::base.graph.graph.QuantumGraph
Definition: graph.py:54
lsst::pipe::base.graphBuilder._QuantumScaffolding.makeQuantum
Quantum makeQuantum(self)
Definition: graphBuilder.py:241
lsst::pipe::base.graphBuilder._DatasetDict.fromDatasetTypes
_DatasetDict fromDatasetTypes(cls, Iterable[DatasetType] datasetTypes, *DimensionUniverse universe)
Definition: graphBuilder.py:80
lsst::pipe::base.graphBuilder.PrerequisiteMissingError
Definition: graphBuilder.py:862
lsst::pipe::base.graphBuilder._DatasetDict
Definition: graphBuilder.py:64
lsst::pipe::base.graphBuilder._QuantumScaffolding.inputs
inputs
Definition: graphBuilder.py:204
lsst::pipe::base.graphBuilder._QuantumScaffolding.task
task
Definition: graphBuilder.py:202
lsst::pipe::base.connections.iterConnections
typing.Generator[BaseConnection, None, None] iterConnections(PipelineTaskConnections connections, Union[str, Iterable[str]] connectionType)
Definition: connections.py:501
lsst::pipe::base.graphBuilder._TaskScaffolding.initOutputs
initOutputs
Definition: graphBuilder.py:293
lsst::pipe::base.graphBuilder._PipelineScaffolding.__init__
def __init__(self, pipeline, *registry)
Definition: graphBuilder.py:402
lsst::pipe::base.graphBuilder._InstrumentFinder.visitNumericLiteral
def visitNumericLiteral(self, value, node)
Definition: graphBuilder.py:772
lsst::pipe::base.graphBuilder._TaskScaffolding.inputs
inputs
Definition: graphBuilder.py:295
lsst::pipe::base.graphBuilder._PipelineScaffolding.resolveDatasetRefs
def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *skipExisting=True)
Definition: graphBuilder.py:567
lsst::pipe::base.graphBuilder._InstrumentFinder.visitUnaryOp
def visitUnaryOp(self, operator, operand, node)
Definition: graphBuilder.py:793
lsst::pipe::base.graphBuilder._InstrumentFinder.visitBinaryOp
def visitBinaryOp(self, operator, lhs, rhs, node)
Definition: graphBuilder.py:797
lsst::pipe::base.graphBuilder._InstrumentFinder.instruments
instruments
Definition: graphBuilder.py:770
lsst::pipe::base.graphBuilder._PipelineScaffolding
Definition: graphBuilder.py:358
lsst::pipe::base.graphBuilder._DatasetDict.extract
Iterator[DatasetRef] extract(self, DatasetType datasetType, Iterable[DataCoordinate] dataIds)
Definition: graphBuilder.py:164
lsst::pipe::base.graphBuilder._DatasetDict.__init__
def __init__(self, *args, DimensionGraph universe)
Definition: graphBuilder.py:75
lsst::pipe::base.graphBuilder.GraphBuilder.dimensions
dimensions
Definition: graphBuilder.py:883
lsst::pipe::base.graphBuilder._InstrumentFinder
Definition: graphBuilder.py:756
lsst::utils
lsst::pipe::base.graphBuilder._InstrumentFinder.__init__
def __init__(self)
Definition: graphBuilder.py:769
lsst::pipe::base.graphBuilder._TaskScaffolding.dataIds
dataIds
Definition: graphBuilder.py:298
lsst::pipe::base.graphBuilder.GraphBuilder.registry
registry
Definition: graphBuilder.py:882
lsst::pipe::base.graphBuilder._TaskScaffolding.quanta
quanta
Definition: graphBuilder.py:299
lsst::pipe::base.graphBuilder.GraphBuilder.makeGraph
def makeGraph(self, pipeline, collections, run, userQuery)
Definition: graphBuilder.py:886
lsst::pipe::base.graphBuilder._DatasetDict.unpackMultiRefs
NamedKeyDict[DatasetType, DatasetRef] unpackMultiRefs(self)
Definition: graphBuilder.py:152
lsst::pipe::base.graphBuilder._InstrumentFinder.visitParens
def visitParens(self, expression, node)
Definition: graphBuilder.py:809
lsst::pipe::base.graphBuilder._DatasetDict.dimensions
DimensionGraph dimensions(self)
Definition: graphBuilder.py:125
lsst::pipe::base.graphBuilder._TaskScaffolding
Definition: graphBuilder.py:267
lsst::pipe::base.graphBuilder._DatasetDict.unpackSingleRefs
NamedKeyDict[DatasetType, DatasetRef] unpackSingleRefs(self)
Definition: graphBuilder.py:134
lsst::pipe::base.graphBuilder._QuantumScaffolding
Definition: graphBuilder.py:186
lsst::pipe::base.graphBuilder._InstrumentFinder.visitIsIn
def visitIsIn(self, lhs, values, not_in, node)
Definition: graphBuilder.py:805
lsst::pipe::base.graphBuilder._DatasetDict.universe
universe
Definition: graphBuilder.py:77
lsst::pipe::base.graphBuilder._TaskScaffolding.dimensions
dimensions
Definition: graphBuilder.py:287
lsst::pipe::base.graphBuilder.GraphBuilder
Definition: graphBuilder.py:868
lsst::pipe::base.graphBuilder.GraphBuilder.__init__
def __init__(self, registry, skipExisting=True)
Definition: graphBuilder.py:881
lsst::pipe::base.graphBuilder.GraphBuilderError
Definition: graphBuilder.py:850
lsst::pipe::base.graphBuilder._InstrumentFinder.visitRangeLiteral
def visitRangeLiteral(self, start, stop, stride, node)
Definition: graphBuilder.py:784
lsst::pipe::base.graphBuilder._PipelineScaffolding.makeQuantumGraph
def makeQuantumGraph(self)
Definition: graphBuilder.py:743
lsst::pipe::base.graphBuilder.OutputExistsError
Definition: graphBuilder.py:856
lsst::pipe::base.graphBuilder._InstrumentFinder.visitIdentifier
def visitIdentifier(self, name, node)
Definition: graphBuilder.py:788
lsst::pipe::base.graphBuilder._TaskScaffolding.__init__
def __init__(self, TaskDef taskDef, _PipelineScaffolding parent, TaskDatasetTypes datasetTypes)
Definition: graphBuilder.py:284