Coverage for python/lsst/pipe/base/graphBuilder.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from contextlib import contextmanager
34from dataclasses import dataclass
35from typing import Dict, Iterable, Iterator, List
36import logging
39# -----------------------------
40# Imports for other modules --
41# -----------------------------
42from .connections import iterConnections
43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44from .graph import QuantumGraph, QuantumGraphTaskNodes
45from lsst.daf.butler import (
46 DataCoordinate,
47 DatasetRef,
48 DatasetType,
49 DimensionGraph,
50 DimensionUniverse,
51 NamedKeyDict,
52 Quantum,
53)
54from lsst.daf.butler.registry.queries.exprParser import ParseError, ParserYacc, TreeVisitor
55from lsst.utils import doImport
57# ----------------------------------
58# Local non-exported definitions --
59# ----------------------------------
61_LOG = logging.getLogger(__name__.partition(".")[2])
64class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
65 """A custom dictionary that maps `DatasetType` to a nested dictionary of
66 the known `DatasetRef` instances of that type.
68 Parameters
69 ----------
70 args
71 Positional arguments are forwarded to the `dict` constructor.
72 universe : `DimensionUniverse`
73 Universe of all possible dimensions.
74 """
75 def __init__(self, *args, universe: DimensionGraph):
76 super().__init__(*args)
77 self.universe = universe
79 @classmethod
80 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
81 universe: DimensionUniverse) -> _DatasetDict:
82 """Construct a dictionary from a flat iterable of `DatasetType` keys.
84 Parameters
85 ----------
86 datasetTypes : `iterable` of `DatasetType`
87 DatasetTypes to use as keys for the dict. Values will be empty
88 dictionaries.
89 universe : `DimensionUniverse`
90 Universe of all possible dimensions.
92 Returns
93 -------
94 dictionary : `_DatasetDict`
95 A new `_DatasetDict` instance.
96 """
97 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
99 @classmethod
100 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
101 ) -> _DatasetDict:
102 """Return a new dictionary by extracting items corresponding to the
103 given keys from one or more existing dictionaries.
105 Parameters
106 ----------
107 datasetTypes : `iterable` of `DatasetType`
108 DatasetTypes to use as keys for the dict. Values will be obtained
109 by lookups against ``first`` and ``rest``.
110 first : `_DatasetDict`
111 Another dictionary from which to extract values.
112 rest
113 Additional dictionaries from which to extract values.
115 Returns
116 -------
117 dictionary : `_DatasetDict`
118 A new dictionary instance.
119 """
120 combined = ChainMap(first, *rest)
121 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
122 universe=first.universe)
124 @property
125 def dimensions(self) -> DimensionGraph:
126 """The union of all dimensions used by all dataset types in this
127 dictionary, including implied dependencies (`DimensionGraph`).
128 """
129 base = self.universe.empty
130 if len(self) == 0:
131 return base
132 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
134 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
135 """Unpack nested single-element `DatasetRef` dicts into a new
136 mapping with `DatasetType` keys and `DatasetRef` values.
138 This method assumes that each nest contains exactly one item, as is the
139 case for all "init" datasets.
141 Returns
142 -------
143 dictionary : `NamedKeyDict`
144 Dictionary mapping `DatasetType` to `DatasetRef`, with both
145 `DatasetType` instances and string names usable as keys.
146 """
147 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
148 ref, = refs.values()
149 return ref
150 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
152 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
153 """Unpack nested multi-element `DatasetRef` dicts into a new
154 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
156 Returns
157 -------
158 dictionary : `NamedKeyDict`
159 Dictionary mapping `DatasetType` to `DatasetRef`, with both
160 `DatasetType` instances and string names usable as keys.
161 """
162 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
164 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
165 ) -> Iterator[DatasetRef]:
166 """Iterate over the contained `DatasetRef` instances that match the
167 given `DatasetType` and data IDs.
169 Parameters
170 ----------
171 datasetType : `DatasetType`
172 Dataset type to match.
173 dataIds : `Iterable` [ `DataCoordinate` ]
174 Data IDs to match.
176 Returns
177 -------
178 refs : `Iterator` [ `DatasetRef` ]
179 DatasetRef instances for which ``ref.datasetType == datasetType``
180 and ``ref.dataId`` is in ``dataIds``.
181 """
182 refs = self[datasetType]
183 return (refs[dataId] for dataId in dataIds)
186class _QuantumScaffolding:
187 """Helper class aggregating information about a `Quantum`, used when
188 constructing a `QuantumGraph`.
190 See `_PipelineScaffolding` for a top-down description of the full
191 scaffolding data structure.
193 Parameters
194 ----------
195 task : _TaskScaffolding
196 Back-reference to the helper object for the `PipelineTask` this quantum
197 represents an execution of.
198 dataId : `DataCoordinate`
199 Data ID for this quantum.
200 """
201 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
202 self.task = task
203 self.dataId = dataId
204 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
205 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
206 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
207 universe=dataId.universe)
209 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
211 def __repr__(self):
212 return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
214 task: _TaskScaffolding
215 """Back-reference to the helper object for the `PipelineTask` this quantum
216 represents an execution of.
217 """
219 dataId: DataCoordinate
220 """Data ID for this quantum.
221 """
223 inputs: _DatasetDict
224 """Nested dictionary containing `DatasetRef` inputs to this quantum.
226 This is initialized to map each `DatasetType` to an empty dictionary at
227 construction. Those nested dictionaries are populated (with data IDs as
228 keys) with unresolved `DatasetRef` instances in
229 `_PipelineScaffolding.connectDataIds`.
230 """
232 outputs: _DatasetDict
233 """Nested dictionary containing `DatasetRef` outputs this quantum.
234 """
236 prerequisites: _DatasetDict
237 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
238 quantum.
239 """
241 def makeQuantum(self) -> Quantum:
242 """Transform the scaffolding object into a true `Quantum` instance.
244 Returns
245 -------
246 quantum : `Quantum`
247 An actual `Quantum` instance.
248 """
249 allInputs = self.inputs.unpackMultiRefs()
250 allInputs.update(self.prerequisites.unpackMultiRefs())
251 # Give the task's Connections class an opportunity to remove some
252 # inputs, or complain if they are unacceptable.
253 # This will raise if one of the check conditions is not met, which is the intended
254 # behavior
255 allInputs = self.task.taskDef.connections.adjustQuantum(allInputs)
256 return Quantum(
257 taskName=self.task.taskDef.taskName,
258 taskClass=self.task.taskDef.taskClass,
259 dataId=self.dataId,
260 initInputs=self.task.initInputs.unpackSingleRefs(),
261 predictedInputs=allInputs,
262 outputs=self.outputs.unpackMultiRefs(),
263 )
266@dataclass
267class _TaskScaffolding:
268 """Helper class aggregating information about a `PipelineTask`, used when
269 constructing a `QuantumGraph`.
271 See `_PipelineScaffolding` for a top-down description of the full
272 scaffolding data structure.
274 Parameters
275 ----------
276 taskDef : `TaskDef`
277 Data structure that identifies the task class and its config.
278 parent : `_PipelineScaffolding`
279 The parent data structure that will hold the instance being
280 constructed.
281 datasetTypes : `TaskDatasetTypes`
282 Data structure that categorizes the dataset types used by this task.
283 """
284 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
285 universe = parent.dimensions.universe
286 self.taskDef = taskDef
287 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
288 assert self.dimensions.issubset(parent.dimensions)
289 # Initialize _DatasetDicts as subsets of the one or two
290 # corresponding dicts in the parent _PipelineScaffolding.
291 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
292 parent.initIntermediates)
293 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
294 parent.initOutputs)
295 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
296 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
297 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
298 self.dataIds = set()
299 self.quanta = {}
301 def __repr__(self):
302 # Default dataclass-injected __repr__ gets caught in an infinite loop
303 # because of back-references.
304 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
306 taskDef: TaskDef
307 """Data structure that identifies the task class and its config
308 (`TaskDef`).
309 """
311 dimensions: DimensionGraph
312 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
313 """
315 initInputs: _DatasetDict
316 """Dictionary containing information about datasets used to construct this
317 task (`_DatasetDict`).
318 """
320 initOutputs: _DatasetDict
321 """Dictionary containing information about datasets produced as a
322 side-effect of constructing this task (`_DatasetDict`).
323 """
325 inputs: _DatasetDict
326 """Dictionary containing information about datasets used as regular,
327 graph-constraining inputs to this task (`_DatasetDict`).
328 """
330 outputs: _DatasetDict
331 """Dictionary containing information about datasets produced by this task
332 (`_DatasetDict`).
333 """
335 prerequisites: _DatasetDict
336 """Dictionary containing information about input datasets that must be
337 present in the repository before any Pipeline containing this task is run
338 (`_DatasetDict`).
339 """
341 quanta: Dict[DataCoordinate, _QuantumScaffolding]
342 """Dictionary mapping data ID to a scaffolding object for the Quantum of
343 this task with that data ID.
344 """
346 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
347 """Create a `QuantumGraphTaskNodes` instance from the information in
348 ``self``.
350 Returns
351 -------
352 nodes : `QuantumGraphTaskNodes`
353 The `QuantumGraph` elements corresponding to this task.
354 """
355 return QuantumGraphTaskNodes(
356 taskDef=self.taskDef,
357 quanta=[q.makeQuantum() for q in self.quanta.values()],
358 initInputs=self.initInputs.unpackSingleRefs(),
359 initOutputs=self.initOutputs.unpackSingleRefs(),
360 )
363@dataclass
364class _PipelineScaffolding:
365 """A helper data structure that organizes the information involved in
366 constructing a `QuantumGraph` for a `Pipeline`.
368 Parameters
369 ----------
370 pipeline : `Pipeline`
371 Sequence of tasks from which a graph is to be constructed. Must
372 have nested task classes already imported.
373 universe : `DimensionUniverse`
374 Universe of all possible dimensions.
376 Notes
377 -----
378 The scaffolding data structure contains nested data structures for both
379 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
380 data structures are shared between the pipeline-level structure (which
381 aggregates all datasets and categorizes them from the perspective of the
382 complete pipeline) and the individual tasks that use them as inputs and
383 outputs.
385 `QuantumGraph` construction proceeds in four steps, with each corresponding
386 to a different `_PipelineScaffolding` method:
388 1. When `_PipelineScaffolding` is constructed, we extract and categorize
389 the DatasetTypes used by the pipeline (delegating to
390 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
391 nested `_TaskScaffolding` and `_DatasetDict` objects.
393 2. In `connectDataIds`, we construct and run the "Big Join Query", which
394 returns related tuples of all dimensions used to identify any regular
395 input, output, and intermediate datasets (not prerequisites). We then
396 iterate over these tuples of related dimensions, identifying the subsets
397 that correspond to distinct data IDs for each task and dataset type,
398 and then create `_QuantumScaffolding` objects.
400 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
401 dataset data IDs previously identified, transforming unresolved
402 DatasetRefs into resolved DatasetRefs where appropriate. We then look
403 up prerequisite datasets for all quanta.
405 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
406 per-task `_QuantumScaffolding` objects.
407 """
408 def __init__(self, pipeline, *, registry):
409 _LOG.debug("Initializing data structures for QuantumGraph generation.")
410 self.tasks = []
411 # Aggregate and categorize the DatasetTypes in the Pipeline.
412 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
413 # Construct dictionaries that map those DatasetTypes to structures
414 # that will (later) hold addiitonal information about them.
415 for attr in ("initInputs", "initIntermediates", "initOutputs",
416 "inputs", "intermediates", "outputs", "prerequisites"):
417 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
418 universe=registry.dimensions))
419 # Aggregate all dimensions for all non-init, non-prerequisite
420 # DatasetTypes. These are the ones we'll include in the big join query.
421 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
422 self.outputs.dimensions)
423 # Construct scaffolding nodes for each Task, and add backreferences
424 # to the Task from each DatasetScaffolding node.
425 # Note that there's only one scaffolding node for each DatasetType, shared by
426 # _PipelineScaffolding and all _TaskScaffoldings that reference it.
427 if isinstance(pipeline, Pipeline):
428 pipeline = pipeline.toExpandedPipeline()
429 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
430 for taskDef, taskDatasetTypes in zip(pipeline,
431 datasetTypes.byTask.values())]
433 def __repr__(self):
434 # Default dataclass-injected __repr__ gets caught in an infinite loop
435 # because of back-references.
436 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
438 tasks: List[_TaskScaffolding]
439 """Scaffolding data structures for each task in the pipeline
440 (`list` of `_TaskScaffolding`).
441 """
443 initInputs: _DatasetDict
444 """Datasets consumed but not produced when constructing the tasks in this
445 pipeline (`_DatasetDict`).
446 """
448 initIntermediates: _DatasetDict
449 """Datasets that are both consumed and produced when constructing the tasks
450 in this pipeline (`_DatasetDict`).
451 """
453 initOutputs: _DatasetDict
454 """Datasets produced but not consumed when constructing the tasks in this
455 pipeline (`_DatasetDict`).
456 """
458 inputs: _DatasetDict
459 """Datasets that are consumed but not produced when running this pipeline
460 (`_DatasetDict`).
461 """
463 intermediates: _DatasetDict
464 """Datasets that are both produced and consumed when running this pipeline
465 (`_DatasetDict`).
466 """
468 outputs: _DatasetDict
469 """Datasets produced but not consumed when when running this pipeline
470 (`_DatasetDict`).
471 """
473 prerequisites: _DatasetDict
474 """Datasets that are consumed when running this pipeline and looked up
475 per-Quantum when generating the graph (`_DatasetDict`).
476 """
478 dimensions: DimensionGraph
479 """All dimensions used by any regular input, intermediate, or output
480 (not prerequisite) dataset; the set of dimension used in the "Big Join
481 Query" (`DimensionGraph`).
483 This is required to be a superset of all task quantum dimensions.
484 """
486 @contextmanager
487 def connectDataIds(self, registry, collections, userQuery):
488 """Query for the data IDs that connect nodes in the `QuantumGraph`.
490 This method populates `_TaskScaffolding.dataIds` and
491 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
493 Parameters
494 ----------
495 registry : `lsst.daf.butler.Registry`
496 Registry for the data repository; used for all data ID queries.
497 collections : `lsst.daf.butler.CollectionSearch`
498 Object representing the collections to search for input datasets.
499 userQuery : `str`, optional
500 User-provided expression to limit the data IDs processed.
502 Returns
503 -------
504 commonDataIds : \
505 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
506 An interface to a database temporary table containing all data IDs
507 that will appear in this `QuantumGraph`. Returned inside a
508 context manager, which will drop the temporary table at the end of
509 the `with` block in which this method is called.
510 """
511 _LOG.debug("Building query for data IDs.")
512 # Initialization datasets always have empty data IDs.
513 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
514 for datasetType, refs in itertools.chain(self.initInputs.items(),
515 self.initIntermediates.items(),
516 self.initOutputs.items()):
517 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
518 # Run one big query for the data IDs for task dimensions and regular
519 # inputs and outputs. We limit the query to only dimensions that are
520 # associated with the input dataset types, but don't (yet) try to
521 # obtain the dataset_ids for those inputs.
522 _LOG.debug("Submitting data ID query and materializing results.")
523 with registry.queryDataIds(self.dimensions,
524 datasets=list(self.inputs),
525 collections=collections,
526 where=userQuery,
527 ).materialize() as commonDataIds:
528 _LOG.debug("Expanding data IDs.")
529 commonDataIds = commonDataIds.expanded()
530 _LOG.debug("Iterating over query results to associate quanta with datasets.")
531 # Iterate over query results, populating data IDs for datasets and
532 # quanta and then connecting them to each other.
533 n = 0
534 for n, commonDataId in enumerate(commonDataIds):
535 # Create DatasetRefs for all DatasetTypes from this result row,
536 # noting that we might have created some already.
537 # We remember both those that already existed and those that we
538 # create now.
539 refsForRow = {}
540 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
541 self.outputs.items()):
542 datasetDataId = commonDataId.subset(datasetType.dimensions)
543 ref = refs.get(datasetDataId)
544 if ref is None:
545 ref = DatasetRef(datasetType, datasetDataId)
546 refs[datasetDataId] = ref
547 refsForRow[datasetType.name] = ref
548 # Create _QuantumScaffolding objects for all tasks from this result
549 # row, noting that we might have created some already.
550 for task in self.tasks:
551 quantumDataId = commonDataId.subset(task.dimensions)
552 quantum = task.quanta.get(quantumDataId)
553 if quantum is None:
554 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
555 task.quanta[quantumDataId] = quantum
556 # Whether this is a new quantum or an existing one, we can now
557 # associate the DatasetRefs for this row with it. The fact
558 # the fact that a Quantum data ID and a dataset data ID both
559 # came from the same result row is what tells us they should
560 # be associated.
561 # Many of these associates will be duplicates (because another
562 # query row that differed from this one only in irrelevant
563 # dimensions already added them), and we use sets to skip.
564 for datasetType in task.inputs:
565 ref = refsForRow[datasetType.name]
566 quantum.inputs[datasetType.name][ref.dataId] = ref
567 for datasetType in task.outputs:
568 ref = refsForRow[datasetType.name]
569 quantum.outputs[datasetType.name][ref.dataId] = ref
570 _LOG.debug("Finished processing %d rows from data ID query.", n)
571 yield commonDataIds
573 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True):
574 """Perform follow up queries for each dataset data ID produced in
575 `fillDataIds`.
577 This method populates `_DatasetScaffolding.refs` (except for those in
578 `prerequisites`).
580 Parameters
581 ----------
582 registry : `lsst.daf.butler.Registry`
583 Registry for the data repository; used for all data ID queries.
584 collections : `lsst.daf.butler.CollectionSearch`
585 Object representing the collections to search for input datasets.
586 run : `str`, optional
587 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
588 output datasets, if it already exists.
589 commonDataIds : \
590 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
591 Result of a previous call to `connectDataIds`.
592 skipExisting : `bool`, optional
593 If `True` (default), a Quantum is not created if all its outputs
594 already exist in ``run``. Ignored if ``run`` is `None`.
596 Raises
597 ------
598 OutputExistsError
599 Raised if an output dataset already exists in the output run
600 and ``skipExisting`` is `False`. The case where some but not all
601 of a quantum's outputs are present and ``skipExisting`` is `True`
602 cannot be identified at this stage, and is handled by `fillQuanta`
603 instead.
604 """
605 # Look up [init] intermediate and output datasets in the output
606 # collection, if there is an output collection.
607 if run is not None:
608 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
609 self.initOutputs.items(),
610 self.intermediates.items(),
611 self.outputs.items()):
612 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
613 len(refs), datasetType.name)
614 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
615 resolvedRefQueryResults = commonDataIds.subset(
616 datasetType.dimensions,
617 unique=True
618 ).findDatasets(
619 datasetType,
620 collections=run,
621 deduplicate=True
622 )
623 for resolvedRef in resolvedRefQueryResults:
624 # TODO: we could easily support per-DatasetType
625 # skipExisting and I could imagine that being useful - it's
626 # probably required in order to support writing initOutputs
627 # before QuantumGraph generation.
628 assert resolvedRef.dataId in refs
629 if skipExisting or isInit:
630 refs[resolvedRef.dataId] = resolvedRef
631 else:
632 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
633 f"output RUN collection '{run}' with data ID"
634 f" {resolvedRef.dataId}.")
635 # Look up input and initInput datasets in the input collection(s).
636 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
637 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
638 resolvedRefQueryResults = commonDataIds.subset(
639 datasetType.dimensions,
640 unique=True
641 ).findDatasets(
642 datasetType,
643 collections=collections,
644 deduplicate=True
645 )
646 dataIdsNotFoundYet = set(refs.keys())
647 for resolvedRef in resolvedRefQueryResults:
648 dataIdsNotFoundYet.discard(resolvedRef.dataId)
649 refs[resolvedRef.dataId] = resolvedRef
650 if dataIdsNotFoundYet:
651 raise RuntimeError(
652 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
653 f"'{datasetType.name}' was/were present in a previous "
654 f"query, but could not be found now."
655 f"This is either a logic bug in QuantumGraph generation "
656 f"or the input collections have been modified since "
657 f"QuantumGraph generation began."
658 )
659 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
660 # replacing the unresolved refs there, and then look up prerequisites.
661 for task in self.tasks:
662 _LOG.debug(
663 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
664 len(task.quanta),
665 task.taskDef.label
666 )
667 lookupFunctions = {
668 c.name: c.lookupFunction
669 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
670 if c.lookupFunction is not None
671 }
672 dataIdsToSkip = []
673 for quantum in task.quanta.values():
674 # Process outputs datasets only if there is a run to look for
675 # outputs in and skipExisting is True. Note that if
676 # skipExisting is False, any output datasets that already exist
677 # would have already caused an exception to be raised.
678 # We never update the DatasetRefs in the quantum because those
679 # should never be resolved.
680 if run is not None and skipExisting:
681 resolvedRefs = []
682 unresolvedRefs = []
683 for datasetType, originalRefs in quantum.outputs.items():
684 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
685 if ref.id is not None:
686 resolvedRefs.append(ref)
687 else:
688 unresolvedRefs.append(ref)
689 if resolvedRefs:
690 if unresolvedRefs:
691 raise OutputExistsError(
692 f"Quantum {quantum.dataId} of task with label "
693 f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
694 f"and others that don't ({unresolvedRefs})."
695 )
696 else:
697 # All outputs are already present; skip this
698 # quantum and continue to the next.
699 dataIdsToSkip.append(quantum.dataId)
700 continue
701 # Update the input DatasetRefs to the resolved ones we already
702 # searched for.
703 for datasetType, refs in quantum.inputs.items():
704 for ref in task.inputs.extract(datasetType, refs.keys()):
705 refs[ref.dataId] = ref
706 # Look up prerequisite datasets in the input collection(s).
707 # These may have dimensions that extend beyond those we queried
708 # for originally, because we want to permit those data ID
709 # values to differ across quanta and dataset types.
710 for datasetType in task.prerequisites:
711 lookupFunction = lookupFunctions.get(datasetType.name)
712 if lookupFunction is not None:
713 # PipelineTask has provided its own function to do the
714 # lookup. This always takes precedence.
715 refs = list(
716 lookupFunction(datasetType, registry, quantum.dataId, collections)
717 )
718 elif (datasetType.isCalibration()
719 and datasetType.dimensions <= quantum.dataId.graph
720 and quantum.dataId.graph.temporal):
721 # This is a master calibration lookup, which we have to
722 # handle specially because the query system can't do a
723 # temporal join on a non-dimension-based timespan yet.
724 timespan = quantum.dataId.timespan
725 try:
726 refs = [registry.findDataset(datasetType, quantum.dataId,
727 collections=collections,
728 timespan=timespan)]
729 except KeyError:
730 # This dataset type is not present in the registry,
731 # which just means there are no datasets here.
732 refs = []
733 else:
734 # Most general case.
735 refs = list(registry.queryDatasets(datasetType,
736 collections=collections,
737 dataId=quantum.dataId,
738 deduplicate=True).expanded())
739 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs
740 if ref is not None})
741 # Actually remove any quanta that we decided to skip above.
742 if dataIdsToSkip:
743 _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
744 len(dataIdsToSkip), task.taskDef.label)
745 for dataId in dataIdsToSkip:
746 del task.quanta[dataId]
748 def makeQuantumGraph(self):
749 """Create a `QuantumGraph` from the quanta already present in
750 the scaffolding data structure.
752 Returns
753 -------
754 graph : `QuantumGraph`
755 The full `QuantumGraph`.
756 """
757 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
758 graph.initInputs = self.initInputs.unpackSingleRefs()
759 graph.initOutputs = self.initOutputs.unpackSingleRefs()
760 graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
761 return graph
764class _InstrumentFinder(TreeVisitor):
765 """Implementation of TreeVisitor which looks for instrument name
767 Instrument should be specified as a boolean expression
769 instrument = 'string'
770 'string' = instrument
772 so we only need to find a binary operator where operator is "=",
773 one side is a string literal and other side is an identifier.
774 All visit methods return tuple of (type, value), non-useful nodes
775 return None for both type and value.
776 """
777 def __init__(self):
778 self.instruments = []
780 def visitNumericLiteral(self, value, node):
781 # do not care about numbers
782 return (None, None)
784 def visitStringLiteral(self, value, node):
785 # return type and value
786 return ("str", value)
788 def visitTimeLiteral(self, value, node):
789 # do not care about these
790 return (None, None)
792 def visitRangeLiteral(self, start, stop, stride, node):
793 # do not care about these
794 return (None, None)
796 def visitIdentifier(self, name, node):
797 if name.lower() == "instrument":
798 return ("id", "instrument")
799 return (None, None)
801 def visitUnaryOp(self, operator, operand, node):
802 # do not care about these
803 return (None, None)
805 def visitBinaryOp(self, operator, lhs, rhs, node):
806 if operator == "=":
807 if lhs == ("id", "instrument") and rhs[0] == "str":
808 self.instruments.append(rhs[1])
809 elif rhs == ("id", "instrument") and lhs[0] == "str":
810 self.instruments.append(lhs[1])
811 return (None, None)
813 def visitIsIn(self, lhs, values, not_in, node):
814 # do not care about these
815 return (None, None)
817 def visitParens(self, expression, node):
818 # do not care about these
819 return (None, None)
822def _findInstruments(queryStr):
823 """Get the names of any instrument named in the query string by searching
824 for "instrument = <value>" and similar patterns.
826 Parameters
827 ----------
828 queryStr : `str` or None
829 The query string to search, or None if there is no query.
831 Returns
832 -------
833 instruments : `list` [`str`]
834 The list of instrument names found in the query.
836 Raises
837 ------
838 ValueError
839 If the query expression can not be parsed.
840 """
841 if not queryStr:
842 return []
843 parser = ParserYacc()
844 finder = _InstrumentFinder()
845 try:
846 tree = parser.parse(queryStr)
847 except ParseError as exc:
848 raise ValueError(f"failed to parse query expression: {queryStr}") from exc
849 tree.visit(finder)
850 return finder.instruments
853# ------------------------
854# Exported definitions --
855# ------------------------
858class GraphBuilderError(Exception):
859 """Base class for exceptions generated by graph builder.
860 """
861 pass
864class OutputExistsError(GraphBuilderError):
865 """Exception generated when output datasets already exist.
866 """
867 pass
870class PrerequisiteMissingError(GraphBuilderError):
871 """Exception generated when a prerequisite dataset does not exist.
872 """
873 pass
876class GraphBuilder(object):
877 """GraphBuilder class is responsible for building task execution graph from
878 a Pipeline.
880 Parameters
881 ----------
882 registry : `~lsst.daf.butler.Registry`
883 Data butler instance.
884 skipExisting : `bool`, optional
885 If `True` (default), a Quantum is not created if all its outputs
886 already exist.
887 """
889 def __init__(self, registry, skipExisting=True):
890 self.registry = registry
891 self.dimensions = registry.dimensions
892 self.skipExisting = skipExisting
894 def makeGraph(self, pipeline, collections, run, userQuery):
895 """Create execution graph for a pipeline.
897 Parameters
898 ----------
899 pipeline : `Pipeline`
900 Pipeline definition, task names/classes and their configs.
901 collections : `lsst.daf.butler.CollectionSearch`
902 Object representing the collections to search for input datasets.
903 run : `str`, optional
904 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
905 output datasets, if it already exists.
906 userQuery : `str`
907 String which defines user-defined selection for registry, should be
908 empty or `None` if there is no restrictions on data selection.
910 Returns
911 -------
912 graph : `QuantumGraph`
914 Raises
915 ------
916 UserExpressionError
917 Raised when user expression cannot be parsed.
918 OutputExistsError
919 Raised when output datasets already exist.
920 Exception
921 Other exceptions types may be raised by underlying registry
922 classes.
923 """
924 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
926 instrument = pipeline.getInstrument()
927 if isinstance(instrument, str):
928 instrument = doImport(instrument)
929 instrumentName = instrument.getName() if instrument else None
930 userQuery = self._verifyInstrumentRestriction(instrumentName, userQuery)
932 with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds:
933 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
934 skipExisting=self.skipExisting)
935 return scaffolding.makeQuantumGraph()
937 @staticmethod
938 def _verifyInstrumentRestriction(instrumentName, query):
939 """Add an instrument restriction to the query if it does not have one,
940 and verify that if given an instrument name that there are no other
941 instrument restrictions in the query.
943 Parameters
944 ----------
945 instrumentName : `str`
946 The name of the instrument that should appear in the query.
947 query : `str`
948 The query string.
950 Returns
951 -------
952 query : `str`
953 The query string with the instrument added to it if needed.
955 Raises
956 ------
957 RuntimeError
958 If the pipeline names an instrument and the query contains more
959 than one instrument or the name of the instrument in the query does
960 not match the instrument named by the pipeline.
961 """
962 if not instrumentName:
963 return query
964 queryInstruments = _findInstruments(query)
965 if len(queryInstruments) > 1:
966 raise RuntimeError(f"When the pipeline has an instrument (\"{instrumentName}\") the query must "
967 "have zero instruments or one instrument that matches the pipeline. "
968 f"Found these instruments in the query: {queryInstruments}.")
969 if not queryInstruments:
970 # There is not an instrument in the query, add it:
971 restriction = f"instrument = '{instrumentName}'"
972 _LOG.debug(f"Adding restriction \"{restriction}\" to query.")
973 query = f"{restriction} AND ({query})" if query else restriction # (there may not be a query)
974 elif queryInstruments[0] != instrumentName:
975 # Since there is an instrument in the query, it should match
976 # the instrument in the pipeline.
977 raise RuntimeError(f"The instrument named in the query (\"{queryInstruments[0]}\") does not "
978 f"match the instrument named by the pipeline (\"{instrumentName}\")")
979 return query