Coverage for python/lsst/pipe/base/graphBuilder.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from contextlib import contextmanager
34from dataclasses import dataclass
35from typing import Dict, Iterable, Iterator, List
36import logging
39# -----------------------------
40# Imports for other modules --
41# -----------------------------
42from .connections import iterConnections
43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44from .graph import QuantumGraph, QuantumGraphTaskNodes
45from lsst.daf.butler import (
46 DataCoordinate,
47 DatasetRef,
48 DatasetType,
49 DimensionGraph,
50 DimensionUniverse,
51 NamedKeyDict,
52 Quantum,
53)
54from lsst.daf.butler.registry.queries.exprParser import ParseError, ParserYacc, TreeVisitor
55from lsst.utils import doImport
57# ----------------------------------
58# Local non-exported definitions --
59# ----------------------------------
61_LOG = logging.getLogger(__name__.partition(".")[2])
64class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
65 """A custom dictionary that maps `DatasetType` to a nested dictionary of
66 the known `DatasetRef` instances of that type.
68 Parameters
69 ----------
70 args
71 Positional arguments are forwarded to the `dict` constructor.
72 universe : `DimensionUniverse`
73 Universe of all possible dimensions.
74 """
75 def __init__(self, *args, universe: DimensionGraph):
76 super().__init__(*args)
77 self.universe = universe
79 @classmethod
80 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
81 universe: DimensionUniverse) -> _DatasetDict:
82 """Construct a dictionary from a flat iterable of `DatasetType` keys.
84 Parameters
85 ----------
86 datasetTypes : `iterable` of `DatasetType`
87 DatasetTypes to use as keys for the dict. Values will be empty
88 dictionaries.
89 universe : `DimensionUniverse`
90 Universe of all possible dimensions.
92 Returns
93 -------
94 dictionary : `_DatasetDict`
95 A new `_DatasetDict` instance.
96 """
97 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
99 @classmethod
100 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
101 ) -> _DatasetDict:
102 """Return a new dictionary by extracting items corresponding to the
103 given keys from one or more existing dictionaries.
105 Parameters
106 ----------
107 datasetTypes : `iterable` of `DatasetType`
108 DatasetTypes to use as keys for the dict. Values will be obtained
109 by lookups against ``first`` and ``rest``.
110 first : `_DatasetDict`
111 Another dictionary from which to extract values.
112 rest
113 Additional dictionaries from which to extract values.
115 Returns
116 -------
117 dictionary : `_DatasetDict`
118 A new dictionary instance.
119 """
120 combined = ChainMap(first, *rest)
121 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
122 universe=first.universe)
124 @property
125 def dimensions(self) -> DimensionGraph:
126 """The union of all dimensions used by all dataset types in this
127 dictionary, including implied dependencies (`DimensionGraph`).
128 """
129 base = self.universe.empty
130 if len(self) == 0:
131 return base
132 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
134 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
135 """Unpack nested single-element `DatasetRef` dicts into a new
136 mapping with `DatasetType` keys and `DatasetRef` values.
138 This method assumes that each nest contains exactly one item, as is the
139 case for all "init" datasets.
141 Returns
142 -------
143 dictionary : `NamedKeyDict`
144 Dictionary mapping `DatasetType` to `DatasetRef`, with both
145 `DatasetType` instances and string names usable as keys.
146 """
147 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
148 ref, = refs.values()
149 return ref
150 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
152 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
153 """Unpack nested multi-element `DatasetRef` dicts into a new
154 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
156 Returns
157 -------
158 dictionary : `NamedKeyDict`
159 Dictionary mapping `DatasetType` to `DatasetRef`, with both
160 `DatasetType` instances and string names usable as keys.
161 """
162 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
164 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
165 ) -> Iterator[DatasetRef]:
166 """Iterate over the contained `DatasetRef` instances that match the
167 given `DatasetType` and data IDs.
169 Parameters
170 ----------
171 datasetType : `DatasetType`
172 Dataset type to match.
173 dataIds : `Iterable` [ `DataCoordinate` ]
174 Data IDs to match.
176 Returns
177 -------
178 refs : `Iterator` [ `DatasetRef` ]
179 DatasetRef instances for which ``ref.datasetType == datasetType``
180 and ``ref.dataId`` is in ``dataIds``.
181 """
182 refs = self[datasetType]
183 return (refs[dataId] for dataId in dataIds)
186class _QuantumScaffolding:
187 """Helper class aggregating information about a `Quantum`, used when
188 constructing a `QuantumGraph`.
190 See `_PipelineScaffolding` for a top-down description of the full
191 scaffolding data structure.
193 Parameters
194 ----------
195 task : _TaskScaffolding
196 Back-reference to the helper object for the `PipelineTask` this quantum
197 represents an execution of.
198 dataId : `DataCoordinate`
199 Data ID for this quantum.
200 """
201 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
202 self.task = task
203 self.dataId = dataId
204 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
205 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
206 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
207 universe=dataId.universe)
209 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
211 def __repr__(self):
212 return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)"
214 task: _TaskScaffolding
215 """Back-reference to the helper object for the `PipelineTask` this quantum
216 represents an execution of.
217 """
219 dataId: DataCoordinate
220 """Data ID for this quantum.
221 """
223 inputs: _DatasetDict
224 """Nested dictionary containing `DatasetRef` inputs to this quantum.
226 This is initialized to map each `DatasetType` to an empty dictionary at
227 construction. Those nested dictionaries are populated (with data IDs as
228 keys) with unresolved `DatasetRef` instances in
229 `_PipelineScaffolding.connectDataIds`.
230 """
232 outputs: _DatasetDict
233 """Nested dictionary containing `DatasetRef` outputs this quantum.
234 """
236 prerequisites: _DatasetDict
237 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
238 quantum.
239 """
241 def makeQuantum(self) -> Quantum:
242 """Transform the scaffolding object into a true `Quantum` instance.
244 Returns
245 -------
246 quantum : `Quantum`
247 An actual `Quantum` instance.
248 """
249 allInputs = self.inputs.unpackMultiRefs()
250 allInputs.update(self.prerequisites.unpackMultiRefs())
251 # Give the task's Connections class an opportunity to remove some
252 # inputs, or complain if they are unacceptable.
253 config = self.task.taskDef.config
254 connections = config.connections.ConnectionsClass(config=config)
255 # This will raise if one of the check conditions is not met, which is the intended
256 # behavior
257 allInputs = connections.adjustQuantum(allInputs)
258 return Quantum(
259 taskName=self.task.taskDef.taskName,
260 taskClass=self.task.taskDef.taskClass,
261 dataId=self.dataId,
262 initInputs=self.task.initInputs.unpackSingleRefs(),
263 predictedInputs=allInputs,
264 outputs=self.outputs.unpackMultiRefs(),
265 )
268@dataclass
269class _TaskScaffolding:
270 """Helper class aggregating information about a `PipelineTask`, used when
271 constructing a `QuantumGraph`.
273 See `_PipelineScaffolding` for a top-down description of the full
274 scaffolding data structure.
276 Parameters
277 ----------
278 taskDef : `TaskDef`
279 Data structure that identifies the task class and its config.
280 parent : `_PipelineScaffolding`
281 The parent data structure that will hold the instance being
282 constructed.
283 datasetTypes : `TaskDatasetTypes`
284 Data structure that categorizes the dataset types used by this task.
285 """
286 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
287 universe = parent.dimensions.universe
288 self.taskDef = taskDef
289 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
290 assert self.dimensions.issubset(parent.dimensions)
291 # Initialize _DatasetDicts as subsets of the one or two
292 # corresponding dicts in the parent _PipelineScaffolding.
293 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
294 parent.initIntermediates)
295 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
296 parent.initOutputs)
297 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
298 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
299 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
300 self.dataIds = set()
301 self.quanta = {}
303 def __repr__(self):
304 # Default dataclass-injected __repr__ gets caught in an infinite loop
305 # because of back-references.
306 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
308 taskDef: TaskDef
309 """Data structure that identifies the task class and its config
310 (`TaskDef`).
311 """
313 dimensions: DimensionGraph
314 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
315 """
317 initInputs: _DatasetDict
318 """Dictionary containing information about datasets used to construct this
319 task (`_DatasetDict`).
320 """
322 initOutputs: _DatasetDict
323 """Dictionary containing information about datasets produced as a
324 side-effect of constructing this task (`_DatasetDict`).
325 """
327 inputs: _DatasetDict
328 """Dictionary containing information about datasets used as regular,
329 graph-constraining inputs to this task (`_DatasetDict`).
330 """
332 outputs: _DatasetDict
333 """Dictionary containing information about datasets produced by this task
334 (`_DatasetDict`).
335 """
337 prerequisites: _DatasetDict
338 """Dictionary containing information about input datasets that must be
339 present in the repository before any Pipeline containing this task is run
340 (`_DatasetDict`).
341 """
343 quanta: Dict[DataCoordinate, _QuantumScaffolding]
344 """Dictionary mapping data ID to a scaffolding object for the Quantum of
345 this task with that data ID.
346 """
348 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes:
349 """Create a `QuantumGraphTaskNodes` instance from the information in
350 ``self``.
352 Returns
353 -------
354 nodes : `QuantumGraphTaskNodes`
355 The `QuantumGraph` elements corresponding to this task.
356 """
357 return QuantumGraphTaskNodes(
358 taskDef=self.taskDef,
359 quanta=[q.makeQuantum() for q in self.quanta.values()],
360 initInputs=self.initInputs.unpackSingleRefs(),
361 initOutputs=self.initOutputs.unpackSingleRefs(),
362 )
365@dataclass
366class _PipelineScaffolding:
367 """A helper data structure that organizes the information involved in
368 constructing a `QuantumGraph` for a `Pipeline`.
370 Parameters
371 ----------
372 pipeline : `Pipeline`
373 Sequence of tasks from which a graph is to be constructed. Must
374 have nested task classes already imported.
375 universe : `DimensionUniverse`
376 Universe of all possible dimensions.
378 Notes
379 -----
380 The scaffolding data structure contains nested data structures for both
381 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
382 data structures are shared between the pipeline-level structure (which
383 aggregates all datasets and categorizes them from the perspective of the
384 complete pipeline) and the individual tasks that use them as inputs and
385 outputs.
387 `QuantumGraph` construction proceeds in four steps, with each corresponding
388 to a different `_PipelineScaffolding` method:
390 1. When `_PipelineScaffolding` is constructed, we extract and categorize
391 the DatasetTypes used by the pipeline (delegating to
392 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
393 nested `_TaskScaffolding` and `_DatasetDict` objects.
395 2. In `connectDataIds`, we construct and run the "Big Join Query", which
396 returns related tuples of all dimensions used to identify any regular
397 input, output, and intermediate datasets (not prerequisites). We then
398 iterate over these tuples of related dimensions, identifying the subsets
399 that correspond to distinct data IDs for each task and dataset type,
400 and then create `_QuantumScaffolding` objects.
402 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
403 dataset data IDs previously identified, transforming unresolved
404 DatasetRefs into resolved DatasetRefs where appropriate. We then look
405 up prerequisite datasets for all quanta.
407 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
408 per-task `_QuantumScaffolding` objects.
409 """
410 def __init__(self, pipeline, *, registry):
411 _LOG.debug("Initializing data structures for QuantumGraph generation.")
412 self.tasks = []
413 # Aggregate and categorize the DatasetTypes in the Pipeline.
414 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
415 # Construct dictionaries that map those DatasetTypes to structures
416 # that will (later) hold addiitonal information about them.
417 for attr in ("initInputs", "initIntermediates", "initOutputs",
418 "inputs", "intermediates", "outputs", "prerequisites"):
419 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
420 universe=registry.dimensions))
421 # Aggregate all dimensions for all non-init, non-prerequisite
422 # DatasetTypes. These are the ones we'll include in the big join query.
423 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
424 self.outputs.dimensions)
425 # Construct scaffolding nodes for each Task, and add backreferences
426 # to the Task from each DatasetScaffolding node.
427 # Note that there's only one scaffolding node for each DatasetType, shared by
428 # _PipelineScaffolding and all _TaskScaffoldings that reference it.
429 if isinstance(pipeline, Pipeline):
430 pipeline = pipeline.toExpandedPipeline()
431 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
432 for taskDef, taskDatasetTypes in zip(pipeline,
433 datasetTypes.byTask.values())]
435 def __repr__(self):
436 # Default dataclass-injected __repr__ gets caught in an infinite loop
437 # because of back-references.
438 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
440 tasks: List[_TaskScaffolding]
441 """Scaffolding data structures for each task in the pipeline
442 (`list` of `_TaskScaffolding`).
443 """
445 initInputs: _DatasetDict
446 """Datasets consumed but not produced when constructing the tasks in this
447 pipeline (`_DatasetDict`).
448 """
450 initIntermediates: _DatasetDict
451 """Datasets that are both consumed and produced when constructing the tasks
452 in this pipeline (`_DatasetDict`).
453 """
455 initOutputs: _DatasetDict
456 """Datasets produced but not consumed when constructing the tasks in this
457 pipeline (`_DatasetDict`).
458 """
460 inputs: _DatasetDict
461 """Datasets that are consumed but not produced when running this pipeline
462 (`_DatasetDict`).
463 """
465 intermediates: _DatasetDict
466 """Datasets that are both produced and consumed when running this pipeline
467 (`_DatasetDict`).
468 """
470 outputs: _DatasetDict
471 """Datasets produced but not consumed when when running this pipeline
472 (`_DatasetDict`).
473 """
475 prerequisites: _DatasetDict
476 """Datasets that are consumed when running this pipeline and looked up
477 per-Quantum when generating the graph (`_DatasetDict`).
478 """
480 dimensions: DimensionGraph
481 """All dimensions used by any regular input, intermediate, or output
482 (not prerequisite) dataset; the set of dimension used in the "Big Join
483 Query" (`DimensionGraph`).
485 This is required to be a superset of all task quantum dimensions.
486 """
488 @contextmanager
489 def connectDataIds(self, registry, collections, userQuery):
490 """Query for the data IDs that connect nodes in the `QuantumGraph`.
492 This method populates `_TaskScaffolding.dataIds` and
493 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
495 Parameters
496 ----------
497 registry : `lsst.daf.butler.Registry`
498 Registry for the data repository; used for all data ID queries.
499 collections : `lsst.daf.butler.CollectionSearch`
500 Object representing the collections to search for input datasets.
501 userQuery : `str`, optional
502 User-provided expression to limit the data IDs processed.
504 Returns
505 -------
506 commonDataIds : \
507 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
508 An interface to a database temporary table containing all data IDs
509 that will appear in this `QuantumGraph`. Returned inside a
510 context manager, which will drop the temporary table at the end of
511 the `with` block in which this method is called.
512 """
513 _LOG.debug("Building query for data IDs.")
514 # Initialization datasets always have empty data IDs.
515 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
516 for datasetType, refs in itertools.chain(self.initInputs.items(),
517 self.initIntermediates.items(),
518 self.initOutputs.items()):
519 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
520 # Run one big query for the data IDs for task dimensions and regular
521 # inputs and outputs. We limit the query to only dimensions that are
522 # associated with the input dataset types, but don't (yet) try to
523 # obtain the dataset_ids for those inputs.
524 _LOG.debug("Submitting data ID query and materializing results.")
525 with registry.queryDataIds(self.dimensions,
526 datasets=list(self.inputs),
527 collections=collections,
528 where=userQuery,
529 ).materialize() as commonDataIds:
530 _LOG.debug("Expanding data IDs.")
531 commonDataIds = commonDataIds.expanded()
532 _LOG.debug("Iterating over query results to associate quanta with datasets.")
533 # Iterate over query results, populating data IDs for datasets and
534 # quanta and then connecting them to each other.
535 n = 0
536 for n, commonDataId in enumerate(commonDataIds):
537 # Create DatasetRefs for all DatasetTypes from this result row,
538 # noting that we might have created some already.
539 # We remember both those that already existed and those that we
540 # create now.
541 refsForRow = {}
542 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
543 self.outputs.items()):
544 datasetDataId = commonDataId.subset(datasetType.dimensions)
545 ref = refs.get(datasetDataId)
546 if ref is None:
547 ref = DatasetRef(datasetType, datasetDataId)
548 refs[datasetDataId] = ref
549 refsForRow[datasetType.name] = ref
550 # Create _QuantumScaffolding objects for all tasks from this result
551 # row, noting that we might have created some already.
552 for task in self.tasks:
553 quantumDataId = commonDataId.subset(task.dimensions)
554 quantum = task.quanta.get(quantumDataId)
555 if quantum is None:
556 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
557 task.quanta[quantumDataId] = quantum
558 # Whether this is a new quantum or an existing one, we can now
559 # associate the DatasetRefs for this row with it. The fact
560 # the fact that a Quantum data ID and a dataset data ID both
561 # came from the same result row is what tells us they should
562 # be associated.
563 # Many of these associates will be duplicates (because another
564 # query row that differed from this one only in irrelevant
565 # dimensions already added them), and we use sets to skip.
566 for datasetType in task.inputs:
567 ref = refsForRow[datasetType.name]
568 quantum.inputs[datasetType.name][ref.dataId] = ref
569 for datasetType in task.outputs:
570 ref = refsForRow[datasetType.name]
571 quantum.outputs[datasetType.name][ref.dataId] = ref
572 _LOG.debug("Finished processing %d rows from data ID query.", n)
573 yield commonDataIds
575 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True):
576 """Perform follow up queries for each dataset data ID produced in
577 `fillDataIds`.
579 This method populates `_DatasetScaffolding.refs` (except for those in
580 `prerequisites`).
582 Parameters
583 ----------
584 registry : `lsst.daf.butler.Registry`
585 Registry for the data repository; used for all data ID queries.
586 collections : `lsst.daf.butler.CollectionSearch`
587 Object representing the collections to search for input datasets.
588 run : `str`, optional
589 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
590 output datasets, if it already exists.
591 commonDataIds : \
592 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
593 Result of a previous call to `connectDataIds`.
594 skipExisting : `bool`, optional
595 If `True` (default), a Quantum is not created if all its outputs
596 already exist in ``run``. Ignored if ``run`` is `None`.
598 Raises
599 ------
600 OutputExistsError
601 Raised if an output dataset already exists in the output run
602 and ``skipExisting`` is `False`. The case where some but not all
603 of a quantum's outputs are present and ``skipExisting`` is `True`
604 cannot be identified at this stage, and is handled by `fillQuanta`
605 instead.
606 """
607 # Look up [init] intermediate and output datasets in the output
608 # collection, if there is an output collection.
609 if run is not None:
610 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
611 self.initOutputs.items(),
612 self.intermediates.items(),
613 self.outputs.items()):
614 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
615 len(refs), datasetType.name)
616 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
617 resolvedRefQueryResults = commonDataIds.subset(
618 datasetType.dimensions,
619 unique=True
620 ).findDatasets(
621 datasetType,
622 collections=run,
623 deduplicate=True
624 )
625 for resolvedRef in resolvedRefQueryResults:
626 # TODO: we could easily support per-DatasetType
627 # skipExisting and I could imagine that being useful - it's
628 # probably required in order to support writing initOutputs
629 # before QuantumGraph generation.
630 assert resolvedRef.dataId in refs
631 if skipExisting or isInit:
632 refs[resolvedRef.dataId] = resolvedRef
633 else:
634 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
635 f"output RUN collection '{run}' with data ID"
636 f" {resolvedRef.dataId}.")
637 # Look up input and initInput datasets in the input collection(s).
638 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
639 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
640 resolvedRefQueryResults = commonDataIds.subset(
641 datasetType.dimensions,
642 unique=True
643 ).findDatasets(
644 datasetType,
645 collections=collections,
646 deduplicate=True
647 )
648 dataIdsNotFoundYet = set(refs.keys())
649 for resolvedRef in resolvedRefQueryResults:
650 dataIdsNotFoundYet.discard(resolvedRef.dataId)
651 refs[resolvedRef.dataId] = resolvedRef
652 if dataIdsNotFoundYet:
653 raise RuntimeError(
654 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
655 f"'{datasetType.name}' was/were present in a previous "
656 f"query, but could not be found now."
657 f"This is either a logic bug in QuantumGraph generation "
658 f"or the input collections have been modified since "
659 f"QuantumGraph generation began."
660 )
661 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
662 # replacing the unresolved refs there, and then look up prerequisites.
663 for task in self.tasks:
664 _LOG.debug(
665 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
666 len(task.quanta),
667 task.taskDef.label
668 )
669 lookupFunctions = {
670 c.name: c.lookupFunction
671 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
672 if c.lookupFunction is not None
673 }
674 dataIdsToSkip = []
675 for quantum in task.quanta.values():
676 # Process outputs datasets only if there is a run to look for
677 # outputs in and skipExisting is True. Note that if
678 # skipExisting is False, any output datasets that already exist
679 # would have already caused an exception to be raised.
680 # We never update the DatasetRefs in the quantum because those
681 # should never be resolved.
682 if run is not None and skipExisting:
683 resolvedRefs = []
684 unresolvedRefs = []
685 for datasetType, originalRefs in quantum.outputs.items():
686 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
687 if ref.id is not None:
688 resolvedRefs.append(ref)
689 else:
690 unresolvedRefs.append(ref)
691 if resolvedRefs:
692 if unresolvedRefs:
693 raise OutputExistsError(
694 f"Quantum {quantum.dataId} of task with label "
695 f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) "
696 f"and others that don't ({unresolvedRefs})."
697 )
698 else:
699 # All outputs are already present; skip this
700 # quantum and continue to the next.
701 dataIdsToSkip.append(quantum.dataId)
702 continue
703 # Update the input DatasetRefs to the resolved ones we already
704 # searched for.
705 for datasetType, refs in quantum.inputs.items():
706 for ref in task.inputs.extract(datasetType, refs.keys()):
707 refs[ref.dataId] = ref
708 # Look up prerequisite datasets in the input collection(s).
709 # These may have dimensions that extend beyond those we queried
710 # for originally, because we want to permit those data ID
711 # values to differ across quanta and dataset types.
712 # For example, the same quantum may have a flat and bias with
713 # a different calibration_label, or a refcat with a skypix
714 # value that overlaps the quantum's data ID's region, but not
715 # the user expression used for the initial query.
716 for datasetType in task.prerequisites:
717 lookupFunction = lookupFunctions.get(datasetType.name)
718 if lookupFunction is not None:
719 refs = list(
720 lookupFunction(datasetType, registry, quantum.dataId, collections)
721 )
722 else:
723 refs = list(registry.queryDatasets(datasetType,
724 collections=collections,
725 dataId=quantum.dataId,
726 deduplicate=True).expanded())
727 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs})
728 # Actually remove any quanta that we decided to skip above.
729 if dataIdsToSkip:
730 _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.",
731 len(dataIdsToSkip), task.taskDef.label)
732 for dataId in dataIdsToSkip:
733 del task.quanta[dataId]
735 def makeQuantumGraph(self):
736 """Create a `QuantumGraph` from the quanta already present in
737 the scaffolding data structure.
739 Returns
740 -------
741 graph : `QuantumGraph`
742 The full `QuantumGraph`.
743 """
744 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks)
745 graph.initInputs = self.initInputs.unpackSingleRefs()
746 graph.initOutputs = self.initOutputs.unpackSingleRefs()
747 graph.initIntermediates = self.initIntermediates.unpackSingleRefs()
748 return graph
751class _InstrumentFinder(TreeVisitor):
752 """Implementation of TreeVisitor which looks for instrument name
754 Instrument should be specified as a boolean expression
756 instrument = 'string'
757 'string' = instrument
759 so we only need to find a binary operator where operator is "=",
760 one side is a string literal and other side is an identifier.
761 All visit methods return tuple of (type, value), non-useful nodes
762 return None for both type and value.
763 """
764 def __init__(self):
765 self.instruments = []
767 def visitNumericLiteral(self, value, node):
768 # do not care about numbers
769 return (None, None)
771 def visitStringLiteral(self, value, node):
772 # return type and value
773 return ("str", value)
775 def visitTimeLiteral(self, value, node):
776 # do not care about these
777 return (None, None)
779 def visitRangeLiteral(self, start, stop, stride, node):
780 # do not care about these
781 return (None, None)
783 def visitIdentifier(self, name, node):
784 if name.lower() == "instrument":
785 return ("id", "instrument")
786 return (None, None)
788 def visitUnaryOp(self, operator, operand, node):
789 # do not care about these
790 return (None, None)
792 def visitBinaryOp(self, operator, lhs, rhs, node):
793 if operator == "=":
794 if lhs == ("id", "instrument") and rhs[0] == "str":
795 self.instruments.append(rhs[1])
796 elif rhs == ("id", "instrument") and lhs[0] == "str":
797 self.instruments.append(lhs[1])
798 return (None, None)
800 def visitIsIn(self, lhs, values, not_in, node):
801 # do not care about these
802 return (None, None)
804 def visitParens(self, expression, node):
805 # do not care about these
806 return (None, None)
809def _findInstruments(queryStr):
810 parser = ParserYacc()
811 finder = _InstrumentFinder()
812 try:
813 tree = parser.parse(queryStr)
814 except ParseError as exc:
815 raise ValueError(f"failed to parse query expression: {queryStr}") from exc
816 tree.visit(finder)
817 return finder.instruments
820# ------------------------
821# Exported definitions --
822# ------------------------
825class GraphBuilderError(Exception):
826 """Base class for exceptions generated by graph builder.
827 """
828 pass
831class OutputExistsError(GraphBuilderError):
832 """Exception generated when output datasets already exist.
833 """
834 pass
837class PrerequisiteMissingError(GraphBuilderError):
838 """Exception generated when a prerequisite dataset does not exist.
839 """
840 pass
843class GraphBuilder(object):
844 """GraphBuilder class is responsible for building task execution graph from
845 a Pipeline.
847 Parameters
848 ----------
849 registry : `~lsst.daf.butler.Registry`
850 Data butler instance.
851 skipExisting : `bool`, optional
852 If `True` (default), a Quantum is not created if all its outputs
853 already exist.
854 """
856 def __init__(self, registry, skipExisting=True):
857 self.registry = registry
858 self.dimensions = registry.dimensions
859 self.skipExisting = skipExisting
861 def makeGraph(self, pipeline, collections, run, userQuery):
862 """Create execution graph for a pipeline.
864 Parameters
865 ----------
866 pipeline : `Pipeline`
867 Pipeline definition, task names/classes and their configs.
868 collections : `lsst.daf.butler.CollectionSearch`
869 Object representing the collections to search for input datasets.
870 run : `str`, optional
871 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
872 output datasets, if it already exists.
873 userQuery : `str`
874 String which defines user-defined selection for registry, should be
875 empty or `None` if there is no restrictions on data selection.
877 Returns
878 -------
879 graph : `QuantumGraph`
881 Raises
882 ------
883 UserExpressionError
884 Raised when user expression cannot be parsed.
885 OutputExistsError
886 Raised when output datasets already exist.
887 Exception
888 Other exceptions types may be raised by underlying registry
889 classes.
890 """
891 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
893 instrument = pipeline.getInstrument()
894 if isinstance(instrument, str):
895 instrument = doImport(instrument)
896 instrumentName = instrument.getName() if instrument else None
897 userQuery = self._verifyInstrumentRestriction(instrumentName, userQuery)
899 with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds:
900 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
901 skipExisting=self.skipExisting)
902 return scaffolding.makeQuantumGraph()
904 @staticmethod
905 def _verifyInstrumentRestriction(instrumentName, query):
906 """Add an instrument restriction to the query if it does not have one,
907 and verify that if given an instrument name that there are no other
908 instrument restrictions in the query.
910 Parameters
911 ----------
912 instrumentName : `str`
913 The name of the instrument that should appear in the query.
914 query : `str`
915 The query string.
917 Returns
918 -------
919 query : `str`
920 The query string with the instrument added to it if needed.
922 Raises
923 ------
924 RuntimeError
925 If the pipeline names an instrument and the query contains more
926 than one instrument or the name of the instrument in the query does
927 not match the instrument named by the pipeline.
928 """
929 if not instrumentName:
930 return query
931 queryInstruments = _findInstruments(query)
932 if len(queryInstruments) > 1:
933 raise RuntimeError(f"When the pipeline has an instrument (\"{instrumentName}\") the query must "
934 "have zero instruments or one instrument that matches the pipeline. "
935 f"Found these instruments in the query: {queryInstruments}.")
936 if not queryInstruments:
937 # There is not an instrument in the query, add it:
938 restriction = f"instrument = '{instrumentName}'"
939 _LOG.debug(f"Adding restriction \"{restriction}\" to query.")
940 query = f"{restriction} AND ({query})"
941 elif queryInstruments[0] != instrumentName:
942 # Since there is an instrument in the query, it should match
943 # the instrument in the pipeline.
944 raise RuntimeError(f"The instrument named in the query (\"{queryInstruments[0]}\") does not "
945 f"match the instrument named by the pipeline (\"{instrumentName}\")")
946 return query