Coverage for python/lsst/pipe/base/graphBuilder.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from contextlib import contextmanager
34from dataclasses import dataclass
35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping
36import logging
39# -----------------------------
40# Imports for other modules --
41# -----------------------------
42from .connections import iterConnections, AdjustQuantumHelper
43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44from .graph import QuantumGraph
45from lsst.daf.butler import (
46 DataCoordinate,
47 DatasetRef,
48 DatasetType,
49 DimensionGraph,
50 DimensionUniverse,
51 NamedKeyDict,
52 Quantum,
53)
54from lsst.utils import doImport
56# ----------------------------------
57# Local non-exported definitions --
58# ----------------------------------
60_LOG = logging.getLogger(__name__.partition(".")[2])
63class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
64 """A custom dictionary that maps `DatasetType` to a nested dictionary of
65 the known `DatasetRef` instances of that type.
67 Parameters
68 ----------
69 args
70 Positional arguments are forwarded to the `dict` constructor.
71 universe : `DimensionUniverse`
72 Universe of all possible dimensions.
73 """
74 def __init__(self, *args, universe: DimensionGraph):
75 super().__init__(*args)
76 self.universe = universe
78 @classmethod
79 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
80 universe: DimensionUniverse) -> _DatasetDict:
81 """Construct a dictionary from a flat iterable of `DatasetType` keys.
83 Parameters
84 ----------
85 datasetTypes : `iterable` of `DatasetType`
86 DatasetTypes to use as keys for the dict. Values will be empty
87 dictionaries.
88 universe : `DimensionUniverse`
89 Universe of all possible dimensions.
91 Returns
92 -------
93 dictionary : `_DatasetDict`
94 A new `_DatasetDict` instance.
95 """
96 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
98 @classmethod
99 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
100 ) -> _DatasetDict:
101 """Return a new dictionary by extracting items corresponding to the
102 given keys from one or more existing dictionaries.
104 Parameters
105 ----------
106 datasetTypes : `iterable` of `DatasetType`
107 DatasetTypes to use as keys for the dict. Values will be obtained
108 by lookups against ``first`` and ``rest``.
109 first : `_DatasetDict`
110 Another dictionary from which to extract values.
111 rest
112 Additional dictionaries from which to extract values.
114 Returns
115 -------
116 dictionary : `_DatasetDict`
117 A new dictionary instance.
118 """
119 combined = ChainMap(first, *rest)
120 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
121 universe=first.universe)
123 @property
124 def dimensions(self) -> DimensionGraph:
125 """The union of all dimensions used by all dataset types in this
126 dictionary, including implied dependencies (`DimensionGraph`).
127 """
128 base = self.universe.empty
129 if len(self) == 0:
130 return base
131 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
133 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
134 """Unpack nested single-element `DatasetRef` dicts into a new
135 mapping with `DatasetType` keys and `DatasetRef` values.
137 This method assumes that each nest contains exactly one item, as is the
138 case for all "init" datasets.
140 Returns
141 -------
142 dictionary : `NamedKeyDict`
143 Dictionary mapping `DatasetType` to `DatasetRef`, with both
144 `DatasetType` instances and string names usable as keys.
145 """
146 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
147 ref, = refs.values()
148 return ref
149 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
151 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
152 """Unpack nested multi-element `DatasetRef` dicts into a new
153 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
155 Returns
156 -------
157 dictionary : `NamedKeyDict`
158 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
159 both `DatasetType` instances and string names usable as keys.
160 """
161 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
163 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
164 ) -> Iterator[DatasetRef]:
165 """Iterate over the contained `DatasetRef` instances that match the
166 given `DatasetType` and data IDs.
168 Parameters
169 ----------
170 datasetType : `DatasetType`
171 Dataset type to match.
172 dataIds : `Iterable` [ `DataCoordinate` ]
173 Data IDs to match.
175 Returns
176 -------
177 refs : `Iterator` [ `DatasetRef` ]
178 DatasetRef instances for which ``ref.datasetType == datasetType``
179 and ``ref.dataId`` is in ``dataIds``.
180 """
181 refs = self[datasetType]
182 return (refs[dataId] for dataId in dataIds)
185class _QuantumScaffolding:
186 """Helper class aggregating information about a `Quantum`, used when
187 constructing a `QuantumGraph`.
189 See `_PipelineScaffolding` for a top-down description of the full
190 scaffolding data structure.
192 Parameters
193 ----------
194 task : _TaskScaffolding
195 Back-reference to the helper object for the `PipelineTask` this quantum
196 represents an execution of.
197 dataId : `DataCoordinate`
198 Data ID for this quantum.
199 """
200 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
201 self.task = task
202 self.dataId = dataId
203 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
204 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
205 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
206 universe=dataId.universe)
208 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
210 def __repr__(self):
211 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
213 task: _TaskScaffolding
214 """Back-reference to the helper object for the `PipelineTask` this quantum
215 represents an execution of.
216 """
218 dataId: DataCoordinate
219 """Data ID for this quantum.
220 """
222 inputs: _DatasetDict
223 """Nested dictionary containing `DatasetRef` inputs to this quantum.
225 This is initialized to map each `DatasetType` to an empty dictionary at
226 construction. Those nested dictionaries are populated (with data IDs as
227 keys) with unresolved `DatasetRef` instances in
228 `_PipelineScaffolding.connectDataIds`.
229 """
231 outputs: _DatasetDict
232 """Nested dictionary containing `DatasetRef` outputs this quantum.
233 """
235 prerequisites: _DatasetDict
236 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
237 quantum.
238 """
240 def makeQuantum(self) -> Quantum:
241 """Transform the scaffolding object into a true `Quantum` instance.
243 Returns
244 -------
245 quantum : `Quantum`
246 An actual `Quantum` instance.
247 """
248 allInputs = self.inputs.unpackMultiRefs()
249 allInputs.update(self.prerequisites.unpackMultiRefs())
250 # Give the task's Connections class an opportunity to remove some
251 # inputs, or complain if they are unacceptable.
252 # This will raise if one of the check conditions is not met, which is
253 # the intended behavior.
254 # If it raises NotWorkFound, there is a bug in the QG algorithm
255 # or the adjustQuantum is incorrectly trying to make a prerequisite
256 # input behave like a regular input; adjustQuantum should only raise
257 # NoWorkFound if a regular input is missing, and it shouldn't be
258 # possible for us to have generated ``self`` if that's true.
259 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
260 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
261 return Quantum(
262 taskName=self.task.taskDef.taskName,
263 taskClass=self.task.taskDef.taskClass,
264 dataId=self.dataId,
265 initInputs=self.task.initInputs.unpackSingleRefs(),
266 inputs=helper.inputs,
267 outputs=helper.outputs,
268 )
271@dataclass
272class _TaskScaffolding:
273 """Helper class aggregating information about a `PipelineTask`, used when
274 constructing a `QuantumGraph`.
276 See `_PipelineScaffolding` for a top-down description of the full
277 scaffolding data structure.
279 Parameters
280 ----------
281 taskDef : `TaskDef`
282 Data structure that identifies the task class and its config.
283 parent : `_PipelineScaffolding`
284 The parent data structure that will hold the instance being
285 constructed.
286 datasetTypes : `TaskDatasetTypes`
287 Data structure that categorizes the dataset types used by this task.
288 """
289 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
290 universe = parent.dimensions.universe
291 self.taskDef = taskDef
292 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
293 assert self.dimensions.issubset(parent.dimensions)
294 # Initialize _DatasetDicts as subsets of the one or two
295 # corresponding dicts in the parent _PipelineScaffolding.
296 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
297 parent.initIntermediates)
298 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
299 parent.initOutputs)
300 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
301 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
302 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
303 self.dataIds = set()
304 self.quanta = {}
306 def __repr__(self):
307 # Default dataclass-injected __repr__ gets caught in an infinite loop
308 # because of back-references.
309 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
311 taskDef: TaskDef
312 """Data structure that identifies the task class and its config
313 (`TaskDef`).
314 """
316 dimensions: DimensionGraph
317 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
318 """
320 initInputs: _DatasetDict
321 """Dictionary containing information about datasets used to construct this
322 task (`_DatasetDict`).
323 """
325 initOutputs: _DatasetDict
326 """Dictionary containing information about datasets produced as a
327 side-effect of constructing this task (`_DatasetDict`).
328 """
330 inputs: _DatasetDict
331 """Dictionary containing information about datasets used as regular,
332 graph-constraining inputs to this task (`_DatasetDict`).
333 """
335 outputs: _DatasetDict
336 """Dictionary containing information about datasets produced by this task
337 (`_DatasetDict`).
338 """
340 prerequisites: _DatasetDict
341 """Dictionary containing information about input datasets that must be
342 present in the repository before any Pipeline containing this task is run
343 (`_DatasetDict`).
344 """
346 quanta: Dict[DataCoordinate, _QuantumScaffolding]
347 """Dictionary mapping data ID to a scaffolding object for the Quantum of
348 this task with that data ID.
349 """
351 def makeQuantumSet(self) -> Set[Quantum]:
352 """Create a `set` of `Quantum` from the information in ``self``.
354 Returns
355 -------
356 nodes : `set` of `Quantum
357 The `Quantum` elements corresponding to this task.
358 """
359 return set(q.makeQuantum() for q in self.quanta.values())
362@dataclass
363class _PipelineScaffolding:
364 """A helper data structure that organizes the information involved in
365 constructing a `QuantumGraph` for a `Pipeline`.
367 Parameters
368 ----------
369 pipeline : `Pipeline`
370 Sequence of tasks from which a graph is to be constructed. Must
371 have nested task classes already imported.
372 universe : `DimensionUniverse`
373 Universe of all possible dimensions.
375 Notes
376 -----
377 The scaffolding data structure contains nested data structures for both
378 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
379 data structures are shared between the pipeline-level structure (which
380 aggregates all datasets and categorizes them from the perspective of the
381 complete pipeline) and the individual tasks that use them as inputs and
382 outputs.
384 `QuantumGraph` construction proceeds in four steps, with each corresponding
385 to a different `_PipelineScaffolding` method:
387 1. When `_PipelineScaffolding` is constructed, we extract and categorize
388 the DatasetTypes used by the pipeline (delegating to
389 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
390 nested `_TaskScaffolding` and `_DatasetDict` objects.
392 2. In `connectDataIds`, we construct and run the "Big Join Query", which
393 returns related tuples of all dimensions used to identify any regular
394 input, output, and intermediate datasets (not prerequisites). We then
395 iterate over these tuples of related dimensions, identifying the subsets
396 that correspond to distinct data IDs for each task and dataset type,
397 and then create `_QuantumScaffolding` objects.
399 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
400 dataset data IDs previously identified, transforming unresolved
401 DatasetRefs into resolved DatasetRefs where appropriate. We then look
402 up prerequisite datasets for all quanta.
404 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
405 per-task `_QuantumScaffolding` objects.
406 """
407 def __init__(self, pipeline, *, registry):
408 _LOG.debug("Initializing data structures for QuantumGraph generation.")
409 self.tasks = []
410 # Aggregate and categorize the DatasetTypes in the Pipeline.
411 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
412 # Construct dictionaries that map those DatasetTypes to structures
413 # that will (later) hold addiitonal information about them.
414 for attr in ("initInputs", "initIntermediates", "initOutputs",
415 "inputs", "intermediates", "outputs", "prerequisites"):
416 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
417 universe=registry.dimensions))
418 # Aggregate all dimensions for all non-init, non-prerequisite
419 # DatasetTypes. These are the ones we'll include in the big join
420 # query.
421 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
422 self.outputs.dimensions)
423 # Construct scaffolding nodes for each Task, and add backreferences
424 # to the Task from each DatasetScaffolding node.
425 # Note that there's only one scaffolding node for each DatasetType,
426 # shared by _PipelineScaffolding and all _TaskScaffoldings that
427 # reference it.
428 if isinstance(pipeline, Pipeline):
429 pipeline = pipeline.toExpandedPipeline()
430 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
431 for taskDef, taskDatasetTypes in zip(pipeline,
432 datasetTypes.byTask.values())]
434 def __repr__(self):
435 # Default dataclass-injected __repr__ gets caught in an infinite loop
436 # because of back-references.
437 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
439 tasks: List[_TaskScaffolding]
440 """Scaffolding data structures for each task in the pipeline
441 (`list` of `_TaskScaffolding`).
442 """
444 initInputs: _DatasetDict
445 """Datasets consumed but not produced when constructing the tasks in this
446 pipeline (`_DatasetDict`).
447 """
449 initIntermediates: _DatasetDict
450 """Datasets that are both consumed and produced when constructing the tasks
451 in this pipeline (`_DatasetDict`).
452 """
454 initOutputs: _DatasetDict
455 """Datasets produced but not consumed when constructing the tasks in this
456 pipeline (`_DatasetDict`).
457 """
459 inputs: _DatasetDict
460 """Datasets that are consumed but not produced when running this pipeline
461 (`_DatasetDict`).
462 """
464 intermediates: _DatasetDict
465 """Datasets that are both produced and consumed when running this pipeline
466 (`_DatasetDict`).
467 """
469 outputs: _DatasetDict
470 """Datasets produced but not consumed when when running this pipeline
471 (`_DatasetDict`).
472 """
474 prerequisites: _DatasetDict
475 """Datasets that are consumed when running this pipeline and looked up
476 per-Quantum when generating the graph (`_DatasetDict`).
477 """
479 dimensions: DimensionGraph
480 """All dimensions used by any regular input, intermediate, or output
481 (not prerequisite) dataset; the set of dimension used in the "Big Join
482 Query" (`DimensionGraph`).
484 This is required to be a superset of all task quantum dimensions.
485 """
487 @contextmanager
488 def connectDataIds(self, registry, collections, userQuery, externalDataId):
489 """Query for the data IDs that connect nodes in the `QuantumGraph`.
491 This method populates `_TaskScaffolding.dataIds` and
492 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
494 Parameters
495 ----------
496 registry : `lsst.daf.butler.Registry`
497 Registry for the data repository; used for all data ID queries.
498 collections
499 Expressions representing the collections to search for input
500 datasets. May be any of the types accepted by
501 `lsst.daf.butler.CollectionSearch.fromExpression`.
502 userQuery : `str` or `None`
503 User-provided expression to limit the data IDs processed.
504 externalDataId : `DataCoordinate`
505 Externally-provided data ID that should be used to restrict the
506 results, just as if these constraints had been included via ``AND``
507 in ``userQuery``. This includes (at least) any instrument named
508 in the pipeline definition.
510 Returns
511 -------
512 commonDataIds : \
513 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
514 An interface to a database temporary table containing all data IDs
515 that will appear in this `QuantumGraph`. Returned inside a
516 context manager, which will drop the temporary table at the end of
517 the `with` block in which this method is called.
518 """
519 _LOG.debug("Building query for data IDs.")
520 # Initialization datasets always have empty data IDs.
521 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
522 for datasetType, refs in itertools.chain(self.initInputs.items(),
523 self.initIntermediates.items(),
524 self.initOutputs.items()):
525 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
526 # Run one big query for the data IDs for task dimensions and regular
527 # inputs and outputs. We limit the query to only dimensions that are
528 # associated with the input dataset types, but don't (yet) try to
529 # obtain the dataset_ids for those inputs.
530 _LOG.debug("Submitting data ID query and materializing results.")
531 with registry.queryDataIds(self.dimensions,
532 datasets=list(self.inputs),
533 collections=collections,
534 where=userQuery,
535 dataId=externalDataId,
536 ).materialize() as commonDataIds:
537 _LOG.debug("Expanding data IDs.")
538 commonDataIds = commonDataIds.expanded()
539 _LOG.debug("Iterating over query results to associate quanta with datasets.")
540 # Iterate over query results, populating data IDs for datasets and
541 # quanta and then connecting them to each other.
542 n = 0
543 for n, commonDataId in enumerate(commonDataIds):
544 # Create DatasetRefs for all DatasetTypes from this result row,
545 # noting that we might have created some already.
546 # We remember both those that already existed and those that we
547 # create now.
548 refsForRow = {}
549 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
550 self.outputs.items()):
551 datasetDataId = commonDataId.subset(datasetType.dimensions)
552 ref = refs.get(datasetDataId)
553 if ref is None:
554 ref = DatasetRef(datasetType, datasetDataId)
555 refs[datasetDataId] = ref
556 refsForRow[datasetType.name] = ref
557 # Create _QuantumScaffolding objects for all tasks from this
558 # result row, noting that we might have created some already.
559 for task in self.tasks:
560 quantumDataId = commonDataId.subset(task.dimensions)
561 quantum = task.quanta.get(quantumDataId)
562 if quantum is None:
563 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
564 task.quanta[quantumDataId] = quantum
565 # Whether this is a new quantum or an existing one, we can
566 # now associate the DatasetRefs for this row with it. The
567 # fact that a Quantum data ID and a dataset data ID both
568 # came from the same result row is what tells us they
569 # should be associated.
570 # Many of these associates will be duplicates (because
571 # another query row that differed from this one only in
572 # irrelevant dimensions already added them), and we use
573 # sets to skip.
574 for datasetType in task.inputs:
575 ref = refsForRow[datasetType.name]
576 quantum.inputs[datasetType.name][ref.dataId] = ref
577 for datasetType in task.outputs:
578 ref = refsForRow[datasetType.name]
579 quantum.outputs[datasetType.name][ref.dataId] = ref
580 _LOG.debug("Finished processing %d rows from data ID query.", n)
581 yield commonDataIds
583 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True,
584 clobberOutputs=True):
585 """Perform follow up queries for each dataset data ID produced in
586 `fillDataIds`.
588 This method populates `_DatasetScaffolding.refs` (except for those in
589 `prerequisites`).
591 Parameters
592 ----------
593 registry : `lsst.daf.butler.Registry`
594 Registry for the data repository; used for all data ID queries.
595 collections
596 Expressions representing the collections to search for input
597 datasets. May be any of the types accepted by
598 `lsst.daf.butler.CollectionSearch.fromExpression`.
599 run : `str`, optional
600 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
601 output datasets, if it already exists.
602 commonDataIds : \
603 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
604 Result of a previous call to `connectDataIds`.
605 skipExisting : `bool`, optional
606 If `True` (default), a Quantum is not created if all its outputs
607 already exist in ``run``. Ignored if ``run`` is `None`.
608 clobberOutputs : `bool`, optional
609 If `True` (default), allow quanta to created even if outputs exist;
610 this requires the same behavior behavior to be enabled when
611 executing. If ``skipExisting`` is also `True`, completed quanta
612 (those with metadata, or all outputs if there is no metadata
613 dataset configured) will be skipped rather than clobbered.
615 Raises
616 ------
617 OutputExistsError
618 Raised if an output dataset already exists in the output run
619 and ``skipExisting`` is `False`, or if only some outputs are
620 present and ``clobberOutputs`` is `False`.
621 """
622 # Look up [init] intermediate and output datasets in the output
623 # collection, if there is an output collection.
624 if run is not None:
625 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
626 self.initOutputs.items(),
627 self.intermediates.items(),
628 self.outputs.items()):
629 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
630 len(refs), datasetType.name)
631 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
632 resolvedRefQueryResults = commonDataIds.subset(
633 datasetType.dimensions,
634 unique=True
635 ).findDatasets(
636 datasetType,
637 collections=run,
638 findFirst=True
639 )
640 for resolvedRef in resolvedRefQueryResults:
641 # TODO: we could easily support per-DatasetType
642 # skipExisting and I could imagine that being useful - it's
643 # probably required in order to support writing initOutputs
644 # before QuantumGraph generation.
645 assert resolvedRef.dataId in refs
646 if skipExisting or isInit or clobberOutputs:
647 refs[resolvedRef.dataId] = resolvedRef
648 else:
649 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
650 f"output RUN collection '{run}' with data ID"
651 f" {resolvedRef.dataId}.")
652 # Look up input and initInput datasets in the input collection(s).
653 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
654 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
655 resolvedRefQueryResults = commonDataIds.subset(
656 datasetType.dimensions,
657 unique=True
658 ).findDatasets(
659 datasetType,
660 collections=collections,
661 findFirst=True
662 )
663 dataIdsNotFoundYet = set(refs.keys())
664 for resolvedRef in resolvedRefQueryResults:
665 dataIdsNotFoundYet.discard(resolvedRef.dataId)
666 refs[resolvedRef.dataId] = resolvedRef
667 if dataIdsNotFoundYet:
668 raise RuntimeError(
669 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
670 f"'{datasetType.name}' was/were present in a previous "
671 f"query, but could not be found now."
672 f"This is either a logic bug in QuantumGraph generation "
673 f"or the input collections have been modified since "
674 f"QuantumGraph generation began."
675 )
676 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
677 # replacing the unresolved refs there, and then look up prerequisites.
678 for task in self.tasks:
679 _LOG.debug(
680 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
681 len(task.quanta),
682 task.taskDef.label
683 )
684 lookupFunctions = {
685 c.name: c.lookupFunction
686 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
687 if c.lookupFunction is not None
688 }
689 dataIdsFailed = []
690 dataIdsSucceeded = []
691 for quantum in task.quanta.values():
692 # Process outputs datasets only if there is a run to look for
693 # outputs in and skipExisting and/or clobberOutputs is True.
694 # Note that if skipExisting is False, any output datasets that
695 # already exist would have already caused an exception to be
696 # raised. We never update the DatasetRefs in the quantum
697 # because those should never be resolved.
698 if run is not None and (skipExisting or clobberOutputs):
699 resolvedRefs = []
700 unresolvedRefs = []
701 haveMetadata = False
702 for datasetType, originalRefs in quantum.outputs.items():
703 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
704 if ref.id is not None:
705 resolvedRefs.append(ref)
706 if datasetType.name == task.taskDef.metadataDatasetName:
707 haveMetadata = True
708 else:
709 unresolvedRefs.append(ref)
710 if resolvedRefs:
711 if haveMetadata or not unresolvedRefs:
712 dataIdsSucceeded.append(quantum.dataId)
713 if skipExisting:
714 continue
715 else:
716 dataIdsFailed.append(quantum.dataId)
717 if not clobberOutputs:
718 raise OutputExistsError(
719 f"Quantum {quantum.dataId} of task with label "
720 f"'{quantum.task.taskDef.label}' has some outputs that exist "
721 f"({resolvedRefs}) "
722 f"and others that don't ({unresolvedRefs}), with no metadata output, "
723 "and clobbering outputs was not enabled."
724 )
725 # Update the input DatasetRefs to the resolved ones we already
726 # searched for.
727 for datasetType, refs in quantum.inputs.items():
728 for ref in task.inputs.extract(datasetType, refs.keys()):
729 refs[ref.dataId] = ref
730 # Look up prerequisite datasets in the input collection(s).
731 # These may have dimensions that extend beyond those we queried
732 # for originally, because we want to permit those data ID
733 # values to differ across quanta and dataset types.
734 for datasetType in task.prerequisites:
735 lookupFunction = lookupFunctions.get(datasetType.name)
736 if lookupFunction is not None:
737 # PipelineTask has provided its own function to do the
738 # lookup. This always takes precedence.
739 refs = list(
740 lookupFunction(datasetType, registry, quantum.dataId, collections)
741 )
742 elif (datasetType.isCalibration()
743 and datasetType.dimensions <= quantum.dataId.graph
744 and quantum.dataId.graph.temporal):
745 # This is a master calibration lookup, which we have to
746 # handle specially because the query system can't do a
747 # temporal join on a non-dimension-based timespan yet.
748 timespan = quantum.dataId.timespan
749 try:
750 refs = [registry.findDataset(datasetType, quantum.dataId,
751 collections=collections,
752 timespan=timespan)]
753 except KeyError:
754 # This dataset type is not present in the registry,
755 # which just means there are no datasets here.
756 refs = []
757 else:
758 # Most general case.
759 refs = list(registry.queryDatasets(datasetType,
760 collections=collections,
761 dataId=quantum.dataId,
762 findFirst=True).expanded())
763 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs
764 if ref is not None})
765 # Actually remove any quanta that we decided to skip above.
766 if dataIdsSucceeded:
767 if skipExisting:
768 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their "
769 "outputs exist or metadata was written successfully.",
770 len(dataIdsSucceeded), task.taskDef.label)
771 for dataId in dataIdsSucceeded:
772 del task.quanta[dataId]
773 elif clobberOutputs:
774 _LOG.info("Found %d successful quanta for task with label '%s' "
775 "that will need to be clobbered during execution.",
776 len(dataIdsSucceeded),
777 task.taskDef.label)
778 else:
779 raise AssertionError("OutputExistsError should have already been raised.")
780 if dataIdsFailed:
781 if clobberOutputs:
782 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' "
783 "that will need to be clobbered during execution.",
784 len(dataIdsFailed),
785 task.taskDef.label)
786 else:
787 raise AssertionError("OutputExistsError should have already been raised.")
789 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None):
790 """Create a `QuantumGraph` from the quanta already present in
791 the scaffolding data structure.
793 Parameters
794 ---------
795 metadata : Optional Mapping of `str` to primitives
796 This is an optional parameter of extra data to carry with the
797 graph. Entries in this mapping should be able to be serialized in
798 JSON.
800 Returns
801 -------
802 graph : `QuantumGraph`
803 The full `QuantumGraph`.
804 """
805 graph = QuantumGraph({task.taskDef: task.makeQuantumSet() for task in self.tasks}, metadata=metadata)
806 return graph
809# ------------------------
810# Exported definitions --
811# ------------------------
814class GraphBuilderError(Exception):
815 """Base class for exceptions generated by graph builder.
816 """
817 pass
820class OutputExistsError(GraphBuilderError):
821 """Exception generated when output datasets already exist.
822 """
823 pass
826class PrerequisiteMissingError(GraphBuilderError):
827 """Exception generated when a prerequisite dataset does not exist.
828 """
829 pass
832class GraphBuilder(object):
833 """GraphBuilder class is responsible for building task execution graph from
834 a Pipeline.
836 Parameters
837 ----------
838 registry : `~lsst.daf.butler.Registry`
839 Data butler instance.
840 skipExisting : `bool`, optional
841 If `True` (default), a Quantum is not created if all its outputs
842 already exist.
843 clobberOutputs : `bool`, optional
844 If `True` (default), allow quanta to created even if partial outputs
845 exist; this requires the same behavior behavior to be enabled when
846 executing.
847 """
849 def __init__(self, registry, skipExisting=True, clobberOutputs=True):
850 self.registry = registry
851 self.dimensions = registry.dimensions
852 self.skipExisting = skipExisting
853 self.clobberOutputs = clobberOutputs
855 def makeGraph(self, pipeline, collections, run, userQuery,
856 metadata: Optional[Mapping[str, Any]] = None):
857 """Create execution graph for a pipeline.
859 Parameters
860 ----------
861 pipeline : `Pipeline`
862 Pipeline definition, task names/classes and their configs.
863 collections
864 Expressions representing the collections to search for input
865 datasets. May be any of the types accepted by
866 `lsst.daf.butler.CollectionSearch.fromExpression`.
867 run : `str`, optional
868 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
869 output datasets, if it already exists.
870 userQuery : `str`
871 String which defines user-defined selection for registry, should be
872 empty or `None` if there is no restrictions on data selection.
873 metadata : Optional Mapping of `str` to primitives
874 This is an optional parameter of extra data to carry with the
875 graph. Entries in this mapping should be able to be serialized in
876 JSON.
878 Returns
879 -------
880 graph : `QuantumGraph`
882 Raises
883 ------
884 UserExpressionError
885 Raised when user expression cannot be parsed.
886 OutputExistsError
887 Raised when output datasets already exist.
888 Exception
889 Other exceptions types may be raised by underlying registry
890 classes.
891 """
892 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
893 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
894 raise ValueError("Pipeline requires input datasets but no input collections provided.")
895 instrument = pipeline.getInstrument()
896 if isinstance(instrument, str):
897 instrument = doImport(instrument)
898 if instrument is not None:
899 dataId = DataCoordinate.standardize(instrument=instrument.getName(),
900 universe=self.registry.dimensions)
901 else:
902 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
903 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId) as commonDataIds:
904 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
905 skipExisting=self.skipExisting,
906 clobberOutputs=self.clobberOutputs)
907 return scaffolding.makeQuantumGraph(metadata=metadata)