Coverage for python/lsst/pipe/base/graphBuilder.py: 17%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from contextlib import contextmanager
34from dataclasses import dataclass
35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping
36import logging
39# -----------------------------
40# Imports for other modules --
41# -----------------------------
42from .connections import iterConnections, AdjustQuantumHelper
43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44from .graph import QuantumGraph
45from lsst.daf.butler import (
46 CollectionSearch,
47 CollectionType,
48 DataCoordinate,
49 DatasetRef,
50 DatasetType,
51 DimensionGraph,
52 DimensionUniverse,
53 NamedKeyDict,
54 Quantum,
55)
56from lsst.utils import doImport
57from ._status import NoWorkFound
58from ._datasetQueryConstraints import DatasetQueryConstraintVariant
60# ----------------------------------
61# Local non-exported definitions --
62# ----------------------------------
64_LOG = logging.getLogger(__name__.partition(".")[2])
67class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
68 """A custom dictionary that maps `DatasetType` to a nested dictionary of
69 the known `DatasetRef` instances of that type.
71 Parameters
72 ----------
73 args
74 Positional arguments are forwarded to the `dict` constructor.
75 universe : `DimensionUniverse`
76 Universe of all possible dimensions.
77 """
78 def __init__(self, *args, universe: DimensionGraph):
79 super().__init__(*args)
80 self.universe = universe
82 @classmethod
83 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
84 universe: DimensionUniverse) -> _DatasetDict:
85 """Construct a dictionary from a flat iterable of `DatasetType` keys.
87 Parameters
88 ----------
89 datasetTypes : `iterable` of `DatasetType`
90 DatasetTypes to use as keys for the dict. Values will be empty
91 dictionaries.
92 universe : `DimensionUniverse`
93 Universe of all possible dimensions.
95 Returns
96 -------
97 dictionary : `_DatasetDict`
98 A new `_DatasetDict` instance.
99 """
100 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
102 @classmethod
103 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
104 ) -> _DatasetDict:
105 """Return a new dictionary by extracting items corresponding to the
106 given keys from one or more existing dictionaries.
108 Parameters
109 ----------
110 datasetTypes : `iterable` of `DatasetType`
111 DatasetTypes to use as keys for the dict. Values will be obtained
112 by lookups against ``first`` and ``rest``.
113 first : `_DatasetDict`
114 Another dictionary from which to extract values.
115 rest
116 Additional dictionaries from which to extract values.
118 Returns
119 -------
120 dictionary : `_DatasetDict`
121 A new dictionary instance.
122 """
123 combined = ChainMap(first, *rest)
124 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
125 universe=first.universe)
127 @property
128 def dimensions(self) -> DimensionGraph:
129 """The union of all dimensions used by all dataset types in this
130 dictionary, including implied dependencies (`DimensionGraph`).
131 """
132 base = self.universe.empty
133 if len(self) == 0:
134 return base
135 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
137 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
138 """Unpack nested single-element `DatasetRef` dicts into a new
139 mapping with `DatasetType` keys and `DatasetRef` values.
141 This method assumes that each nest contains exactly one item, as is the
142 case for all "init" datasets.
144 Returns
145 -------
146 dictionary : `NamedKeyDict`
147 Dictionary mapping `DatasetType` to `DatasetRef`, with both
148 `DatasetType` instances and string names usable as keys.
149 """
150 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
151 ref, = refs.values()
152 return ref
153 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
155 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
156 """Unpack nested multi-element `DatasetRef` dicts into a new
157 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
159 Returns
160 -------
161 dictionary : `NamedKeyDict`
162 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
163 both `DatasetType` instances and string names usable as keys.
164 """
165 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
167 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
168 ) -> Iterator[DatasetRef]:
169 """Iterate over the contained `DatasetRef` instances that match the
170 given `DatasetType` and data IDs.
172 Parameters
173 ----------
174 datasetType : `DatasetType`
175 Dataset type to match.
176 dataIds : `Iterable` [ `DataCoordinate` ]
177 Data IDs to match.
179 Returns
180 -------
181 refs : `Iterator` [ `DatasetRef` ]
182 DatasetRef instances for which ``ref.datasetType == datasetType``
183 and ``ref.dataId`` is in ``dataIds``.
184 """
185 refs = self[datasetType]
186 return (refs[dataId] for dataId in dataIds)
189class _QuantumScaffolding:
190 """Helper class aggregating information about a `Quantum`, used when
191 constructing a `QuantumGraph`.
193 See `_PipelineScaffolding` for a top-down description of the full
194 scaffolding data structure.
196 Parameters
197 ----------
198 task : _TaskScaffolding
199 Back-reference to the helper object for the `PipelineTask` this quantum
200 represents an execution of.
201 dataId : `DataCoordinate`
202 Data ID for this quantum.
203 """
204 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
205 self.task = task
206 self.dataId = dataId
207 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
208 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
209 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
210 universe=dataId.universe)
212 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
214 def __repr__(self):
215 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
217 task: _TaskScaffolding
218 """Back-reference to the helper object for the `PipelineTask` this quantum
219 represents an execution of.
220 """
222 dataId: DataCoordinate
223 """Data ID for this quantum.
224 """
226 inputs: _DatasetDict
227 """Nested dictionary containing `DatasetRef` inputs to this quantum.
229 This is initialized to map each `DatasetType` to an empty dictionary at
230 construction. Those nested dictionaries are populated (with data IDs as
231 keys) with unresolved `DatasetRef` instances in
232 `_PipelineScaffolding.connectDataIds`.
233 """
235 outputs: _DatasetDict
236 """Nested dictionary containing `DatasetRef` outputs this quantum.
237 """
239 prerequisites: _DatasetDict
240 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
241 quantum.
242 """
244 def makeQuantum(self) -> Quantum:
245 """Transform the scaffolding object into a true `Quantum` instance.
247 Returns
248 -------
249 quantum : `Quantum`
250 An actual `Quantum` instance.
251 """
252 allInputs = self.inputs.unpackMultiRefs()
253 allInputs.update(self.prerequisites.unpackMultiRefs())
254 # Give the task's Connections class an opportunity to remove some
255 # inputs, or complain if they are unacceptable.
256 # This will raise if one of the check conditions is not met, which is
257 # the intended behavior.
258 # If it raises NotWorkFound, there is a bug in the QG algorithm
259 # or the adjustQuantum is incorrectly trying to make a prerequisite
260 # input behave like a regular input; adjustQuantum should only raise
261 # NoWorkFound if a regular input is missing, and it shouldn't be
262 # possible for us to have generated ``self`` if that's true.
263 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
264 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
265 return Quantum(
266 taskName=self.task.taskDef.taskName,
267 taskClass=self.task.taskDef.taskClass,
268 dataId=self.dataId,
269 initInputs=self.task.initInputs.unpackSingleRefs(),
270 inputs=helper.inputs,
271 outputs=helper.outputs,
272 )
275@dataclass
276class _TaskScaffolding:
277 """Helper class aggregating information about a `PipelineTask`, used when
278 constructing a `QuantumGraph`.
280 See `_PipelineScaffolding` for a top-down description of the full
281 scaffolding data structure.
283 Parameters
284 ----------
285 taskDef : `TaskDef`
286 Data structure that identifies the task class and its config.
287 parent : `_PipelineScaffolding`
288 The parent data structure that will hold the instance being
289 constructed.
290 datasetTypes : `TaskDatasetTypes`
291 Data structure that categorizes the dataset types used by this task.
292 """
293 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
294 universe = parent.dimensions.universe
295 self.taskDef = taskDef
296 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
297 assert self.dimensions.issubset(parent.dimensions)
298 # Initialize _DatasetDicts as subsets of the one or two
299 # corresponding dicts in the parent _PipelineScaffolding.
300 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
301 parent.initIntermediates)
302 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
303 parent.initOutputs)
304 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
305 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
306 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
307 self.dataIds = set()
308 self.quanta = {}
310 def __repr__(self):
311 # Default dataclass-injected __repr__ gets caught in an infinite loop
312 # because of back-references.
313 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
315 taskDef: TaskDef
316 """Data structure that identifies the task class and its config
317 (`TaskDef`).
318 """
320 dimensions: DimensionGraph
321 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
322 """
324 initInputs: _DatasetDict
325 """Dictionary containing information about datasets used to construct this
326 task (`_DatasetDict`).
327 """
329 initOutputs: _DatasetDict
330 """Dictionary containing information about datasets produced as a
331 side-effect of constructing this task (`_DatasetDict`).
332 """
334 inputs: _DatasetDict
335 """Dictionary containing information about datasets used as regular,
336 graph-constraining inputs to this task (`_DatasetDict`).
337 """
339 outputs: _DatasetDict
340 """Dictionary containing information about datasets produced by this task
341 (`_DatasetDict`).
342 """
344 prerequisites: _DatasetDict
345 """Dictionary containing information about input datasets that must be
346 present in the repository before any Pipeline containing this task is run
347 (`_DatasetDict`).
348 """
350 quanta: Dict[DataCoordinate, _QuantumScaffolding]
351 """Dictionary mapping data ID to a scaffolding object for the Quantum of
352 this task with that data ID.
353 """
355 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]:
356 """Create a `set` of `Quantum` from the information in ``self``.
358 Returns
359 -------
360 nodes : `set` of `Quantum
361 The `Quantum` elements corresponding to this task.
362 """
363 if unresolvedRefs is None:
364 unresolvedRefs = set()
365 outputs = set()
366 for q in self.quanta.values():
367 try:
368 tmpQuanta = q.makeQuantum()
369 outputs.add(tmpQuanta)
370 except (NoWorkFound, FileNotFoundError) as exc:
371 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
372 if unresolvedRefs.intersection(refs):
373 # This means it is a node that is Known to be pruned
374 # later and should be left in even though some follow up
375 # queries fail. This allows the pruning to start from this
376 # quantum with known issues, and prune other nodes it
377 # touches
378 inputs = q.inputs.unpackMultiRefs()
379 inputs.update(q.prerequisites.unpackMultiRefs())
380 tmpQuantum = Quantum(taskName=q.task.taskDef.taskName,
381 taskClass=q.task.taskDef.taskClass,
382 dataId=q.dataId,
383 initInputs=q.task.initInputs.unpackSingleRefs(),
384 inputs=inputs,
385 outputs=q.outputs.unpackMultiRefs(),)
386 outputs.add(tmpQuantum)
387 else:
388 raise exc
389 return outputs
392@dataclass
393class _PipelineScaffolding:
394 """A helper data structure that organizes the information involved in
395 constructing a `QuantumGraph` for a `Pipeline`.
397 Parameters
398 ----------
399 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
400 Sequence of tasks from which a graph is to be constructed. Must
401 have nested task classes already imported.
402 universe : `DimensionUniverse`
403 Universe of all possible dimensions.
405 Notes
406 -----
407 The scaffolding data structure contains nested data structures for both
408 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
409 data structures are shared between the pipeline-level structure (which
410 aggregates all datasets and categorizes them from the perspective of the
411 complete pipeline) and the individual tasks that use them as inputs and
412 outputs.
414 `QuantumGraph` construction proceeds in four steps, with each corresponding
415 to a different `_PipelineScaffolding` method:
417 1. When `_PipelineScaffolding` is constructed, we extract and categorize
418 the DatasetTypes used by the pipeline (delegating to
419 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
420 nested `_TaskScaffolding` and `_DatasetDict` objects.
422 2. In `connectDataIds`, we construct and run the "Big Join Query", which
423 returns related tuples of all dimensions used to identify any regular
424 input, output, and intermediate datasets (not prerequisites). We then
425 iterate over these tuples of related dimensions, identifying the subsets
426 that correspond to distinct data IDs for each task and dataset type,
427 and then create `_QuantumScaffolding` objects.
429 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
430 dataset data IDs previously identified, transforming unresolved
431 DatasetRefs into resolved DatasetRefs where appropriate. We then look
432 up prerequisite datasets for all quanta.
434 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
435 per-task `_QuantumScaffolding` objects.
436 """
437 def __init__(self, pipeline, *, registry):
438 _LOG.debug("Initializing data structures for QuantumGraph generation.")
439 self.tasks = []
440 # Aggregate and categorize the DatasetTypes in the Pipeline.
441 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
442 # Construct dictionaries that map those DatasetTypes to structures
443 # that will (later) hold addiitonal information about them.
444 for attr in ("initInputs", "initIntermediates", "initOutputs",
445 "inputs", "intermediates", "outputs", "prerequisites"):
446 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
447 universe=registry.dimensions))
448 # Aggregate all dimensions for all non-init, non-prerequisite
449 # DatasetTypes. These are the ones we'll include in the big join
450 # query.
451 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
452 self.outputs.dimensions)
453 # Construct scaffolding nodes for each Task, and add backreferences
454 # to the Task from each DatasetScaffolding node.
455 # Note that there's only one scaffolding node for each DatasetType,
456 # shared by _PipelineScaffolding and all _TaskScaffoldings that
457 # reference it.
458 if isinstance(pipeline, Pipeline):
459 pipeline = pipeline.toExpandedPipeline()
460 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
461 for taskDef, taskDatasetTypes in zip(pipeline,
462 datasetTypes.byTask.values())]
464 def __repr__(self):
465 # Default dataclass-injected __repr__ gets caught in an infinite loop
466 # because of back-references.
467 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
469 tasks: List[_TaskScaffolding]
470 """Scaffolding data structures for each task in the pipeline
471 (`list` of `_TaskScaffolding`).
472 """
474 initInputs: _DatasetDict
475 """Datasets consumed but not produced when constructing the tasks in this
476 pipeline (`_DatasetDict`).
477 """
479 initIntermediates: _DatasetDict
480 """Datasets that are both consumed and produced when constructing the tasks
481 in this pipeline (`_DatasetDict`).
482 """
484 initOutputs: _DatasetDict
485 """Datasets produced but not consumed when constructing the tasks in this
486 pipeline (`_DatasetDict`).
487 """
489 inputs: _DatasetDict
490 """Datasets that are consumed but not produced when running this pipeline
491 (`_DatasetDict`).
492 """
494 intermediates: _DatasetDict
495 """Datasets that are both produced and consumed when running this pipeline
496 (`_DatasetDict`).
497 """
499 outputs: _DatasetDict
500 """Datasets produced but not consumed when when running this pipeline
501 (`_DatasetDict`).
502 """
504 prerequisites: _DatasetDict
505 """Datasets that are consumed when running this pipeline and looked up
506 per-Quantum when generating the graph (`_DatasetDict`).
507 """
509 dimensions: DimensionGraph
510 """All dimensions used by any regular input, intermediate, or output
511 (not prerequisite) dataset; the set of dimension used in the "Big Join
512 Query" (`DimensionGraph`).
514 This is required to be a superset of all task quantum dimensions.
515 """
517 @contextmanager
518 def connectDataIds(self, registry, collections, userQuery, externalDataId,
519 datasetQueryConstraint: DatasetQueryConstraintVariant =
520 DatasetQueryConstraintVariant.ALL):
521 """Query for the data IDs that connect nodes in the `QuantumGraph`.
523 This method populates `_TaskScaffolding.dataIds` and
524 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
526 Parameters
527 ----------
528 registry : `lsst.daf.butler.Registry`
529 Registry for the data repository; used for all data ID queries.
530 collections
531 Expressions representing the collections to search for input
532 datasets. May be any of the types accepted by
533 `lsst.daf.butler.CollectionSearch.fromExpression`.
534 userQuery : `str` or `None`
535 User-provided expression to limit the data IDs processed.
536 externalDataId : `DataCoordinate`
537 Externally-provided data ID that should be used to restrict the
538 results, just as if these constraints had been included via ``AND``
539 in ``userQuery``. This includes (at least) any instrument named
540 in the pipeline definition.
541 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
542 The query constraint variant that should be used to constraint the
543 query based on dataset existance, defaults to
544 `DatasetQueryConstraintVariant.ALL`.
546 Returns
547 -------
548 commonDataIds : \
549 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
550 An interface to a database temporary table containing all data IDs
551 that will appear in this `QuantumGraph`. Returned inside a
552 context manager, which will drop the temporary table at the end of
553 the `with` block in which this method is called.
554 """
555 _LOG.debug("Building query for data IDs.")
556 # Initialization datasets always have empty data IDs.
557 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
558 for datasetType, refs in itertools.chain(self.initInputs.items(),
559 self.initIntermediates.items(),
560 self.initOutputs.items()):
561 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
562 # Run one big query for the data IDs for task dimensions and regular
563 # inputs and outputs. We limit the query to only dimensions that are
564 # associated with the input dataset types, but don't (yet) try to
565 # obtain the dataset_ids for those inputs.
566 _LOG.debug("Submitting data ID query and materializing results.")
567 queryArgs = {'dimensions': self.dimensions, 'where': userQuery, 'dataId': externalDataId}
568 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
569 _LOG.debug("Constraining graph query using all datasets in pipeline.")
570 queryArgs['datasets'] = list(self.inputs)
571 queryArgs['collections'] = collections
572 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
573 _LOG.debug("Not using dataset existance to constrain query.")
574 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
575 constraint = set(datasetQueryConstraint)
576 inputs = {k.name: k for k in self.inputs.keys()}
577 if (remainder := constraint.difference(inputs.keys())):
578 raise ValueError(f"{remainder} dataset type(s) specified as a graph constraint, but"
579 f" do not appear as an input to the specified pipeline: {inputs.keys()}")
580 _LOG.debug(f"Constraining graph query using {constraint}")
581 queryArgs['datasets'] = [typ for name, typ in inputs.items() if name in constraint]
582 queryArgs['collections'] = collections
583 else:
584 raise ValueError(f"Unable to handle type {datasetQueryConstraint} given as "
585 "datasetQueryConstraint.")
587 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
588 _LOG.debug("Expanding data IDs.")
589 commonDataIds = commonDataIds.expanded()
590 _LOG.debug("Iterating over query results to associate quanta with datasets.")
591 # Iterate over query results, populating data IDs for datasets and
592 # quanta and then connecting them to each other.
593 n = 0
594 for n, commonDataId in enumerate(commonDataIds):
595 # Create DatasetRefs for all DatasetTypes from this result row,
596 # noting that we might have created some already.
597 # We remember both those that already existed and those that we
598 # create now.
599 refsForRow = {}
600 dataIdCacheForRow: Mapping[DimensionGraph, DataCoordinate] = {}
601 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
602 self.outputs.items()):
603 if not (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)):
604 datasetDataId = commonDataId.subset(datasetType.dimensions)
605 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
606 ref = refs.get(datasetDataId)
607 if ref is None:
608 ref = DatasetRef(datasetType, datasetDataId)
609 refs[datasetDataId] = ref
610 refsForRow[datasetType.name] = ref
611 # Create _QuantumScaffolding objects for all tasks from this
612 # result row, noting that we might have created some already.
613 for task in self.tasks:
614 quantumDataId = commonDataId.subset(task.dimensions)
615 quantum = task.quanta.get(quantumDataId)
616 if quantum is None:
617 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
618 task.quanta[quantumDataId] = quantum
619 # Whether this is a new quantum or an existing one, we can
620 # now associate the DatasetRefs for this row with it. The
621 # fact that a Quantum data ID and a dataset data ID both
622 # came from the same result row is what tells us they
623 # should be associated.
624 # Many of these associates will be duplicates (because
625 # another query row that differed from this one only in
626 # irrelevant dimensions already added them), and we use
627 # sets to skip.
628 for datasetType in task.inputs:
629 ref = refsForRow[datasetType.name]
630 quantum.inputs[datasetType.name][ref.dataId] = ref
631 for datasetType in task.outputs:
632 ref = refsForRow[datasetType.name]
633 quantum.outputs[datasetType.name][ref.dataId] = ref
634 if n == 0:
635 for message in commonDataIds.explain_no_results():
636 _LOG.warn(message)
637 _LOG.debug("Finished processing %d rows from data ID query.", n)
638 yield commonDataIds
640 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExistingIn=None,
641 clobberOutputs=True, constrainedByAllDatasets: bool = True):
642 """Perform follow up queries for each dataset data ID produced in
643 `fillDataIds`.
645 This method populates `_DatasetScaffolding.refs` (except for those in
646 `prerequisites`).
648 Parameters
649 ----------
650 registry : `lsst.daf.butler.Registry`
651 Registry for the data repository; used for all data ID queries.
652 collections
653 Expressions representing the collections to search for input
654 datasets. May be any of the types accepted by
655 `lsst.daf.butler.CollectionSearch.fromExpression`.
656 run : `str`, optional
657 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
658 output datasets, if it already exists.
659 commonDataIds : \
660 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
661 Result of a previous call to `connectDataIds`.
662 skipExistingIn
663 Expressions representing the collections to search for existing
664 output datasets that should be skipped. May be any of the types
665 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
666 `None` or empty string/sequence disables skipping.
667 clobberOutputs : `bool`, optional
668 If `True` (default), allow quanta to created even if outputs exist;
669 this requires the same behavior behavior to be enabled when
670 executing. If ``skipExistingIn`` is not `None`, completed quanta
671 (those with metadata, or all outputs if there is no metadata
672 dataset configured) will be skipped rather than clobbered.
673 constrainedByAllDatasets : `bool`, optional
674 Indicates if the commonDataIds were generated with a constraint on
675 all dataset types.
677 Raises
678 ------
679 OutputExistsError
680 Raised if an output dataset already exists in the output run
681 and ``skipExistingIn`` does not include output run, or if only
682 some outputs are present and ``clobberOutputs`` is `False`.
683 """
684 skipCollections: Optional[CollectionSearch] = None
685 skipExistingInRun = False
686 if skipExistingIn:
687 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
688 if run:
689 # as optimization check in the explicit list of names first
690 skipExistingInRun = run in skipCollections.explicitNames()
691 if not skipExistingInRun:
692 # need to flatten it and check again
693 skipExistingInRun = run in registry.queryCollections(
694 skipExistingIn,
695 collectionTypes=CollectionType.RUN,
696 )
698 # Look up [init] intermediate and output datasets in the output
699 # collection, if there is an output collection.
700 if run is not None or skipCollections is not None:
701 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
702 self.initOutputs.items(),
703 self.intermediates.items(),
704 self.outputs.items()):
705 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
706 len(refs), datasetType.name)
707 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
708 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
710 # look at RUN collection first
711 if run is not None:
712 resolvedRefQueryResults = subset.findDatasets(
713 datasetType,
714 collections=run,
715 findFirst=True
716 )
717 for resolvedRef in resolvedRefQueryResults:
718 # TODO: we could easily support per-DatasetType
719 # skipExisting and I could imagine that being useful -
720 # it's probably required in order to support writing
721 # initOutputs before QuantumGraph generation.
722 assert resolvedRef.dataId in refs
723 if not (skipExistingInRun or isInit or clobberOutputs):
724 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
725 f"output RUN collection '{run}' with data ID"
726 f" {resolvedRef.dataId}.")
728 # And check skipExistingIn too, if RUN collection is in
729 # it is handled above
730 if skipCollections is not None:
731 resolvedRefQueryResults = subset.findDatasets(
732 datasetType,
733 collections=skipCollections,
734 findFirst=True
735 )
736 for resolvedRef in resolvedRefQueryResults:
737 assert resolvedRef.dataId in refs
738 refs[resolvedRef.dataId] = resolvedRef
740 # Look up input and initInput datasets in the input collection(s).
741 # container to accumulate unfound refs, if the common dataIs were not
742 # constrained on dataset type existence.
743 self.unfoundRefs = set()
744 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
745 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
746 resolvedRefQueryResults = commonDataIds.subset(
747 datasetType.dimensions,
748 unique=True
749 ).findDatasets(
750 datasetType,
751 collections=collections,
752 findFirst=True
753 )
754 dataIdsNotFoundYet = set(refs.keys())
755 for resolvedRef in resolvedRefQueryResults:
756 dataIdsNotFoundYet.discard(resolvedRef.dataId)
757 refs[resolvedRef.dataId] = resolvedRef
758 if dataIdsNotFoundYet:
759 if constrainedByAllDatasets:
760 raise RuntimeError(
761 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
762 f"'{datasetType.name}' was/were present in a previous "
763 f"query, but could not be found now."
764 f"This is either a logic bug in QuantumGraph generation "
765 f"or the input collections have been modified since "
766 f"QuantumGraph generation began."
767 )
768 else:
769 # if the common dataIds were not constrained using all the
770 # input dataset types, it is possible that some data ids
771 # found dont correspond to existing dataset types and they
772 # will be un-resolved. Mark these for later pruning from
773 # the quantum graph.
774 for k in dataIdsNotFoundYet:
775 self.unfoundRefs.add(refs[k])
777 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
778 # replacing the unresolved refs there, and then look up prerequisites.
779 for task in self.tasks:
780 _LOG.debug(
781 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
782 len(task.quanta),
783 task.taskDef.label
784 )
785 lookupFunctions = {
786 c.name: c.lookupFunction
787 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
788 if c.lookupFunction is not None
789 }
790 dataIdsFailed = []
791 dataIdsSucceeded = []
792 for quantum in task.quanta.values():
793 # Process outputs datasets only if skipExistingIn is not None
794 # or there is a run to look for outputs in and clobberOutputs
795 # is True. Note that if skipExistingIn is None, any output
796 # datasets that already exist would have already caused an
797 # exception to be raised. We never update the DatasetRefs in
798 # the quantum because those should never be resolved.
799 if skipCollections is not None or (run is not None and clobberOutputs):
800 resolvedRefs = []
801 unresolvedRefs = []
802 haveMetadata = False
803 for datasetType, originalRefs in quantum.outputs.items():
804 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
805 if ref.id is not None:
806 resolvedRefs.append(ref)
807 if datasetType.name == task.taskDef.metadataDatasetName:
808 haveMetadata = True
809 else:
810 unresolvedRefs.append(ref)
811 if resolvedRefs:
812 if haveMetadata or not unresolvedRefs:
813 dataIdsSucceeded.append(quantum.dataId)
814 if skipCollections is not None:
815 continue
816 else:
817 dataIdsFailed.append(quantum.dataId)
818 if not clobberOutputs:
819 raise OutputExistsError(
820 f"Quantum {quantum.dataId} of task with label "
821 f"'{quantum.task.taskDef.label}' has some outputs that exist "
822 f"({resolvedRefs}) "
823 f"and others that don't ({unresolvedRefs}), with no metadata output, "
824 "and clobbering outputs was not enabled."
825 )
826 # Update the input DatasetRefs to the resolved ones we already
827 # searched for.
828 for datasetType, refs in quantum.inputs.items():
829 for ref in task.inputs.extract(datasetType, refs.keys()):
830 refs[ref.dataId] = ref
831 # Look up prerequisite datasets in the input collection(s).
832 # These may have dimensions that extend beyond those we queried
833 # for originally, because we want to permit those data ID
834 # values to differ across quanta and dataset types.
835 for datasetType in task.prerequisites:
836 lookupFunction = lookupFunctions.get(datasetType.name)
837 if lookupFunction is not None:
838 # PipelineTask has provided its own function to do the
839 # lookup. This always takes precedence.
840 refs = list(
841 lookupFunction(datasetType, registry, quantum.dataId, collections)
842 )
843 elif (datasetType.isCalibration()
844 and datasetType.dimensions <= quantum.dataId.graph
845 and quantum.dataId.graph.temporal):
846 # This is a master calibration lookup, which we have to
847 # handle specially because the query system can't do a
848 # temporal join on a non-dimension-based timespan yet.
849 timespan = quantum.dataId.timespan
850 try:
851 refs = [registry.findDataset(datasetType, quantum.dataId,
852 collections=collections,
853 timespan=timespan)]
854 except KeyError:
855 # This dataset type is not present in the registry,
856 # which just means there are no datasets here.
857 refs = []
858 else:
859 # Most general case.
860 refs = list(registry.queryDatasets(datasetType,
861 collections=collections,
862 dataId=quantum.dataId,
863 findFirst=True).expanded())
864 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs
865 if ref is not None})
866 # Actually remove any quanta that we decided to skip above.
867 if dataIdsSucceeded:
868 if skipCollections is not None:
869 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their "
870 "outputs exist or metadata was written successfully.",
871 len(dataIdsSucceeded), task.taskDef.label)
872 for dataId in dataIdsSucceeded:
873 del task.quanta[dataId]
874 elif clobberOutputs:
875 _LOG.info("Found %d successful quanta for task with label '%s' "
876 "that will need to be clobbered during execution.",
877 len(dataIdsSucceeded),
878 task.taskDef.label)
879 else:
880 raise AssertionError("OutputExistsError should have already been raised.")
881 if dataIdsFailed:
882 if clobberOutputs:
883 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' "
884 "that will need to be clobbered during execution.",
885 len(dataIdsFailed),
886 task.taskDef.label)
887 else:
888 raise AssertionError("OutputExistsError should have already been raised.")
890 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None):
891 """Create a `QuantumGraph` from the quanta already present in
892 the scaffolding data structure.
894 Parameters
895 ---------
896 metadata : Optional Mapping of `str` to primitives
897 This is an optional parameter of extra data to carry with the
898 graph. Entries in this mapping should be able to be serialized in
899 JSON.
901 Returns
902 -------
903 graph : `QuantumGraph`
904 The full `QuantumGraph`.
905 """
906 graphInput: Dict[TaskDef, Set[Quantum]] = {}
907 for task in self.tasks:
908 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs)
909 graphInput[task.taskDef] = qset
911 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs)
912 return graph
915# ------------------------
916# Exported definitions --
917# ------------------------
920class GraphBuilderError(Exception):
921 """Base class for exceptions generated by graph builder.
922 """
923 pass
926class OutputExistsError(GraphBuilderError):
927 """Exception generated when output datasets already exist.
928 """
929 pass
932class PrerequisiteMissingError(GraphBuilderError):
933 """Exception generated when a prerequisite dataset does not exist.
934 """
935 pass
938class GraphBuilder(object):
939 """GraphBuilder class is responsible for building task execution graph from
940 a Pipeline.
942 Parameters
943 ----------
944 registry : `~lsst.daf.butler.Registry`
945 Data butler instance.
946 skipExistingIn
947 Expressions representing the collections to search for existing
948 output datasets that should be skipped. May be any of the types
949 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
950 clobberOutputs : `bool`, optional
951 If `True` (default), allow quanta to created even if partial outputs
952 exist; this requires the same behavior behavior to be enabled when
953 executing.
954 """
956 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True):
957 self.registry = registry
958 self.dimensions = registry.dimensions
959 self.skipExistingIn = skipExistingIn
960 self.clobberOutputs = clobberOutputs
962 def makeGraph(self, pipeline, collections, run, userQuery,
963 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
964 metadata: Optional[Mapping[str, Any]] = None):
965 """Create execution graph for a pipeline.
967 Parameters
968 ----------
969 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
970 Pipeline definition, task names/classes and their configs.
971 collections
972 Expressions representing the collections to search for input
973 datasets. May be any of the types accepted by
974 `lsst.daf.butler.CollectionSearch.fromExpression`.
975 run : `str`, optional
976 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
977 output datasets, if it already exists.
978 userQuery : `str`
979 String which defines user-defined selection for registry, should be
980 empty or `None` if there is no restrictions on data selection.
981 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
982 The query constraint variant that should be used to constraint the
983 query based on dataset existance, defaults to
984 `DatasetQueryConstraintVariant.ALL`.
985 metadata : Optional Mapping of `str` to primitives
986 This is an optional parameter of extra data to carry with the
987 graph. Entries in this mapping should be able to be serialized in
988 JSON.
990 Returns
991 -------
992 graph : `QuantumGraph`
994 Raises
995 ------
996 UserExpressionError
997 Raised when user expression cannot be parsed.
998 OutputExistsError
999 Raised when output datasets already exist.
1000 Exception
1001 Other exceptions types may be raised by underlying registry
1002 classes.
1003 """
1004 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1005 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1006 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1007 instrument = None
1008 if isinstance(pipeline, Pipeline):
1009 instrument = pipeline.getInstrument()
1010 if isinstance(instrument, str):
1011 instrument = doImport(instrument)
1012 pipeline = list(pipeline.toExpandedPipeline())
1013 if instrument is not None:
1014 dataId = DataCoordinate.standardize(instrument=instrument.getName(),
1015 universe=self.registry.dimensions)
1016 else:
1017 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1018 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId,
1019 datasetQueryConstraint) as commonDataIds:
1020 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1021 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
1022 skipExistingIn=self.skipExistingIn,
1023 clobberOutputs=self.clobberOutputs,
1024 constrainedByAllDatasets=condition)
1025 return scaffolding.makeQuantumGraph(metadata=metadata)