Coverage for python/lsst/pipe/base/graphBuilder.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from contextlib import contextmanager
34from dataclasses import dataclass
35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping
36import logging
39# -----------------------------
40# Imports for other modules --
41# -----------------------------
42from .connections import iterConnections, AdjustQuantumHelper
43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44from .graph import QuantumGraph
45from lsst.daf.butler import (
46 CollectionSearch,
47 CollectionType,
48 DataCoordinate,
49 DatasetRef,
50 DatasetType,
51 DimensionGraph,
52 DimensionUniverse,
53 NamedKeyDict,
54 Quantum,
55)
56from lsst.utils import doImport
57from ._status import NoWorkFound
58from ._datasetQueryConstraints import DatasetQueryConstraintVariant
60# ----------------------------------
61# Local non-exported definitions --
62# ----------------------------------
64_LOG = logging.getLogger(__name__)
67class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
68 """A custom dictionary that maps `DatasetType` to a nested dictionary of
69 the known `DatasetRef` instances of that type.
71 Parameters
72 ----------
73 args
74 Positional arguments are forwarded to the `dict` constructor.
75 universe : `DimensionUniverse`
76 Universe of all possible dimensions.
77 """
78 def __init__(self, *args, universe: DimensionGraph):
79 super().__init__(*args)
80 self.universe = universe
82 @classmethod
83 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
84 universe: DimensionUniverse) -> _DatasetDict:
85 """Construct a dictionary from a flat iterable of `DatasetType` keys.
87 Parameters
88 ----------
89 datasetTypes : `iterable` of `DatasetType`
90 DatasetTypes to use as keys for the dict. Values will be empty
91 dictionaries.
92 universe : `DimensionUniverse`
93 Universe of all possible dimensions.
95 Returns
96 -------
97 dictionary : `_DatasetDict`
98 A new `_DatasetDict` instance.
99 """
100 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
102 @classmethod
103 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
104 ) -> _DatasetDict:
105 """Return a new dictionary by extracting items corresponding to the
106 given keys from one or more existing dictionaries.
108 Parameters
109 ----------
110 datasetTypes : `iterable` of `DatasetType`
111 DatasetTypes to use as keys for the dict. Values will be obtained
112 by lookups against ``first`` and ``rest``.
113 first : `_DatasetDict`
114 Another dictionary from which to extract values.
115 rest
116 Additional dictionaries from which to extract values.
118 Returns
119 -------
120 dictionary : `_DatasetDict`
121 A new dictionary instance.
122 """
123 combined = ChainMap(first, *rest)
124 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
125 universe=first.universe)
127 @property
128 def dimensions(self) -> DimensionGraph:
129 """The union of all dimensions used by all dataset types in this
130 dictionary, including implied dependencies (`DimensionGraph`).
131 """
132 base = self.universe.empty
133 if len(self) == 0:
134 return base
135 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
137 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
138 """Unpack nested single-element `DatasetRef` dicts into a new
139 mapping with `DatasetType` keys and `DatasetRef` values.
141 This method assumes that each nest contains exactly one item, as is the
142 case for all "init" datasets.
144 Returns
145 -------
146 dictionary : `NamedKeyDict`
147 Dictionary mapping `DatasetType` to `DatasetRef`, with both
148 `DatasetType` instances and string names usable as keys.
149 """
150 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
151 ref, = refs.values()
152 return ref
153 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
155 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
156 """Unpack nested multi-element `DatasetRef` dicts into a new
157 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
159 Returns
160 -------
161 dictionary : `NamedKeyDict`
162 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
163 both `DatasetType` instances and string names usable as keys.
164 """
165 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
167 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
168 ) -> Iterator[DatasetRef]:
169 """Iterate over the contained `DatasetRef` instances that match the
170 given `DatasetType` and data IDs.
172 Parameters
173 ----------
174 datasetType : `DatasetType`
175 Dataset type to match.
176 dataIds : `Iterable` [ `DataCoordinate` ]
177 Data IDs to match.
179 Returns
180 -------
181 refs : `Iterator` [ `DatasetRef` ]
182 DatasetRef instances for which ``ref.datasetType == datasetType``
183 and ``ref.dataId`` is in ``dataIds``.
184 """
185 refs = self[datasetType]
186 return (refs[dataId] for dataId in dataIds)
189class _QuantumScaffolding:
190 """Helper class aggregating information about a `Quantum`, used when
191 constructing a `QuantumGraph`.
193 See `_PipelineScaffolding` for a top-down description of the full
194 scaffolding data structure.
196 Parameters
197 ----------
198 task : _TaskScaffolding
199 Back-reference to the helper object for the `PipelineTask` this quantum
200 represents an execution of.
201 dataId : `DataCoordinate`
202 Data ID for this quantum.
203 """
204 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
205 self.task = task
206 self.dataId = dataId
207 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
208 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
209 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
210 universe=dataId.universe)
212 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
214 def __repr__(self):
215 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
217 task: _TaskScaffolding
218 """Back-reference to the helper object for the `PipelineTask` this quantum
219 represents an execution of.
220 """
222 dataId: DataCoordinate
223 """Data ID for this quantum.
224 """
226 inputs: _DatasetDict
227 """Nested dictionary containing `DatasetRef` inputs to this quantum.
229 This is initialized to map each `DatasetType` to an empty dictionary at
230 construction. Those nested dictionaries are populated (with data IDs as
231 keys) with unresolved `DatasetRef` instances in
232 `_PipelineScaffolding.connectDataIds`.
233 """
235 outputs: _DatasetDict
236 """Nested dictionary containing `DatasetRef` outputs this quantum.
237 """
239 prerequisites: _DatasetDict
240 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
241 quantum.
242 """
244 def makeQuantum(self) -> Quantum:
245 """Transform the scaffolding object into a true `Quantum` instance.
247 Returns
248 -------
249 quantum : `Quantum`
250 An actual `Quantum` instance.
251 """
252 allInputs = self.inputs.unpackMultiRefs()
253 allInputs.update(self.prerequisites.unpackMultiRefs())
254 # Give the task's Connections class an opportunity to remove some
255 # inputs, or complain if they are unacceptable.
256 # This will raise if one of the check conditions is not met, which is
257 # the intended behavior.
258 # If it raises NotWorkFound, there is a bug in the QG algorithm
259 # or the adjustQuantum is incorrectly trying to make a prerequisite
260 # input behave like a regular input; adjustQuantum should only raise
261 # NoWorkFound if a regular input is missing, and it shouldn't be
262 # possible for us to have generated ``self`` if that's true.
263 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
264 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
265 return Quantum(
266 taskName=self.task.taskDef.taskName,
267 taskClass=self.task.taskDef.taskClass,
268 dataId=self.dataId,
269 initInputs=self.task.initInputs.unpackSingleRefs(),
270 inputs=helper.inputs,
271 outputs=helper.outputs,
272 )
275@dataclass
276class _TaskScaffolding:
277 """Helper class aggregating information about a `PipelineTask`, used when
278 constructing a `QuantumGraph`.
280 See `_PipelineScaffolding` for a top-down description of the full
281 scaffolding data structure.
283 Parameters
284 ----------
285 taskDef : `TaskDef`
286 Data structure that identifies the task class and its config.
287 parent : `_PipelineScaffolding`
288 The parent data structure that will hold the instance being
289 constructed.
290 datasetTypes : `TaskDatasetTypes`
291 Data structure that categorizes the dataset types used by this task.
292 """
293 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
294 universe = parent.dimensions.universe
295 self.taskDef = taskDef
296 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
297 assert self.dimensions.issubset(parent.dimensions)
298 # Initialize _DatasetDicts as subsets of the one or two
299 # corresponding dicts in the parent _PipelineScaffolding.
300 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
301 parent.initIntermediates)
302 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
303 parent.initOutputs)
304 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
305 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
306 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
307 self.dataIds = set()
308 self.quanta = {}
310 def __repr__(self):
311 # Default dataclass-injected __repr__ gets caught in an infinite loop
312 # because of back-references.
313 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
315 taskDef: TaskDef
316 """Data structure that identifies the task class and its config
317 (`TaskDef`).
318 """
320 dimensions: DimensionGraph
321 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
322 """
324 initInputs: _DatasetDict
325 """Dictionary containing information about datasets used to construct this
326 task (`_DatasetDict`).
327 """
329 initOutputs: _DatasetDict
330 """Dictionary containing information about datasets produced as a
331 side-effect of constructing this task (`_DatasetDict`).
332 """
334 inputs: _DatasetDict
335 """Dictionary containing information about datasets used as regular,
336 graph-constraining inputs to this task (`_DatasetDict`).
337 """
339 outputs: _DatasetDict
340 """Dictionary containing information about datasets produced by this task
341 (`_DatasetDict`).
342 """
344 prerequisites: _DatasetDict
345 """Dictionary containing information about input datasets that must be
346 present in the repository before any Pipeline containing this task is run
347 (`_DatasetDict`).
348 """
350 quanta: Dict[DataCoordinate, _QuantumScaffolding]
351 """Dictionary mapping data ID to a scaffolding object for the Quantum of
352 this task with that data ID.
353 """
355 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]:
356 """Create a `set` of `Quantum` from the information in ``self``.
358 Returns
359 -------
360 nodes : `set` of `Quantum
361 The `Quantum` elements corresponding to this task.
362 """
363 if unresolvedRefs is None:
364 unresolvedRefs = set()
365 outputs = set()
366 for q in self.quanta.values():
367 try:
368 tmpQuanta = q.makeQuantum()
369 outputs.add(tmpQuanta)
370 except (NoWorkFound, FileNotFoundError) as exc:
371 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
372 if unresolvedRefs.intersection(refs):
373 # This means it is a node that is Known to be pruned
374 # later and should be left in even though some follow up
375 # queries fail. This allows the pruning to start from this
376 # quantum with known issues, and prune other nodes it
377 # touches
378 inputs = q.inputs.unpackMultiRefs()
379 inputs.update(q.prerequisites.unpackMultiRefs())
380 tmpQuantum = Quantum(taskName=q.task.taskDef.taskName,
381 taskClass=q.task.taskDef.taskClass,
382 dataId=q.dataId,
383 initInputs=q.task.initInputs.unpackSingleRefs(),
384 inputs=inputs,
385 outputs=q.outputs.unpackMultiRefs(),)
386 outputs.add(tmpQuantum)
387 else:
388 raise exc
389 return outputs
392@dataclass
393class _PipelineScaffolding:
394 """A helper data structure that organizes the information involved in
395 constructing a `QuantumGraph` for a `Pipeline`.
397 Parameters
398 ----------
399 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
400 Sequence of tasks from which a graph is to be constructed. Must
401 have nested task classes already imported.
402 universe : `DimensionUniverse`
403 Universe of all possible dimensions.
405 Notes
406 -----
407 The scaffolding data structure contains nested data structures for both
408 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
409 data structures are shared between the pipeline-level structure (which
410 aggregates all datasets and categorizes them from the perspective of the
411 complete pipeline) and the individual tasks that use them as inputs and
412 outputs.
414 `QuantumGraph` construction proceeds in four steps, with each corresponding
415 to a different `_PipelineScaffolding` method:
417 1. When `_PipelineScaffolding` is constructed, we extract and categorize
418 the DatasetTypes used by the pipeline (delegating to
419 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
420 nested `_TaskScaffolding` and `_DatasetDict` objects.
422 2. In `connectDataIds`, we construct and run the "Big Join Query", which
423 returns related tuples of all dimensions used to identify any regular
424 input, output, and intermediate datasets (not prerequisites). We then
425 iterate over these tuples of related dimensions, identifying the subsets
426 that correspond to distinct data IDs for each task and dataset type,
427 and then create `_QuantumScaffolding` objects.
429 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
430 dataset data IDs previously identified, transforming unresolved
431 DatasetRefs into resolved DatasetRefs where appropriate. We then look
432 up prerequisite datasets for all quanta.
434 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
435 per-task `_QuantumScaffolding` objects.
436 """
437 def __init__(self, pipeline, *, registry):
438 _LOG.debug("Initializing data structures for QuantumGraph generation.")
439 self.tasks = []
440 # Aggregate and categorize the DatasetTypes in the Pipeline.
441 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
442 # Construct dictionaries that map those DatasetTypes to structures
443 # that will (later) hold addiitonal information about them.
444 for attr in ("initInputs", "initIntermediates", "initOutputs",
445 "inputs", "intermediates", "outputs", "prerequisites"):
446 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
447 universe=registry.dimensions))
448 # Aggregate all dimensions for all non-init, non-prerequisite
449 # DatasetTypes. These are the ones we'll include in the big join
450 # query.
451 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
452 self.outputs.dimensions)
453 # Construct scaffolding nodes for each Task, and add backreferences
454 # to the Task from each DatasetScaffolding node.
455 # Note that there's only one scaffolding node for each DatasetType,
456 # shared by _PipelineScaffolding and all _TaskScaffoldings that
457 # reference it.
458 if isinstance(pipeline, Pipeline):
459 pipeline = pipeline.toExpandedPipeline()
460 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
461 for taskDef, taskDatasetTypes in zip(pipeline,
462 datasetTypes.byTask.values())]
464 def __repr__(self):
465 # Default dataclass-injected __repr__ gets caught in an infinite loop
466 # because of back-references.
467 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
469 tasks: List[_TaskScaffolding]
470 """Scaffolding data structures for each task in the pipeline
471 (`list` of `_TaskScaffolding`).
472 """
474 initInputs: _DatasetDict
475 """Datasets consumed but not produced when constructing the tasks in this
476 pipeline (`_DatasetDict`).
477 """
479 initIntermediates: _DatasetDict
480 """Datasets that are both consumed and produced when constructing the tasks
481 in this pipeline (`_DatasetDict`).
482 """
484 initOutputs: _DatasetDict
485 """Datasets produced but not consumed when constructing the tasks in this
486 pipeline (`_DatasetDict`).
487 """
489 inputs: _DatasetDict
490 """Datasets that are consumed but not produced when running this pipeline
491 (`_DatasetDict`).
492 """
494 intermediates: _DatasetDict
495 """Datasets that are both produced and consumed when running this pipeline
496 (`_DatasetDict`).
497 """
499 outputs: _DatasetDict
500 """Datasets produced but not consumed when when running this pipeline
501 (`_DatasetDict`).
502 """
504 prerequisites: _DatasetDict
505 """Datasets that are consumed when running this pipeline and looked up
506 per-Quantum when generating the graph (`_DatasetDict`).
507 """
509 dimensions: DimensionGraph
510 """All dimensions used by any regular input, intermediate, or output
511 (not prerequisite) dataset; the set of dimension used in the "Big Join
512 Query" (`DimensionGraph`).
514 This is required to be a superset of all task quantum dimensions.
515 """
517 @contextmanager
518 def connectDataIds(self, registry, collections, userQuery, externalDataId,
519 datasetQueryConstraint: DatasetQueryConstraintVariant =
520 DatasetQueryConstraintVariant.ALL):
521 """Query for the data IDs that connect nodes in the `QuantumGraph`.
523 This method populates `_TaskScaffolding.dataIds` and
524 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
526 Parameters
527 ----------
528 registry : `lsst.daf.butler.Registry`
529 Registry for the data repository; used for all data ID queries.
530 collections
531 Expressions representing the collections to search for input
532 datasets. May be any of the types accepted by
533 `lsst.daf.butler.CollectionSearch.fromExpression`.
534 userQuery : `str` or `None`
535 User-provided expression to limit the data IDs processed.
536 externalDataId : `DataCoordinate`
537 Externally-provided data ID that should be used to restrict the
538 results, just as if these constraints had been included via ``AND``
539 in ``userQuery``. This includes (at least) any instrument named
540 in the pipeline definition.
541 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
542 The query constraint variant that should be used to constraint the
543 query based on dataset existance, defaults to
544 `DatasetQueryConstraintVariant.ALL`.
546 Returns
547 -------
548 commonDataIds : \
549 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
550 An interface to a database temporary table containing all data IDs
551 that will appear in this `QuantumGraph`. Returned inside a
552 context manager, which will drop the temporary table at the end of
553 the `with` block in which this method is called.
554 """
555 _LOG.debug("Building query for data IDs.")
556 # Initialization datasets always have empty data IDs.
557 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
558 for datasetType, refs in itertools.chain(self.initInputs.items(),
559 self.initIntermediates.items(),
560 self.initOutputs.items()):
561 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
562 # Run one big query for the data IDs for task dimensions and regular
563 # inputs and outputs. We limit the query to only dimensions that are
564 # associated with the input dataset types, but don't (yet) try to
565 # obtain the dataset_ids for those inputs.
566 _LOG.debug("Submitting data ID query and materializing results.")
567 queryArgs = {'dimensions': self.dimensions, 'where': userQuery, 'dataId': externalDataId}
568 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
569 _LOG.debug("Constraining graph query using all datasets in pipeline.")
570 queryArgs['datasets'] = list(self.inputs)
571 queryArgs['collections'] = collections
572 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
573 _LOG.debug("Not using dataset existence to constrain query.")
574 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
575 constraint = set(datasetQueryConstraint)
576 inputs = {k.name: k for k in self.inputs.keys()}
577 if (remainder := constraint.difference(inputs.keys())):
578 raise ValueError(f"{remainder} dataset type(s) specified as a graph constraint, but"
579 f" do not appear as an input to the specified pipeline: {inputs.keys()}")
580 _LOG.debug(f"Constraining graph query using {constraint}")
581 queryArgs['datasets'] = [typ for name, typ in inputs.items() if name in constraint]
582 queryArgs['collections'] = collections
583 else:
584 raise ValueError(f"Unable to handle type {datasetQueryConstraint} given as "
585 "datasetQueryConstraint.")
587 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
588 _LOG.debug("Expanding data IDs.")
589 commonDataIds = commonDataIds.expanded()
590 _LOG.debug("Iterating over query results to associate quanta with datasets.")
591 # Iterate over query results, populating data IDs for datasets and
592 # quanta and then connecting them to each other.
593 n = -1
594 for n, commonDataId in enumerate(commonDataIds):
595 # Create DatasetRefs for all DatasetTypes from this result row,
596 # noting that we might have created some already.
597 # We remember both those that already existed and those that we
598 # create now.
599 refsForRow = {}
600 dataIdCacheForRow: Mapping[DimensionGraph, DataCoordinate] = {}
601 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
602 self.outputs.items()):
603 if not (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)):
604 datasetDataId = commonDataId.subset(datasetType.dimensions)
605 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
606 ref = refs.get(datasetDataId)
607 if ref is None:
608 ref = DatasetRef(datasetType, datasetDataId)
609 refs[datasetDataId] = ref
610 refsForRow[datasetType.name] = ref
611 # Create _QuantumScaffolding objects for all tasks from this
612 # result row, noting that we might have created some already.
613 for task in self.tasks:
614 quantumDataId = commonDataId.subset(task.dimensions)
615 quantum = task.quanta.get(quantumDataId)
616 if quantum is None:
617 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
618 task.quanta[quantumDataId] = quantum
619 # Whether this is a new quantum or an existing one, we can
620 # now associate the DatasetRefs for this row with it. The
621 # fact that a Quantum data ID and a dataset data ID both
622 # came from the same result row is what tells us they
623 # should be associated.
624 # Many of these associates will be duplicates (because
625 # another query row that differed from this one only in
626 # irrelevant dimensions already added them), and we use
627 # sets to skip.
628 for datasetType in task.inputs:
629 ref = refsForRow[datasetType.name]
630 quantum.inputs[datasetType.name][ref.dataId] = ref
631 for datasetType in task.outputs:
632 ref = refsForRow[datasetType.name]
633 quantum.outputs[datasetType.name][ref.dataId] = ref
634 if n < 0:
635 emptiness_explained = False
636 for message in commonDataIds.explain_no_results():
637 _LOG.warn(message)
638 emptiness_explained = True
639 if not emptiness_explained:
640 _LOG.warn("To reproduce this query for debugging purposes, run "
641 "Registry.queryDataIds with these arguments:")
642 # We could just repr() the queryArgs dict to get something
643 # the user could make sense of, but it's friendlier to
644 # put these args in an easier-to-construct equivalent form
645 # so they can read it more easily and copy and paste into
646 # a Python terminal.
647 _LOG.warn(" dimensions=%s,", list(queryArgs["dimensions"].names))
648 _LOG.warn(" dataId=%s,", queryArgs["dataId"].byName())
649 if queryArgs["where"]:
650 _LOG.warn(" where=%s,", repr(queryArgs["where"]))
651 if "datasets" in queryArgs:
652 _LOG.warn(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
653 if "collections" in queryArgs:
654 _LOG.warn(" collections=%s,", list(queryArgs["collections"]))
655 _LOG.debug("Finished processing %d rows from data ID query.", n)
656 yield commonDataIds
658 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExistingIn=None,
659 clobberOutputs=True, constrainedByAllDatasets: bool = True):
660 """Perform follow up queries for each dataset data ID produced in
661 `fillDataIds`.
663 This method populates `_DatasetScaffolding.refs` (except for those in
664 `prerequisites`).
666 Parameters
667 ----------
668 registry : `lsst.daf.butler.Registry`
669 Registry for the data repository; used for all data ID queries.
670 collections
671 Expressions representing the collections to search for input
672 datasets. May be any of the types accepted by
673 `lsst.daf.butler.CollectionSearch.fromExpression`.
674 run : `str`, optional
675 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
676 output datasets, if it already exists.
677 commonDataIds : \
678 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
679 Result of a previous call to `connectDataIds`.
680 skipExistingIn
681 Expressions representing the collections to search for existing
682 output datasets that should be skipped. May be any of the types
683 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
684 `None` or empty string/sequence disables skipping.
685 clobberOutputs : `bool`, optional
686 If `True` (default), allow quanta to created even if outputs exist;
687 this requires the same behavior behavior to be enabled when
688 executing. If ``skipExistingIn`` is not `None`, completed quanta
689 (those with metadata, or all outputs if there is no metadata
690 dataset configured) will be skipped rather than clobbered.
691 constrainedByAllDatasets : `bool`, optional
692 Indicates if the commonDataIds were generated with a constraint on
693 all dataset types.
695 Raises
696 ------
697 OutputExistsError
698 Raised if an output dataset already exists in the output run
699 and ``skipExistingIn`` does not include output run, or if only
700 some outputs are present and ``clobberOutputs`` is `False`.
701 """
702 skipCollections: Optional[CollectionSearch] = None
703 skipExistingInRun = False
704 if skipExistingIn:
705 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
706 if run:
707 # as optimization check in the explicit list of names first
708 skipExistingInRun = run in skipCollections.explicitNames()
709 if not skipExistingInRun:
710 # need to flatten it and check again
711 skipExistingInRun = run in registry.queryCollections(
712 skipExistingIn,
713 collectionTypes=CollectionType.RUN,
714 )
716 # Look up [init] intermediate and output datasets in the output
717 # collection, if there is an output collection.
718 if run is not None or skipCollections is not None:
719 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
720 self.initOutputs.items(),
721 self.intermediates.items(),
722 self.outputs.items()):
723 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
724 len(refs), datasetType.name)
725 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
726 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
728 # look at RUN collection first
729 if run is not None:
730 resolvedRefQueryResults = subset.findDatasets(
731 datasetType,
732 collections=run,
733 findFirst=True
734 )
735 for resolvedRef in resolvedRefQueryResults:
736 # TODO: we could easily support per-DatasetType
737 # skipExisting and I could imagine that being useful -
738 # it's probably required in order to support writing
739 # initOutputs before QuantumGraph generation.
740 assert resolvedRef.dataId in refs
741 if not (skipExistingInRun or isInit or clobberOutputs):
742 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
743 f"output RUN collection '{run}' with data ID"
744 f" {resolvedRef.dataId}.")
746 # And check skipExistingIn too, if RUN collection is in
747 # it is handled above
748 if skipCollections is not None:
749 resolvedRefQueryResults = subset.findDatasets(
750 datasetType,
751 collections=skipCollections,
752 findFirst=True
753 )
754 for resolvedRef in resolvedRefQueryResults:
755 assert resolvedRef.dataId in refs
756 refs[resolvedRef.dataId] = resolvedRef
758 # Look up input and initInput datasets in the input collection(s).
759 # container to accumulate unfound refs, if the common dataIs were not
760 # constrained on dataset type existence.
761 self.unfoundRefs = set()
762 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
763 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
764 resolvedRefQueryResults = commonDataIds.subset(
765 datasetType.dimensions,
766 unique=True
767 ).findDatasets(
768 datasetType,
769 collections=collections,
770 findFirst=True
771 )
772 dataIdsNotFoundYet = set(refs.keys())
773 for resolvedRef in resolvedRefQueryResults:
774 dataIdsNotFoundYet.discard(resolvedRef.dataId)
775 refs[resolvedRef.dataId] = resolvedRef
776 if dataIdsNotFoundYet:
777 if constrainedByAllDatasets:
778 raise RuntimeError(
779 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
780 f"'{datasetType.name}' was/were present in a previous "
781 f"query, but could not be found now."
782 f"This is either a logic bug in QuantumGraph generation "
783 f"or the input collections have been modified since "
784 f"QuantumGraph generation began."
785 )
786 else:
787 # if the common dataIds were not constrained using all the
788 # input dataset types, it is possible that some data ids
789 # found dont correspond to existing dataset types and they
790 # will be un-resolved. Mark these for later pruning from
791 # the quantum graph.
792 for k in dataIdsNotFoundYet:
793 self.unfoundRefs.add(refs[k])
795 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
796 # replacing the unresolved refs there, and then look up prerequisites.
797 for task in self.tasks:
798 _LOG.debug(
799 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
800 len(task.quanta),
801 task.taskDef.label
802 )
803 lookupFunctions = {
804 c.name: c.lookupFunction
805 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
806 if c.lookupFunction is not None
807 }
808 dataIdsFailed = []
809 dataIdsSucceeded = []
810 for quantum in task.quanta.values():
811 # Process outputs datasets only if skipExistingIn is not None
812 # or there is a run to look for outputs in and clobberOutputs
813 # is True. Note that if skipExistingIn is None, any output
814 # datasets that already exist would have already caused an
815 # exception to be raised. We never update the DatasetRefs in
816 # the quantum because those should never be resolved.
817 if skipCollections is not None or (run is not None and clobberOutputs):
818 resolvedRefs = []
819 unresolvedRefs = []
820 haveMetadata = False
821 for datasetType, originalRefs in quantum.outputs.items():
822 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
823 if ref.id is not None:
824 resolvedRefs.append(ref)
825 if datasetType.name == task.taskDef.metadataDatasetName:
826 haveMetadata = True
827 else:
828 unresolvedRefs.append(ref)
829 if resolvedRefs:
830 if haveMetadata or not unresolvedRefs:
831 dataIdsSucceeded.append(quantum.dataId)
832 if skipCollections is not None:
833 continue
834 else:
835 dataIdsFailed.append(quantum.dataId)
836 if not clobberOutputs:
837 raise OutputExistsError(
838 f"Quantum {quantum.dataId} of task with label "
839 f"'{quantum.task.taskDef.label}' has some outputs that exist "
840 f"({resolvedRefs}) "
841 f"and others that don't ({unresolvedRefs}), with no metadata output, "
842 "and clobbering outputs was not enabled."
843 )
844 # Update the input DatasetRefs to the resolved ones we already
845 # searched for.
846 for datasetType, refs in quantum.inputs.items():
847 for ref in task.inputs.extract(datasetType, refs.keys()):
848 refs[ref.dataId] = ref
849 # Look up prerequisite datasets in the input collection(s).
850 # These may have dimensions that extend beyond those we queried
851 # for originally, because we want to permit those data ID
852 # values to differ across quanta and dataset types.
853 for datasetType in task.prerequisites:
854 lookupFunction = lookupFunctions.get(datasetType.name)
855 if lookupFunction is not None:
856 # PipelineTask has provided its own function to do the
857 # lookup. This always takes precedence.
858 refs = list(
859 lookupFunction(datasetType, registry, quantum.dataId, collections)
860 )
861 elif (datasetType.isCalibration()
862 and datasetType.dimensions <= quantum.dataId.graph
863 and quantum.dataId.graph.temporal):
864 # This is a master calibration lookup, which we have to
865 # handle specially because the query system can't do a
866 # temporal join on a non-dimension-based timespan yet.
867 timespan = quantum.dataId.timespan
868 try:
869 refs = [registry.findDataset(datasetType, quantum.dataId,
870 collections=collections,
871 timespan=timespan)]
872 except KeyError:
873 # This dataset type is not present in the registry,
874 # which just means there are no datasets here.
875 refs = []
876 else:
877 # Most general case.
878 refs = list(registry.queryDatasets(datasetType,
879 collections=collections,
880 dataId=quantum.dataId,
881 findFirst=True).expanded())
882 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs
883 if ref is not None})
884 # Actually remove any quanta that we decided to skip above.
885 if dataIdsSucceeded:
886 if skipCollections is not None:
887 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their "
888 "outputs exist or metadata was written successfully.",
889 len(dataIdsSucceeded), task.taskDef.label)
890 for dataId in dataIdsSucceeded:
891 del task.quanta[dataId]
892 elif clobberOutputs:
893 _LOG.info("Found %d successful quanta for task with label '%s' "
894 "that will need to be clobbered during execution.",
895 len(dataIdsSucceeded),
896 task.taskDef.label)
897 else:
898 raise AssertionError("OutputExistsError should have already been raised.")
899 if dataIdsFailed:
900 if clobberOutputs:
901 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' "
902 "that will need to be clobbered during execution.",
903 len(dataIdsFailed),
904 task.taskDef.label)
905 else:
906 raise AssertionError("OutputExistsError should have already been raised.")
908 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None):
909 """Create a `QuantumGraph` from the quanta already present in
910 the scaffolding data structure.
912 Parameters
913 ---------
914 metadata : Optional Mapping of `str` to primitives
915 This is an optional parameter of extra data to carry with the
916 graph. Entries in this mapping should be able to be serialized in
917 JSON.
919 Returns
920 -------
921 graph : `QuantumGraph`
922 The full `QuantumGraph`.
923 """
924 graphInput: Dict[TaskDef, Set[Quantum]] = {}
925 for task in self.tasks:
926 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs)
927 graphInput[task.taskDef] = qset
929 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs)
930 return graph
933# ------------------------
934# Exported definitions --
935# ------------------------
938class GraphBuilderError(Exception):
939 """Base class for exceptions generated by graph builder.
940 """
941 pass
944class OutputExistsError(GraphBuilderError):
945 """Exception generated when output datasets already exist.
946 """
947 pass
950class PrerequisiteMissingError(GraphBuilderError):
951 """Exception generated when a prerequisite dataset does not exist.
952 """
953 pass
956class GraphBuilder(object):
957 """GraphBuilder class is responsible for building task execution graph from
958 a Pipeline.
960 Parameters
961 ----------
962 registry : `~lsst.daf.butler.Registry`
963 Data butler instance.
964 skipExistingIn
965 Expressions representing the collections to search for existing
966 output datasets that should be skipped. May be any of the types
967 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
968 clobberOutputs : `bool`, optional
969 If `True` (default), allow quanta to created even if partial outputs
970 exist; this requires the same behavior behavior to be enabled when
971 executing.
972 """
974 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True):
975 self.registry = registry
976 self.dimensions = registry.dimensions
977 self.skipExistingIn = skipExistingIn
978 self.clobberOutputs = clobberOutputs
980 def makeGraph(self, pipeline, collections, run, userQuery,
981 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
982 metadata: Optional[Mapping[str, Any]] = None):
983 """Create execution graph for a pipeline.
985 Parameters
986 ----------
987 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
988 Pipeline definition, task names/classes and their configs.
989 collections
990 Expressions representing the collections to search for input
991 datasets. May be any of the types accepted by
992 `lsst.daf.butler.CollectionSearch.fromExpression`.
993 run : `str`, optional
994 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
995 output datasets, if it already exists.
996 userQuery : `str`
997 String which defines user-defined selection for registry, should be
998 empty or `None` if there is no restrictions on data selection.
999 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1000 The query constraint variant that should be used to constraint the
1001 query based on dataset existance, defaults to
1002 `DatasetQueryConstraintVariant.ALL`.
1003 metadata : Optional Mapping of `str` to primitives
1004 This is an optional parameter of extra data to carry with the
1005 graph. Entries in this mapping should be able to be serialized in
1006 JSON.
1008 Returns
1009 -------
1010 graph : `QuantumGraph`
1012 Raises
1013 ------
1014 UserExpressionError
1015 Raised when user expression cannot be parsed.
1016 OutputExistsError
1017 Raised when output datasets already exist.
1018 Exception
1019 Other exceptions types may be raised by underlying registry
1020 classes.
1021 """
1022 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1023 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1024 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1025 instrument = None
1026 if isinstance(pipeline, Pipeline):
1027 instrument = pipeline.getInstrument()
1028 if isinstance(instrument, str):
1029 instrument = doImport(instrument)
1030 pipeline = list(pipeline.toExpandedPipeline())
1031 if instrument is not None:
1032 dataId = DataCoordinate.standardize(instrument=instrument.getName(),
1033 universe=self.registry.dimensions)
1034 else:
1035 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1036 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId,
1037 datasetQueryConstraint) as commonDataIds:
1038 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1039 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
1040 skipExistingIn=self.skipExistingIn,
1041 clobberOutputs=self.clobberOutputs,
1042 constrainedByAllDatasets=condition)
1043 return scaffolding.makeQuantumGraph(metadata=metadata)