Coverage for python/lsst/pipe/base/graphBuilder.py: 19%
388 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-07-09 06:14 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-07-09 06:14 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ["GraphBuilder"]
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32import logging
33from collections import ChainMap
34from contextlib import contextmanager
35from dataclasses import dataclass
36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Union
38from lsst.daf.butler import (
39 CollectionSearch,
40 CollectionType,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DatastoreRecordData,
46 DimensionGraph,
47 DimensionUniverse,
48 NamedKeyDict,
49 Quantum,
50 Registry,
51)
52from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
53from lsst.utils import doImportType
55from ._datasetQueryConstraints import DatasetQueryConstraintVariant
56from ._status import NoWorkFound
58# -----------------------------
59# Imports for other modules --
60# -----------------------------
61from .connections import AdjustQuantumHelper, iterConnections
62from .graph import QuantumGraph
63from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef
65# ----------------------------------
66# Local non-exported definitions --
67# ----------------------------------
69_LOG = logging.getLogger(__name__)
72class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
73 """A custom dictionary that maps `DatasetType` to a nested dictionary of
74 the known `DatasetRef` instances of that type.
76 Parameters
77 ----------
78 args
79 Positional arguments are forwarded to the `dict` constructor.
80 universe : `DimensionUniverse`
81 Universe of all possible dimensions.
82 """
84 def __init__(self, *args: Any, universe: DimensionUniverse):
85 super().__init__(*args)
86 self.universe = universe
88 @classmethod
89 def fromDatasetTypes(
90 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse
91 ) -> _DatasetDict:
92 """Construct a dictionary from a flat iterable of `DatasetType` keys.
94 Parameters
95 ----------
96 datasetTypes : `iterable` of `DatasetType`
97 DatasetTypes to use as keys for the dict. Values will be empty
98 dictionaries.
99 universe : `DimensionUniverse`
100 Universe of all possible dimensions.
102 Returns
103 -------
104 dictionary : `_DatasetDict`
105 A new `_DatasetDict` instance.
106 """
107 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
109 @classmethod
110 def fromSubset(
111 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict
112 ) -> _DatasetDict:
113 """Return a new dictionary by extracting items corresponding to the
114 given keys from one or more existing dictionaries.
116 Parameters
117 ----------
118 datasetTypes : `iterable` of `DatasetType`
119 DatasetTypes to use as keys for the dict. Values will be obtained
120 by lookups against ``first`` and ``rest``.
121 first : `_DatasetDict`
122 Another dictionary from which to extract values.
123 rest
124 Additional dictionaries from which to extract values.
126 Returns
127 -------
128 dictionary : `_DatasetDict`
129 A new dictionary instance.
130 """
131 combined = ChainMap(first, *rest)
133 # Dataset types known to match immediately can be processed
134 # without checks.
135 matches = combined.keys() & set(datasetTypes)
136 _dict = {k: combined[k] for k in matches}
138 if len(_dict) < len(datasetTypes):
139 # Work out which ones are missing.
140 missing_datasetTypes = set(datasetTypes) - _dict.keys()
142 # Get the known names for comparison.
143 combined_by_name = {k.name: k for k in combined}
145 missing = set()
146 incompatible = {}
147 for datasetType in missing_datasetTypes:
148 # The dataset type is not found. It may not be listed
149 # or it may be that it is there with the same name
150 # but different definition.
151 if datasetType.name in combined_by_name:
152 # This implies some inconsistency in definitions
153 # for connections. If there is support for storage
154 # class conversion we can let it slide.
155 # At this point we do not know
156 # where the inconsistency is but trust that down
157 # stream code will be more explicit about input
158 # vs output incompatibilities.
159 existing = combined_by_name[datasetType.name]
160 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing):
161 _LOG.warning(
162 "Dataset type mismatch (%s != %s) but continuing since they are compatible",
163 datasetType,
164 existing,
165 )
166 _dict[datasetType] = combined[existing]
167 else:
168 incompatible[datasetType] = existing
169 else:
170 missing.add(datasetType)
172 if missing or incompatible:
173 reasons = []
174 if missing:
175 reasons.append(
176 "DatasetTypes {'.'.join(missing)} not present in list of known types: "
177 + ", ".join(d.name for d in combined)
178 )
179 if incompatible:
180 for x, y in incompatible.items():
181 reasons.append(f"{x} incompatible with {y}")
182 raise KeyError("Errors matching dataset types: " + " & ".join(reasons))
184 return cls(_dict, universe=first.universe)
186 @property
187 def dimensions(self) -> DimensionGraph:
188 """The union of all dimensions used by all dataset types in this
189 dictionary, including implied dependencies (`DimensionGraph`).
190 """
191 base = self.universe.empty
192 if len(self) == 0:
193 return base
194 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
196 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
197 """Unpack nested single-element `DatasetRef` dicts into a new
198 mapping with `DatasetType` keys and `DatasetRef` values.
200 This method assumes that each nest contains exactly one item, as is the
201 case for all "init" datasets.
203 Returns
204 -------
205 dictionary : `NamedKeyDict`
206 Dictionary mapping `DatasetType` to `DatasetRef`, with both
207 `DatasetType` instances and string names usable as keys.
208 """
210 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
211 (ref,) = refs.values()
212 return ref
214 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
216 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
217 """Unpack nested multi-element `DatasetRef` dicts into a new
218 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
220 Returns
221 -------
222 dictionary : `NamedKeyDict`
223 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
224 both `DatasetType` instances and string names usable as keys.
225 """
226 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
228 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
229 """Iterate over the contained `DatasetRef` instances that match the
230 given `DatasetType` and data IDs.
232 Parameters
233 ----------
234 datasetType : `DatasetType`
235 Dataset type to match.
236 dataIds : `Iterable` [ `DataCoordinate` ]
237 Data IDs to match.
239 Returns
240 -------
241 refs : `Iterator` [ `DatasetRef` ]
242 DatasetRef instances for which ``ref.datasetType == datasetType``
243 and ``ref.dataId`` is in ``dataIds``.
244 """
245 refs = self[datasetType]
246 return (refs[dataId] for dataId in dataIds)
249class _QuantumScaffolding:
250 """Helper class aggregating information about a `Quantum`, used when
251 constructing a `QuantumGraph`.
253 See `_PipelineScaffolding` for a top-down description of the full
254 scaffolding data structure.
256 Parameters
257 ----------
258 task : _TaskScaffolding
259 Back-reference to the helper object for the `PipelineTask` this quantum
260 represents an execution of.
261 dataId : `DataCoordinate`
262 Data ID for this quantum.
263 """
265 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
266 self.task = task
267 self.dataId = dataId
268 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
269 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
270 self.prerequisites = _DatasetDict.fromDatasetTypes(
271 task.prerequisites.keys(), universe=dataId.universe
272 )
274 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
276 def __repr__(self) -> str:
277 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
279 task: _TaskScaffolding
280 """Back-reference to the helper object for the `PipelineTask` this quantum
281 represents an execution of.
282 """
284 dataId: DataCoordinate
285 """Data ID for this quantum.
286 """
288 inputs: _DatasetDict
289 """Nested dictionary containing `DatasetRef` inputs to this quantum.
291 This is initialized to map each `DatasetType` to an empty dictionary at
292 construction. Those nested dictionaries are populated (with data IDs as
293 keys) with unresolved `DatasetRef` instances in
294 `_PipelineScaffolding.connectDataIds`.
295 """
297 outputs: _DatasetDict
298 """Nested dictionary containing `DatasetRef` outputs this quantum.
299 """
301 prerequisites: _DatasetDict
302 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
303 quantum.
304 """
306 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum:
307 """Transform the scaffolding object into a true `Quantum` instance.
309 Parameters
310 ----------
311 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional
312 If not `None` then fill datastore records in each generated Quantum
313 using the records from this structure.
315 Returns
316 -------
317 quantum : `Quantum`
318 An actual `Quantum` instance.
319 """
320 allInputs = self.inputs.unpackMultiRefs()
321 allInputs.update(self.prerequisites.unpackMultiRefs())
322 # Give the task's Connections class an opportunity to remove some
323 # inputs, or complain if they are unacceptable.
324 # This will raise if one of the check conditions is not met, which is
325 # the intended behavior.
326 # If it raises NotWorkFound, there is a bug in the QG algorithm
327 # or the adjustQuantum is incorrectly trying to make a prerequisite
328 # input behave like a regular input; adjustQuantum should only raise
329 # NoWorkFound if a regular input is missing, and it shouldn't be
330 # possible for us to have generated ``self`` if that's true.
331 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
332 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
333 initInputs = self.task.initInputs.unpackSingleRefs()
334 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None
335 if datastore_records is not None:
336 quantum_records = {}
337 input_refs = list(itertools.chain.from_iterable(helper.inputs.values()))
338 input_refs += list(initInputs.values())
339 input_ids = set(ref.id for ref in input_refs if ref.id is not None)
340 for datastore_name, records in datastore_records.items():
341 matching_records = records.subset(input_ids)
342 if matching_records is not None:
343 quantum_records[datastore_name] = matching_records
344 return Quantum(
345 taskName=self.task.taskDef.taskName,
346 taskClass=self.task.taskDef.taskClass,
347 dataId=self.dataId,
348 initInputs=initInputs,
349 inputs=helper.inputs,
350 outputs=helper.outputs,
351 datastore_records=quantum_records,
352 )
355@dataclass
356class _TaskScaffolding:
357 """Helper class aggregating information about a `PipelineTask`, used when
358 constructing a `QuantumGraph`.
360 See `_PipelineScaffolding` for a top-down description of the full
361 scaffolding data structure.
363 Parameters
364 ----------
365 taskDef : `TaskDef`
366 Data structure that identifies the task class and its config.
367 parent : `_PipelineScaffolding`
368 The parent data structure that will hold the instance being
369 constructed.
370 datasetTypes : `TaskDatasetTypes`
371 Data structure that categorizes the dataset types used by this task.
372 """
374 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
375 universe = parent.dimensions.universe
376 self.taskDef = taskDef
377 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
378 assert self.dimensions.issubset(parent.dimensions)
379 # Initialize _DatasetDicts as subsets of the one or two
380 # corresponding dicts in the parent _PipelineScaffolding.
381 self.initInputs = _DatasetDict.fromSubset(
382 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates
383 )
384 self.initOutputs = _DatasetDict.fromSubset(
385 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs
386 )
387 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
388 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
389 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
390 self.dataIds: Set[DataCoordinate] = set()
391 self.quanta = {}
393 def __repr__(self) -> str:
394 # Default dataclass-injected __repr__ gets caught in an infinite loop
395 # because of back-references.
396 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
398 taskDef: TaskDef
399 """Data structure that identifies the task class and its config
400 (`TaskDef`).
401 """
403 dimensions: DimensionGraph
404 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
405 """
407 initInputs: _DatasetDict
408 """Dictionary containing information about datasets used to construct this
409 task (`_DatasetDict`).
410 """
412 initOutputs: _DatasetDict
413 """Dictionary containing information about datasets produced as a
414 side-effect of constructing this task (`_DatasetDict`).
415 """
417 inputs: _DatasetDict
418 """Dictionary containing information about datasets used as regular,
419 graph-constraining inputs to this task (`_DatasetDict`).
420 """
422 outputs: _DatasetDict
423 """Dictionary containing information about datasets produced by this task
424 (`_DatasetDict`).
425 """
427 prerequisites: _DatasetDict
428 """Dictionary containing information about input datasets that must be
429 present in the repository before any Pipeline containing this task is run
430 (`_DatasetDict`).
431 """
433 quanta: Dict[DataCoordinate, _QuantumScaffolding]
434 """Dictionary mapping data ID to a scaffolding object for the Quantum of
435 this task with that data ID.
436 """
438 def makeQuantumSet(
439 self,
440 unresolvedRefs: Optional[Set[DatasetRef]] = None,
441 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
442 ) -> Set[Quantum]:
443 """Create a `set` of `Quantum` from the information in ``self``.
445 Parameters
446 ----------
447 unresolvedRefs : `set` [ `DatasetRef` ], optional
448 Input dataset refs that have not been found.
449 datastore_records : `dict`
452 Returns
453 -------
454 nodes : `set` of `Quantum`
455 The `Quantum` elements corresponding to this task.
456 """
457 if unresolvedRefs is None:
458 unresolvedRefs = set()
459 outputs = set()
460 for q in self.quanta.values():
461 try:
462 tmpQuanta = q.makeQuantum(datastore_records)
463 outputs.add(tmpQuanta)
464 except (NoWorkFound, FileNotFoundError) as exc:
465 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values())
466 if unresolvedRefs.intersection(refs):
467 # This means it is a node that is Known to be pruned
468 # later and should be left in even though some follow up
469 # queries fail. This allows the pruning to start from this
470 # quantum with known issues, and prune other nodes it
471 # touches
472 inputs = q.inputs.unpackMultiRefs()
473 inputs.update(q.prerequisites.unpackMultiRefs())
474 tmpQuantum = Quantum(
475 taskName=q.task.taskDef.taskName,
476 taskClass=q.task.taskDef.taskClass,
477 dataId=q.dataId,
478 initInputs=q.task.initInputs.unpackSingleRefs(),
479 inputs=inputs,
480 outputs=q.outputs.unpackMultiRefs(),
481 )
482 outputs.add(tmpQuantum)
483 else:
484 raise exc
485 return outputs
488@dataclass
489class _PipelineScaffolding:
490 """A helper data structure that organizes the information involved in
491 constructing a `QuantumGraph` for a `Pipeline`.
493 Parameters
494 ----------
495 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
496 Sequence of tasks from which a graph is to be constructed. Must
497 have nested task classes already imported.
498 universe : `DimensionUniverse`
499 Universe of all possible dimensions.
501 Notes
502 -----
503 The scaffolding data structure contains nested data structures for both
504 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
505 data structures are shared between the pipeline-level structure (which
506 aggregates all datasets and categorizes them from the perspective of the
507 complete pipeline) and the individual tasks that use them as inputs and
508 outputs.
510 `QuantumGraph` construction proceeds in four steps, with each corresponding
511 to a different `_PipelineScaffolding` method:
513 1. When `_PipelineScaffolding` is constructed, we extract and categorize
514 the DatasetTypes used by the pipeline (delegating to
515 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
516 nested `_TaskScaffolding` and `_DatasetDict` objects.
518 2. In `connectDataIds`, we construct and run the "Big Join Query", which
519 returns related tuples of all dimensions used to identify any regular
520 input, output, and intermediate datasets (not prerequisites). We then
521 iterate over these tuples of related dimensions, identifying the subsets
522 that correspond to distinct data IDs for each task and dataset type,
523 and then create `_QuantumScaffolding` objects.
525 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
526 dataset data IDs previously identified, transforming unresolved
527 DatasetRefs into resolved DatasetRefs where appropriate. We then look
528 up prerequisite datasets for all quanta.
530 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
531 per-task `_QuantumScaffolding` objects.
532 """
534 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry):
535 _LOG.debug("Initializing data structures for QuantumGraph generation.")
536 self.tasks = []
537 # Aggregate and categorize the DatasetTypes in the Pipeline.
538 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
539 # Construct dictionaries that map those DatasetTypes to structures
540 # that will (later) hold addiitonal information about them.
541 for attr in (
542 "initInputs",
543 "initIntermediates",
544 "initOutputs",
545 "inputs",
546 "intermediates",
547 "outputs",
548 "prerequisites",
549 ):
550 setattr(
551 self,
552 attr,
553 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions),
554 )
555 # Aggregate all dimensions for all non-init, non-prerequisite
556 # DatasetTypes. These are the ones we'll include in the big join
557 # query.
558 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions)
559 # Construct scaffolding nodes for each Task, and add backreferences
560 # to the Task from each DatasetScaffolding node.
561 # Note that there's only one scaffolding node for each DatasetType,
562 # shared by _PipelineScaffolding and all _TaskScaffoldings that
563 # reference it.
564 if isinstance(pipeline, Pipeline):
565 pipeline = pipeline.toExpandedPipeline()
566 self.tasks = [
567 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
568 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values())
569 ]
571 def __repr__(self) -> str:
572 # Default dataclass-injected __repr__ gets caught in an infinite loop
573 # because of back-references.
574 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
576 tasks: List[_TaskScaffolding]
577 """Scaffolding data structures for each task in the pipeline
578 (`list` of `_TaskScaffolding`).
579 """
581 initInputs: _DatasetDict
582 """Datasets consumed but not produced when constructing the tasks in this
583 pipeline (`_DatasetDict`).
584 """
586 initIntermediates: _DatasetDict
587 """Datasets that are both consumed and produced when constructing the tasks
588 in this pipeline (`_DatasetDict`).
589 """
591 initOutputs: _DatasetDict
592 """Datasets produced but not consumed when constructing the tasks in this
593 pipeline (`_DatasetDict`).
594 """
596 inputs: _DatasetDict
597 """Datasets that are consumed but not produced when running this pipeline
598 (`_DatasetDict`).
599 """
601 intermediates: _DatasetDict
602 """Datasets that are both produced and consumed when running this pipeline
603 (`_DatasetDict`).
604 """
606 outputs: _DatasetDict
607 """Datasets produced but not consumed when when running this pipeline
608 (`_DatasetDict`).
609 """
611 prerequisites: _DatasetDict
612 """Datasets that are consumed when running this pipeline and looked up
613 per-Quantum when generating the graph (`_DatasetDict`).
614 """
616 dimensions: DimensionGraph
617 """All dimensions used by any regular input, intermediate, or output
618 (not prerequisite) dataset; the set of dimension used in the "Big Join
619 Query" (`DimensionGraph`).
621 This is required to be a superset of all task quantum dimensions.
622 """
624 @contextmanager
625 def connectDataIds(
626 self,
627 registry: Registry,
628 collections: Any,
629 userQuery: Optional[str],
630 externalDataId: DataCoordinate,
631 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
632 ) -> Iterator[DataCoordinateQueryResults]:
633 """Query for the data IDs that connect nodes in the `QuantumGraph`.
635 This method populates `_TaskScaffolding.dataIds` and
636 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
638 Parameters
639 ----------
640 registry : `lsst.daf.butler.Registry`
641 Registry for the data repository; used for all data ID queries.
642 collections
643 Expressions representing the collections to search for input
644 datasets. May be any of the types accepted by
645 `lsst.daf.butler.CollectionSearch.fromExpression`.
646 userQuery : `str` or `None`
647 User-provided expression to limit the data IDs processed.
648 externalDataId : `DataCoordinate`
649 Externally-provided data ID that should be used to restrict the
650 results, just as if these constraints had been included via ``AND``
651 in ``userQuery``. This includes (at least) any instrument named
652 in the pipeline definition.
653 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
654 The query constraint variant that should be used to constraint the
655 query based on dataset existance, defaults to
656 `DatasetQueryConstraintVariant.ALL`.
658 Returns
659 -------
660 commonDataIds : \
661 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
662 An interface to a database temporary table containing all data IDs
663 that will appear in this `QuantumGraph`. Returned inside a
664 context manager, which will drop the temporary table at the end of
665 the `with` block in which this method is called.
666 """
667 _LOG.debug("Building query for data IDs.")
668 # Initialization datasets always have empty data IDs.
669 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
670 for datasetType, refs in itertools.chain(
671 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()
672 ):
673 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
674 # Run one big query for the data IDs for task dimensions and regular
675 # inputs and outputs. We limit the query to only dimensions that are
676 # associated with the input dataset types, but don't (yet) try to
677 # obtain the dataset_ids for those inputs.
678 _LOG.debug("Submitting data ID query and materializing results.")
679 queryArgs: Dict[str, Any] = {
680 "dimensions": self.dimensions,
681 "where": userQuery,
682 "dataId": externalDataId,
683 }
684 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL:
685 _LOG.debug("Constraining graph query using all datasets in pipeline.")
686 queryArgs["datasets"] = list(self.inputs)
687 queryArgs["collections"] = collections
688 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF:
689 _LOG.debug("Not using dataset existence to constrain query.")
690 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST:
691 constraint = set(datasetQueryConstraint)
692 inputs = {k.name: k for k in self.inputs.keys()}
693 if remainder := constraint.difference(inputs.keys()):
694 raise ValueError(
695 f"{remainder} dataset type(s) specified as a graph constraint, but"
696 f" do not appear as an input to the specified pipeline: {inputs.keys()}"
697 )
698 _LOG.debug(f"Constraining graph query using {constraint}")
699 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint]
700 queryArgs["collections"] = collections
701 else:
702 raise ValueError(
703 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint."
704 )
706 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds:
707 _LOG.debug("Expanding data IDs.")
708 commonDataIds = commonDataIds.expanded()
709 _LOG.debug("Iterating over query results to associate quanta with datasets.")
710 # Iterate over query results, populating data IDs for datasets and
711 # quanta and then connecting them to each other.
712 n = -1
713 for n, commonDataId in enumerate(commonDataIds):
714 # Create DatasetRefs for all DatasetTypes from this result row,
715 # noting that we might have created some already.
716 # We remember both those that already existed and those that we
717 # create now.
718 refsForRow = {}
719 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {}
720 for datasetType, refs in itertools.chain(
721 self.inputs.items(), self.intermediates.items(), self.outputs.items()
722 ):
723 datasetDataId: Optional[DataCoordinate]
724 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None:
725 datasetDataId = commonDataId.subset(datasetType.dimensions)
726 dataIdCacheForRow[datasetType.dimensions] = datasetDataId
727 ref = refs.get(datasetDataId)
728 if ref is None:
729 ref = DatasetRef(datasetType, datasetDataId)
730 refs[datasetDataId] = ref
731 refsForRow[datasetType.name] = ref
732 # Create _QuantumScaffolding objects for all tasks from this
733 # result row, noting that we might have created some already.
734 for task in self.tasks:
735 quantumDataId = commonDataId.subset(task.dimensions)
736 quantum = task.quanta.get(quantumDataId)
737 if quantum is None:
738 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
739 task.quanta[quantumDataId] = quantum
740 # Whether this is a new quantum or an existing one, we can
741 # now associate the DatasetRefs for this row with it. The
742 # fact that a Quantum data ID and a dataset data ID both
743 # came from the same result row is what tells us they
744 # should be associated.
745 # Many of these associates will be duplicates (because
746 # another query row that differed from this one only in
747 # irrelevant dimensions already added them), and we use
748 # sets to skip.
749 for datasetType in task.inputs:
750 ref = refsForRow[datasetType.name]
751 quantum.inputs[datasetType.name][ref.dataId] = ref
752 for datasetType in task.outputs:
753 ref = refsForRow[datasetType.name]
754 quantum.outputs[datasetType.name][ref.dataId] = ref
755 if n < 0:
756 emptiness_explained = False
757 for message in commonDataIds.explain_no_results():
758 _LOG.warning(message)
759 emptiness_explained = True
760 if not emptiness_explained:
761 _LOG.warning(
762 "To reproduce this query for debugging purposes, run "
763 "Registry.queryDataIds with these arguments:"
764 )
765 # We could just repr() the queryArgs dict to get something
766 # the user could make sense of, but it's friendlier to
767 # put these args in an easier-to-construct equivalent form
768 # so they can read it more easily and copy and paste into
769 # a Python terminal.
770 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names))
771 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName())
772 if queryArgs["where"]:
773 _LOG.warning(" where=%s,", repr(queryArgs["where"]))
774 if "datasets" in queryArgs:
775 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]])
776 if "collections" in queryArgs:
777 _LOG.warning(" collections=%s,", list(queryArgs["collections"]))
778 _LOG.debug("Finished processing %d rows from data ID query.", n)
779 yield commonDataIds
781 def resolveDatasetRefs(
782 self,
783 registry: Registry,
784 collections: Any,
785 run: Optional[str],
786 commonDataIds: DataCoordinateQueryResults,
787 *,
788 skipExistingIn: Any = None,
789 clobberOutputs: bool = True,
790 constrainedByAllDatasets: bool = True,
791 ) -> None:
792 """Perform follow up queries for each dataset data ID produced in
793 `fillDataIds`.
795 This method populates `_DatasetScaffolding.refs` (except for those in
796 `prerequisites`).
798 Parameters
799 ----------
800 registry : `lsst.daf.butler.Registry`
801 Registry for the data repository; used for all data ID queries.
802 collections
803 Expressions representing the collections to search for input
804 datasets. May be any of the types accepted by
805 `lsst.daf.butler.CollectionSearch.fromExpression`.
806 run : `str`, optional
807 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
808 output datasets, if it already exists.
809 commonDataIds : \
810 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
811 Result of a previous call to `connectDataIds`.
812 skipExistingIn
813 Expressions representing the collections to search for existing
814 output datasets that should be skipped. May be any of the types
815 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
816 `None` or empty string/sequence disables skipping.
817 clobberOutputs : `bool`, optional
818 If `True` (default), allow quanta to created even if outputs exist;
819 this requires the same behavior behavior to be enabled when
820 executing. If ``skipExistingIn`` is not `None`, completed quanta
821 (those with metadata, or all outputs if there is no metadata
822 dataset configured) will be skipped rather than clobbered.
823 constrainedByAllDatasets : `bool`, optional
824 Indicates if the commonDataIds were generated with a constraint on
825 all dataset types.
827 Raises
828 ------
829 OutputExistsError
830 Raised if an output dataset already exists in the output run
831 and ``skipExistingIn`` does not include output run, or if only
832 some outputs are present and ``clobberOutputs`` is `False`.
833 """
834 skipCollections: Optional[CollectionSearch] = None
835 skipExistingInRun = False
836 if skipExistingIn:
837 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
838 if run:
839 # as optimization check in the explicit list of names first
840 skipExistingInRun = run in skipCollections.explicitNames()
841 if not skipExistingInRun:
842 # need to flatten it and check again
843 skipExistingInRun = run in registry.queryCollections(
844 skipExistingIn,
845 collectionTypes=CollectionType.RUN,
846 )
848 # Look up [init] intermediate and output datasets in the output
849 # collection, if there is an output collection.
850 if run is not None or skipCollections is not None:
851 for datasetType, refs in itertools.chain(
852 self.initIntermediates.items(),
853 self.initOutputs.items(),
854 self.intermediates.items(),
855 self.outputs.items(),
856 ):
857 _LOG.debug(
858 "Resolving %d datasets for intermediate and/or output dataset %s.",
859 len(refs),
860 datasetType.name,
861 )
862 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
863 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
865 # look at RUN collection first
866 if run is not None:
867 resolvedRefQueryResults = subset.findDatasets(
868 datasetType, collections=run, findFirst=True
869 )
870 for resolvedRef in resolvedRefQueryResults:
871 # TODO: we could easily support per-DatasetType
872 # skipExisting and I could imagine that being useful -
873 # it's probably required in order to support writing
874 # initOutputs before QuantumGraph generation.
875 assert resolvedRef.dataId in refs
876 if not (skipExistingInRun or isInit or clobberOutputs):
877 raise OutputExistsError(
878 f"Output dataset {datasetType.name} already exists in "
879 f"output RUN collection '{run}' with data ID"
880 f" {resolvedRef.dataId}."
881 )
883 # And check skipExistingIn too, if RUN collection is in
884 # it is handled above
885 if skipCollections is not None:
886 resolvedRefQueryResults = subset.findDatasets(
887 datasetType, collections=skipCollections, findFirst=True
888 )
889 for resolvedRef in resolvedRefQueryResults:
890 assert resolvedRef.dataId in refs
891 refs[resolvedRef.dataId] = resolvedRef
893 # Look up input and initInput datasets in the input collection(s).
894 # container to accumulate unfound refs, if the common dataIs were not
895 # constrained on dataset type existence.
896 self.unfoundRefs = set()
897 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
898 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
899 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets(
900 datasetType, collections=collections, findFirst=True
901 )
902 dataIdsNotFoundYet = set(refs.keys())
903 for resolvedRef in resolvedRefQueryResults:
904 dataIdsNotFoundYet.discard(resolvedRef.dataId)
905 refs[resolvedRef.dataId] = resolvedRef
906 if dataIdsNotFoundYet:
907 if constrainedByAllDatasets:
908 raise RuntimeError(
909 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
910 f"'{datasetType.name}' was/were present in a previous "
911 f"query, but could not be found now."
912 f"This is either a logic bug in QuantumGraph generation "
913 f"or the input collections have been modified since "
914 f"QuantumGraph generation began."
915 )
916 else:
917 # if the common dataIds were not constrained using all the
918 # input dataset types, it is possible that some data ids
919 # found dont correspond to existing dataset types and they
920 # will be un-resolved. Mark these for later pruning from
921 # the quantum graph.
922 for k in dataIdsNotFoundYet:
923 self.unfoundRefs.add(refs[k])
925 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
926 # replacing the unresolved refs there, and then look up prerequisites.
927 for task in self.tasks:
928 _LOG.debug(
929 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
930 len(task.quanta),
931 task.taskDef.label,
932 )
933 # The way iterConnections is designed makes it impossible to
934 # annotate precisely enough to satisfy MyPy here.
935 lookupFunctions = {
936 c.name: c.lookupFunction # type: ignore
937 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
938 if c.lookupFunction is not None # type: ignore
939 }
940 dataIdsFailed = []
941 dataIdsSucceeded = []
942 for quantum in task.quanta.values():
943 # Process outputs datasets only if skipExistingIn is not None
944 # or there is a run to look for outputs in and clobberOutputs
945 # is True. Note that if skipExistingIn is None, any output
946 # datasets that already exist would have already caused an
947 # exception to be raised. We never update the DatasetRefs in
948 # the quantum because those should never be resolved.
949 if skipCollections is not None or (run is not None and clobberOutputs):
950 resolvedRefs = []
951 unresolvedRefs = []
952 haveMetadata = False
953 for datasetType, originalRefs in quantum.outputs.items():
954 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
955 if ref.id is not None:
956 resolvedRefs.append(ref)
957 if datasetType.name == task.taskDef.metadataDatasetName:
958 haveMetadata = True
959 else:
960 unresolvedRefs.append(ref)
961 if resolvedRefs:
962 if haveMetadata or not unresolvedRefs:
963 dataIdsSucceeded.append(quantum.dataId)
964 if skipCollections is not None:
965 continue
966 else:
967 dataIdsFailed.append(quantum.dataId)
968 if not clobberOutputs:
969 raise OutputExistsError(
970 f"Quantum {quantum.dataId} of task with label "
971 f"'{quantum.task.taskDef.label}' has some outputs that exist "
972 f"({resolvedRefs}) "
973 f"and others that don't ({unresolvedRefs}), with no metadata output, "
974 "and clobbering outputs was not enabled."
975 )
976 # Update the input DatasetRefs to the resolved ones we already
977 # searched for.
978 for datasetType, input_refs in quantum.inputs.items():
979 for ref in task.inputs.extract(datasetType, input_refs.keys()):
980 input_refs[ref.dataId] = ref
981 # Look up prerequisite datasets in the input collection(s).
982 # These may have dimensions that extend beyond those we queried
983 # for originally, because we want to permit those data ID
984 # values to differ across quanta and dataset types.
985 for datasetType in task.prerequisites:
986 lookupFunction = lookupFunctions.get(datasetType.name)
987 if lookupFunction is not None:
988 # PipelineTask has provided its own function to do the
989 # lookup. This always takes precedence.
990 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections))
991 elif (
992 datasetType.isCalibration()
993 and datasetType.dimensions <= quantum.dataId.graph
994 and quantum.dataId.graph.temporal
995 ):
996 # This is a master calibration lookup, which we have to
997 # handle specially because the query system can't do a
998 # temporal join on a non-dimension-based timespan yet.
999 timespan = quantum.dataId.timespan
1000 try:
1001 prereq_refs = [
1002 registry.findDataset(
1003 datasetType, quantum.dataId, collections=collections, timespan=timespan
1004 )
1005 ]
1006 except KeyError:
1007 # This dataset type is not present in the registry,
1008 # which just means there are no datasets here.
1009 prereq_refs = []
1010 else:
1011 # Most general case.
1012 prereq_refs = list(
1013 registry.queryDatasets(
1014 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True
1015 ).expanded()
1016 )
1017 quantum.prerequisites[datasetType].update(
1018 {ref.dataId: ref for ref in prereq_refs if ref is not None}
1019 )
1020 # Actually remove any quanta that we decided to skip above.
1021 if dataIdsSucceeded:
1022 if skipCollections is not None:
1023 _LOG.debug(
1024 "Pruning successful %d quanta for task with label '%s' because all of their "
1025 "outputs exist or metadata was written successfully.",
1026 len(dataIdsSucceeded),
1027 task.taskDef.label,
1028 )
1029 for dataId in dataIdsSucceeded:
1030 del task.quanta[dataId]
1031 elif clobberOutputs:
1032 _LOG.info(
1033 "Found %d successful quanta for task with label '%s' "
1034 "that will need to be clobbered during execution.",
1035 len(dataIdsSucceeded),
1036 task.taskDef.label,
1037 )
1038 else:
1039 raise AssertionError("OutputExistsError should have already been raised.")
1040 if dataIdsFailed:
1041 if clobberOutputs:
1042 _LOG.info(
1043 "Found %d failed/incomplete quanta for task with label '%s' "
1044 "that will need to be clobbered during execution.",
1045 len(dataIdsFailed),
1046 task.taskDef.label,
1047 )
1048 else:
1049 raise AssertionError("OutputExistsError should have already been raised.")
1051 def makeQuantumGraph(
1052 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None
1053 ) -> QuantumGraph:
1054 """Create a `QuantumGraph` from the quanta already present in
1055 the scaffolding data structure.
1057 Parameters
1058 ---------
1059 metadata : Optional Mapping of `str` to primitives
1060 This is an optional parameter of extra data to carry with the
1061 graph. Entries in this mapping should be able to be serialized in
1062 JSON.
1063 datastore : `Datastore`, optional
1064 If not `None` then fill datastore records in each generated
1065 Quantum.
1067 Returns
1068 -------
1069 graph : `QuantumGraph`
1070 The full `QuantumGraph`.
1071 """
1073 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]:
1074 """Extract all DatasetRefs from the dictionaries"""
1075 for ref_dict in dataset_dict.values():
1076 yield from ref_dict.values()
1078 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None
1079 if datastore is not None:
1080 datastore_records = datastore.export_records(
1081 itertools.chain(
1082 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites)
1083 )
1084 )
1086 graphInput: Dict[TaskDef, Set[Quantum]] = {}
1087 for task in self.tasks:
1088 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records)
1089 graphInput[task.taskDef] = qset
1091 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs)
1092 return graph
1095# ------------------------
1096# Exported definitions --
1097# ------------------------
1100class GraphBuilderError(Exception):
1101 """Base class for exceptions generated by graph builder."""
1103 pass
1106class OutputExistsError(GraphBuilderError):
1107 """Exception generated when output datasets already exist."""
1109 pass
1112class PrerequisiteMissingError(GraphBuilderError):
1113 """Exception generated when a prerequisite dataset does not exist."""
1115 pass
1118class GraphBuilder:
1119 """GraphBuilder class is responsible for building task execution graph from
1120 a Pipeline.
1122 Parameters
1123 ----------
1124 registry : `~lsst.daf.butler.Registry`
1125 Data butler instance.
1126 skipExistingIn
1127 Expressions representing the collections to search for existing
1128 output datasets that should be skipped. May be any of the types
1129 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
1130 clobberOutputs : `bool`, optional
1131 If `True` (default), allow quanta to created even if partial outputs
1132 exist; this requires the same behavior behavior to be enabled when
1133 executing.
1134 datastore : `Datastore`, optional
1135 If not `None` then fill datastore records in each generated Quantum.
1136 """
1138 def __init__(
1139 self,
1140 registry: Registry,
1141 skipExistingIn: Any = None,
1142 clobberOutputs: bool = True,
1143 datastore: Optional[Datastore] = None,
1144 ):
1145 self.registry = registry
1146 self.dimensions = registry.dimensions
1147 self.skipExistingIn = skipExistingIn
1148 self.clobberOutputs = clobberOutputs
1149 self.datastore = datastore
1151 def makeGraph(
1152 self,
1153 pipeline: Union[Pipeline, Iterable[TaskDef]],
1154 collections: Any,
1155 run: Optional[str],
1156 userQuery: Optional[str],
1157 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
1158 metadata: Optional[Mapping[str, Any]] = None,
1159 ) -> QuantumGraph:
1160 """Create execution graph for a pipeline.
1162 Parameters
1163 ----------
1164 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
1165 Pipeline definition, task names/classes and their configs.
1166 collections
1167 Expressions representing the collections to search for input
1168 datasets. May be any of the types accepted by
1169 `lsst.daf.butler.CollectionSearch.fromExpression`.
1170 run : `str`, optional
1171 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
1172 output datasets, if it already exists.
1173 userQuery : `str`
1174 String which defines user-defined selection for registry, should be
1175 empty or `None` if there is no restrictions on data selection.
1176 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional
1177 The query constraint variant that should be used to constraint the
1178 query based on dataset existance, defaults to
1179 `DatasetQueryConstraintVariant.ALL`.
1180 metadata : Optional Mapping of `str` to primitives
1181 This is an optional parameter of extra data to carry with the
1182 graph. Entries in this mapping should be able to be serialized in
1183 JSON.
1185 Returns
1186 -------
1187 graph : `QuantumGraph`
1189 Raises
1190 ------
1191 UserExpressionError
1192 Raised when user expression cannot be parsed.
1193 OutputExistsError
1194 Raised when output datasets already exist.
1195 Exception
1196 Other exceptions types may be raised by underlying registry
1197 classes.
1198 """
1199 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
1200 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
1201 raise ValueError("Pipeline requires input datasets but no input collections provided.")
1202 instrument_class: Optional[Any] = None
1203 if isinstance(pipeline, Pipeline):
1204 instrument_class_name = pipeline.getInstrument()
1205 if instrument_class_name is not None:
1206 instrument_class = doImportType(instrument_class_name)
1207 pipeline = list(pipeline.toExpandedPipeline())
1208 if instrument_class is not None:
1209 dataId = DataCoordinate.standardize(
1210 instrument=instrument_class.getName(), universe=self.registry.dimensions
1211 )
1212 else:
1213 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
1214 with scaffolding.connectDataIds(
1215 self.registry, collections, userQuery, dataId, datasetQueryConstraint
1216 ) as commonDataIds:
1217 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL
1218 scaffolding.resolveDatasetRefs(
1219 self.registry,
1220 collections,
1221 run,
1222 commonDataIds,
1223 skipExistingIn=self.skipExistingIn,
1224 clobberOutputs=self.clobberOutputs,
1225 constrainedByAllDatasets=condition,
1226 )
1227 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)