Coverage for python/lsst/pipe/base/all_dimensions_quantum_graph_builder.py: 18%
197 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 10:12 +0000
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 10:12 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""The standard, general-purpose implementation of the QuantumGraph-generation
29algorithm.
30"""
32from __future__ import annotations
34__all__ = ("AllDimensionsQuantumGraphBuilder", "DatasetQueryConstraintVariant")
36import dataclasses
37from collections.abc import Iterator, Mapping
38from contextlib import contextmanager
39from typing import TYPE_CHECKING, Any, final
41from lsst.daf.butler.registry import MissingDatasetTypeError
42from lsst.utils.timer import timeMethod
44from ._datasetQueryConstraints import DatasetQueryConstraintVariant
45from .quantum_graph_builder import (
46 DatasetKey,
47 PrerequisiteDatasetKey,
48 QuantumGraphBuilder,
49 QuantumGraphBuilderError,
50 QuantumGraphSkeleton,
51 QuantumKey,
52)
54if TYPE_CHECKING:
55 from lsst.daf.butler import Butler, DimensionGroup
56 from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
57 from lsst.utils.logging import LsstLogAdapter
59 from .pipeline_graph import DatasetTypeNode, PipelineGraph, TaskNode
62@final
63class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
64 """An implementation of `QuantumGraphBuilder` that uses a single large
65 query for data IDs covering all dimensions in the pipeline.
67 Parameters
68 ----------
69 pipeline_graph : `.pipeline_graph.PipelineGraph`
70 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved
71 in-place with the given butler (any existing resolution is ignored).
72 butler : `lsst.daf.butler.Butler`
73 Client for the data repository. Should be read-only.
74 where : `str`, optional
75 Butler expression language constraint to apply to all data IDs.
76 dataset_query_constraint : `DatasetQueryConstraintVariant`, optional
77 Specification of which overall-input datasets should be used to
78 constrain the initial data ID queries. Not including an important
79 constraint can result in catastrophically large query results that take
80 too long to process, while including too many makes the query much more
81 complex, increasing the chances that the database will choose a bad
82 (sometimes catastrophically bad) query plan.
83 bind : `~collections.abc.Mapping`, optional
84 Variable substitutions for the ``where`` expression.
85 **kwargs
86 Additional keyword arguments forwarded to `QuantumGraphBuilder`.
88 Notes
89 -----
90 This is a general-purpose algorithm that delegates the problem of
91 determining which "end" of the pipeline is more constrained (beginning by
92 input collection contents vs. end by the ``where`` string) to the database
93 query planner, which *usually* does a good job.
95 This algorithm suffers from a serious limitation, which we refer to as the
96 "tract slicing" problem from its most common variant: the ``where`` string
97 and general data ID intersection rules apply to *all* data IDs in the
98 graph. For example, if a ``tract`` constraint is present in the ``where``
99 string or an overall-input dataset, then it is impossible for any data ID
100 that does not overlap that tract to be present anywhere in the pipeline,
101 such as a ``{visit, detector}`` combination where the ``visit`` overlaps
102 the ``tract`` even if the ``detector`` does not.
103 """
105 def __init__(
106 self,
107 pipeline_graph: PipelineGraph,
108 butler: Butler,
109 *,
110 where: str = "",
111 dataset_query_constraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
112 bind: Mapping[str, Any] | None = None,
113 **kwargs: Any,
114 ):
115 super().__init__(pipeline_graph, butler, **kwargs)
116 self.where = where
117 self.dataset_query_constraint = dataset_query_constraint
118 self.bind = bind
120 @timeMethod
121 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton:
122 # Docstring inherited.
123 # There is some chance that the dimension query for one subgraph would
124 # be the same as or a dimension-subset of another. This is an
125 # optimization opportunity we're not currently taking advantage of.
126 with _AllDimensionsQuery.from_builder(self, subgraph) as query:
127 skeleton = self._make_subgraph_skeleton(query)
128 self._find_followup_datasets(query, skeleton)
129 return skeleton
131 @timeMethod
132 def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton:
133 """Build a `QuantumGraphSkeleton` by iterating over the result rows
134 of the initial data ID query.
136 Parameters
137 ----------
138 query : `_AllDimensionsQuery`
139 Object representing the full-pipeline data ID query.
141 Returns
142 -------
143 skeleton : `QuantumGraphSkeleton`
144 Preliminary quantum graph.
145 """
146 # First we make containers of empty-dimensions quantum and dataset
147 # keys, and add those to the skelton, since empty data IDs are
148 # logically subsets of any data ID. We'll copy those to initialize the
149 # containers of keys for each result row. We don't ever explicitly add
150 # nodes to the skeleton for these, and that's okay because networkx
151 # adds nodes implicitly when an edge to that node is added, and we
152 # don't want to add nodes for init datasets here.
153 skeleton = QuantumGraphSkeleton(query.subgraph.tasks)
154 empty_dimensions_dataset_keys = {}
155 for dataset_type_name in query.empty_dimensions_dataset_types.keys():
156 empty_dimensions_dataset_keys[dataset_type_name] = skeleton.add_dataset_node(
157 dataset_type_name, self.empty_data_id
158 )
159 empty_dimensions_quantum_keys = []
160 for task_label in query.empty_dimensions_tasks.keys():
161 empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id))
162 self.log.info("Iterating over query results to associate quanta with datasets.")
163 # Iterate over query results, populating data IDs for datasets and
164 # quanta and then connecting them to each other. This is the slowest
165 # client-side part of QG generation, and it's often the slowest part
166 # overall, so inside this loop is where it's really critical to avoid
167 # expensive things, especially in the nested loops.
168 n_rows = 0
169 for common_data_id in query.common_data_ids:
170 # Create a data ID for each set of dimensions used by one or more
171 # tasks or dataset types, and use that to record all quanta and
172 # dataset data IDs for this row.
173 dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy()
174 quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy()
175 for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items():
176 data_id = common_data_id.subset(dimensions)
177 for dataset_type_name in dataset_type_nodes.keys():
178 dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node(
179 dataset_type_name, data_id
180 )
181 for task_label in task_nodes.keys():
182 quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id))
183 # Whether these quanta are new or existing, we can now associate
184 # the dataset data IDs for this row with them. The fact that a
185 # quantum data ID and a dataset data ID both came from the same
186 # result row is what tells us they should be associated. Many of
187 # these associates will be duplicates (because another query row
188 # that differed from this one only in irrelevant dimensions already
189 # added them), and our use of sets should take care of that.
190 for quantum_key in quantum_keys_for_row:
191 for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values():
192 skeleton.add_input_edge(
193 quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name]
194 )
195 for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs():
196 skeleton.add_output_edge(
197 quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name]
198 )
199 n_rows += 1
200 if n_rows == 0:
201 query.log_failure(self.log)
202 else:
203 n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks)
204 self.log.info(
205 "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).",
206 n_quanta,
207 skeleton.n_nodes - n_quanta,
208 skeleton.n_edges,
209 n_rows,
210 )
211 return skeleton
213 @timeMethod
214 def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None:
215 """Populate `existing_datasets` by performing follow-up queries joined
216 to column-subsets of the initial data ID query.
218 Parameters
219 ----------
220 query : `_AllDimensionsQuery`
221 Object representing the full-pipeline data ID query.
222 """
223 for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items():
224 data_ids = query.common_data_ids.subset(dimensions, unique=True)
225 # Iterate over regular input/output dataset type nodes with these
226 # dimensions to find those datasets using straightforward followup
227 # queries.
228 for dataset_type_node in dataset_types_in_group.values():
229 if dataset_type_node.name in query.overall_inputs:
230 # Dataset type is an overall input; we always need to try
231 # to find these.
232 count = 0
233 try:
234 for ref in data_ids.findDatasets(dataset_type_node.name, self.input_collections):
235 self.existing_datasets.inputs[
236 DatasetKey(dataset_type_node.name, ref.dataId.required_values)
237 ] = ref
238 count += 1
239 except MissingDatasetTypeError:
240 pass
241 self.log.verbose(
242 "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
243 )
244 continue
245 if self.skip_existing_in:
246 # Dataset type is an intermediate or output; need to find
247 # these if only they're from previously executed quanta
248 # that we might skip...
249 count = 0
250 try:
251 for ref in data_ids.findDatasets(dataset_type_node.name, self.skip_existing_in):
252 key = DatasetKey(dataset_type_node.name, ref.dataId.required_values)
253 self.existing_datasets.outputs_for_skip[key] = ref
254 count += 1
255 if ref.run == self.output_run:
256 self.existing_datasets.outputs_in_the_way[key] = ref
257 except MissingDatasetTypeError:
258 pass
259 self.log.verbose(
260 "Found %d output dataset(s) of type %r in %s.",
261 count,
262 dataset_type_node.name,
263 self.skip_existing_in,
264 )
265 if self.output_run_exists and not self.skip_existing_starts_with_output_run:
266 # ...or if they're in the way and would need to be
267 # clobbered (and we haven't already found them in the
268 # previous block).
269 count = 0
270 try:
271 for ref in data_ids.findDatasets(dataset_type_node.name, [self.output_run]):
272 self.existing_datasets.outputs_in_the_way[
273 DatasetKey(dataset_type_node.name, ref.dataId.required_values)
274 ] = ref
275 count += 1
276 except MissingDatasetTypeError:
277 pass
278 self.log.verbose(
279 "Found %d output dataset(s) of type %r in %s.",
280 count,
281 dataset_type_node.name,
282 self.output_run,
283 )
284 del dataset_type_node
285 # Iterate over tasks with these dimensions to perform follow-up
286 # queries for prerequisite inputs, which may have dimensions that
287 # were not in ``common_data_ids`` and/or require temporal joins to
288 # calibration validity ranges.
289 for task_node in tasks_in_group.values():
290 task_prerequisite_info = self.prerequisite_info[task_node.label]
291 for connection_name, finder in list(task_prerequisite_info.finders.items()):
292 if finder.lookup_function is not None:
293 self.log.verbose(
294 "Deferring prerequisite input %r of task %r to per-quantum processing "
295 "(lookup function provided).",
296 finder.dataset_type_node.name,
297 task_node.label,
298 )
299 continue
300 # We also fall back to the base class if there is a
301 # nontrivial spatial or temporal join in the lookup.
302 if finder.dataset_skypix or finder.dataset_other_spatial:
303 if task_prerequisite_info.bounds.spatial_connections:
304 self.log.verbose(
305 "Deferring prerequisite input %r of task %r to per-quantum processing "
306 "(for spatial-bounds-connections handling).",
307 finder.dataset_type_node.name,
308 task_node.label,
309 )
310 continue
311 if not task_node.dimensions.spatial:
312 self.log.verbose(
313 "Deferring prerequisite input %r of task %r to per-quantum processing "
314 "(dataset has spatial data IDs, but task does not).",
315 finder.dataset_type_node.name,
316 task_node.label,
317 )
318 continue
319 if finder.dataset_has_timespan:
320 if task_prerequisite_info.bounds.spatial_connections:
321 self.log.verbose(
322 "Deferring prerequisite input %r of task %r to per-quantum processing "
323 "(for temporal-bounds-connections handling).",
324 finder.dataset_type_node.name,
325 task_node.label,
326 )
327 continue
328 if not task_node.dimensions.temporal:
329 self.log.verbose(
330 "Deferring prerequisite input %r of task %r to per-quantum processing "
331 "(dataset has temporal data IDs, but task does not).",
332 finder.dataset_type_node.name,
333 task_node.label,
334 )
335 continue
336 # We have a simple case where we can do a single query
337 # that joins the query we already have for the task data
338 # IDs to the datasets we're looking for.
339 count = 0
340 try:
341 query_results = data_ids.findRelatedDatasets(
342 finder.dataset_type_node.dataset_type, self.input_collections
343 )
344 except MissingDatasetTypeError:
345 query_results = []
346 for data_id, ref in query_results:
347 dataset_key = PrerequisiteDatasetKey(finder.dataset_type_node.name, ref.id.bytes)
348 quantum_key = QuantumKey(task_node.label, data_id.required_values)
349 # The column-subset operation used to make `data_ids`
350 # from `common_data_ids` can strip away post-query
351 # filtering; e.g. if we starts with a {visit, patch}
352 # query but subset down to just {visit}, we can't keep
353 # the patch.region column we need for that filtering.
354 # This means we can get some data IDs that weren't in
355 # the original query (e.g. visits that don't overlap
356 # the same patch, but do overlap the some common skypix
357 # ID). We don't want to add quanta with those data ID
358 # here, which is why we pass
359 # ignore_unrecognized_quanta=True here.
360 if skeleton.add_input_edge(quantum_key, dataset_key, ignore_unrecognized_quanta=True):
361 self.existing_datasets.inputs[dataset_key] = ref
362 count += 1
363 # Remove this finder from the mapping so the base class
364 # knows it doesn't have to look for these prerequisites.
365 del task_prerequisite_info.finders[connection_name]
366 self.log.verbose(
367 "Added %d prerequisite input edge(s) from dataset type %r to task %r.",
368 count,
369 finder.dataset_type_node.name,
370 task_node.label,
371 )
374@dataclasses.dataclass(eq=False, repr=False)
375class _AllDimensionsQuery:
376 """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all
377 per-subgraph state.
379 This object should always be constructed by `from_builder`, which returns
380 an instance wrapped with a context manager. This controls the lifetime of
381 the temporary table referenced by `common_data_ids`.
382 """
384 subgraph: PipelineGraph
385 """Graph of this subset of the pipeline."""
387 grouped_by_dimensions: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = (
388 dataclasses.field(default_factory=dict)
389 )
390 """The tasks and dataset types of this subset of the pipeline, grouped
391 by their dimensions.
393 The tasks and dataset types with empty dimensions are not included; they're
394 in other attributes since they are usually used differently. Prerequisite
395 dataset types are also not included.
396 """
398 empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
399 """The tasks of this subset of this pipeline that have empty dimensions."""
401 empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
402 """The dataset types of this subset of this pipeline that have empty
403 dimensions.
405 Prerequisite dataset types are not included.
406 """
408 overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
409 """Pipeline graph nodes for all non-prerequisite, non-init overall-input
410 dataset types for this subset of the pipeline.
411 """
413 query_args: dict[str, Any] = dataclasses.field(default_factory=dict)
414 """All keyword arguments passed to `lsst.daf.butler.Registry.queryDataIds`.
415 """
417 common_data_ids: DataCoordinateQueryResults = dataclasses.field(init=False)
418 """Results of the materialized initial data ID query."""
420 @classmethod
421 @contextmanager
422 def from_builder(
423 cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph
424 ) -> Iterator[_AllDimensionsQuery]:
425 """Construct and run the query, returning an instance guarded by
426 a context manager.
428 Parameters
429 ----------
430 builder : `AllDimensionsQuantumGraphBuilder`
431 Builder object this helper is associated with.
432 subgraph : `pipeline_graph.PipelineGraph`
433 Subset of the pipeline being processed.
435 Returns
436 -------
437 context : `AbstractContextManager` [ `_AllDimensionsQuery` ]
438 An instance of this class, inside a context manager that manages
439 the lifetime of its temporary database table.
440 """
441 result = cls(subgraph)
442 builder.log.debug("Analyzing subgraph dimensions and overall-inputs.")
443 result.grouped_by_dimensions = result.subgraph.group_by_dimensions()
444 (
445 result.empty_dimensions_tasks,
446 result.empty_dimensions_dataset_types,
447 ) = result.grouped_by_dimensions.pop(builder.universe.empty.as_group())
448 result.overall_inputs = {
449 name: node # type: ignore
450 for name, node in result.subgraph.iter_overall_inputs()
451 if not node.is_prerequisite # type: ignore
452 }
453 dimension_names: set[str] = set()
454 for dimensions_for_group in result.grouped_by_dimensions.keys():
455 dimension_names.update(dimensions_for_group.names)
456 dimensions = builder.universe.conform(dimension_names)
457 builder.log.debug("Building query for data IDs.")
458 result.query_args = {
459 "dimensions": dimensions,
460 "where": builder.where,
461 "dataId": result.subgraph.data_id,
462 "bind": builder.bind,
463 }
464 if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
465 builder.log.debug("Constraining graph query using all datasets not marked as deferred.")
466 result.query_args["datasets"] = {
467 name
468 for name, dataset_type_node in result.overall_inputs.items()
469 if (
470 dataset_type_node.is_initial_query_constraint
471 and name not in result.empty_dimensions_dataset_types
472 )
473 }
474 result.query_args["collections"] = builder.input_collections
475 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
476 builder.log.debug("Not using dataset existence to constrain query.")
477 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
478 constraint = set(builder.dataset_query_constraint)
479 inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys()
480 if remainder := constraint.difference(inputs):
481 raise QuantumGraphBuilderError(
482 f"{remainder} dataset type(s) specified as a graph constraint, but"
483 f" do not appear as an overall input to the specified pipeline: {inputs}."
484 " Note that component datasets are not permitted as constraints."
485 )
486 builder.log.debug(f"Constraining graph query using {constraint}")
487 result.query_args["datasets"] = constraint
488 result.query_args["collections"] = builder.input_collections
489 else:
490 raise QuantumGraphBuilderError(
491 f"Unable to handle type {builder.dataset_query_constraint} "
492 "given as datasetQueryConstraint."
493 )
494 builder.log.verbose("Querying for data IDs with arguments:")
495 builder.log.verbose(" dimensions=%s,", list(result.query_args["dimensions"].names))
496 builder.log.verbose(" dataId=%s,", dict(result.query_args["dataId"].required))
497 if result.query_args["where"]:
498 builder.log.verbose(" where=%s,", repr(result.query_args["where"]))
499 if "datasets" in result.query_args:
500 builder.log.verbose(" datasets=%s,", list(result.query_args["datasets"]))
501 if "collections" in result.query_args:
502 builder.log.verbose(" collections=%s,", list(result.query_args["collections"]))
503 with builder.butler.registry.caching_context():
504 with builder.butler.registry.queryDataIds(**result.query_args).materialize() as common_data_ids:
505 builder.log.debug("Expanding data IDs.")
506 result.common_data_ids = common_data_ids.expanded()
507 yield result
509 def log_failure(self, log: LsstLogAdapter) -> None:
510 """Emit a series of CRITICAL-level log message that attempts to explain
511 why the initial data ID query returned no rows.
513 Parameters
514 ----------
515 log : `logging.Logger`
516 The logger to use to emit log messages.
517 """
518 log.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
519 for message in self.common_data_ids.explain_no_results():
520 log.critical(message)
521 log.critical(
522 "To reproduce this query for debugging purposes, run "
523 "Registry.queryDataIds with these arguments:"
524 )
525 # We could just repr() the queryArgs dict to get something
526 # the user could make sense of, but it's friendlier to
527 # put these args in an easier-to-reconstruct equivalent form
528 # so they can read it more easily and copy and paste into
529 # a Python terminal.
530 log.critical(" dimensions=%s,", list(self.query_args["dimensions"].names))
531 log.critical(" dataId=%s,", dict(self.query_args["dataId"].required))
532 if self.query_args["where"]:
533 log.critical(" where=%s,", repr(self.query_args["where"]))
534 if "datasets" in self.query_args:
535 log.critical(" datasets=%s,", list(self.query_args["datasets"]))
536 if "collections" in self.query_args:
537 log.critical(" collections=%s,", list(self.query_args["collections"]))