Coverage for python/lsst/pipe/base/all_dimensions_quantum_graph_builder.py: 18%
197 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:46 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:46 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""The standard, general-purpose implementation of the QuantumGraph-generation
29algorithm.
30"""
32from __future__ import annotations
34__all__ = ("AllDimensionsQuantumGraphBuilder",)
36import dataclasses
37from collections.abc import Iterator, Mapping
38from contextlib import contextmanager
39from typing import TYPE_CHECKING, Any, final
41from lsst.daf.butler.registry import MissingDatasetTypeError
42from lsst.utils.timer import timeMethod
44from ._datasetQueryConstraints import DatasetQueryConstraintVariant
45from .quantum_graph_builder import (
46 DatasetKey,
47 PrerequisiteDatasetKey,
48 QuantumGraphBuilder,
49 QuantumGraphBuilderError,
50 QuantumGraphSkeleton,
51 QuantumKey,
52)
54if TYPE_CHECKING:
55 from lsst.daf.butler import Butler, DataCoordinateQueryResults, DimensionGroup
56 from lsst.utils.logging import LsstLogAdapter
58 from .pipeline_graph import DatasetTypeNode, PipelineGraph, TaskNode
61@final
62class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
63 """An implementation of `QuantumGraphBuilder` that uses a single large
64 query for data IDs covering all dimensions in the pipeline.
66 Parameters
67 ----------
68 pipeline_graph : `.pipeline_graph.PipelineGraph`
69 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved
70 in-place with the given butler (any existing resolution is ignored).
71 butler : `lsst.daf.butler.Butler`
72 Client for the data repository. Should be read-only.
73 where : `str`
74 Butler expression language constraint to apply to all data IDs.
75 dataset_query_constraint : `DatasetQueryConstraintVariant`, optional
76 Specification of which overall-input datasets should be used to
77 constrain the initial data ID queries. Not including an important
78 constraint can result in catastrophically large query results that take
79 too long to process, while including too many makes the query much more
80 complex, increasing the chances that the database will choose a bad
81 (sometimes catastrophically bad) query plan.
82 bind : `~collections.abc.Mapping`, optional
83 Variable substitutions for the ``where`` expression.
84 **kwargs
85 Additional keyword arguments forwarded to `QuantumGraphBuilder`.
87 Notes
88 -----
89 This is a general-purpose algorithm that delegates the problem of
90 determining which "end" of the pipeline is more constrained (beginning by
91 input collection contents vs. end by the ``where`` string) to the database
92 query planner, which *usually* does a good job.
94 This algorithm suffers from a serious limitation, which we refer to as the
95 "tract slicing" problem from its most common variant: the ``where`` string
96 and general data ID intersection rules apply to *all* data IDs in the
97 graph. For example, if a ``tract`` constraint is present in the ``where``
98 string or an overall-input dataset, then it is impossible for any data ID
99 that does not overlap that tract to be present anywhere in the pipeline,
100 such as a ``{visit, detector}`` combination where the ``visit`` overlaps
101 the ``tract`` even if the ``detector`` does not.
102 """
104 def __init__(
105 self,
106 pipeline_graph: PipelineGraph,
107 butler: Butler,
108 *,
109 where: str,
110 dataset_query_constraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
111 bind: Mapping[str, Any] | None = None,
112 **kwargs: Any,
113 ):
114 super().__init__(pipeline_graph, butler, **kwargs)
115 self.where = where
116 self.dataset_query_constraint = dataset_query_constraint
117 self.bind = bind
119 @timeMethod
120 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton:
121 # Docstring inherited.
122 # There is some chance that the dimension query for one subgraph would
123 # be the same as or a dimension-subset of another. This is an
124 # optimization opportunity we're not currently taking advantage of.
125 with _AllDimensionsQuery.from_builder(self, subgraph) as query:
126 skeleton = self._make_subgraph_skeleton(query)
127 self._find_followup_datasets(query, skeleton)
128 return skeleton
130 @timeMethod
131 def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton:
132 """Build a `QuantumGraphSkeleton` by iterating over the result rows
133 of the initial data ID query.
135 Parameters
136 ----------
137 query : `_AllDimensionsQuery`
138 Object representing the full-pipeline data ID query.
140 Returns
141 -------
142 skeleton : `QuantumGraphSkeleton`
143 Preliminary quantum graph.
144 """
145 # First we make containers of empty-dimensions quantum and dataset
146 # keys, and add those to the skelton, since empty data IDs are
147 # logically subsets of any data ID. We'll copy those to initialize the
148 # containers of keys for each result row. We don't ever explicitly add
149 # nodes to the skeleton for these, and that's okay because networkx
150 # adds nodes implicitly when an edge to that node is added, and we
151 # don't want to add nodes for init datasets here.
152 skeleton = QuantumGraphSkeleton(query.subgraph.tasks)
153 empty_dimensions_dataset_keys = {}
154 for dataset_type_name in query.empty_dimensions_dataset_types.keys():
155 empty_dimensions_dataset_keys[dataset_type_name] = skeleton.add_dataset_node(
156 dataset_type_name, self.empty_data_id
157 )
158 empty_dimensions_quantum_keys = []
159 for task_label in query.empty_dimensions_tasks.keys():
160 empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id))
161 self.log.info("Iterating over query results to associate quanta with datasets.")
162 # Iterate over query results, populating data IDs for datasets and
163 # quanta and then connecting them to each other. This is the slowest
164 # client-side part of QG generation, and it's often the slowest part
165 # overall, so inside this loop is where it's really critical to avoid
166 # expensive things, especially in the nested loops.
167 n_rows = 0
168 for common_data_id in query.common_data_ids:
169 # Create a data ID for each set of dimensions used by one or more
170 # tasks or dataset types, and use that to record all quanta and
171 # dataset data IDs for this row.
172 dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy()
173 quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy()
174 for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items():
175 data_id = common_data_id.subset(dimensions)
176 for dataset_type_name in dataset_type_nodes.keys():
177 dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node(
178 dataset_type_name, data_id
179 )
180 for task_label in task_nodes.keys():
181 quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id))
182 # Whether these quanta are new or existing, we can now associate
183 # the dataset data IDs for this row with them. The fact that a
184 # quantum data ID and a dataset data ID both came from the same
185 # result row is what tells us they should be associated. Many of
186 # these associates will be duplicates (because another query row
187 # that differed from this one only in irrelevant dimensions already
188 # added them), and our use of sets should take care of that.
189 for quantum_key in quantum_keys_for_row:
190 for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values():
191 skeleton.add_input_edge(
192 quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name]
193 )
194 for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs():
195 skeleton.add_output_edge(
196 quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name]
197 )
198 n_rows += 1
199 if n_rows == 0:
200 query.log_failure(self.log)
201 else:
202 n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks)
203 self.log.info(
204 "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).",
205 n_quanta,
206 skeleton.n_nodes - n_quanta,
207 skeleton.n_edges,
208 n_rows,
209 )
210 return skeleton
212 @timeMethod
213 def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None:
214 """Populate `existing_datasets` by performing follow-up queries joined
215 to column-subsets of the initial data ID query.
217 Parameters
218 ----------
219 query : `_AllDimensionsQuery`
220 Object representing the full-pipeline data ID query.
221 """
222 for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items():
223 data_ids = query.common_data_ids.subset(dimensions, unique=True)
224 # Iterate over regular input/output dataset type nodes with these
225 # dimensions to find those datasets using straightforward followup
226 # queries.
227 for dataset_type_node in dataset_types_in_group.values():
228 if dataset_type_node.name in query.overall_inputs:
229 # Dataset type is an overall input; we always need to try
230 # to find these.
231 count = 0
232 try:
233 for ref in data_ids.find_datasets(dataset_type_node.name, self.input_collections):
234 self.existing_datasets.inputs[
235 DatasetKey(dataset_type_node.name, ref.dataId.required_values)
236 ] = ref
237 count += 1
238 except MissingDatasetTypeError:
239 pass
240 self.log.verbose(
241 "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
242 )
243 continue
244 if self.skip_existing_in:
245 # Dataset type is an intermediate or output; need to find
246 # these if only they're from previously executed quanta
247 # that we might skip...
248 count = 0
249 try:
250 for ref in data_ids.find_datasets(dataset_type_node.name, self.skip_existing_in):
251 key = DatasetKey(dataset_type_node.name, ref.dataId.required_values)
252 self.existing_datasets.outputs_for_skip[key] = ref
253 count += 1
254 if ref.run == self.output_run:
255 self.existing_datasets.outputs_in_the_way[key] = ref
256 except MissingDatasetTypeError:
257 pass
258 self.log.verbose(
259 "Found %d output dataset(s) of type %r in %s.",
260 count,
261 dataset_type_node.name,
262 self.skip_existing_in,
263 )
264 if self.output_run_exists and not self.skip_existing_starts_with_output_run:
265 # ...or if they're in the way and would need to be
266 # clobbered (and we haven't already found them in the
267 # previous block).
268 count = 0
269 try:
270 for ref in data_ids.find_datasets(dataset_type_node.name, [self.output_run]):
271 self.existing_datasets.outputs_in_the_way[
272 DatasetKey(dataset_type_node.name, ref.dataId.required_values)
273 ] = ref
274 count += 1
275 except MissingDatasetTypeError:
276 pass
277 self.log.verbose(
278 "Found %d output dataset(s) of type %r in %s.",
279 count,
280 dataset_type_node.name,
281 self.output_run,
282 )
283 del dataset_type_node
284 # Iterate over tasks with these dimensions to perform follow-up
285 # queries for prerequisite inputs, which may have dimensions that
286 # were not in ``common_data_ids`` and/or require temporal joins to
287 # calibration validity ranges.
288 for task_node in tasks_in_group.values():
289 task_prerequisite_info = self.prerequisite_info[task_node.label]
290 for connection_name, finder in list(task_prerequisite_info.finders.items()):
291 if finder.lookup_function is not None:
292 self.log.verbose(
293 "Deferring prerequisite input %r of task %r to per-quantum processing "
294 "(lookup function provided).",
295 finder.dataset_type_node.name,
296 task_node.label,
297 )
298 continue
299 # We also fall back to the base class if there is a
300 # nontrivial spatial or temporal join in the lookup.
301 if finder.dataset_skypix or finder.dataset_other_spatial:
302 if task_prerequisite_info.bounds.spatial_connections:
303 self.log.verbose(
304 "Deferring prerequisite input %r of task %r to per-quantum processing "
305 "(for spatial-bounds-connections handling).",
306 finder.dataset_type_node.name,
307 task_node.label,
308 )
309 continue
310 if not task_node.dimensions.spatial:
311 self.log.verbose(
312 "Deferring prerequisite input %r of task %r to per-quantum processing "
313 "(dataset has spatial data IDs, but task does not).",
314 finder.dataset_type_node.name,
315 task_node.label,
316 )
317 continue
318 if finder.dataset_has_timespan:
319 if task_prerequisite_info.bounds.spatial_connections:
320 self.log.verbose(
321 "Deferring prerequisite input %r of task %r to per-quantum processing "
322 "(for temporal-bounds-connections handling).",
323 finder.dataset_type_node.name,
324 task_node.label,
325 )
326 continue
327 if not task_node.dimensions.temporal:
328 self.log.verbose(
329 "Deferring prerequisite input %r of task %r to per-quantum processing "
330 "(dataset has temporal data IDs, but task does not).",
331 finder.dataset_type_node.name,
332 task_node.label,
333 )
334 continue
335 # We have a simple case where we can do a single query
336 # that joins the query we already have for the task data
337 # IDs to the datasets we're looking for.
338 count = 0
339 try:
340 query_results = data_ids.find_related_datasets(
341 finder.dataset_type_node.dataset_type, self.input_collections
342 )
343 except MissingDatasetTypeError:
344 query_results = []
345 for data_id, ref in query_results:
346 dataset_key = PrerequisiteDatasetKey(finder.dataset_type_node.name, ref.id.bytes)
347 quantum_key = QuantumKey(task_node.label, data_id.required_values)
348 # The column-subset operation used to make `data_ids`
349 # from `common_data_ids` can strip away post-query
350 # filtering; e.g. if we starts with a {visit, patch}
351 # query but subset down to just {visit}, we can't keep
352 # the patch.region column we need for that filtering.
353 # This means we can get some data IDs that weren't in
354 # the original query (e.g. visits that don't overlap
355 # the same patch, but do overlap the some common skypix
356 # ID). We don't want to add quanta with those data ID
357 # here, which is why we pass
358 # ignore_unrecognized_quanta=True here.
359 if skeleton.add_input_edge(quantum_key, dataset_key, ignore_unrecognized_quanta=True):
360 self.existing_datasets.inputs[dataset_key] = ref
361 count += 1
362 # Remove this finder from the mapping so the base class
363 # knows it doesn't have to look for these prerequisites.
364 del task_prerequisite_info.finders[connection_name]
365 self.log.verbose(
366 "Added %d prerequisite input edge(s) from dataset type %r to task %r.",
367 count,
368 finder.dataset_type_node.name,
369 task_node.label,
370 )
373@dataclasses.dataclass(eq=False, repr=False)
374class _AllDimensionsQuery:
375 """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all
376 per-subgraph state.
378 This object should always be constructed by `from_builder`, which returns
379 an instance wrapped with a context manager. This controls the lifetime of
380 the temporary table referenced by `common_data_ids`.
381 """
383 subgraph: PipelineGraph
384 """Graph of this subset of the pipeline."""
386 grouped_by_dimensions: dict[
387 DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]
388 ] = dataclasses.field(default_factory=dict)
389 """The tasks and dataset types of this subset of the pipeline, grouped
390 by their dimensions.
392 The tasks and dataset types with empty dimensions are not included; they're
393 in other attributes since they are usually used differently. Prerequisite
394 dataset types are also not included.
395 """
397 empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
398 """The tasks of this subset of this pipeline that have empty dimensions."""
400 empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
401 """The dataset types of this subset of this pipeline that have empty
402 dimensions.
404 Prerequisite dataset types are not included.
405 """
407 overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
408 """Pipeline graph nodes for all non-prerequisite, non-init overall-input
409 dataset types for this subset of the pipeline.
410 """
412 query_args: dict[str, Any] = dataclasses.field(default_factory=dict)
413 """All keyword arguments passed to `lsst.daf.butler.Registry.queryDataIds`.
414 """
416 common_data_ids: DataCoordinateQueryResults = dataclasses.field(init=False)
417 """Results of the materialized initial data ID query."""
419 @classmethod
420 @contextmanager
421 def from_builder(
422 cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph
423 ) -> Iterator[_AllDimensionsQuery]:
424 """Construct and run the query, returning an instance guarded by
425 a context manager.
427 Parameters
428 ----------
429 builder : `AllDimensionsQuantumGraphBuilder`
430 Builder object this helper is associated with.
431 subgraph : `pipeline_graph.PipelineGraph`
432 Subset of the pipeline being processed.
434 Returns
435 -------
436 context : `AbstractContextManager` [ `_AllDimensionsQuery` ]
437 An instance of this class, inside a context manager that manages
438 the lifetime of its temporary database table.
439 """
440 result = cls(subgraph)
441 builder.log.debug("Analyzing subgraph dimensions and overall-inputs.")
442 result.grouped_by_dimensions = result.subgraph.group_by_dimensions()
443 (
444 result.empty_dimensions_tasks,
445 result.empty_dimensions_dataset_types,
446 ) = result.grouped_by_dimensions.pop(builder.universe.empty.as_group())
447 result.overall_inputs = {
448 name: node # type: ignore
449 for name, node in result.subgraph.iter_overall_inputs()
450 if not node.is_prerequisite # type: ignore
451 }
452 dimension_names: set[str] = set()
453 for dimensions_for_group in result.grouped_by_dimensions.keys():
454 dimension_names.update(dimensions_for_group.names)
455 dimensions = builder.universe.conform(dimension_names)
456 builder.log.debug("Building query for data IDs.")
457 result.query_args = {
458 "dimensions": dimensions,
459 "where": builder.where,
460 "data_id": result.subgraph.data_id,
461 "bind": builder.bind,
462 }
463 if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
464 builder.log.debug("Constraining graph query using all datasets not marked as deferred.")
465 result.query_args["datasets"] = {
466 name
467 for name, dataset_type_node in result.overall_inputs.items()
468 if (
469 dataset_type_node.is_initial_query_constraint
470 and name not in result.empty_dimensions_dataset_types
471 )
472 }
473 result.query_args["collections"] = builder.input_collections
474 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
475 builder.log.debug("Not using dataset existence to constrain query.")
476 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
477 constraint = set(builder.dataset_query_constraint)
478 inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys()
479 if remainder := constraint.difference(inputs):
480 raise QuantumGraphBuilderError(
481 f"{remainder} dataset type(s) specified as a graph constraint, but"
482 f" do not appear as an overall input to the specified pipeline: {inputs}."
483 " Note that component datasets are not permitted as constraints."
484 )
485 builder.log.debug(f"Constraining graph query using {constraint}")
486 result.query_args["datasets"] = constraint
487 result.query_args["collections"] = builder.input_collections
488 else:
489 raise QuantumGraphBuilderError(
490 f"Unable to handle type {builder.dataset_query_constraint} "
491 "given as datasetQueryConstraint."
492 )
493 builder.log.verbose("Querying for data IDs with arguments:")
494 builder.log.verbose(" dimensions=%s,", list(result.query_args["dimensions"].names))
495 builder.log.verbose(" data_id=%s,", dict(result.query_args["data_id"].required))
496 if result.query_args["where"]:
497 builder.log.verbose(" where=%s,", repr(result.query_args["where"]))
498 if "datasets" in result.query_args:
499 builder.log.verbose(" datasets=%s,", list(result.query_args["datasets"]))
500 if "collections" in result.query_args:
501 builder.log.verbose(" collections=%s,", list(result.query_args["collections"]))
502 with builder.butler._query() as query:
503 with query.data_ids(**result.query_args).materialize() as common_data_ids:
504 builder.log.debug("Expanding data IDs.")
505 result.common_data_ids = common_data_ids.expanded()
506 yield result
508 def log_failure(self, log: LsstLogAdapter) -> None:
509 """Emit a series of CRITICAL-level log message that attempts to explain
510 why the initial data ID query returned no rows.
512 Parameters
513 ----------
514 log : `logging.Logger`
515 The logger to use to emit log messages.
516 """
517 log.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
518 for message in self.common_data_ids.explain_no_results():
519 log.critical(message)
520 log.critical(
521 "To reproduce this query for debugging purposes, run "
522 "Registry.queryDataIds with these arguments:"
523 )
524 # We could just repr() the queryArgs dict to get something
525 # the user could make sense of, but it's friendlier to
526 # put these args in an easier-to-reconstruct equivalent form
527 # so they can read it more easily and copy and paste into
528 # a Python terminal.
529 log.critical(" dimensions=%s,", list(self.query_args["dimensions"].names))
530 log.critical(" data_id=%s,", dict(self.query_args["data_id"].required))
531 if self.query_args["where"]:
532 log.critical(" where=%s,", repr(self.query_args["where"]))
533 if "datasets" in self.query_args:
534 log.critical(" datasets=%s,", list(self.query_args["datasets"]))
535 if "collections" in self.query_args:
536 log.critical(" collections=%s,", list(self.query_args["collections"]))