Coverage for python/lsst/pipe/base/all_dimensions_quantum_graph_builder.py: 19%
200 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 10:52 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 10:52 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""The standard, general-purpose implementation of the QuantumGraph-generation
29algorithm.
30"""
32from __future__ import annotations
34__all__ = ("AllDimensionsQuantumGraphBuilder",)
36import dataclasses
37from collections.abc import Iterator, Mapping
38from contextlib import contextmanager
39from typing import Any, final
41from lsst.daf.butler import Butler, DimensionGraph
42from lsst.daf.butler.registry import MissingDatasetTypeError
43from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
44from lsst.utils.logging import LsstLogAdapter
45from lsst.utils.timer import timeMethod
47from ._datasetQueryConstraints import DatasetQueryConstraintVariant
48from .pipeline_graph import DatasetTypeNode, PipelineGraph, TaskNode
49from .quantum_graph_builder import (
50 DatasetKey,
51 PrerequisiteDatasetKey,
52 QuantumGraphBuilder,
53 QuantumGraphBuilderError,
54 QuantumGraphSkeleton,
55 QuantumKey,
56)
59@final
60class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
61 """An implementation of `QuantumGraphBuilder` that uses a single large
62 query for data IDs covering all dimensions in the pipeline.
64 Parameters
65 ----------
66 pipeline_graph : `.pipeline_graph.PipelineGraph`
67 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved
68 in-place with the given butler (any existing resolution is ignored).
69 butler : `lsst.daf.butler.Butler`
70 Client for the data repository. Should be read-only.
71 where : `str`
72 Butler expression language constraint to apply to all data IDs.
73 dataset_query_constraint : `DatasetQueryConstraintVariant`, optional
74 Specification of which overall-input datasets should be used to
75 constrain the initial data ID queries. Not including an important
76 constraint can result in catastrophically large query results that take
77 too long to process, while including too many makes the query much more
78 complex, increasing the chances that the database will choose a bad
79 (sometimes catastrophically bad) query plan.
80 bind : `~collections.abc.Mapping`, optional
81 Variable substitutions for the ``where`` expression.
82 **kwargs
83 Additional keyword arguments forwarded to `QuantumGraphBuilder`.
85 Notes
86 -----
87 This is a general-purpose algorithm that delegates the problem of
88 determining which "end" of the pipeline is more constrained (beginning by
89 input collection contents vs. end by the ``where`` string) to the database
90 query planner, which *usually* does a good job.
92 This algorithm suffers from a serious limitation, which we refer to as the
93 "tract slicing" problem from its most common variant: the ``where`` string
94 and general data ID intersection rules apply to *all* data IDs in the
95 graph. For example, if a ``tract`` constraint is present in the ``where``
96 string or an overall-input dataset, then it is impossible for any data ID
97 that does not overlap that tract to be present anywhere in the pipeline,
98 such as a ``{visit, detector}`` combination where the ``visit`` overlaps
99 the ``tract`` even if the ``detector`` does not.
100 """
102 def __init__(
103 self,
104 pipeline_graph: PipelineGraph,
105 butler: Butler,
106 *,
107 where: str,
108 dataset_query_constraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
109 bind: Mapping[str, Any] | None = None,
110 **kwargs: Any,
111 ):
112 super().__init__(pipeline_graph, butler, **kwargs)
113 self.where = where
114 self.dataset_query_constraint = dataset_query_constraint
115 self.bind = bind
117 @timeMethod
118 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton:
119 # Docstring inherited.
120 # There is some chance that the dimension query for one subgraph would
121 # be the same as or a dimension-subset of another. This is an
122 # optimization opportunity we're not currently taking advantage of.
123 with _AllDimensionsQuery.from_builder(self, subgraph) as query:
124 skeleton = self._make_subgraph_skeleton(query)
125 self._find_followup_datasets(query, skeleton)
126 return skeleton
128 @timeMethod
129 def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton:
130 """Build a `QuantumGraphSkeleton` by iterating over the result rows
131 of the initial data ID query.
133 Parameters
134 ----------
135 query : `_AllDimensionsQuery`
136 Object representing the full-pipeline data ID query.
138 Returns
139 -------
140 skeleton : `QuantumGraphSkeleton`
141 Preliminary quantum graph.
142 """
143 # First we make containers of empty-dimensions quantum and dataset
144 # keys, and add those to the skelton, since empty data IDs are
145 # logically subsets of any data ID. We'll copy those to initialize the
146 # containers of keys for each result row. We don't ever explicitly add
147 # nodes to the skeleton for these, and that's okay because networkx
148 # adds nodes implicitly when an edge to that node is added, and we
149 # don't want to add nodes for init datasets here.
150 skeleton = QuantumGraphSkeleton(query.subgraph.tasks)
151 empty_dimensions_dataset_keys = {}
152 for dataset_type_name in query.empty_dimensions_dataset_types.keys():
153 empty_dimensions_dataset_keys[dataset_type_name] = skeleton.add_dataset_node(
154 dataset_type_name, self.empty_data_id
155 )
156 empty_dimensions_quantum_keys = []
157 for task_label in query.empty_dimensions_tasks.keys():
158 empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id))
159 self.log.info("Iterating over query results to associate quanta with datasets.")
160 # Iterate over query results, populating data IDs for datasets and
161 # quanta and then connecting them to each other. This is the slowest
162 # client-side part of QG generation, and it's often the slowest part
163 # overall, so inside this loop is where it's really critical to avoid
164 # expensive things, especially in the nested loops.
165 n_rows = 0
166 for common_data_id in query.common_data_ids:
167 # Create a data ID for each set of dimensions used by one or more
168 # tasks or dataset types, and use that to record all quanta and
169 # dataset data IDs for this row.
170 dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy()
171 quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy()
172 for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items():
173 data_id = common_data_id.subset(dimensions)
174 for dataset_type_name in dataset_type_nodes.keys():
175 dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node(
176 dataset_type_name, data_id
177 )
178 for task_label in task_nodes.keys():
179 quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id))
180 # Whether these quanta are new or existing, we can now associate
181 # the dataset data IDs for this row with them. The fact that a
182 # quantum data ID and a dataset data ID both came from the same
183 # result row is what tells us they should be associated. Many of
184 # these associates will be duplicates (because another query row
185 # that differed from this one only in irrelevant dimensions already
186 # added them), and our use of sets should take care of that.
187 for quantum_key in quantum_keys_for_row:
188 for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values():
189 skeleton.add_input_edge(
190 quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name]
191 )
192 for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs():
193 skeleton.add_output_edge(
194 quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name]
195 )
196 n_rows += 1
197 if n_rows == 0:
198 query.log_failure(self.log)
199 else:
200 n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks)
201 self.log.info(
202 "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).",
203 n_quanta,
204 skeleton.n_nodes - n_quanta,
205 skeleton.n_edges,
206 n_rows,
207 )
208 return skeleton
210 @timeMethod
211 def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None:
212 """Populate `existing_datasets` by performing follow-up queries joined
213 to column-subsets of the initial data ID query.
215 Parameters
216 ----------
217 query : `_AllDimensionsQuery`
218 Object representing the full-pipeline data ID query.
219 """
220 for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items():
221 data_ids = query.common_data_ids.subset(dimensions, unique=True)
222 # Iterate over regular input/output dataset type nodes with these
223 # dimensions to find those datasets using straightforward followup
224 # queries.
225 for dataset_type_node in dataset_types_in_group.values():
226 if dataset_type_node.name in query.overall_inputs:
227 # Dataset type is an overall input; we always need to try
228 # to find these.
229 count = 0
230 try:
231 for ref in data_ids.findDatasets(dataset_type_node.name, self.input_collections):
232 self.existing_datasets.inputs[
233 DatasetKey(dataset_type_node.name, ref.dataId.values_tuple())
234 ] = ref
235 count += 1
236 except MissingDatasetTypeError:
237 pass
238 self.log.verbose(
239 "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
240 )
241 continue
242 if self.skip_existing_in:
243 # Dataset type is an intermediate or output; need to find
244 # these if only they're from previously executed quanta
245 # that we might skip...
246 count = 0
247 try:
248 for ref in data_ids.findDatasets(dataset_type_node.name, self.skip_existing_in):
249 key = DatasetKey(dataset_type_node.name, ref.dataId.values_tuple())
250 self.existing_datasets.outputs_for_skip[key] = ref
251 count += 1
252 if ref.run == self.output_run:
253 self.existing_datasets.outputs_in_the_way[key] = ref
254 except MissingDatasetTypeError:
255 pass
256 self.log.verbose(
257 "Found %d output dataset(s) of type %r in %s.",
258 count,
259 dataset_type_node.name,
260 self.skip_existing_in,
261 )
262 if self.output_run_exists and not self.skip_existing_starts_with_output_run:
263 # ...or if they're in the way and would need to be
264 # clobbered (and we haven't already found them in the
265 # previous block).
266 count = 0
267 try:
268 for ref in data_ids.findDatasets(dataset_type_node.name, [self.output_run]):
269 self.existing_datasets.outputs_in_the_way[
270 DatasetKey(dataset_type_node.name, ref.dataId.values_tuple())
271 ] = ref
272 count += 1
273 except MissingDatasetTypeError:
274 pass
275 self.log.verbose(
276 "Found %d output dataset(s) of type %r in %s.",
277 count,
278 dataset_type_node.name,
279 self.output_run,
280 )
281 del dataset_type_node
282 # Iterate over tasks with these dimensions to perform follow-up
283 # queries for prerequisite inputs, which may have dimensions that
284 # were not in ``common_data_ids`` and/or require temporal joins to
285 # calibration validity ranges.
286 for task_node in tasks_in_group.values():
287 task_prerequisite_info = self.prerequisite_info[task_node.label]
288 for connection_name, finder in list(task_prerequisite_info.finders.items()):
289 if finder.lookup_function is not None:
290 self.log.verbose(
291 "Deferring prerequisite input %r of task %r to per-quantum processing "
292 "(lookup function provided).",
293 finder.dataset_type_node.name,
294 task_node.label,
295 )
296 continue
297 # We also fall back to the base class if there is a
298 # nontrivial spatial or temporal join in the lookup.
299 if finder.dataset_skypix or finder.dataset_other_spatial:
300 if task_prerequisite_info.bounds.spatial_connections:
301 self.log.verbose(
302 "Deferring prerequisite input %r of task %r to per-quantum processing "
303 "(for spatial-bounds-connections handling).",
304 finder.dataset_type_node.name,
305 task_node.label,
306 )
307 continue
308 if not task_node.dimensions.spatial:
309 self.log.verbose(
310 "Deferring prerequisite input %r of task %r to per-quantum processing "
311 "(dataset has spatial data IDs, but task does not).",
312 finder.dataset_type_node.name,
313 task_node.label,
314 )
315 continue
316 if finder.dataset_has_timespan:
317 if task_prerequisite_info.bounds.spatial_connections:
318 self.log.verbose(
319 "Deferring prerequisite input %r of task %r to per-quantum processing "
320 "(for temporal-bounds-connections handling).",
321 finder.dataset_type_node.name,
322 task_node.label,
323 )
324 continue
325 if not task_node.dimensions.temporal:
326 self.log.verbose(
327 "Deferring prerequisite input %r of task %r to per-quantum processing "
328 "(dataset has temporal data IDs, but task does not).",
329 finder.dataset_type_node.name,
330 task_node.label,
331 )
332 continue
333 # We have a simple case where we can do a single query
334 # that joins the query we already have for the task data
335 # IDs to the datasets we're looking for.
336 count = 0
337 try:
338 query_results = data_ids.findRelatedDatasets(
339 finder.dataset_type_node.dataset_type, self.input_collections
340 )
341 except MissingDatasetTypeError:
342 query_results = []
343 for data_id, ref in query_results:
344 dataset_key = PrerequisiteDatasetKey(finder.dataset_type_node.name, ref.id.bytes)
345 quantum_key = QuantumKey(task_node.label, data_id.values_tuple())
346 # The column-subset operation used to make `data_ids`
347 # from `common_data_ids` can strip away post-query
348 # filtering; e.g. if we starts with a {visit, patch}
349 # query but subset down to just {visit}, we can't keep
350 # the patch.region column we need for that filtering.
351 # This means we can get some data IDs that weren't in
352 # the original query (e.g. visits that don't overlap
353 # the same patch, but do overlap the some common skypix
354 # ID). We don't want to add quanta with those data ID
355 # here, which is why we pass
356 # ignore_unrecognized_quanta=True here.
357 if skeleton.add_input_edge(quantum_key, dataset_key, ignore_unrecognized_quanta=True):
358 self.existing_datasets.inputs[dataset_key] = ref
359 count += 1
360 # Remove this finder from the mapping so the base class
361 # knows it doesn't have to look for these prerequisites.
362 del task_prerequisite_info.finders[connection_name]
363 self.log.verbose(
364 "Added %d prerequisite input edge(s) from dataset type %r to task %r.",
365 count,
366 finder.dataset_type_node.name,
367 task_node.label,
368 )
371@dataclasses.dataclass(eq=False, repr=False)
372class _AllDimensionsQuery:
373 """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all
374 per-subgraph state.
376 This object should always be constructed by `from_builder`, which returns
377 an instance wrapped with a context manager. This controls the lifetime of
378 the temporary table referenced by `common_data_ids`.
379 """
381 subgraph: PipelineGraph
382 """Graph of this subset of the pipeline."""
384 grouped_by_dimensions: dict[
385 DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]
386 ] = dataclasses.field(default_factory=dict)
387 """The tasks and dataset types of this subset of the pipeline, grouped
388 by their dimensions.
390 The tasks and dataset types with empty dimensions are not included; they're
391 in other attributes since they are usually used differently. Prerequisite
392 dataset types are also not included.
393 """
395 empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
396 """The tasks of this subset of this pipeline that have empty dimensions."""
398 empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
399 """The dataset types of this subset of this pipeline that have empty
400 dimensions.
402 Prerequisite dataset types are not included.
403 """
405 overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
406 """Pipeline graph nodes for all non-prerequisite, non-init overall-input
407 dataset types for this subset of the pipeline.
408 """
410 query_args: dict[str, Any] = dataclasses.field(default_factory=dict)
411 """All keyword arguments passed to `lsst.daf.butler.Registry.queryDataIds`.
412 """
414 common_data_ids: DataCoordinateQueryResults = dataclasses.field(init=False)
415 """Results of the materialized initial data ID query."""
417 @classmethod
418 @contextmanager
419 def from_builder(
420 cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph
421 ) -> Iterator[_AllDimensionsQuery]:
422 """Construct and run the query, returning an instance guarded by
423 a context manager.
425 Parameters
426 ----------
427 builder : `AllDimensionsQuantumGraphBuilder`
428 Builder object this helper is associated with.
429 subgraph : `pipeline_graph.PipelineGraph`
430 Subset of the pipeline being processed.
432 Returns
433 -------
434 context : `AbstractContextManager` [ `_AllDimensionsQuery` ]
435 An instance of this class, inside a context manager that manages
436 the lifetime of its temporary database table.
437 """
438 result = cls(subgraph)
439 builder.log.debug("Analyzing subgraph dimensions and overall-inputs.")
440 result.grouped_by_dimensions = result.subgraph.group_by_dimensions()
441 (
442 result.empty_dimensions_tasks,
443 result.empty_dimensions_dataset_types,
444 ) = result.grouped_by_dimensions.pop(builder.universe.empty)
445 result.overall_inputs = {
446 name: node # type: ignore
447 for name, node in result.subgraph.iter_overall_inputs()
448 if not node.is_prerequisite # type: ignore
449 }
450 dimension_names: set[str] = set()
451 for dimensions_for_group in result.grouped_by_dimensions.keys():
452 dimension_names.update(dimensions_for_group.names)
453 dimensions = builder.universe.extract(dimension_names)
454 builder.log.debug("Building query for data IDs.")
455 result.query_args = {
456 "dimensions": dimensions,
457 "where": builder.where,
458 "dataId": result.subgraph.data_id,
459 "bind": builder.bind,
460 }
461 if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
462 builder.log.debug("Constraining graph query using all datasets not marked as deferred.")
463 result.query_args["datasets"] = {
464 name
465 for name, dataset_type_node in result.overall_inputs.items()
466 if (
467 dataset_type_node.is_initial_query_constraint
468 and name not in result.empty_dimensions_dataset_types
469 )
470 }
471 result.query_args["collections"] = builder.input_collections
472 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
473 builder.log.debug("Not using dataset existence to constrain query.")
474 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
475 constraint = set(builder.dataset_query_constraint)
476 inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys()
477 if remainder := constraint.difference(inputs):
478 raise QuantumGraphBuilderError(
479 f"{remainder} dataset type(s) specified as a graph constraint, but"
480 f" do not appear as an overall input to the specified pipeline: {inputs}."
481 " Note that component datasets are not permitted as constraints."
482 )
483 builder.log.debug(f"Constraining graph query using {constraint}")
484 result.query_args["datasets"] = constraint
485 result.query_args["collections"] = builder.input_collections
486 else:
487 raise QuantumGraphBuilderError(
488 f"Unable to handle type {builder.dataset_query_constraint} "
489 "given as datasetQueryConstraint."
490 )
491 builder.log.verbose("Querying for data IDs with arguments:")
492 builder.log.verbose(" dimensions=%s,", list(result.query_args["dimensions"].names))
493 builder.log.verbose(" dataId=%s,", result.query_args["dataId"].byName())
494 if result.query_args["where"]:
495 builder.log.verbose(" where=%s,", repr(result.query_args["where"]))
496 if "datasets" in result.query_args:
497 builder.log.verbose(" datasets=%s,", list(result.query_args["datasets"]))
498 if "collections" in result.query_args:
499 builder.log.verbose(" collections=%s,", list(result.query_args["collections"]))
500 with builder.butler.registry.queryDataIds(**result.query_args).materialize() as common_data_ids:
501 builder.log.debug("Expanding data IDs.")
502 result.common_data_ids = common_data_ids.expanded()
503 yield result
505 def log_failure(self, log: LsstLogAdapter) -> None:
506 """Emit a series of CRITICAL-level log message that attempts to explain
507 why the initial data ID query returned no rows.
508 """
509 log.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
510 for message in self.common_data_ids.explain_no_results():
511 log.critical(message)
512 log.critical(
513 "To reproduce this query for debugging purposes, run "
514 "Registry.queryDataIds with these arguments:"
515 )
516 # We could just repr() the queryArgs dict to get something
517 # the user could make sense of, but it's friendlier to
518 # put these args in an easier-to-reconstruct equivalent form
519 # so they can read it more easily and copy and paste into
520 # a Python terminal.
521 log.critical(" dimensions=%s,", list(self.query_args["dimensions"].names))
522 log.critical(" dataId=%s,", self.query_args["dataId"].byName())
523 if self.query_args["where"]:
524 log.critical(" where=%s,", repr(self.query_args["where"]))
525 if "datasets" in self.query_args:
526 log.critical(" datasets=%s,", list(self.query_args["datasets"]))
527 if "collections" in self.query_args:
528 log.critical(" collections=%s,", list(self.query_args["collections"]))