Coverage for python/lsst/pipe/base/all_dimensions_quantum_graph_builder.py: 19%
200 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:39 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:39 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""The standard, general-purpose implementation of the QuantumGraph-generation
23algorithm.
24"""
26from __future__ import annotations
28__all__ = ("AllDimensionsQuantumGraphBuilder",)
30import dataclasses
31from collections.abc import Iterator, Mapping
32from contextlib import contextmanager
33from typing import Any, final
35from lsst.daf.butler import Butler, DimensionGraph
36from lsst.daf.butler.registry import MissingDatasetTypeError
37from lsst.daf.butler.registry.queries import DataCoordinateQueryResults
38from lsst.utils.logging import LsstLogAdapter
39from lsst.utils.timer import timeMethod
41from ._datasetQueryConstraints import DatasetQueryConstraintVariant
42from .pipeline_graph import DatasetTypeNode, PipelineGraph, TaskNode
43from .quantum_graph_builder import (
44 DatasetKey,
45 PrerequisiteDatasetKey,
46 QuantumGraphBuilder,
47 QuantumGraphBuilderError,
48 QuantumGraphSkeleton,
49 QuantumKey,
50)
53@final
54class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
55 """An implementation of `QuantumGraphBuilder` that uses a single large
56 query for data IDs covering all dimensions in the pipeline.
58 Parameters
59 ----------
60 pipeline_graph : `.pipeline_graph.PipelineGraph`
61 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved
62 in-place with the given butler (any existing resolution is ignored).
63 butler : `lsst.daf.butler.Butler`
64 Client for the data repository. Should be read-only.
65 where : `str`
66 Butler expression language constraint to apply to all data IDs.
67 dataset_query_constraint : `DatasetQueryConstraintVariant`, optional
68 Specification of which overall-input datasets should be used to
69 constrain the initial data ID queries. Not including an important
70 constraint can result in catastrophically large query results that take
71 too long to process, while including too many makes the query much more
72 complex, increasing the chances that the database will choose a bad
73 (sometimes catastrophically bad) query plan.
74 bind : `~collections.abc.Mapping`, optional
75 Variable substitutions for the ``where`` expression.
76 **kwargs
77 Additional keyword arguments forwarded to `QuantumGraphBuilder`.
79 Notes
80 -----
81 This is a general-purpose algorithm that delegates the problem of
82 determining which "end" of the pipeline is more constrained (beginning by
83 input collection contents vs. end by the ``where`` string) to the database
84 query planner, which *usually* does a good job.
86 This algorithm suffers from a serious limitation, which we refer to as the
87 "tract slicing" problem from its most common variant: the ``where`` string
88 and general data ID intersection rules apply to *all* data IDs in the
89 graph. For example, if a ``tract`` constraint is present in the ``where``
90 string or an overall-input dataset, then it is impossible for any data ID
91 that does not overlap that tract to be present anywhere in the pipeline,
92 such as a ``{visit, detector}`` combination where the ``visit`` overlaps
93 the ``tract`` even if the ``detector`` does not.
94 """
96 def __init__(
97 self,
98 pipeline_graph: PipelineGraph,
99 butler: Butler,
100 *,
101 where: str,
102 dataset_query_constraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL,
103 bind: Mapping[str, Any] | None = None,
104 **kwargs: Any,
105 ):
106 super().__init__(pipeline_graph, butler, **kwargs)
107 self.where = where
108 self.dataset_query_constraint = dataset_query_constraint
109 self.bind = bind
111 @timeMethod
112 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton:
113 # Docstring inherited.
114 # There is some chance that the dimension query for one subgraph would
115 # be the same as or a dimension-subset of another. This is an
116 # optimization opportunity we're not currently taking advantage of.
117 with _AllDimensionsQuery.from_builder(self, subgraph) as query:
118 skeleton = self._make_subgraph_skeleton(query)
119 self._find_followup_datasets(query, skeleton)
120 return skeleton
122 @timeMethod
123 def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton:
124 """Build a `QuantumGraphSkeleton` by iterating over the result rows
125 of the initial data ID query.
127 Parameters
128 ----------
129 query : `_AllDimensionsQuery`
130 Object representing the full-pipeline data ID query.
132 Returns
133 -------
134 skeleton : `QuantumGraphSkeleton`
135 Preliminary quantum graph.
136 """
137 # First we make containers of empty-dimensions quantum and dataset
138 # keys, and add those to the skelton, since empty data IDs are
139 # logically subsets of any data ID. We'll copy those to initialize the
140 # containers of keys for each result row. We don't ever explicitly add
141 # nodes to the skeleton for these, and that's okay because networkx
142 # adds nodes implicitly when an edge to that node is added, and we
143 # don't want to add nodes for init datasets here.
144 skeleton = QuantumGraphSkeleton(query.subgraph.tasks)
145 empty_dimensions_dataset_keys = {}
146 for dataset_type_name in query.empty_dimensions_dataset_types.keys():
147 empty_dimensions_dataset_keys[dataset_type_name] = skeleton.add_dataset_node(
148 dataset_type_name, self.empty_data_id
149 )
150 empty_dimensions_quantum_keys = []
151 for task_label in query.empty_dimensions_tasks.keys():
152 empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id))
153 self.log.info("Iterating over query results to associate quanta with datasets.")
154 # Iterate over query results, populating data IDs for datasets and
155 # quanta and then connecting them to each other. This is the slowest
156 # client-side part of QG generation, and it's often the slowest part
157 # overall, so inside this loop is where it's really critical to avoid
158 # expensive things, especially in the nested loops.
159 n_rows = 0
160 for common_data_id in query.common_data_ids:
161 # Create a data ID for each set of dimensions used by one or more
162 # tasks or dataset types, and use that to record all quanta and
163 # dataset data IDs for this row.
164 dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy()
165 quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy()
166 for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items():
167 data_id = common_data_id.subset(dimensions)
168 for dataset_type_name in dataset_type_nodes.keys():
169 dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node(
170 dataset_type_name, data_id
171 )
172 for task_label in task_nodes.keys():
173 quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id))
174 # Whether these quanta are new or existing, we can now associate
175 # the dataset data IDs for this row with them. The fact that a
176 # quantum data ID and a dataset data ID both came from the same
177 # result row is what tells us they should be associated. Many of
178 # these associates will be duplicates (because another query row
179 # that differed from this one only in irrelevant dimensions already
180 # added them), and our use of sets should take care of that.
181 for quantum_key in quantum_keys_for_row:
182 for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values():
183 skeleton.add_input_edge(
184 quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name]
185 )
186 for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs():
187 skeleton.add_output_edge(
188 quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name]
189 )
190 n_rows += 1
191 if n_rows == 0:
192 query.log_failure(self.log)
193 else:
194 n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks)
195 self.log.info(
196 "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).",
197 n_quanta,
198 skeleton.n_nodes - n_quanta,
199 skeleton.n_edges,
200 n_rows,
201 )
202 return skeleton
204 @timeMethod
205 def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None:
206 """Populate `existing_datasets` by performing follow-up queries joined
207 to column-subsets of the initial data ID query.
209 Parameters
210 ----------
211 query : `_AllDimensionsQuery`
212 Object representing the full-pipeline data ID query.
213 """
214 for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items():
215 data_ids = query.common_data_ids.subset(dimensions, unique=True)
216 # Iterate over regular input/output dataset type nodes with these
217 # dimensions to find those datasets using straightforward followup
218 # queries.
219 for dataset_type_node in dataset_types_in_group.values():
220 if dataset_type_node.name in query.overall_inputs:
221 # Dataset type is an overall input; we always need to try
222 # to find these.
223 count = 0
224 try:
225 for ref in data_ids.findDatasets(dataset_type_node.name, self.input_collections):
226 self.existing_datasets.inputs[
227 DatasetKey(dataset_type_node.name, ref.dataId.values_tuple())
228 ] = ref
229 count += 1
230 except MissingDatasetTypeError:
231 pass
232 self.log.verbose(
233 "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
234 )
235 continue
236 if self.skip_existing_in:
237 # Dataset type is an intermediate or output; need to find
238 # these if only they're from previously executed quanta
239 # that we might skip...
240 count = 0
241 try:
242 for ref in data_ids.findDatasets(dataset_type_node.name, self.skip_existing_in):
243 key = DatasetKey(dataset_type_node.name, ref.dataId.values_tuple())
244 self.existing_datasets.outputs_for_skip[key] = ref
245 count += 1
246 if ref.run == self.output_run:
247 self.existing_datasets.outputs_in_the_way[key] = ref
248 except MissingDatasetTypeError:
249 pass
250 self.log.verbose(
251 "Found %d output dataset(s) of type %r in %s.",
252 count,
253 dataset_type_node.name,
254 self.skip_existing_in,
255 )
256 if self.output_run_exists and not self.skip_existing_starts_with_output_run:
257 # ...or if they're in the way and would need to be
258 # clobbered (and we haven't already found them in the
259 # previous block).
260 count = 0
261 try:
262 for ref in data_ids.findDatasets(dataset_type_node.name, [self.output_run]):
263 self.existing_datasets.outputs_in_the_way[
264 DatasetKey(dataset_type_node.name, ref.dataId.values_tuple())
265 ] = ref
266 count += 1
267 except MissingDatasetTypeError:
268 pass
269 self.log.verbose(
270 "Found %d output dataset(s) of type %r in %s.",
271 count,
272 dataset_type_node.name,
273 self.output_run,
274 )
275 del dataset_type_node
276 # Iterate over tasks with these dimensions to perform follow-up
277 # queries for prerequisite inputs, which may have dimensions that
278 # were not in ``common_data_ids`` and/or require temporal joins to
279 # calibration validity ranges.
280 for task_node in tasks_in_group.values():
281 task_prerequisite_info = self.prerequisite_info[task_node.label]
282 for connection_name, finder in list(task_prerequisite_info.finders.items()):
283 if finder.lookup_function is not None:
284 self.log.verbose(
285 "Deferring prerequisite input %r of task %r to per-quantum processing "
286 "(lookup function provided).",
287 finder.dataset_type_node.name,
288 task_node.label,
289 )
290 continue
291 # We also fall back to the base class if there is a
292 # nontrivial spatial or temporal join in the lookup.
293 if finder.dataset_skypix or finder.dataset_other_spatial:
294 if task_prerequisite_info.bounds.spatial_connections:
295 self.log.verbose(
296 "Deferring prerequisite input %r of task %r to per-quantum processing "
297 "(for spatial-bounds-connections handling).",
298 finder.dataset_type_node.name,
299 task_node.label,
300 )
301 continue
302 if not task_node.dimensions.spatial:
303 self.log.verbose(
304 "Deferring prerequisite input %r of task %r to per-quantum processing "
305 "(dataset has spatial data IDs, but task does not).",
306 finder.dataset_type_node.name,
307 task_node.label,
308 )
309 continue
310 if finder.dataset_has_timespan:
311 if task_prerequisite_info.bounds.spatial_connections:
312 self.log.verbose(
313 "Deferring prerequisite input %r of task %r to per-quantum processing "
314 "(for temporal-bounds-connections handling).",
315 finder.dataset_type_node.name,
316 task_node.label,
317 )
318 continue
319 if not task_node.dimensions.temporal:
320 self.log.verbose(
321 "Deferring prerequisite input %r of task %r to per-quantum processing "
322 "(dataset has temporal data IDs, but task does not).",
323 finder.dataset_type_node.name,
324 task_node.label,
325 )
326 continue
327 # We have a simple case where we can do a single query
328 # that joins the query we already have for the task data
329 # IDs to the datasets we're looking for.
330 count = 0
331 try:
332 query_results = data_ids.findRelatedDatasets(
333 finder.dataset_type_node.dataset_type, self.input_collections
334 )
335 except MissingDatasetTypeError:
336 query_results = []
337 for data_id, ref in query_results:
338 dataset_key = PrerequisiteDatasetKey(finder.dataset_type_node.name, ref.id.bytes)
339 quantum_key = QuantumKey(task_node.label, data_id.values_tuple())
340 # The column-subset operation used to make `data_ids`
341 # from `common_data_ids` can strip away post-query
342 # filtering; e.g. if we starts with a {visit, patch}
343 # query but subset down to just {visit}, we can't keep
344 # the patch.region column we need for that filtering.
345 # This means we can get some data IDs that weren't in
346 # the original query (e.g. visits that don't overlap
347 # the same patch, but do overlap the some common skypix
348 # ID). We don't want to add quanta with those data ID
349 # here, which is why we pass
350 # ignore_unrecognized_quanta=True here.
351 if skeleton.add_input_edge(quantum_key, dataset_key, ignore_unrecognized_quanta=True):
352 self.existing_datasets.inputs[dataset_key] = ref
353 count += 1
354 # Remove this finder from the mapping so the base class
355 # knows it doesn't have to look for these prerequisites.
356 del task_prerequisite_info.finders[connection_name]
357 self.log.verbose(
358 "Added %d prerequisite input edge(s) from dataset type %r to task %r.",
359 count,
360 finder.dataset_type_node.name,
361 task_node.label,
362 )
365@dataclasses.dataclass(eq=False, repr=False)
366class _AllDimensionsQuery:
367 """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all
368 per-subgraph state.
370 This object should always be constructed by `from_builder`, which returns
371 an instance wrapped with a context manager. This controls the lifetime of
372 the temporary table referenced by `common_data_ids`.
373 """
375 subgraph: PipelineGraph
376 """Graph of this subset of the pipeline."""
378 grouped_by_dimensions: dict[
379 DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]
380 ] = dataclasses.field(default_factory=dict)
381 """The tasks and dataset types of this subset of the pipeline, grouped
382 by their dimensions.
384 The tasks and dataset types with empty dimensions are not included; they're
385 in other attributes since they are usually used differently. Prerequisite
386 dataset types are also not included.
387 """
389 empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
390 """The tasks of this subset of this pipeline that have empty dimensions."""
392 empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
393 """The dataset types of this subset of this pipeline that have empty
394 dimensions.
396 Prerequisite dataset types are not included.
397 """
399 overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
400 """Pipeline graph nodes for all non-prerequisite, non-init overall-input
401 dataset types for this subset of the pipeline.
402 """
404 query_args: dict[str, Any] = dataclasses.field(default_factory=dict)
405 """All keyword arguments passed to `lsst.daf.butler.Registry.queryDataIds`.
406 """
408 common_data_ids: DataCoordinateQueryResults = dataclasses.field(init=False)
409 """Results of the materialized initial data ID query."""
411 @classmethod
412 @contextmanager
413 def from_builder(
414 cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph
415 ) -> Iterator[_AllDimensionsQuery]:
416 """Construct and run the query, returning an instance guarded by
417 a context manager.
419 Parameters
420 ----------
421 builder : `AllDimensionsQuantumGraphBuilder`
422 Builder object this helper is associated with.
423 subgraph : `pipeline_graph.PipelineGraph`
424 Subset of the pipeline being processed.
426 Returns
427 -------
428 context : `AbstractContextManager` [ `_AllDimensionsQuery` ]
429 An instance of this class, inside a context manager that manages
430 the lifetime of its temporary database table.
431 """
432 result = cls(subgraph)
433 builder.log.debug("Analyzing subgraph dimensions and overall-inputs.")
434 result.grouped_by_dimensions = result.subgraph.group_by_dimensions()
435 (
436 result.empty_dimensions_tasks,
437 result.empty_dimensions_dataset_types,
438 ) = result.grouped_by_dimensions.pop(builder.universe.empty)
439 result.overall_inputs = {
440 name: node # type: ignore
441 for name, node in result.subgraph.iter_overall_inputs()
442 if not node.is_prerequisite # type: ignore
443 }
444 dimension_names: set[str] = set()
445 for dimensions_for_group in result.grouped_by_dimensions.keys():
446 dimension_names.update(dimensions_for_group.names)
447 dimensions = builder.universe.extract(dimension_names)
448 builder.log.debug("Building query for data IDs.")
449 result.query_args = {
450 "dimensions": dimensions,
451 "where": builder.where,
452 "dataId": result.subgraph.data_id,
453 "bind": builder.bind,
454 }
455 if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
456 builder.log.debug("Constraining graph query using all datasets not marked as deferred.")
457 result.query_args["datasets"] = {
458 name
459 for name, dataset_type_node in result.overall_inputs.items()
460 if (
461 dataset_type_node.is_initial_query_constraint
462 and name not in result.empty_dimensions_dataset_types
463 )
464 }
465 result.query_args["collections"] = builder.input_collections
466 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
467 builder.log.debug("Not using dataset existence to constrain query.")
468 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
469 constraint = set(builder.dataset_query_constraint)
470 inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys()
471 if remainder := constraint.difference(inputs):
472 raise QuantumGraphBuilderError(
473 f"{remainder} dataset type(s) specified as a graph constraint, but"
474 f" do not appear as an overall input to the specified pipeline: {inputs}."
475 " Note that component datasets are not permitted as constraints."
476 )
477 builder.log.debug(f"Constraining graph query using {constraint}")
478 result.query_args["datasets"] = constraint
479 result.query_args["collections"] = builder.input_collections
480 else:
481 raise QuantumGraphBuilderError(
482 f"Unable to handle type {builder.dataset_query_constraint} "
483 "given as datasetQueryConstraint."
484 )
485 builder.log.verbose("Querying for data IDs with arguments:")
486 builder.log.verbose(" dimensions=%s,", list(result.query_args["dimensions"].names))
487 builder.log.verbose(" dataId=%s,", result.query_args["dataId"].byName())
488 if result.query_args["where"]:
489 builder.log.verbose(" where=%s,", repr(result.query_args["where"]))
490 if "datasets" in result.query_args:
491 builder.log.verbose(" datasets=%s,", list(result.query_args["datasets"]))
492 if "collections" in result.query_args:
493 builder.log.verbose(" collections=%s,", list(result.query_args["collections"]))
494 with builder.butler.registry.queryDataIds(**result.query_args).materialize() as common_data_ids:
495 builder.log.debug("Expanding data IDs.")
496 result.common_data_ids = common_data_ids.expanded()
497 yield result
499 def log_failure(self, log: LsstLogAdapter) -> None:
500 """Emit a series of CRITICAL-level log message that attempts to explain
501 why the initial data ID query returned no rows.
502 """
503 log.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.")
504 for message in self.common_data_ids.explain_no_results():
505 log.critical(message)
506 log.critical(
507 "To reproduce this query for debugging purposes, run "
508 "Registry.queryDataIds with these arguments:"
509 )
510 # We could just repr() the queryArgs dict to get something
511 # the user could make sense of, but it's friendlier to
512 # put these args in an easier-to-reconstruct equivalent form
513 # so they can read it more easily and copy and paste into
514 # a Python terminal.
515 log.critical(" dimensions=%s,", list(self.query_args["dimensions"].names))
516 log.critical(" dataId=%s,", self.query_args["dataId"].byName())
517 if self.query_args["where"]:
518 log.critical(" where=%s,", repr(self.query_args["where"]))
519 if "datasets" in self.query_args:
520 log.critical(" datasets=%s,", list(self.query_args["datasets"]))
521 if "collections" in self.query_args:
522 log.critical(" collections=%s,", list(self.query_args["collections"]))