Coverage for python/lsst/pipe/base/quantum_graph_builder.py: 25%
375 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 11:28 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 11:28 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""The base class for the QuantumGraph-generation algorithm and various
29helper classes.
30"""
32from __future__ import annotations
34__all__ = (
35 "QuantumGraphBuilder",
36 "ExistingDatasets",
37 "QuantumGraphBuilderError",
38 "OutputExistsError",
39 "PrerequisiteMissingError",
40)
42import dataclasses
43from abc import ABC, abstractmethod
44from collections.abc import Iterable, Mapping, Sequence
45from typing import TYPE_CHECKING, Any, final
47from deprecated.sphinx import deprecated
48from lsst.daf.butler import (
49 Butler,
50 CollectionType,
51 DataCoordinate,
52 DatasetRef,
53 DatasetType,
54 DimensionUniverse,
55 NamedKeyDict,
56 NamedKeyMapping,
57 Quantum,
58)
59from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
60from lsst.utils.logging import LsstLogAdapter, getLogger
61from lsst.utils.timer import timeMethod
63from . import automatic_connection_constants as acc
64from ._status import NoWorkFound
65from ._task_metadata import TaskMetadata
66from .connections import AdjustQuantumHelper
67from .graph import QuantumGraph
68from .pipeline_graph import PipelineGraph, TaskNode
69from .prerequisite_helpers import PrerequisiteInfo, SkyPixBoundsBuilder, TimespanBuilder
70from .quantum_graph_skeleton import (
71 DatasetKey,
72 PrerequisiteDatasetKey,
73 QuantumGraphSkeleton,
74 QuantumKey,
75 TaskInitKey,
76)
78if TYPE_CHECKING:
79 from .pipeline import TaskDef
82class QuantumGraphBuilderError(Exception):
83 """Base class for exceptions generated by QuantumGraphBuilder."""
85 pass
88# TODO: remove class and switch downstream inheritance to just
89# QuantumGraphBuilderError on DM-40443.
90@deprecated(
91 "Deprecated in favor of QuantumGraphBuilderError and will be removed after v27.",
92 version="v27.0",
93 category=FutureWarning,
94)
95class GraphBuilderError(QuantumGraphBuilderError):
96 """Backwards-compatibility near-alias for QuantumGraphBuilderError."""
98 pass
101# Inherit from backwards-compatibility alias for backwards-compatibility.
102class OutputExistsError(GraphBuilderError):
103 """Exception generated when output datasets already exist."""
105 pass
108# Inherit from backwards-compatibility alias for backwards-compatibility.
109class PrerequisiteMissingError(GraphBuilderError):
110 """Exception generated when a prerequisite dataset does not exist."""
112 pass
115class InitInputMissingError(QuantumGraphBuilderError):
116 """Exception generated when an init-input dataset does not exist."""
118 pass
121class QuantumGraphBuilder(ABC):
122 """An abstract base class for building `QuantumGraph` objects from a
123 pipeline.
125 Parameters
126 ----------
127 pipeline_graph : `.pipeline_graph.PipelineGraph`
128 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved
129 in-place with the given butler (any existing resolution is ignored).
130 butler : `lsst.daf.butler.Butler`
131 Client for the data repository. Should be read-only.
132 input_collections : `~collections.abc.Sequence` [ `str` ], optional
133 Collections to search for overall-input datasets. If not provided,
134 ``butler.collections`` is used (and must not be empty).
135 output_run : `str`, optional
136 Output `~lsst.daf.butler.CollectionType.RUN` collection. If not
137 provided, ``butler.run`` is used (and must not be `None`).
138 skip_existing_in : `~collections.abc.Sequence` [ `str` ], optional
139 Collections to search for outputs that already exist for the purpose of
140 skipping quanta that have already been run.
141 clobber : `bool`, optional
142 Whether to raise if predicted outputs already exist in ``output_run``
143 (not including those quanta that would be skipped because they've
144 already been run). This never actually clobbers outputs; it just
145 informs the graph generation algorithm whether execution will run with
146 clobbering enabled. This is ignored if ``output_run`` does not exist.
148 Notes
149 -----
150 Constructing a `QuantumGraphBuilder` will run queries for existing datasets
151 with empty data IDs (including but not limited to init inputs and outputs),
152 in addition to resolving the given pipeline graph and testing for existence
153 of the ``output`` run collection.
155 The `build` method splits the pipeline graph into independent subgraphs,
156 then calls the abstract method `process_subgraph` on each, to allow
157 concrete implementations to populate the rough graph structure (the
158 `~quantum_graph_skeleton.QuantumGraphSkeleton` class) and search for
159 existing datasets (further populating the builder's `existing_datasets`
160 struct). The `build` method then:
162 - assembles `lsst.daf.butler.Quantum` instances from all data IDs in the
163 skeleton;
164 - looks for existing outputs found in ``skip_existing_in`` to see if any
165 quanta should be skipped;
166 - calls `PipelineTaskConnections.adjustQuantum` on all quanta, adjusting
167 downstream quanta appropriately when preliminary predicted outputs are
168 rejected (pruning nodes that will not have the inputs they need to run);
169 - attaches datastore records and registry dataset types to the graph.
171 In addition to implementing `process_subgraph`, derived classes are
172 generally expected to add new construction keyword-only arguments to
173 control the data IDs of the quantum graph, while forwarding all of the
174 arguments defined in the base class to `super`.
175 """
177 def __init__(
178 self,
179 pipeline_graph: PipelineGraph,
180 butler: Butler,
181 *,
182 input_collections: Sequence[str] | None = None,
183 output_run: str | None = None,
184 skip_existing_in: Sequence[str] = (),
185 clobber: bool = False,
186 ):
187 self.log = getLogger(__name__)
188 self.metadata = TaskMetadata()
189 self._pipeline_graph = pipeline_graph
190 self.butler = butler
191 if input_collections is None:
192 input_collections = butler.collections
193 if not input_collections:
194 raise ValueError("No input collections provided.")
195 self.input_collections = input_collections
196 if output_run is None:
197 output_run = butler.run
198 if not output_run:
199 raise ValueError("No output RUN collection provided.")
200 self.output_run = output_run
201 self.skip_existing_in = skip_existing_in
202 self.empty_data_id = DataCoordinate.make_empty(butler.dimensions)
203 self.clobber = clobber
204 # See whether the output run already exists.
205 self.output_run_exists = False
206 try:
207 if self.butler.registry.getCollectionType(self.output_run) is not CollectionType.RUN:
208 raise RuntimeError(f"{self.output_run!r} is not a RUN collection.")
209 self.output_run_exists = True
210 except MissingCollectionError:
211 # If the run doesn't exist we never need to clobber. This is not
212 # an error so you can run with clobber=True the first time you
213 # attempt some processing as well as all subsequent times, instead
214 # of forcing the user to make the first attempt different.
215 self.clobber = False
216 # We need to know whether the skip_existing_in collection sequence
217 # starts with the output run collection, as an optimization to avoid
218 # queries later.
219 try:
220 skip_existing_in_flat = self.butler.registry.queryCollections(
221 self.skip_existing_in, flattenChains=True
222 )
223 except MissingCollectionError:
224 skip_existing_in_flat = []
225 if not skip_existing_in_flat:
226 self.skip_existing_in = []
227 if self.skip_existing_in and self.output_run_exists:
228 self.skip_existing_starts_with_output_run = self.output_run == skip_existing_in_flat[0]
229 else:
230 self.skip_existing_starts_with_output_run = False
231 self.existing_datasets = ExistingDatasets()
232 try:
233 packages_storage_class = butler.get_dataset_type(acc.PACKAGES_INIT_OUTPUT_NAME).storageClass_name
234 except MissingDatasetTypeError:
235 packages_storage_class = acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS
236 self._global_init_output_types = {
237 acc.PACKAGES_INIT_OUTPUT_NAME: DatasetType(
238 acc.PACKAGES_INIT_OUTPUT_NAME,
239 self.universe.empty,
240 packages_storage_class,
241 )
242 }
243 with self.butler.registry.caching_context():
244 self._pipeline_graph.resolve(self.butler.registry)
245 self._find_empty_dimension_datasets()
246 self.prerequisite_info = {
247 task_node.label: PrerequisiteInfo(task_node, self._pipeline_graph)
248 for task_node in pipeline_graph.tasks.values()
249 }
251 log: LsstLogAdapter
252 """Logger to use for all quantum-graph generation messages.
254 General and per-task status messages should be logged at `~logging.INFO`
255 level or higher, per-dataset-type status messages should be logged at
256 `~lsst.utils.logging.VERBOSE` or higher, and per-data-ID status messages
257 should be logged at `logging.DEBUG` or higher.
258 """
260 metadata: TaskMetadata
261 """Metadata to store in the QuantumGraph.
263 The `TaskMetadata` class is used here primarily in order to enable
264 resource-usage collection with the `lsst.utils.timer.timeMethod` decorator.
265 """
267 butler: Butler
268 """Client for the data repository.
270 Should be read-only.
271 """
273 input_collections: Sequence[str]
274 """Collections to search for overall-input datasets.
275 """
277 output_run: str
278 """Output `~lsst.daf.butler.CollectionType.RUN` collection.
279 """
281 skip_existing_in: Sequence[str]
282 """Collections to search for outputs that already exist for the purpose
283 of skipping quanta that have already been run.
284 """
286 clobber: bool
287 """Whether to raise if predicted outputs already exist in ``output_run``
289 This never actually clobbers outputs; it just informs the graph generation
290 algorithm whether execution will run with clobbering enabled. This is
291 always `False` if `output_run_exists` is `False`.
292 """
294 empty_data_id: DataCoordinate
295 """An empty data ID in the data repository's dimension universe.
296 """
298 output_run_exists: bool
299 """Whether the output run exists in the data repository already.
300 """
302 skip_existing_starts_with_output_run: bool
303 """Whether the `skip_existing_in` sequence begins with `output_run`.
305 If this is true, any dataset found in `output_run` can be used to
306 short-circuit queries in `skip_existing_in`.
307 """
309 existing_datasets: ExistingDatasets
310 """Struct holding datasets that have already been found in the data
311 repository.
313 This is updated in-place as the `QuantumGraph` generation algorithm
314 proceeds.
315 """
317 prerequisite_info: Mapping[str, PrerequisiteInfo]
318 """Helper objects for finding prerequisite inputs, organized by task label.
320 Subclasses that find prerequisites should remove the
321 covered `~prerequisite_helpers.PrerequisiteFinder` objects from this
322 attribute.
323 """
325 @property
326 def universe(self) -> DimensionUniverse:
327 """Definitions of all data dimensions."""
328 return self.butler.dimensions
330 @final
331 @timeMethod
332 def build(
333 self, metadata: Mapping[str, Any] | None = None, attach_datastore_records: bool = True
334 ) -> QuantumGraph:
335 """Build the quantum graph.
337 Parameters
338 ----------
339 metadata : `~collections.abc.Mapping`, optional
340 Flexible metadata to add to the quantum graph.
341 attach_datastore_records : `bool`, optional
342 Whether to include datastore records in the graph. Required for
343 `lsst.daf.butler.QuantumBackedButler` execution.
345 Returns
346 -------
347 quantum_graph : `QuantumGraph`
348 DAG describing processing to be performed.
350 Notes
351 -----
352 External code is expected to construct a `QuantumGraphBuilder` and then
353 call this method exactly once. See class documentation for details on
354 what it does.
355 """
356 with self.butler.registry.caching_context():
357 full_skeleton = QuantumGraphSkeleton(self._pipeline_graph.tasks)
358 subgraphs = list(self._pipeline_graph.split_independent())
359 for i, subgraph in enumerate(subgraphs):
360 self.log.info(
361 "Processing pipeline subgraph %d of %d with %d task(s).",
362 i + 1,
363 len(subgraphs),
364 len(subgraph.tasks),
365 )
366 self.log.verbose("Subgraph tasks: [%s]", ", ".join(label for label in subgraph.tasks))
367 subgraph_skeleton = self.process_subgraph(subgraph)
368 full_skeleton.update(subgraph_skeleton)
369 # Loop over tasks. The pipeline graph must be topologically
370 # sorted, so a quantum is only processed after any quantum that
371 # provides its inputs has been processed.
372 for task_node in self._pipeline_graph.tasks.values():
373 self._resolve_task_quanta(task_node, full_skeleton)
374 # Add global init-outputs to the skeleton.
375 for dataset_type in self._global_init_output_types.values():
376 dataset_key = full_skeleton.add_dataset_node(
377 dataset_type.name, self.empty_data_id, is_global_init_output=True
378 )
379 ref = self.existing_datasets.outputs_in_the_way.get(dataset_key)
380 if ref is None:
381 ref = DatasetRef(dataset_type, self.empty_data_id, run=self.output_run)
382 full_skeleton[dataset_key]["ref"] = ref
383 # Remove dataset nodes with no edges that are not global init
384 # outputs, which are generally overall-inputs whose original quanta
385 # end up skipped or with no work to do (we can't remove these along
386 # with the quanta because no quantum knows if its the only
387 # consumer).
388 full_skeleton.remove_orphan_datasets()
389 if attach_datastore_records:
390 self._attach_datastore_records(full_skeleton)
391 # TODO initialize most metadata here instead of in ctrl_mpexec.
392 if metadata is None:
393 metadata = {}
394 return self._construct_quantum_graph(full_skeleton, metadata)
396 @abstractmethod
397 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton:
398 """Build the rough structure for an independent subset of the
399 `QuantumGraph` and query for relevant existing datasets.
401 Parameters
402 ----------
403 subgraph : `.pipeline_graph.PipelineGraph`
404 Subset of the pipeline graph that should be processed by this call.
405 This is always resolved and topologically sorted. It should not be
406 modified.
408 Returns
409 -------
410 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
411 Class representing an initial quantum graph. See
412 `quantum_graph_skeleton.QuantumGraphSkeleton` docs for details.
413 After this is returned, the object may be modified in-place in
414 unspecified ways.
416 Notes
417 -----
418 In addition to returning a
419 `quantum_graph_skeleton.QuantumGraphSkeleton`, this method should
420 populate the `existing_datasets` structure by querying for all relevant
421 datasets with non-empty data IDs (those with empty data IDs will
422 already be present). In particular:
424 - `~ExistingDatasets.inputs` must always be populated with all
425 overall-input datasets (but not prerequisites), by querying
426 `input_collections`;
427 - `~ExistingDatasets.outputs_for_skip` must be populated with any
428 intermediate our output datasets present in `skip_existing_in` (it
429 can be ignored if `skip_existing_in` is empty);
430 - `~ExistingDatasets.outputs_in_the_way` must be populated with any
431 intermediate or output datasets present in `output_run`, if
432 `output_run_exists` (it can be ignored if `output_run_exists` is
433 `False`). Note that the presence of such datasets is not
434 automatically an error, even if `clobber is `False`, as these may be
435 quanta that will be skipped.
436 - `~ExistingDatasets.inputs` must be populated with all
437 prerequisite-input datasets that were included in the skeleton, by
438 querying `input_collections` (not all prerequisite inputs need to be
439 included in the skeleton, but the base class can only use per-quantum
440 queries to find them, and that can be slow when there are many
441 quanta).
443 Dataset types should never be components and should always use the
444 "common" storage class definition in `pipeline_graph.DatasetTypeNode`
445 (which is the data repository definition when the dataset type is
446 registered).
447 """
448 raise NotImplementedError()
450 @final
451 @timeMethod
452 def _resolve_task_quanta(self, task_node: TaskNode, skeleton: QuantumGraphSkeleton) -> None:
453 """Process the quanta for one task in a skeleton graph to skip those
454 that have already completed and adjust those that request it.
456 Parameters
457 ----------
458 task_node : `pipeline_graph.TaskNode`
459 Node for this task in the pipeline graph.
460 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
461 Preliminary quantum graph, to be modified in-place.
463 Notes
464 -----
465 This method modifies ``skeleton`` in-place in several ways:
467 - It adds a "ref" attribute to dataset nodes, using the contents of
468 `existing_datasets`. This ensures producing and consuming tasks
469 start from the same `DatasetRef`.
470 - It adds "inputs", "outputs", and "init_inputs" attributes to the
471 quantum nodes, holding the same `NamedValueMapping` objects needed to
472 construct an actual `Quantum` instances.
473 - It removes quantum nodes that are to be skipped because their outputs
474 already exist in `skip_existing_in`. It also removes their outputs
475 from `ExistingDatasets.outputs_in_the_way`.
476 - It adds prerequisite dataset nodes and edges that connect them to the
477 quanta that consume them.
478 - It removes quantum nodes whose
479 `~PipelineTaskConnections.adjustQuantum` calls raise `NoWorkFound` or
480 predict no outputs;
481 - It removes the nodes of output datasets that are "adjusted away".
482 - It removes the edges of input datasets that are "adjusted away".
484 The difference between how adjusted inputs and outputs are handled
485 reflects the fact that many quanta can share the same input, but only
486 one produces each output. This can lead to the graph having
487 superfluous isolated nodes after processing is complete, but these
488 should only be removed after all the quanta from all tasks have been
489 processed.
490 """
491 # Extract the helper object for the prerequisite inputs of this task,
492 # and tell it to prepare to construct skypix bounds and timespans for
493 # each quantum (these will automatically do nothing if nothing needs
494 # those bounds).
495 task_prerequisite_info = self.prerequisite_info[task_node.label]
496 task_prerequisite_info.update_bounds()
497 # Loop over all quanta for this task, remembering the ones we've
498 # gotten rid of.
499 skipped_quanta = []
500 no_work_quanta = []
501 for quantum_key in skeleton.get_quanta(task_node.label):
502 if self._skip_quantum_if_metadata_exists(task_node, quantum_key, skeleton):
503 skipped_quanta.append(quantum_key)
504 continue
505 quantum_data_id = skeleton[quantum_key]["data_id"]
506 skypix_bounds_builder = task_prerequisite_info.bounds.make_skypix_bounds_builder(quantum_data_id)
507 timespan_builder = task_prerequisite_info.bounds.make_timespan_builder(quantum_data_id)
508 adjusted_outputs = self._gather_quantum_outputs(
509 task_node, quantum_key, skeleton, skypix_bounds_builder, timespan_builder
510 )
511 adjusted_inputs = self._gather_quantum_inputs(
512 task_node,
513 quantum_key,
514 skeleton,
515 task_prerequisite_info,
516 skypix_bounds_builder,
517 timespan_builder,
518 )
519 # Give the task's Connections class an opportunity to remove
520 # some inputs, or complain if they are unacceptable. This will
521 # raise if one of the check conditions is not met, which is the
522 # intended behavior.
523 helper = AdjustQuantumHelper(inputs=adjusted_inputs, outputs=adjusted_outputs)
524 try:
525 helper.adjust_in_place(task_node.get_connections(), task_node.label, quantum_data_id)
526 except NoWorkFound as err:
527 # Do not generate this quantum; it would not produce any
528 # outputs. Remove it and all of the outputs it might have
529 # produced from the skeleton.
530 try:
531 _, connection_name, _ = err.args
532 details = f"not enough datasets for connection {connection_name}."
533 except ValueError:
534 details = str(err)
535 self.log.debug(
536 "No work found for quantum %s of task %s: %s",
537 quantum_key.data_id_values,
538 quantum_key.task_label,
539 details,
540 )
541 no_work_quanta.append(quantum_key)
542 continue
543 if helper.outputs_adjusted:
544 if not any(adjusted_refs for adjusted_refs in helper.outputs.values()):
545 # No outputs also means we don't generate this quantum.
546 self.log.debug(
547 "No outputs predicted for quantum %s of task %s.",
548 quantum_key.data_id_values,
549 quantum_key.task_label,
550 )
551 no_work_quanta.append(quantum_key)
552 continue
553 # Remove output nodes that were not retained by
554 # adjustQuantum.
555 skeleton.remove_dataset_nodes(
556 self._find_removed(skeleton.iter_outputs_of(quantum_key), helper.outputs)
557 )
558 if helper.inputs_adjusted:
559 if not any(bool(adjusted_refs) for adjusted_refs in helper.inputs.values()):
560 raise QuantumGraphBuilderError(
561 f"adjustQuantum implementation for {task_node.label}@{quantum_key.data_id_values} "
562 "returned outputs but no inputs."
563 )
564 # Remove input dataset edges that were not retained by
565 # adjustQuantum. We can't remove the input dataset nodes
566 # because some other quantum might still want them.
567 skeleton.remove_input_edges(
568 quantum_key, self._find_removed(skeleton.iter_inputs_of(quantum_key), helper.inputs)
569 )
570 # Save the adjusted inputs and outputs to the quantum node's
571 # state so we don't have to regenerate those data structures
572 # from the graph.
573 skeleton[quantum_key]["inputs"] = helper.inputs
574 skeleton[quantum_key]["outputs"] = helper.outputs
575 for no_work_quantum in no_work_quanta:
576 skeleton.remove_quantum_node(no_work_quantum, remove_outputs=True)
577 for skipped_quantum in skipped_quanta:
578 skeleton.remove_quantum_node(skipped_quantum, remove_outputs=False)
579 remaining_quanta = skeleton.get_quanta(task_node.label)
580 self._resolve_task_init(task_node, skeleton, bool(skipped_quanta))
581 message_terms = []
582 if no_work_quanta:
583 message_terms.append(f"{len(no_work_quanta)} had no work to do")
584 if skipped_quanta:
585 message_terms.append(f"{len(skipped_quanta)} previously succeeded")
586 message_parenthetical = f" ({', '.join(message_terms)})" if message_terms else ""
587 if remaining_quanta:
588 self.log.info(
589 "Generated %s for task %s%s.",
590 _quantum_or_quanta(len(remaining_quanta)),
591 task_node.label,
592 message_parenthetical,
593 )
594 else:
595 self.log.info(
596 "Dropping task %s because no quanta remain%s.", task_node.label, message_parenthetical
597 )
598 skeleton.remove_task(task_node.label)
600 def _skip_quantum_if_metadata_exists(
601 self, task_node: TaskNode, quantum_key: QuantumKey, skeleton: QuantumGraphSkeleton
602 ) -> bool:
603 """Identify and drop quanta that should be skipped because their
604 metadata datasets already exist.
606 Parameters
607 ----------
608 task_node : `pipeline_graph.TaskNode`
609 Node for this task in the pipeline graph.
610 quantum_key : `QuantumKey`
611 Identifier for this quantum in the graph.
612 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
613 Preliminary quantum graph, to be modified in-place.
615 Returns
616 -------
617 skipped : `bool`
618 `True` if the quantum is being skipped and has been removed from
619 the graph, `False` otherwise.
621 Notes
622 -----
623 If the metadata dataset for this quantum exists in
624 `ExistingDatasets.outputs_for_skip`, the quantum will be skipped. This
625 causes the quantum node to be removed from the graph. Dataset nodes
626 that were previously the outputs of this quantum will have their "ref"
627 attribute set from `ExistingDatasets.outputs_for_skip`, or will be
628 removed if there is no such dataset there. Any output dataset in
629 `ExistingDatasets.outputs_in_the_way` will be removed.
630 """
631 metadata_dataset_key = DatasetKey(
632 task_node.metadata_output.parent_dataset_type_name, quantum_key.data_id_values
633 )
634 if metadata_dataset_key in self.existing_datasets.outputs_for_skip:
635 # This quantum's metadata is already present in the the
636 # skip_existing_in collections; we'll skip it. But the presence of
637 # the metadata dataset doesn't guarantee that all of the other
638 # outputs we predicted are present; we have to check.
639 for output_dataset_key in list(skeleton.iter_outputs_of(quantum_key)):
640 if (
641 output_ref := self.existing_datasets.outputs_for_skip.get(output_dataset_key)
642 ) is not None:
643 # Populate the skeleton graph's node attributes
644 # with the existing DatasetRef, just like a
645 # predicted output of a non-skipped quantum.
646 skeleton[output_dataset_key]["ref"] = output_ref
647 else:
648 # Remove this dataset from the skeleton graph,
649 # because the quantum that would have produced it
650 # is being skipped and it doesn't already exist.
651 skeleton.remove_dataset_nodes([output_dataset_key])
652 # If this dataset was "in the way" (i.e. already in the
653 # output run), it isn't anymore.
654 self.existing_datasets.outputs_in_the_way.pop(output_dataset_key, None)
655 # Removing the quantum node from the graph will happen outside this
656 # function.
657 return True
658 return False
660 @final
661 def _gather_quantum_outputs(
662 self,
663 task_node: TaskNode,
664 quantum_key: QuantumKey,
665 skeleton: QuantumGraphSkeleton,
666 skypix_bounds_builder: SkyPixBoundsBuilder,
667 timespan_builder: TimespanBuilder,
668 ) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
669 """Collect outputs or generate datasets for a preliminary quantum and
670 put them in the form used by `~lsst.daf.butler.Quantum` and
671 `~PipelineTaskConnections.adjustQuantum`.
673 Parameters
674 ----------
675 task_node : `pipeline_graph.TaskNode`
676 Node for this task in the pipeline graph.
677 quantum_key : `QuantumKey`
678 Identifier for this quantum in the graph.
679 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
680 Preliminary quantum graph, to be modified in-place.
681 skypix_bounds_builder : `~prerequisite_helpers.SkyPixBoundsBuilder`
682 An object that accumulates the appropriate spatial bounds for a
683 quantum.
684 timespan_builder : `~prerequisite_helpers.TimespanBuilder`
685 An object that accumulates the appropriate timespan for a quantum.
687 Returns
688 -------
689 outputs : `~lsst.daf.butler.NamedKeyDict` [ \
690 `~lsst.daf.butler.DatasetType`, `list` [ \
691 `~lsst.daf.butler.DatasetRef` ] ]
692 All outputs to the task, using the storage class and components
693 defined by the task's own connections.
695 Notes
696 -----
697 This first looks for outputs already present in the `output_run` by
698 looking in `ExistingDatasets.outputs_in_the_way`; if it finds something
699 and `clobber` is `True`, it uses that ref (it's not ideal that both the
700 original dataset and its replacement will have the same UUID, but we
701 don't have space in the quantum graph for two UUIDs, and we need the
702 datastore records of the original there). If `clobber` is `False`,
703 `RuntimeError` is raised. If there is no output already present, a new
704 one with a random UUID is generated. In all cases the "ref" attribute
705 of the dataset node in the skeleton is set.
706 """
707 outputs_by_type: dict[str, list[DatasetRef]] = {}
708 dataset_key: DatasetKey
709 for dataset_key in skeleton.iter_outputs_of(quantum_key):
710 dataset_data_id = skeleton[dataset_key]["data_id"]
711 dataset_type_node = self._pipeline_graph.dataset_types[dataset_key.parent_dataset_type_name]
712 if (ref := self.existing_datasets.outputs_in_the_way.get(dataset_key)) is None:
713 ref = DatasetRef(dataset_type_node.dataset_type, dataset_data_id, run=self.output_run)
714 elif not self.clobber:
715 # We intentionally raise here, before running adjustQuantum,
716 # because it'd be weird if we left an old potential output of a
717 # task sitting there in the output collection, just because the
718 # task happened to not actually produce it.
719 raise OutputExistsError(
720 f"Potential output dataset {ref} already exists in the output run "
721 f"{self.output_run}, but clobbering outputs was not expected to be necessary."
722 )
723 skypix_bounds_builder.handle_dataset(dataset_key.parent_dataset_type_name, dataset_data_id)
724 timespan_builder.handle_dataset(dataset_key.parent_dataset_type_name, dataset_data_id)
725 skeleton[dataset_key]["ref"] = ref
726 outputs_by_type.setdefault(dataset_key.parent_dataset_type_name, []).append(ref)
727 adapted_outputs: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict()
728 for write_edge in task_node.iter_all_outputs():
729 dataset_type_node = self._pipeline_graph.dataset_types[write_edge.parent_dataset_type_name]
730 edge_dataset_type = write_edge.adapt_dataset_type(dataset_type_node.dataset_type)
731 adapted_outputs[edge_dataset_type] = [
732 write_edge.adapt_dataset_ref(ref)
733 for ref in sorted(outputs_by_type.get(write_edge.parent_dataset_type_name, []))
734 ]
735 return adapted_outputs
737 @final
738 def _gather_quantum_inputs(
739 self,
740 task_node: TaskNode,
741 quantum_key: QuantumKey,
742 skeleton: QuantumGraphSkeleton,
743 task_prerequisite_info: PrerequisiteInfo,
744 skypix_bounds_builder: SkyPixBoundsBuilder,
745 timespan_builder: TimespanBuilder,
746 ) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
747 """Collect input datasets for a preliminary quantum and put them in the
748 form used by `~lsst.daf.butler.Quantum` and
749 `~PipelineTaskConnections.adjustQuantum`.
751 Parameters
752 ----------
753 task_node : `pipeline_graph.TaskNode`
754 Node for this task in the pipeline graph.
755 quantum_key : `QuantumKey`
756 Identifier for this quantum in the graph.
757 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
758 Preliminary quantum graph, to be modified in-place.
759 skypix_bounds_builder : `~prerequisite_helpers.SkyPixBoundsBuilder`
760 An object that accumulates the appropriate spatial bounds for a
761 quantum.
762 timespan_builder : `~prerequisite_helpers.TimespanBuilder`
763 An object that accumulates the appropriate timespan for a quantum.
765 Returns
766 -------
767 inputs : `~lsst.daf.butler.NamedKeyDict` [ \
768 `~lsst.daf.butler.DatasetType`, `list` [ \
769 `~lsst.daf.butler.DatasetRef` ] ]
770 All regular and prerequisite inputs to the task, using the storage
771 class and components defined by the task's own connections.
773 Notes
774 -----
775 On return, the dataset nodes that represent inputs to this quantum will
776 either have their "ref" attribute set (using the common dataset type,
777 not the task-specific one) or will be removed from the graph.
779 For regular inputs, usually an existing "ref" (corresponding to an
780 output of another quantum) will be found and left unchanged. When
781 there is no existing "ref" attribute, `ExistingDatasets.inputs` is
782 searched next; if there is nothing there, the input will be removed.
784 Prerequisite inputs are always queried for directly here (delegating to
785 `_find_prerequisite_inputs`). They are never produced by other tasks,
786 and cannot in general be queried for in advance when
787 `ExistingDatasets.inputs` is populated.
788 """
789 quantum_data_id = skeleton[quantum_key]["data_id"]
790 inputs_by_type: dict[str, set[DatasetRef]] = {}
791 dataset_key: DatasetKey | PrerequisiteDatasetKey
792 # Process inputs already present in the skeleton - this should include
793 # all regular inputs (including intermediates) and may include some
794 # prerequisites.
795 for dataset_key in list(skeleton.iter_inputs_of(quantum_key)):
796 if (ref := skeleton[dataset_key].get("ref")) is None:
797 # This dataset is an overall input - if it was an intermediate,
798 # we would have already either removed the node or set the
799 # "ref" attribute when processing its producing quantum - and
800 # this is the first time we're trying to resolve it.
801 if (ref := self.existing_datasets.inputs.get(dataset_key)) is None:
802 # It also doesn't exist in the input collections, so we
803 # remove its node in the skeleton graph (so other consumers
804 # won't have to check for it).
805 skeleton.remove_dataset_nodes([dataset_key])
806 continue
807 skeleton[dataset_key]["ref"] = ref
808 inputs_by_type.setdefault(dataset_key.parent_dataset_type_name, set()).add(ref)
809 skypix_bounds_builder.handle_dataset(dataset_key.parent_dataset_type_name, ref.dataId)
810 timespan_builder.handle_dataset(dataset_key.parent_dataset_type_name, ref.dataId)
811 # Query for any prerequisites not handled by process_subgraph. Note
812 # that these were not already in the skeleton graph, so we add them
813 # now.
814 skypix_bounds = skypix_bounds_builder.finish()
815 timespan = timespan_builder.finish()
816 for finder in task_prerequisite_info.finders.values():
817 inputs_for_type = inputs_by_type.setdefault(finder.dataset_type_node.name, set())
818 dataset_keys = []
819 for ref in finder.find(
820 self.butler, self.input_collections, quantum_data_id, skypix_bounds, timespan
821 ):
822 dataset_key = skeleton.add_prerequisite_node(ref.datasetType.name, ref=ref)
823 dataset_keys.append(dataset_key)
824 inputs_for_type.add(ref)
825 skeleton.add_input_edges(quantum_key, dataset_keys)
826 adapted_inputs: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict()
827 for read_edge in task_node.iter_all_inputs():
828 dataset_type_node = self._pipeline_graph.dataset_types[read_edge.parent_dataset_type_name]
829 edge_dataset_type = read_edge.adapt_dataset_type(dataset_type_node.dataset_type)
830 if (current_dataset_type := adapted_inputs.keys().get(edge_dataset_type.name)) is None:
831 adapted_inputs[edge_dataset_type] = [
832 read_edge.adapt_dataset_ref(ref)
833 for ref in sorted(inputs_by_type.get(read_edge.parent_dataset_type_name, frozenset()))
834 ]
835 elif current_dataset_type != edge_dataset_type:
836 raise NotImplementedError(
837 f"Task {task_node.label!r} has {edge_dataset_type.name!r} as an input via "
838 "two different connections, with two different storage class overrides. "
839 "This is not yet supported due to limitations in the Quantum data structure."
840 )
841 # If neither the `if` nor the `elif` above match, it means
842 # multiple input connections have exactly the same dataset
843 # type, and hence nothing to do after the first one.
844 return adapted_inputs
846 @final
847 def _resolve_task_init(
848 self, task_node: TaskNode, skeleton: QuantumGraphSkeleton, has_skipped_quanta: bool
849 ) -> None:
850 """Add init-input and init-output dataset nodes and edges for a task to
851 the skeleton.
853 Parameters
854 ----------
855 task_node : `pipeline_graph.TaskNode`
856 Pipeline graph description of the task.
857 skeleton : `QuantumGraphSkeleton`
858 In-progress quantum graph data structure to update in-place.
859 has_skipped_quanta : `bool`
860 Whether any of this task's quanta were skipped because they had
861 already succeeded.
862 """
863 quanta = skeleton.get_quanta(task_node.label)
864 task_init_key = TaskInitKey(task_node.label)
865 if quanta:
866 adapted_inputs: NamedKeyDict[DatasetType, DatasetRef] = NamedKeyDict()
867 # Process init-inputs.
868 input_keys: list[DatasetKey] = []
869 for read_edge in task_node.init.iter_all_inputs():
870 dataset_key = skeleton.add_dataset_node(
871 read_edge.parent_dataset_type_name, self.empty_data_id
872 )
873 skeleton.add_input_edge(task_init_key, dataset_key)
874 if (ref := skeleton[dataset_key].get("ref")) is None:
875 try:
876 ref = self.existing_datasets.inputs[dataset_key]
877 except KeyError:
878 raise InitInputMissingError(
879 f"Overall init-input dataset {read_edge.parent_dataset_type_name!r} "
880 f"needed by task {task_node.label!r} not found in input collection(s) "
881 f"{self.input_collections}."
882 ) from None
883 skeleton[dataset_key]["ref"] = ref
884 for quantum_key in skeleton.get_quanta(task_node.label):
885 skeleton.add_input_edge(quantum_key, dataset_key)
886 input_keys.append(dataset_key)
887 adapted_ref = read_edge.adapt_dataset_ref(ref)
888 adapted_inputs[adapted_ref.datasetType] = adapted_ref
889 # Save the quantum-adapted init inputs to each quantum, and add
890 # skeleton edges connecting the init inputs to each quantum.
891 for quantum_key in skeleton.get_quanta(task_node.label):
892 skeleton[quantum_key]["init_inputs"] = adapted_inputs
893 # Process init-outputs.
894 adapted_outputs: NamedKeyDict[DatasetType, DatasetRef] = NamedKeyDict()
895 for write_edge in task_node.init.iter_all_outputs():
896 dataset_key = skeleton.add_dataset_node(
897 write_edge.parent_dataset_type_name, self.empty_data_id
898 )
899 if (ref := self.existing_datasets.outputs_in_the_way.get(dataset_key)) is None:
900 ref = DatasetRef(
901 self._pipeline_graph.dataset_types[write_edge.parent_dataset_type_name].dataset_type,
902 self.empty_data_id,
903 run=self.output_run,
904 )
905 skeleton[dataset_key]["ref"] = ref
906 skeleton.add_output_edge(task_init_key, dataset_key)
907 adapted_ref = write_edge.adapt_dataset_ref(ref)
908 adapted_outputs[adapted_ref.datasetType] = adapted_ref
909 skeleton[task_init_key]["inputs"] = adapted_inputs
910 skeleton[task_init_key]["outputs"] = adapted_outputs
911 elif has_skipped_quanta:
912 # No quanta remain for this task, but at least one quantum was
913 # skipped because its outputs were present in the skip_existing_in
914 # collections. This means all init outputs should be present in
915 # the skip_existing_in collections, too, and we need to put those
916 # refs in the graph.
917 for write_edge in task_node.init.iter_all_outputs():
918 dataset_key = skeleton.add_dataset_node(
919 write_edge.parent_dataset_type_name, self.empty_data_id
920 )
921 if (ref := self.existing_datasets.outputs_for_skip.get(dataset_key)) is None:
922 raise InitInputMissingError(
923 f"Init-output dataset {write_edge.parent_dataset_type_name!r} of skipped task "
924 f"{task_node.label!r} not found in skip-existing-in collection(s) "
925 f"{self.skip_existing_in}."
926 ) from None
927 skeleton[dataset_key]["ref"] = ref
928 # If this dataset was "in the way" (i.e. already in the output
929 # run), it isn't anymore.
930 self.existing_datasets.outputs_in_the_way.pop(dataset_key, None)
931 # No quanta remain in this task, but none were skipped; this means
932 # they all got pruned because of NoWorkFound conditions. This
933 # dooms all downstream quanta to the same fate, so we don't bother
934 # doing anything with the task's init-outputs, since nothing is
935 # going to consume them.
937 @final
938 @timeMethod
939 def _find_empty_dimension_datasets(self) -> None:
940 """Query for all dataset types with no dimensions, updating
941 `existing_datasets` in-place.
943 This includes but is not limited to init inputs and init outputs.
944 """
945 _, dataset_type_nodes = self._pipeline_graph.group_by_dimensions()[self.universe.empty.as_group()]
946 dataset_types = [node.dataset_type for node in dataset_type_nodes.values()]
947 dataset_types.extend(self._global_init_output_types.values())
948 for dataset_type in dataset_types:
949 key = DatasetKey(dataset_type.name, self.empty_data_id.required_values)
950 if (
951 self._pipeline_graph.producer_of(dataset_type.name) is None
952 and dataset_type.name not in self._global_init_output_types
953 ):
954 # Dataset type is an overall input; we always need to try to
955 # find these.
956 try:
957 ref = self.butler.find_dataset(dataset_type.name, collections=self.input_collections)
958 except MissingDatasetTypeError:
959 ref = None
960 if ref is not None:
961 self.existing_datasets.inputs[key] = ref
962 elif self.skip_existing_in:
963 # Dataset type is an intermediate or output; need to find these
964 # if only they're from previously executed quanta that we might
965 # skip...
966 try:
967 ref = self.butler.find_dataset(dataset_type.name, collections=self.skip_existing_in)
968 except MissingDatasetTypeError:
969 ref = None
970 if ref is not None:
971 self.existing_datasets.outputs_for_skip[key] = ref
972 if ref.run == self.output_run:
973 self.existing_datasets.outputs_in_the_way[key] = ref
974 if self.output_run_exists and not self.skip_existing_starts_with_output_run:
975 # ...or if they're in the way and would need to be clobbered
976 # (and we haven't already found them in the previous block).
977 try:
978 ref = self.butler.find_dataset(dataset_type.name, collections=[self.output_run])
979 except MissingDatasetTypeError:
980 ref = None
981 if ref is not None:
982 self.existing_datasets.outputs_in_the_way[key] = ref
984 @final
985 @timeMethod
986 def _attach_datastore_records(self, skeleton: QuantumGraphSkeleton) -> None:
987 """Add datastore records for all overall inputs to a preliminary
988 quantum graph.
990 Parameters
991 ----------
992 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
993 Preliminary quantum graph to update in place.
995 Notes
996 -----
997 On return, all quantum nodes in the skeleton graph will have a
998 "datastore_records" attribute that is a mapping from datastore name
999 to `lsst.daf.butler.DatastoreRecordData`, as used by
1000 `lsst.daf.butler.Quantum`.
1001 """
1002 overall_inputs = skeleton.extract_overall_inputs()
1003 exported_records = self.butler._datastore.export_records(overall_inputs.values())
1004 for quantum_key in skeleton.iter_all_quanta():
1005 quantum_records = {}
1006 input_ids = {
1007 ref.id
1008 for dataset_key in skeleton.iter_inputs_of(quantum_key)
1009 if (ref := overall_inputs.get(dataset_key)) is not None
1010 }
1011 if input_ids:
1012 for datastore_name, records in exported_records.items():
1013 matching_records = records.subset(input_ids)
1014 if matching_records is not None:
1015 quantum_records[datastore_name] = matching_records
1016 skeleton[quantum_key]["datastore_records"] = quantum_records
1018 @final
1019 @timeMethod
1020 def _construct_quantum_graph(
1021 self, skeleton: QuantumGraphSkeleton, metadata: Mapping[str, Any]
1022 ) -> QuantumGraph:
1023 """Construct a `QuantumGraph` object from the contents of a
1024 fully-processed `quantum_graph_skeleton.QuantumGraphSkeleton`.
1026 Parameters
1027 ----------
1028 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
1029 Preliminary quantum graph. Must have "init_inputs", "inputs", and
1030 "outputs" attributes on all quantum nodes, as added by
1031 `_resolve_task_quanta`, as well as a "datastore_records" attribute
1032 as added by `_attach_datastore_records`.
1033 metadata : `Mapping`
1034 Flexible metadata to add to the graph.
1036 Returns
1037 -------
1038 quantum_graph : `QuantumGraph`
1039 DAG describing processing to be performed.
1040 """
1041 quanta: dict[TaskDef, set[Quantum]] = {}
1042 init_inputs: dict[TaskDef, Iterable[DatasetRef]] = {}
1043 init_outputs: dict[TaskDef, Iterable[DatasetRef]] = {}
1044 for task_def in self._pipeline_graph._iter_task_defs():
1045 if not skeleton.has_task(task_def.label):
1046 continue
1047 task_node = self._pipeline_graph.tasks[task_def.label]
1048 task_init_key = skeleton.get_task_init_node(task_def.label)
1049 init_inputs[task_def] = skeleton[task_init_key]["inputs"].values()
1050 init_outputs[task_def] = skeleton[task_init_key]["outputs"].values()
1051 quanta_for_task: set[Quantum] = set()
1052 for quantum_key in skeleton.get_quanta(task_node.label):
1053 node_state = skeleton[quantum_key]
1054 quanta_for_task.add(
1055 Quantum(
1056 taskName=task_node.task_class_name,
1057 taskClass=task_node.task_class,
1058 dataId=node_state["data_id"],
1059 initInputs=node_state["init_inputs"],
1060 inputs=node_state["inputs"],
1061 outputs=node_state["outputs"],
1062 datastore_records=node_state.get("datastore_records"),
1063 )
1064 )
1065 quanta[task_def] = quanta_for_task
1067 registry_dataset_types: list[DatasetType] = [
1068 node.dataset_type for node in self._pipeline_graph.dataset_types.values()
1069 ]
1071 all_metadata = self.metadata.to_dict()
1072 all_metadata.update(metadata)
1073 return QuantumGraph(
1074 quanta,
1075 metadata=all_metadata,
1076 universe=self.universe,
1077 initInputs=init_inputs,
1078 initOutputs=init_outputs,
1079 globalInitOutputs=[skeleton[key]["ref"] for key in skeleton.global_init_outputs],
1080 registryDatasetTypes=registry_dataset_types,
1081 )
1083 @staticmethod
1084 @final
1085 def _find_removed(
1086 original: Iterable[DatasetKey | PrerequisiteDatasetKey],
1087 adjusted: NamedKeyMapping[DatasetType, Sequence[DatasetRef]],
1088 ) -> set[DatasetKey | PrerequisiteDatasetKey]:
1089 """Identify skeleton-graph dataset nodes that have been removed by
1090 `~PipelineTaskConnections.adjustQuantum`.
1092 Parameters
1093 ----------
1094 original : `~collections.abc.Iterable` [ `DatasetKey` or \
1095 `PrerequisiteDatasetKey` ]
1096 Identifiers for the dataset nodes that were the original neighbors
1097 (inputs or outputs) of a quantum.
1098 adjusted : `~lsst.daf.butler.NamedKeyMapping` [ \
1099 `~lsst.daf.butler.DatasetType`, \
1100 `~collections.abc.Sequence` [ `lsst.daf.butler.DatasetType` ] ]
1101 Adjusted neighbors, in the form used by `lsst.daf.butler.Quantum`.
1103 Returns
1104 -------
1105 removed : `set` [ `DatasetKey` ]
1106 Datasets in ``original`` that have no counterpart in ``adjusted``.
1107 """
1108 result = set(original)
1109 for dataset_type, kept_refs in adjusted.items():
1110 parent_dataset_type_name, _ = DatasetType.splitDatasetTypeName(dataset_type.name)
1111 for kept_ref in kept_refs:
1112 # We don't know if this was a DatasetKey or a
1113 # PrerequisiteDatasetKey; just try both.
1114 result.discard(DatasetKey(parent_dataset_type_name, kept_ref.dataId.required_values))
1115 result.discard(PrerequisiteDatasetKey(parent_dataset_type_name, kept_ref.id.bytes))
1116 return result
1119@dataclasses.dataclass(eq=False, order=False)
1120class ExistingDatasets:
1121 """Struct that holds the results of dataset queries for
1122 `QuantumGraphBuilder`.
1123 """
1125 inputs: dict[DatasetKey | PrerequisiteDatasetKey, DatasetRef] = dataclasses.field(default_factory=dict)
1126 """Overall-input datasets found in `QuantumGraphBuilder.input_collections`.
1128 This may include prerequisite inputs. It does include init-inputs.
1129 It does not include intermediates.
1130 """
1132 outputs_for_skip: dict[DatasetKey, DatasetRef] = dataclasses.field(default_factory=dict)
1133 """Output datasets found in `QuantumGraphBuilder.skip_existing_in`.
1135 It is unspecified whether this contains include init-outputs; there is
1136 no concept of skipping at the init stage, so this is not expected to
1137 matter.
1138 """
1140 outputs_in_the_way: dict[DatasetKey, DatasetRef] = dataclasses.field(default_factory=dict)
1141 """Output datasets found in `QuantumGraphBuilder.output_run`.
1143 This includes regular outputs and init-outputs.
1144 """
1147def _quantum_or_quanta(n: int) -> str:
1148 """Correctly pluralize 'quantum' if needed."""
1149 return f"{n} quanta" if n != 1 else "1 quantum"