Coverage for python/lsst/pipe/base/quantum_graph_builder.py: 25%
370 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:39 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:39 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""The base class for the QuantumGraph-generation algorithm and various
23helper classes.
24"""
26from __future__ import annotations
28__all__ = (
29 "QuantumGraphBuilder",
30 "ExistingDatasets",
31 "QuantumGraphBuilderError",
32 "OutputExistsError",
33 "PrerequisiteMissingError",
34)
36import dataclasses
37from abc import ABC, abstractmethod
38from collections.abc import Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any, final
41from lsst.daf.butler import (
42 Butler,
43 CollectionType,
44 DataCoordinate,
45 DatasetRef,
46 DatasetType,
47 DimensionUniverse,
48 Quantum,
49)
50from lsst.daf.butler.core.named import NamedKeyDict, NamedKeyMapping
51from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError
52from lsst.utils.logging import LsstLogAdapter, getLogger
53from lsst.utils.timer import timeMethod
55from . import automatic_connection_constants as acc
56from ._status import NoWorkFound
57from ._task_metadata import TaskMetadata
58from .connections import AdjustQuantumHelper
59from .graph import QuantumGraph
60from .pipeline_graph import PipelineGraph, TaskNode
61from .prerequisite_helpers import PrerequisiteInfo, SkyPixBoundsBuilder, TimespanBuilder
62from .quantum_graph_skeleton import (
63 DatasetKey,
64 PrerequisiteDatasetKey,
65 QuantumGraphSkeleton,
66 QuantumKey,
67 TaskInitKey,
68)
70if TYPE_CHECKING:
71 from .pipeline import TaskDef
74class QuantumGraphBuilderError(Exception):
75 """Base class for exceptions generated by QuantumGraphBuilder."""
77 pass
80class GraphBuilderError(QuantumGraphBuilderError):
81 """Backwards-compatibility near-alias for QuantumGraphBuilderError."""
83 pass
86# Inherit from backwards-compatibility alias for backwards-compatibility.
87class OutputExistsError(GraphBuilderError):
88 """Exception generated when output datasets already exist."""
90 pass
93# Inherit from backwards-compatibility alias for backwards-compatibility.
94class PrerequisiteMissingError(GraphBuilderError):
95 """Exception generated when a prerequisite dataset does not exist."""
97 pass
100class InitInputMissingError(QuantumGraphBuilderError):
101 """Exception generated when an init-input dataset does not exist."""
103 pass
106class QuantumGraphBuilder(ABC):
107 """An abstract base class for building `QuantumGraph` objects from a
108 pipeline.
110 Parameters
111 ----------
112 pipeline_graph : `.pipeline_graph.PipelineGraph`
113 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved
114 in-place with the given butler (any existing resolution is ignored).
115 butler : `lsst.daf.butler.Butler`
116 Client for the data repository. Should be read-only.
117 input_collections : `~collections.abc.Sequence` [ `str` ], optional
118 Collections to search for overall-input datasets. If not provided,
119 ``butler.collections`` is used (and must not be empty).
120 output_run : `str`, optional
121 Output `~lsst.daf.butler.CollectionType.RUN` collection. If not
122 provided, ``butler.run`` is used (and must not be `None`).
123 skip_existing_in : `~collections.abc.Sequence` [ `str` ], optional
124 Collections to search for outputs that already exist for the purpose of
125 skipping quanta that have already been run.
126 clobber : `bool`, optional
127 Whether to raise if predicted outputs already exist in ``output_run``
128 (not including those quanta that would be skipped because they've
129 already been run). This never actually clobbers outputs; it just
130 informs the graph generation algorithm whether execution will run with
131 clobbering enabled. This is ignored if ``output_run`` does not exist.
133 Notes
134 -----
135 Constructing a `QuantumGraphBuilder` will run queries for existing datasets
136 with empty data IDs (including but not limited to init inputs and outputs),
137 in addition to resolving the given pipeline graph and testing for existence
138 of the ``output`` run collection.
140 The `build` method splits the pipeline graph into independent subgraphs,
141 then calls the abstract method `process_subgraph` on each, to allow
142 concrete implementations to populate the rough graph structure (the
143 `~quantum_graph_skeleton.QuantumGraphSkeleton` class) and search for
144 existing datasets (further populating the builder's `existing_datasets`
145 struct). The `build` method then:
147 - assembles `lsst.daf.butler.Quantum` instances from all data IDs in the
148 skeleton;
149 - looks for existing outputs found in ``skip_existing_in`` to see if any
150 quanta should be skipped;
151 - calls `PipelineTaskConnections.adjustQuantum` on all quanta, adjusting
152 downstream quanta appropriately when preliminary predicted outputs are
153 rejected (pruning nodes that will not have the inputs they need to run);
154 - attaches datastore records and registry dataset types to the graph.
156 In addition to implementing `process_subgraph`, derived classes are
157 generally expected to add new construction keyword-only arguments to
158 control the data IDs of the quantum graph, while forwarding all of the
159 arguments defined in the base class to `super`.
160 """
162 def __init__(
163 self,
164 pipeline_graph: PipelineGraph,
165 butler: Butler,
166 *,
167 input_collections: Sequence[str] | None = None,
168 output_run: str | None = None,
169 skip_existing_in: Sequence[str] = (),
170 clobber: bool = False,
171 ):
172 self.log = getLogger(__name__)
173 self.metadata = TaskMetadata()
174 self._pipeline_graph = pipeline_graph
175 self.butler = butler
176 self._pipeline_graph.resolve(self.butler.registry)
177 if input_collections is None:
178 input_collections = butler.collections
179 if not input_collections:
180 raise ValueError("No input collections provided.")
181 self.input_collections = input_collections
182 if output_run is None:
183 output_run = butler.run
184 if not output_run:
185 raise ValueError("No output RUN collection provided.")
186 self.output_run = output_run
187 self.skip_existing_in = skip_existing_in
188 self.empty_data_id = DataCoordinate.makeEmpty(butler.dimensions)
189 self.clobber = clobber
190 # See whether the output run already exists.
191 self.output_run_exists = False
192 try:
193 if self.butler.registry.getCollectionType(self.output_run) is not CollectionType.RUN:
194 raise RuntimeError(f"{self.output_run!r} is not a RUN collection.")
195 self.output_run_exists = True
196 except MissingCollectionError:
197 # If the run doesn't exist we never need to clobber. This is not
198 # an error so you can run with clobber=True the first time you
199 # attempt some processing as well as all subsequent times, instead
200 # of forcing the user to make the first attempt different.
201 self.clobber = False
202 # We need to know whether the skip_existing_in collection sequence
203 # starts with the output run collection, as an optimization to avoid
204 # queries later.
205 try:
206 skip_existing_in_flat = self.butler.registry.queryCollections(
207 self.skip_existing_in, flattenChains=True
208 )
209 except MissingCollectionError:
210 skip_existing_in_flat = []
211 if not skip_existing_in_flat:
212 self.skip_existing_in = []
213 if self.skip_existing_in and self.output_run_exists:
214 self.skip_existing_starts_with_output_run = self.output_run == skip_existing_in_flat[0]
215 else:
216 self.skip_existing_starts_with_output_run = False
217 self.existing_datasets = ExistingDatasets()
218 try:
219 packages_storage_class = butler.registry.getDatasetType(
220 acc.PACKAGES_INIT_OUTPUT_NAME
221 ).storageClass_name
222 except MissingDatasetTypeError:
223 packages_storage_class = acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS
224 self._global_init_output_types = {
225 acc.PACKAGES_INIT_OUTPUT_NAME: DatasetType(
226 acc.PACKAGES_INIT_OUTPUT_NAME,
227 self.universe.empty,
228 packages_storage_class,
229 )
230 }
231 self._find_empty_dimension_datasets()
232 self.prerequisite_info = {
233 task_node.label: PrerequisiteInfo(task_node, self._pipeline_graph)
234 for task_node in pipeline_graph.tasks.values()
235 }
237 log: LsstLogAdapter
238 """Logger to use for all quantum-graph generation messages.
240 General and per-task status messages should be logged at `~logging.INFO`
241 level or higher, per-dataset-type status messages should be logged at
242 `~lsst.utils.logging.VERBOSE` or higher, and per-data-ID status messages
243 should be logged at `logging.DEBUG` or higher.
244 """
246 metadata: TaskMetadata
247 """Metadata to store in the QuantumGraph.
249 The `TaskMetadata` class is used here primarily in order to enable
250 resource-usage collection with the `lsst.utils.timer.timeMethod` decorator.
251 """
253 butler: Butler
254 """Client for the data repository.
256 Should be read-only.
257 """
259 input_collections: Sequence[str]
260 """Collections to search for overall-input datasets.
261 """
263 output_run: str
264 """Output `~lsst.daf.butler.CollectionType.RUN` collection.
265 """
267 skip_existing_in: Sequence[str]
268 """Collections to search for outputs that already exist for the purpose
269 of skipping quanta that have already been run.
270 """
272 clobber: bool
273 """Whether to raise if predicted outputs already exist in ``output_run``
275 This never actually clobbers outputs; it just informs the graph generation
276 algorithm whether execution will run with clobbering enabled. This is
277 always `False` if `output_run_exists` is `False`.
278 """
280 empty_data_id: DataCoordinate
281 """An empty data ID in the data repository's dimension universe.
282 """
284 output_run_exists: bool
285 """Whether the output run exists in the data repository already.
286 """
288 skip_existing_starts_with_output_run: bool
289 """Whether the `skip_existing_in` sequence begins with `output_run`.
291 If this is true, any dataset found in `output_run` can be used to
292 short-circuit queries in `skip_existing_in`.
293 """
295 existing_datasets: ExistingDatasets
296 """Struct holding datasets that have already been found in the data
297 repository.
299 This is updated in-place as the `QuantumGraph` generation algorithm
300 proceeds.
301 """
303 prerequisite_info: Mapping[str, PrerequisiteInfo]
304 """Helper objects for finding prerequisite inputs, organized by task label.
306 Subclasses that find prerequisites should remove the
307 covered `~prerequisite_helpers.PrerequisiteFinder` objects from this
308 attribute.
309 """
311 @property
312 def universe(self) -> DimensionUniverse:
313 """Definitions of all data dimensions."""
314 return self.butler.dimensions
316 @final
317 @timeMethod
318 def build(self, metadata: Mapping[str, Any] | None = None) -> QuantumGraph:
319 """Build the quantum graph.
321 Parameters
322 ----------
323 metadata : `~collections.abc.Mapping`, optional
324 Flexible metadata to add to the quantum graph.
326 Returns
327 -------
328 quantum_graph : `QuantumGraph`
329 DAG describing processing to be performed.
331 Notes
332 -----
333 External code is expected to construct a `QuantumGraphBuilder` and then
334 call this method exactly once. See class documentation for details on
335 what it does.
336 """
337 full_skeleton = QuantumGraphSkeleton(self._pipeline_graph.tasks)
338 subgraphs = list(self._pipeline_graph.split_independent())
339 for i, subgraph in enumerate(subgraphs):
340 self.log.info(
341 "Processing pipeline subgraph %d of %d with %d task(s).",
342 i + 1,
343 len(subgraphs),
344 len(subgraph.tasks),
345 )
346 self.log.verbose("Subgraph tasks: [%s]", ", ".join(label for label in subgraph.tasks))
347 subgraph_skeleton = self.process_subgraph(subgraph)
348 full_skeleton.update(subgraph_skeleton)
349 # Loop over tasks. The pipeline graph must be topologically sorted,
350 # so a quantum is only processed after any quantum that provides its
351 # inputs has been processed.
352 for task_node in self._pipeline_graph.tasks.values():
353 self._resolve_task_quanta(task_node, full_skeleton)
354 # Add global init-outputs to the skeleton.
355 for dataset_type in self._global_init_output_types.values():
356 dataset_key = full_skeleton.add_dataset_node(
357 dataset_type.name, self.empty_data_id, is_global_init_output=True
358 )
359 ref = self.existing_datasets.outputs_in_the_way.get(dataset_key)
360 if ref is None:
361 ref = DatasetRef(dataset_type, self.empty_data_id, run=self.output_run)
362 full_skeleton[dataset_key]["ref"] = ref
363 # Remove dataset nodes with no edges that are not global init outputs,
364 # which are generally overall-inputs whose original quanta end up
365 # skipped or with no work to do (we can't remove these along with the
366 # quanta because no quantum knows if its the only consumer).
367 full_skeleton.remove_orphan_datasets()
368 self._attach_datastore_records(full_skeleton)
369 # TODO initialize most metadata here instead of in ctrl_mpexec.
370 if metadata is None:
371 metadata = {}
372 return self._construct_quantum_graph(full_skeleton, metadata)
374 @abstractmethod
375 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton:
376 """Build the rough structure for an independent subset of the
377 `QuantumGraph` and query for relevant existing datasets.
379 Parameters
380 ----------
381 subgraph : `.pipeline_graph.PipelineGraph`
382 Subset of the pipeline graph that should be processed by this call.
383 This is always resolved and topologically sorted. It should not be
384 modified.
386 Returns
387 -------
388 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
389 Class representing an initial quantum graph. See
390 `quantum_graph_skeleton.QuantumGraphSkeleton` docs for details.
391 After this is returned, the object may be modified in-place in
392 unspecified ways.
394 Notes
395 -----
396 In addition to returning a
397 `quantum_graph_skeleton.QuantumGraphSkeleton`, this method should
398 populate the `existing_datasets` structure by querying for all relevant
399 datasets with non-empty data IDs (those with empty data IDs will
400 already be present). In particular:
402 - `~ExistingDatasets.inputs` must always be populated with all
403 overall-input datasets (but not prerequisites), by querying
404 `input_collections`;
405 - `~ExistingDatasets.outputs_for_skip` must be populated with any
406 intermediate our output datasets present in `skip_existing_in` (it
407 can be ignored if `skip_existing_in` is empty);
408 - `~ExistingDatasets.outputs_in_the_way` must be populated with any
409 intermediate or output datasets present in `output_run`, if
410 `output_run_exists` (it can be ignored if `output_run_exists` is
411 `False`). Note that the presence of such datasets is not
412 automatically an error, even if `clobber is `False`, as these may be
413 quanta that will be skipped.
414 - `~ExistingDatasets.inputs` must be populated with all
415 prerequisite-input datasets that were included in the skeleton, by
416 querying `input_collections` (not all prerequisite inputs need to be
417 included in the skeleton, but the base class can only use per-quantum
418 queries to find them, and that can be slow when there are many
419 quanta).
421 Dataset types should never be components and should always use the
422 "common" storage class definition in `pipeline_graph.DatasetTypeNode`
423 (which is the data repository definition when the dataset type is
424 registered).
425 """
426 raise NotImplementedError()
428 @final
429 @timeMethod
430 def _resolve_task_quanta(self, task_node: TaskNode, skeleton: QuantumGraphSkeleton) -> None:
431 """Process the quanta for one task in a skeleton graph to skip those
432 that have already completed and adjust those that request it.
434 Parameters
435 ----------
436 task_node : `pipeline_graph.TaskNode`
437 Node for this task in the pipeline graph.
438 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
439 Preliminary quantum graph, to be modified in-place.
441 Notes
442 -----
443 This method modifies ``skeleton`` in-place in several ways:
445 - It adds a "ref" attribute to dataset nodes, using the contents of
446 `existing_datasets`. This ensures producing and consuming tasks
447 start from the same `DatasetRef`.
448 - It adds "inputs", "outputs", and "init_inputs" attributes to the
449 quantum nodes, holding the same `NamedValueMapping` objects needed to
450 construct an actual `Quantum` instances.
451 - It removes quantum nodes that are to be skipped because their outputs
452 already exist in `skip_existing_in`. It also removes their outputs
453 from `ExistingDatasets.outputs_in_the_way`.
454 - It adds prerequisite dataset nodes and edges that connect them to the
455 quanta that consume them.
456 - It removes quantum nodes whose
457 `~PipelineTaskConnections.adjustQuantum` calls raise `NoWorkFound` or
458 predict no outputs;
459 - It removes the nodes of output datasets that are "adjusted away".
460 - It removes the edges of input datasets that are "adjusted away".
462 The difference between how adjusted inputs and outputs are handled
463 reflects the fact that many quanta can share the same input, but only
464 one produces each output. This can lead to the graph having
465 superfluous isolated nodes after processing is complete, but these
466 should only be removed after all the quanta from all tasks have been
467 processed.
468 """
469 # Extract the helper object for the prerequisite inputs of this task,
470 # and tell it to prepare to construct skypix bounds and timespans for
471 # each quantum (these will automatically do nothing if nothing needs
472 # those bounds).
473 task_prerequisite_info = self.prerequisite_info[task_node.label]
474 task_prerequisite_info.update_bounds()
475 # Loop over all quanta for this task, remembering the ones we've
476 # gotten rid of.
477 skipped_quanta = []
478 no_work_quanta = []
479 for quantum_key in skeleton.get_quanta(task_node.label):
480 if self._skip_quantum_if_metadata_exists(task_node, quantum_key, skeleton):
481 skipped_quanta.append(quantum_key)
482 continue
483 quantum_data_id = skeleton[quantum_key]["data_id"]
484 skypix_bounds_builder = task_prerequisite_info.bounds.make_skypix_bounds_builder(quantum_data_id)
485 timespan_builder = task_prerequisite_info.bounds.make_timespan_builder(quantum_data_id)
486 adjusted_outputs = self._gather_quantum_outputs(
487 task_node, quantum_key, skeleton, skypix_bounds_builder, timespan_builder
488 )
489 adjusted_inputs = self._gather_quantum_inputs(
490 task_node,
491 quantum_key,
492 skeleton,
493 task_prerequisite_info,
494 skypix_bounds_builder,
495 timespan_builder,
496 )
497 # Give the task's Connections class an opportunity to remove
498 # some inputs, or complain if they are unacceptable. This will
499 # raise if one of the check conditions is not met, which is the
500 # intended behavior.
501 helper = AdjustQuantumHelper(inputs=adjusted_inputs, outputs=adjusted_outputs)
502 try:
503 helper.adjust_in_place(
504 task_node._get_imported_data().connections, task_node.label, quantum_data_id
505 )
506 except NoWorkFound as err:
507 # Do not generate this quantum; it would not produce any
508 # outputs. Remove it and all of the outputs it might have
509 # produced from the skeleton.
510 try:
511 _, connection_name, _ = err.args
512 details = f"not enough datasets for connection {connection_name}."
513 except ValueError:
514 details = str(err)
515 self.log.debug(
516 "No work found for quantum %s of task %s: %s",
517 quantum_key.data_id_values,
518 quantum_key.task_label,
519 details,
520 )
521 no_work_quanta.append(quantum_key)
522 continue
523 if helper.outputs_adjusted:
524 if not any(adjusted_refs for adjusted_refs in helper.outputs.values()):
525 # No outputs also means we don't generate this quantum.
526 self.log.debug(
527 "No outputs predicted for quantum %s of task %s.",
528 quantum_key.data_id_values,
529 quantum_key.task_label,
530 )
531 no_work_quanta.append(quantum_key)
532 continue
533 # Remove output nodes that were not retained by
534 # adjustQuantum.
535 skeleton.remove_dataset_nodes(
536 self._find_removed(skeleton.iter_outputs_of(quantum_key), helper.outputs)
537 )
538 if helper.inputs_adjusted:
539 if not any(bool(adjusted_refs) for adjusted_refs in helper.inputs.values()):
540 raise QuantumGraphBuilderError(
541 f"adjustQuantum implementation for {task_node.label}@{quantum_key.data_id_values} "
542 "returned outputs but no inputs."
543 )
544 # Remove input dataset edges that were not retained by
545 # adjustQuantum. We can't remove the input dataset nodes
546 # because some other quantum might still want them.
547 skeleton.remove_input_edges(
548 quantum_key, self._find_removed(skeleton.iter_inputs_of(quantum_key), helper.inputs)
549 )
550 # Save the adjusted inputs and outputs to the quantum node's
551 # state so we don't have to regenerate those data structures
552 # from the graph.
553 skeleton[quantum_key]["inputs"] = helper.inputs
554 skeleton[quantum_key]["outputs"] = helper.outputs
555 for no_work_quantum in no_work_quanta:
556 skeleton.remove_quantum_node(no_work_quantum, remove_outputs=True)
557 for skipped_quantum in skipped_quanta:
558 skeleton.remove_quantum_node(skipped_quantum, remove_outputs=False)
559 remaining_quanta = skeleton.get_quanta(task_node.label)
560 self._resolve_task_init(task_node, skeleton, bool(skipped_quanta))
561 message_terms = []
562 if no_work_quanta:
563 message_terms.append(f"{len(no_work_quanta)} had no work to do")
564 if skipped_quanta:
565 message_terms.append(f"{len(no_work_quanta)} previously succeeded")
566 message_parenthetical = f" ({', '.join(message_terms)})" if message_terms else ""
567 if remaining_quanta:
568 self.log.info(
569 "Generated %s for task %s%s.",
570 _quantum_or_quanta(len(remaining_quanta)),
571 task_node.label,
572 message_parenthetical,
573 )
574 else:
575 self.log.info(
576 "Dropping task %s because no quanta remain%s.", task_node.label, message_parenthetical
577 )
578 skeleton.remove_task(task_node.label)
580 def _skip_quantum_if_metadata_exists(
581 self, task_node: TaskNode, quantum_key: QuantumKey, skeleton: QuantumGraphSkeleton
582 ) -> bool:
583 """Identify and drop quanta that should be skipped because their
584 metadata datasets already exist.
586 Parameters
587 ----------
588 task_node : `pipeline_graph.TaskNode`
589 Node for this task in the pipeline graph.
590 quantum_key : `QuantumKey`
591 Identifier for this quantum in the graph.
592 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
593 Preliminary quantum graph, to be modified in-place.
595 Returns
596 -------
597 skipped : `bool`
598 `True` if the quantum is being skipped and has been removed from
599 the graph, `False` otherwise.
601 Notes
602 -----
603 If the metadata dataset for this quantum exists in
604 `ExistingDatasets.outputs_for_skip`, the quantum will be skipped. This
605 causes the quantum node to be removed from the graph. Dataset nodes
606 that were previously the outputs of this quantum will have their "ref"
607 attribute set from `ExistingDatasets.outputs_for_skip`, or will be
608 removed if there is no such dataset there. Any output dataset in
609 `ExistingDatasets.outputs_in_the_way` will be removed.
610 """
611 metadata_dataset_key = DatasetKey(
612 task_node.metadata_output.parent_dataset_type_name, quantum_key.data_id_values
613 )
614 if metadata_dataset_key in self.existing_datasets.outputs_for_skip:
615 # This quantum's metadata is already present in the the
616 # skip_existing_in collections; we'll skip it. But the presence of
617 # the metadata dataset doesn't guarantee that all of the other
618 # outputs we predicted are present; we have to check.
619 for output_dataset_key in list(skeleton.iter_outputs_of(quantum_key)):
620 if (
621 output_ref := self.existing_datasets.outputs_for_skip.get(output_dataset_key)
622 ) is not None:
623 # Populate the skeleton graph's node attributes
624 # with the existing DatasetRef, just like a
625 # predicted output of a non-skipped quantum.
626 skeleton[output_dataset_key]["ref"] = output_ref
627 else:
628 # Remove this dataset from the skeleton graph,
629 # because the quantum that would have produced it
630 # is being skipped and it doesn't already exist.
631 skeleton.remove_dataset_nodes([output_dataset_key])
632 # If this dataset was "in the way" (i.e. already in the
633 # output run), it isn't anymore.
634 self.existing_datasets.outputs_in_the_way.pop(output_dataset_key, None)
635 # Removing the quantum node from the graph will happen outside this
636 # function.
637 return True
638 return False
640 @final
641 def _gather_quantum_outputs(
642 self,
643 task_node: TaskNode,
644 quantum_key: QuantumKey,
645 skeleton: QuantumGraphSkeleton,
646 skypix_bounds_builder: SkyPixBoundsBuilder,
647 timespan_builder: TimespanBuilder,
648 ) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
649 """Collect outputs or generate datasets for a preliminary quantum and
650 put them in the form used by `~lsst.daf.butler.Quantum` and
651 `~PipelineTaskConnections.adjustQuantum`.
653 Parameters
654 ----------
655 task_node : `pipeline_graph.TaskNode`
656 Node for this task in the pipeline graph.
657 quantum_key : `QuantumKey`
658 Identifier for this quantum in the graph.
659 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
660 Preliminary quantum graph, to be modified in-place.
661 skypix_bounds_builder : `~prerequisite_helpers.SkyPixBoundsBuilder`
662 An object that accumulates the appropriate spatial bounds for a
663 quantum.
664 timespan_builder : `~prerequisite_helpers.TimespanBuilder`
665 An object that accumulates the appropriate timespan for a quantum.
667 Returns
668 -------
669 outputs : `~lsst.daf.butler.NamedKeyDict` [ \
670 `~lsst.daf.butler.DatasetType`, `list` [ \
671 `~lsst.daf.butler.DatasetRef` ] ]
672 All outputs to the task, using the storage class and components
673 defined by the task's own connections.
675 Notes
676 -----
677 This first looks for outputs already present in the `output_run` by
678 looking in `ExistingDatasets.outputs_in_the_way`; if it finds something
679 and `clobber` is `True`, it uses that ref (it's not ideal that both the
680 original dataset and its replacement will have the same UUID, but we
681 don't have space in the quantum graph for two UUIDs, and we need the
682 datastore records of the original there). If `clobber` is `False`,
683 `RuntimeError` is raised. If there is no output already present, a new
684 one with a random UUID is generated. In all cases the "ref" attribute
685 of the dataset node in the skeleton is set.
686 """
687 outputs_by_type: dict[str, list[DatasetRef]] = {}
688 dataset_key: DatasetKey
689 for dataset_key in skeleton.iter_outputs_of(quantum_key):
690 dataset_data_id = skeleton[dataset_key]["data_id"]
691 dataset_type_node = self._pipeline_graph.dataset_types[dataset_key.parent_dataset_type_name]
692 if (ref := self.existing_datasets.outputs_in_the_way.get(dataset_key)) is None:
693 ref = DatasetRef(dataset_type_node.dataset_type, dataset_data_id, run=self.output_run)
694 elif not self.clobber:
695 # We intentionally raise here, before running adjustQuantum,
696 # because it'd be weird if we left an old potential output of a
697 # task sitting there in the output collection, just because the
698 # task happened to not actually produce it.
699 raise OutputExistsError(
700 f"Potential output dataset {ref} already exists in the output run "
701 f"{self.output_run}, but clobbering outputs was not expected to be necessary."
702 )
703 skypix_bounds_builder.handle_dataset(dataset_key.parent_dataset_type_name, dataset_data_id)
704 timespan_builder.handle_dataset(dataset_key.parent_dataset_type_name, dataset_data_id)
705 skeleton[dataset_key]["ref"] = ref
706 outputs_by_type.setdefault(dataset_key.parent_dataset_type_name, []).append(ref)
707 adapted_outputs: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict()
708 for write_edge in task_node.iter_all_outputs():
709 dataset_type_node = self._pipeline_graph.dataset_types[write_edge.parent_dataset_type_name]
710 edge_dataset_type = write_edge.adapt_dataset_type(dataset_type_node.dataset_type)
711 adapted_outputs[edge_dataset_type] = [
712 write_edge.adapt_dataset_ref(ref)
713 for ref in sorted(outputs_by_type.get(write_edge.parent_dataset_type_name, []))
714 ]
715 return adapted_outputs
717 @final
718 def _gather_quantum_inputs(
719 self,
720 task_node: TaskNode,
721 quantum_key: QuantumKey,
722 skeleton: QuantumGraphSkeleton,
723 task_prerequisite_info: PrerequisiteInfo,
724 skypix_bounds_builder: SkyPixBoundsBuilder,
725 timespan_builder: TimespanBuilder,
726 ) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
727 """Collect input datasets for a preliminary quantum and put them in the
728 form used by `~lsst.daf.butler.Quantum` and
729 `~PipelineTaskConnections.adjustQuantum`.
731 Parameters
732 ----------
733 task_node : `pipeline_graph.TaskNode`
734 Node for this task in the pipeline graph.
735 quantum_key : `QuantumKey`
736 Identifier for this quantum in the graph.
737 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
738 Preliminary quantum graph, to be modified in-place.
739 skypix_bounds_builder : `~prerequisite_helpers.SkyPixBoundsBuilder`
740 An object that accumulates the appropriate spatial bounds for a
741 quantum.
742 timespan_builder : `~prerequisite_helpers.TimespanBuilder`
743 An object that accumulates the appropriate timespan for a quantum.
745 Returns
746 -------
747 inputs : `~lsst.daf.butler.NamedKeyDict` [ \
748 `~lsst.daf.butler.DatasetType`, `list` [ \
749 `~lsst.daf.butler.DatasetRef` ] ]
750 All regular and prerequisite inputs to the task, using the storage
751 class and components defined by the task's own connections.
753 Notes
754 -----
755 On return, the dataset nodes that represent inputs to this quantum will
756 either have their "ref" attribute set (using the common dataset type,
757 not the task-specific one) or will be removed from the graph.
759 For regular inputs, usually an existing "ref" (corresponding to an
760 output of another quantum) will be found and left unchanged. When
761 there is no existing "ref" attribute, `ExistingDatasets.inputs` is
762 searched next; if there is nothing there, the input will be removed.
764 Prerequisite inputs are always queried for directly here (delegating to
765 `_find_prerequisite_inputs`). They are never produced by other tasks,
766 and cannot in general be queried for in advance when
767 `ExistingDatasets.inputs` is populated.
768 """
769 quantum_data_id = skeleton[quantum_key]["data_id"]
770 inputs_by_type: dict[str, set[DatasetRef]] = {}
771 dataset_key: DatasetKey | PrerequisiteDatasetKey
772 # Process inputs already present in the skeleton - this should include
773 # all regular inputs (including intermediates) and may include some
774 # prerequisites.
775 for dataset_key in list(skeleton.iter_inputs_of(quantum_key)):
776 if (ref := skeleton[dataset_key].get("ref")) is None:
777 # This dataset is an overall input - if it was an intermediate,
778 # we would have already either removed the node or set the
779 # "ref" attribute when processing its producing quantum - and
780 # this is the first time we're trying to resolve it.
781 if (ref := self.existing_datasets.inputs.get(dataset_key)) is None:
782 # It also doesn't exist in the input collections, so we
783 # remove its node in the skeleton graph (so other consumers
784 # won't have to check for it).
785 skeleton.remove_dataset_nodes([dataset_key])
786 continue
787 skeleton[dataset_key]["ref"] = ref
788 inputs_by_type.setdefault(dataset_key.parent_dataset_type_name, set()).add(ref)
789 skypix_bounds_builder.handle_dataset(dataset_key.parent_dataset_type_name, ref.dataId)
790 timespan_builder.handle_dataset(dataset_key.parent_dataset_type_name, ref.dataId)
791 # Query for any prerequisites not handled by process_subgraph. Note
792 # that these were not already in the skeleton graph, so we add them
793 # now.
794 skypix_bounds = skypix_bounds_builder.finish()
795 timespan = timespan_builder.finish()
796 for finder in task_prerequisite_info.finders.values():
797 inputs_for_type = inputs_by_type.setdefault(finder.dataset_type_node.name, set())
798 dataset_keys = []
799 for ref in finder.find(
800 self.butler, self.input_collections, quantum_data_id, skypix_bounds, timespan
801 ):
802 dataset_key = skeleton.add_prerequisite_node(ref.datasetType.name, ref=ref)
803 dataset_keys.append(dataset_key)
804 inputs_for_type.add(ref)
805 skeleton.add_input_edges(quantum_key, dataset_keys)
806 adapted_inputs: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict()
807 for read_edge in task_node.iter_all_inputs():
808 dataset_type_node = self._pipeline_graph.dataset_types[read_edge.parent_dataset_type_name]
809 edge_dataset_type = read_edge.adapt_dataset_type(dataset_type_node.dataset_type)
810 if (current_dataset_type := adapted_inputs.keys().get(edge_dataset_type.name)) is None:
811 adapted_inputs[edge_dataset_type] = [
812 read_edge.adapt_dataset_ref(ref)
813 for ref in sorted(inputs_by_type.get(read_edge.parent_dataset_type_name, frozenset()))
814 ]
815 elif current_dataset_type != edge_dataset_type:
816 raise NotImplementedError(
817 f"Task {task_node.label!r} has {edge_dataset_type.name!r} as an input via "
818 "two different connections, with two different storage class overrides. "
819 "This is not yet supported due to limitations in the Quantum data structure."
820 )
821 # If neither the `if` nor the `elif` above match, it means
822 # multiple input connections have exactly the same dataset
823 # type, and hence nothing to do after the first one.
824 return adapted_inputs
826 @final
827 def _resolve_task_init(
828 self, task_node: TaskNode, skeleton: QuantumGraphSkeleton, has_skipped_quanta: bool
829 ) -> None:
830 """Add init-input and init-output dataset nodes and edges for a task to
831 the skeleton.
833 Parameters
834 ----------
835 task_node : `pipeline_graph.TaskNode`
836 Pipeline graph description of the task.
837 skeleton : `QuantumGraphSkeleton`
838 In-progress quantum graph data structure to update in-place.
839 has_skipped_quanta : `bool`
840 Whether any of this task's quanta were skipped because they had
841 already succeeded.
842 """
843 quanta = skeleton.get_quanta(task_node.label)
844 task_init_key = TaskInitKey(task_node.label)
845 if quanta:
846 adapted_inputs: NamedKeyDict[DatasetType, DatasetRef] = NamedKeyDict()
847 # Process init-inputs.
848 input_keys: list[DatasetKey] = []
849 for read_edge in task_node.init.iter_all_inputs():
850 dataset_key = skeleton.add_dataset_node(
851 read_edge.parent_dataset_type_name, self.empty_data_id
852 )
853 skeleton.add_input_edge(task_init_key, dataset_key)
854 if (ref := skeleton[dataset_key].get("ref")) is None:
855 try:
856 ref = self.existing_datasets.inputs[dataset_key]
857 except KeyError:
858 raise InitInputMissingError(
859 f"Overall init-input dataset {read_edge.parent_dataset_type_name!r} "
860 f"needed by task {task_node.label!r} not found in input collection(s) "
861 f"{self.input_collections}."
862 ) from None
863 skeleton[dataset_key]["ref"] = ref
864 for quantum_key in skeleton.get_quanta(task_node.label):
865 skeleton.add_input_edge(quantum_key, dataset_key)
866 input_keys.append(dataset_key)
867 adapted_ref = read_edge.adapt_dataset_ref(ref)
868 adapted_inputs[adapted_ref.datasetType] = adapted_ref
869 # Save the quantum-adapted init inputs to each quantum, and add
870 # skeleton edges connecting the init inputs to each quantum.
871 for quantum_key in skeleton.get_quanta(task_node.label):
872 skeleton[quantum_key]["init_inputs"] = adapted_inputs
873 # Process init-outputs.
874 adapted_outputs: NamedKeyDict[DatasetType, DatasetRef] = NamedKeyDict()
875 for write_edge in task_node.init.iter_all_outputs():
876 dataset_key = skeleton.add_dataset_node(
877 write_edge.parent_dataset_type_name, self.empty_data_id
878 )
879 if (ref := self.existing_datasets.outputs_in_the_way.get(dataset_key)) is None:
880 ref = DatasetRef(
881 self._pipeline_graph.dataset_types[write_edge.parent_dataset_type_name].dataset_type,
882 self.empty_data_id,
883 run=self.output_run,
884 )
885 skeleton[dataset_key]["ref"] = ref
886 skeleton.add_output_edge(task_init_key, dataset_key)
887 adapted_ref = write_edge.adapt_dataset_ref(ref)
888 adapted_outputs[adapted_ref.datasetType] = adapted_ref
889 skeleton[task_init_key]["inputs"] = adapted_inputs
890 skeleton[task_init_key]["outputs"] = adapted_outputs
891 elif has_skipped_quanta:
892 # No quanta remain for this task, but at least one quantum was
893 # skipped because its outputs were present in the skip_existing_in
894 # collections. This means all init outputs should be present in
895 # the skip_existing_in collections, too, and we need to put those
896 # refs in the graph.
897 for write_edge in task_node.init.iter_all_outputs():
898 dataset_key = skeleton.add_dataset_node(
899 write_edge.parent_dataset_type_name, self.empty_data_id
900 )
901 if (ref := self.existing_datasets.outputs_for_skip.get(dataset_key)) is None:
902 raise InitInputMissingError(
903 f"Init-output dataset {write_edge.parent_dataset_type_name!r} of skipped task "
904 f"{task_node.label!r} not found in skip-existing-in collection(s) "
905 f"{self.skip_existing_in}."
906 ) from None
907 skeleton[dataset_key]["ref"] = ref
908 # If this dataset was "in the way" (i.e. already in the output
909 # run), it isn't anymore.
910 self.existing_datasets.outputs_in_the_way.pop(dataset_key, None)
911 # No quanta remain in this task, but none were skipped; this means
912 # they all got pruned because of NoWorkFound conditions. This
913 # dooms all downstream quanta to the same fate, so we don't bother
914 # doing anything with the task's init-outputs, since nothing is
915 # going to consume them.
917 @final
918 @timeMethod
919 def _find_empty_dimension_datasets(self) -> None:
920 """Query for all dataset types with no dimensions, updating
921 `existing_datasets` in-place.
923 This includes but is not limited to init inputs and init outputs.
924 """
925 _, dataset_type_nodes = self._pipeline_graph.group_by_dimensions()[self.universe.empty]
926 dataset_types = [node.dataset_type for node in dataset_type_nodes.values()]
927 dataset_types.extend(self._global_init_output_types.values())
928 for dataset_type in dataset_types:
929 key = DatasetKey(dataset_type.name, self.empty_data_id.values_tuple())
930 if (
931 self._pipeline_graph.producer_of(dataset_type.name) is None
932 and dataset_type.name not in self._global_init_output_types
933 ):
934 # Dataset type is an overall input; we always need to try to
935 # find these.
936 try:
937 ref = self.butler.registry.findDataset(
938 dataset_type.name, collections=self.input_collections
939 )
940 except MissingDatasetTypeError:
941 ref = None
942 if ref is not None:
943 self.existing_datasets.inputs[key] = ref
944 elif self.skip_existing_in:
945 # Dataset type is an intermediate or output; need to find these
946 # if only they're from previously executed quanta that we might
947 # skip...
948 try:
949 ref = self.butler.registry.findDataset(
950 dataset_type.name, collections=self.skip_existing_in
951 )
952 except MissingDatasetTypeError:
953 ref = None
954 if ref is not None:
955 self.existing_datasets.outputs_for_skip[key] = ref
956 if ref.run == self.output_run:
957 self.existing_datasets.outputs_in_the_way[key] = ref
958 if self.output_run_exists and not self.skip_existing_starts_with_output_run:
959 # ...or if they're in the way and would need to be clobbered
960 # (and we haven't already found them in the previous block).
961 try:
962 ref = self.butler.registry.findDataset(dataset_type.name, collections=[self.output_run])
963 except MissingDatasetTypeError:
964 ref = None
965 if ref is not None:
966 self.existing_datasets.outputs_in_the_way[key] = ref
968 @final
969 @timeMethod
970 def _attach_datastore_records(self, skeleton: QuantumGraphSkeleton) -> None:
971 """Add datastore records for all overall inputs to a preliminary
972 quantum graph.
974 Parameters
975 ----------
976 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
977 Preliminary quantum graph to update in place.
979 Notes
980 -----
981 On return, all quantum nodes in the skeleton graph will have a
982 "datastore_records" attribute that is a mapping from datastore name
983 to `lsst.daf.butler.DatastoreRecordData`, as used by
984 `lsst.daf.butler.Quantum`.
985 """
986 overall_inputs = skeleton.extract_overall_inputs()
987 exported_records = self.butler._datastore.export_records(overall_inputs.values())
988 for quantum_key in skeleton.iter_all_quanta():
989 quantum_records = {}
990 input_ids = {
991 ref.id
992 for dataset_key in skeleton.iter_inputs_of(quantum_key)
993 if (ref := overall_inputs.get(dataset_key)) is not None
994 }
995 if input_ids:
996 for datastore_name, records in exported_records.items():
997 matching_records = records.subset(input_ids)
998 if matching_records is not None:
999 quantum_records[datastore_name] = matching_records
1000 skeleton[quantum_key]["datastore_records"] = quantum_records
1002 @final
1003 @timeMethod
1004 def _construct_quantum_graph(
1005 self, skeleton: QuantumGraphSkeleton, metadata: Mapping[str, Any]
1006 ) -> QuantumGraph:
1007 """Construct a `QuantumGraph` object from the contents of a
1008 fully-processed `quantum_graph_skeleton.QuantumGraphSkeleton`.
1010 Parameters
1011 ----------
1012 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton`
1013 Preliminary quantum graph. Must have "init_inputs", "inputs", and
1014 "outputs" attributes on all quantum nodes, as added by
1015 `_resolve_task_quanta`, as well as a "datastore_records" attribute
1016 as added by `_attach_datastore_records`.
1017 metadata : `Mapping`
1018 Flexible metadata to add to the graph.
1020 Returns
1021 -------
1022 quantum_graph : `QuantumGraph`
1023 DAG describing processing to be performed.
1024 """
1025 quanta: dict[TaskDef, set[Quantum]] = {}
1026 init_inputs: dict[TaskDef, Iterable[DatasetRef]] = {}
1027 init_outputs: dict[TaskDef, Iterable[DatasetRef]] = {}
1028 for task_def in self._pipeline_graph._iter_task_defs():
1029 if not skeleton.has_task(task_def.label):
1030 continue
1031 task_node = self._pipeline_graph.tasks[task_def.label]
1032 task_init_key = skeleton.get_task_init_node(task_def.label)
1033 init_inputs[task_def] = skeleton[task_init_key]["inputs"].values()
1034 init_outputs[task_def] = skeleton[task_init_key]["outputs"].values()
1035 quanta_for_task: set[Quantum] = set()
1036 for quantum_key in skeleton.get_quanta(task_node.label):
1037 node_state = skeleton[quantum_key]
1038 quanta_for_task.add(
1039 Quantum(
1040 taskName=task_node.task_class_name,
1041 taskClass=task_node.task_class,
1042 dataId=node_state["data_id"],
1043 initInputs=node_state["init_inputs"],
1044 inputs=node_state["inputs"],
1045 outputs=node_state["outputs"],
1046 datastore_records=node_state.get("datastore_records"),
1047 )
1048 )
1049 quanta[task_def] = quanta_for_task
1051 registry_dataset_types: list[DatasetType] = [
1052 node.dataset_type for node in self._pipeline_graph.dataset_types.values()
1053 ]
1055 all_metadata = self.metadata.to_dict()
1056 all_metadata.update(metadata)
1057 return QuantumGraph(
1058 quanta,
1059 metadata=all_metadata,
1060 universe=self.universe,
1061 initInputs=init_inputs,
1062 initOutputs=init_outputs,
1063 globalInitOutputs=[skeleton[key]["ref"] for key in skeleton.global_init_outputs],
1064 registryDatasetTypes=registry_dataset_types,
1065 )
1067 @staticmethod
1068 @final
1069 def _find_removed(
1070 original: Iterable[DatasetKey | PrerequisiteDatasetKey],
1071 adjusted: NamedKeyMapping[DatasetType, Sequence[DatasetRef]],
1072 ) -> set[DatasetKey | PrerequisiteDatasetKey]:
1073 """Identify skeleton-graph dataset nodes that have been removed by
1074 `~PipelineTaskConnections.adjustQuantum`.
1076 Parameters
1077 ----------
1078 original : `~collections.abc.Iterable` [ `DatasetKey` or \
1079 `PrerequisiteDatasetKey` ]
1080 Identifiers for the dataset nodes that were the original neighbors
1081 (inputs or outputs) of a quantum.
1082 adjusted : `~lsst.daf.butler.NamedKeyMapping` [ \
1083 `~lsst.daf.butler.DatasetType`, \
1084 `~collections.abc.Sequence` [ `lsst.daf.butler.DatasetType` ] ]
1085 Adjusted neighbors, in the form used by `lsst.daf.butler.Quantum`.
1087 Returns
1088 -------
1089 removed : `set` [ `DatasetKey` ]
1090 Datasets in ``original`` that have no counterpart in ``adjusted``.
1091 """
1092 result = set(original)
1093 for dataset_type, kept_refs in adjusted.items():
1094 parent_dataset_type_name, _ = DatasetType.splitDatasetTypeName(dataset_type.name)
1095 for kept_ref in kept_refs:
1096 result.remove(DatasetKey(parent_dataset_type_name, kept_ref.dataId.values_tuple()))
1097 return result
1100@dataclasses.dataclass(eq=False, order=False)
1101class ExistingDatasets:
1102 """Struct that holds the results of dataset queries for
1103 `QuantumGraphBuilder`.
1104 """
1106 inputs: dict[DatasetKey | PrerequisiteDatasetKey, DatasetRef] = dataclasses.field(default_factory=dict)
1107 """Overall-input datasets found in `QuantumGraphBuilder.input_collections`.
1109 This may include prerequisite inputs. It does include init-inputs.
1110 It does not include intermediates.
1111 """
1113 outputs_for_skip: dict[DatasetKey, DatasetRef] = dataclasses.field(default_factory=dict)
1114 """Output datasets found in `QuantumGraphBuilder.skip_existing_in`.
1116 It is unspecified whether this contains include init-outputs; there is
1117 no concept of skipping at the init stage, so this is not expected to
1118 matter.
1119 """
1121 outputs_in_the_way: dict[DatasetKey, DatasetRef] = dataclasses.field(default_factory=dict)
1122 """Output datasets found in `QuantumGraphBuilder.output_run`.
1124 This includes regular outputs and init-outputs.
1125 """
1128def _quantum_or_quanta(n: int) -> str:
1129 """Correctly pluralize 'quantum' if needed."""
1130 return f"{n} quanta" if n != 1 else "1 quantum"