Coverage for python/lsst/analysis/tools/tasks/gatherResourceUsage.py: 21%
227 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-26 04:08 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-26 04:08 -0700
1# This file is part of analysis_tools.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = (
23 "ConsolidateResourceUsageConfig",
24 "ConsolidateResourceUsageConnections",
25 "ConsolidateResourceUsageTask",
26 "GatherResourceUsageConfig",
27 "GatherResourceUsageConnections",
28 "GatherResourceUsageTask",
29 "ResourceUsageQuantumGraphBuilder",
30)
32import argparse
33import dataclasses
34import datetime
35import logging
36import re
37from collections.abc import Iterable, Sequence
38from typing import Any
40import numpy as np
41import pandas as pd
42from lsst.daf.butler import Butler, DatasetRef, DatasetType
43from lsst.daf.butler.utils import globToRegex
44from lsst.pex.config import Field, ListField
45from lsst.pipe.base import (
46 Instrument,
47 PipelineTask,
48 PipelineTaskConfig,
49 PipelineTaskConnections,
50 QuantumGraph,
51 Struct,
52)
53from lsst.pipe.base import connectionTypes as cT
54from lsst.pipe.base.pipeline_graph import PipelineGraph
55from lsst.pipe.base.quantum_graph_builder import QuantumGraphBuilder
56from lsst.pipe.base.quantum_graph_skeleton import DatasetKey, QuantumGraphSkeleton
58# It's not great to be importing a private symbol, but this is a temporary
59# workaround for the fact that prior to w.2022.10, the units for memory values
60# written in task metadata were platform-dependent. Once we no longer care
61# about older runs, this import and the code that uses it can be removed.
62from lsst.utils.usage import _RUSAGE_MEMORY_MULTIPLIER
64_LOG = logging.getLogger(__name__)
67class ConsolidateResourceUsageConnections(PipelineTaskConnections, dimensions=()):
68 """Connection definitions for `ConsolidateResourceUsageTask`."""
70 output_table = cT.Output(
71 name="ResourceUsageSummary",
72 storageClass="DataFrame",
73 dimensions=(),
74 doc="Consolidated table of resource usage statistics. One row per task label",
75 )
77 def __init__(self, *, config):
78 super().__init__(config=config)
79 for name in self.config.input_names:
80 setattr(
81 self,
82 name,
83 cT.Input(
84 name,
85 storageClass="DataFrame",
86 dimensions=(),
87 doc="Resource usage statistics for a task.",
88 ),
89 )
90 self.inputs.add(name)
93class ConsolidateResourceUsageConfig(
94 PipelineTaskConfig, pipelineConnections=ConsolidateResourceUsageConnections
95):
96 """Configuration definitions for `ConsolidateResourceUsageTask`."""
98 input_names = ListField[str](
99 doc="Input resource usage dataset type names",
100 default=[],
101 )
104class ConsolidateResourceUsageTask(PipelineTask):
105 """A `PipelineTask` that summarizes task resource usage into a single
106 table with per-task rows.
108 Notes
109 -----
110 This is an unusual `PipelineTask` in that its input connection has
111 dynamic dimensions, and its quanta are generally built via a custom
112 quantum-graph builder defined in the same module.
113 """
115 ConfigClass = ConsolidateResourceUsageConfig
116 _DefaultName = "consolidateResourceUsage"
118 def run(self, **kwargs: Any) -> Struct:
119 quantiles = []
120 for input_name, ru_table in kwargs.items():
121 if not input_name.endswith("resource_usage"):
122 continue
123 else:
124 df = ru_table.quantile(
125 [0.0, 0.01, 0.05, 0.32, 0.50, 0.68, 0.95, 0.99, 1.0],
126 numeric_only=True,
127 ).reset_index()
128 df["task"] = input_name.replace("_resource_usage", "")
129 df["quanta"] = len(ru_table)
130 df["integrated_runtime"] = ru_table["run_time"].sum()
132 quantiles.append(
133 df[
134 [
135 "index",
136 "quanta",
137 "task",
138 "memory",
139 "init_time",
140 "run_time",
141 "integrated_runtime",
142 ]
143 ]
144 )
146 full_quantiles = pd.concat(quantiles)
147 full_quantiles["percentile"] = (full_quantiles["index"] * 100).astype(int)
148 full_quantiles["percentile_name"] = "p" + full_quantiles["percentile"].astype(str).str.zfill(3)
149 full_quantiles["memoryGB"] = full_quantiles["memory"] / 1024 / 1024 / 1024
150 full_quantiles["integrated_runtime_hrs"] = full_quantiles["integrated_runtime"] / 3600.0
151 memoryGB = pd.pivot_table(
152 full_quantiles, values="memoryGB", columns=["percentile_name"], index=["task"]
153 ).add_prefix("mem_GB_")
154 runtime = pd.pivot_table(
155 full_quantiles, values="run_time", columns=["percentile_name"], index=["task"]
156 ).add_prefix("runtime_s_")
157 memrun = pd.merge(
158 memoryGB.reset_index(),
159 runtime.reset_index(),
160 left_on="task",
161 right_on="task",
162 )
163 memrun = pd.merge(
164 full_quantiles[["task", "quanta", "integrated_runtime_hrs"]]
165 .drop_duplicates()
166 .sort_values("task"),
167 memrun,
168 )
170 return Struct(output_table=memrun)
173class GatherResourceUsageConnections(
174 PipelineTaskConnections, dimensions=(), defaultTemplates={"input_task_label": "PLACEHOLDER"}
175):
176 """Connection definitions for `GatherResourceUsageTask`."""
178 output_table = cT.Output(
179 "{input_task_label}_resource_statistics", # Should always be overridden.
180 storageClass="DataFrame",
181 dimensions=(),
182 doc=(
183 "Table that aggregates memory and CPU usage statistics from one "
184 "or more tasks. "
185 "This will have one row for each data ID, with columns for each "
186 "task or method's memory usage and runtime."
187 ),
188 )
189 input_metadata = cT.Input(
190 "{input_task_label}_metadata", # Should always be overridden.
191 storageClass="TaskMetadata",
192 dimensions=(), # Actually set in __init__, according to configuration.
193 doc="Metadata dataset for another task to gather resource usage from.",
194 multiple=True,
195 deferLoad=True,
196 )
198 def __init__(self, *, config):
199 super().__init__(config=config)
200 if "PLACEHOLDER" in self.output_table.name:
201 raise ValueError("Connection configuration for output_table must be overridden.")
202 if "PLACEHOLDER" in self.input_metadata.name:
203 raise ValueError("Connection configuration for input_metadata must be overridden.")
204 # Override the empty dimension set the connection was defined with with
205 # those the task was configured with.
206 self.input_metadata = dataclasses.replace(
207 self.input_metadata,
208 dimensions=list(self.config.dimensions),
209 )
212class GatherResourceUsageConfig(PipelineTaskConfig, pipelineConnections=GatherResourceUsageConnections):
213 """Configuration definitions for `GatherResourceUsageTask`."""
215 dimensions = ListField[str](
216 doc=(
217 "The quantum dimensions for the input metadata connection, and "
218 "the columns (after expansion to include implied dimensions) used "
219 "to identify rows in the output table."
220 ),
221 )
222 memory = Field[bool](
223 doc=(
224 "Whether to extract peak memory usage (maximum resident set size) "
225 "for this task. "
226 "Note that memory usage cannot be further subdivided because only "
227 "a per-process peak is available (and hence if multiple quanta "
228 "are run in one quantum, even per-quantum values may be "
229 "misleading)."
230 ),
231 default=True,
232 )
233 prep_time = Field[bool](
234 doc=(
235 "Whether to extract the CPU time duration for the work the "
236 "middleware does prior to initializing the task (mostly checking "
237 "for input dataset existence)."
238 ),
239 default=False,
240 )
241 init_time = Field[bool](
242 doc=("Whether to extract the CPU time duration for actually " "constructing the task."),
243 default=True,
244 )
245 run_time = Field[bool](
246 doc=("Whether to extract the CPU time duration for actually " "executing the task."),
247 default=True,
248 )
249 method_times = ListField[str](
250 doc=(
251 "Names of @lsst.utils.timer.timeMethod-decorated methods for "
252 "which CPU time durations should also be extracted. Use '.' "
253 "separators to refer to subtask methods at arbitrary depth."
254 ),
255 optional=False,
256 default=[],
257 )
258 input_task_label = Field[str](
259 doc=(
260 "Label for the top-level task whose metadata is being processed "
261 "within its own metadata file, if this differs from the prefix of "
262 "connections.input_metadata."
263 ),
264 default=None,
265 optional=True,
266 )
269class GatherResourceUsageTask(PipelineTask):
270 """A `PipelineTask` that gathers resource usage statistics from task
271 metadata.
273 Notes
274 -----
275 This is an unusual `PipelineTask` in that its input connection has
276 dynamic dimensions.
278 Its output table has columns for each of the dimensions of the input
279 metadata's data ID, as well as (subject to configuration):
281 - ``memory``: the maximum resident set size for the entire quantum
282 (in bytes);
283 - ``prep_time``: the time spent in the pre-initialization step in
284 which the middleware checks which of the quantum's inputs are available;
285 - ``init_time``: the time spent in task construction;
286 - ``run_time``: the time spent executing the task's runQuantum
287 method.
288 - ``{method}``: the time spent in a particular task or subtask
289 method decorated with `lsst.utils.timer.timeMethod`.
291 All time durations are CPU times in seconds, and all columns are 64-bit
292 floating point. Methods or steps that did not run are given a duration of
293 zero.
295 It is expected that this task will be configured to run multiple times in
296 most pipelines, often once for each other task in the pipeline.
297 """
299 ConfigClass = GatherResourceUsageConfig
300 _DefaultName = "gatherResourceUsage"
302 def runQuantum(
303 self,
304 butlerQC,
305 inputRefs,
306 outputRefs,
307 ):
308 # Docstring inherited.
309 # This override exists just so we can pass the butler registry's
310 # DimensionUniverse to run in order to standardize the dimensions.
311 inputs = butlerQC.get(inputRefs)
312 outputs = self.run(butlerQC.dimensions, **inputs)
313 butlerQC.put(outputs, outputRefs)
315 def run(self, universe, input_metadata):
316 """Gather resource usage statistics from per-quantum metadata.
318 Parameters
319 ----------
320 universe : `DimensionUniverse`
321 Object managing all dimensions recognized by the butler; used to
322 standardize and expand `GatherResourceUsageConfig.dimensions`.
323 input_metadata : `list` [ `DeferredDatasetHandle` ]
324 List of `lsst.daf.butler.DeferredDatasetHandle` that can be used to
325 load all input metadata datasets.
327 Returns
328 -------
329 result : `Struct`
330 Structure with a single element:
332 - ``outout_table``: a `pandas.DataFrame` that aggregates the
333 configured resource usage statistics.
334 """
335 dimensions = universe.conform(self.config.dimensions)
336 # Transform input list into a dict keyed by data ID.
337 handles_by_data_id = {}
338 for handle in input_metadata:
339 handles_by_data_id[handle.dataId] = handle
340 n_rows = len(handles_by_data_id)
341 # Create a dict of empty column arrays that we'll ultimately make into
342 # a table.
343 columns = {
344 d: np.zeros(n_rows, dtype=_dtype_from_field_spec(universe.dimensions[d].primaryKey))
345 for d in dimensions.names
346 }
347 for attr_name in ("memory", "prep_time", "init_time", "run_time"):
348 if getattr(self.config, attr_name):
349 columns[attr_name] = np.zeros(n_rows, dtype=float)
350 for method_name in self.config.method_times:
351 columns[method_name] = np.zeros(n_rows, dtype=float)
352 # Populate the table, one row at a time.
353 warned_about_metadata_version = False
354 for index, (data_id, handle) in enumerate(handles_by_data_id.items()):
355 # Fill in the data ID columns.
356 for k, v in data_id.mapping.items():
357 columns[k][index] = v
358 # Load the metadata dataset and fill in the columns derived from
359 # it.
360 metadata = handle.get()
361 try:
362 quantum_metadata = metadata["quantum"]
363 except KeyError:
364 self.log.warning(
365 "Metadata dataset %s @ %s has no 'quantum' key.",
366 handle.ref.datasetType.name,
367 handle.dataId,
368 )
369 else:
370 if self.config.memory:
371 columns["memory"][index], warned_about_metadata_version = self._extract_memory(
372 quantum_metadata,
373 handle,
374 warned_about_metadata_version,
375 )
376 for key, value in self._extract_quantum_timing(quantum_metadata).items():
377 columns[key][index] = value
378 for key, value in self._extract_method_timing(metadata, handle).items():
379 columns[key][index] = value
380 return Struct(output_table=pd.DataFrame(columns, copy=False))
382 def _extract_memory(self, quantum_metadata, handle, warned_about_metadata_version):
383 """Extract maximum memory usage from quantum metadata.
385 Parameters
386 ----------
387 quantum_metadata : `lsst.pipe.base.TaskMetadata`
388 The nested metadata associated with the label "quantum" inside a
389 PipelineTask's metadata.
390 handle : `lsst.daf.butler.DeferredDatasetHandle`
391 Butler handle for the metadata dataset; used to identify the
392 metadata in diagnostic messages only.
393 warned_about_metadata_version : `bool`
394 Whether we have already emitted at least one warning about old
395 metadata versions.
397 Returns
398 -------
399 memory : `float`
400 Maximum memory usage in bytes.
401 warned_about_metadata_version : `bool`
402 Whether we have now emitted at least one warning about old
403 metadata versions.
404 """
405 # Attempt to work around memory units being
406 # platform-dependent for metadata written prior to
407 # w.2022.10.
408 memory_multiplier = 1
409 if quantum_metadata.get("__version__", 0) < 1:
410 memory_multiplier = _RUSAGE_MEMORY_MULTIPLIER
411 msg = (
412 "Metadata dataset %s @ %s is too old; guessing memory units by "
413 "assuming the platform has not changed"
414 )
415 if not warned_about_metadata_version:
416 self.log.warning(msg, handle.ref.datasetType.name, handle.dataId)
417 self.log.warning(
418 "Warnings about memory units for other inputs " "will be emitted only at DEBUG level."
419 )
420 warned_about_metadata_version = True
421 else:
422 self.log.debug(msg, handle.ref.datasetType.name, handle.dataId)
423 return (
424 quantum_metadata["endMaxResidentSetSize"] * memory_multiplier,
425 warned_about_metadata_version,
426 )
428 def _extract_quantum_timing(self, quantum_metadata):
429 """Extract timing for standard PipelineTask quantum-execution steps
430 from metadata.
432 Parameters
433 ----------
434 quantum_metadata : `lsst.pipe.base.TaskMetadata`
435 The nested metadata associated with the label "quantum" inside a
436 PipelineTask's metadata.
438 Returns
439 -------
440 timing : `dict` [ `str`, `float` ]
441 CPU times in bytes, for all stages enabled in configuration.
442 """
443 end_time = quantum_metadata["endCpuTime"]
444 times = [
445 quantum_metadata["prepCpuTime"],
446 quantum_metadata.get("initCpuTime", end_time),
447 quantum_metadata.get("startCpuTime", end_time),
448 end_time,
449 ]
450 return {
451 attr_name: end - begin
452 for attr_name, begin, end in zip(
453 ["prep_time", "init_time", "run_time"],
454 times[:-1],
455 times[1:],
456 )
457 if getattr(self.config, attr_name)
458 }
460 def _extract_method_timing(self, metadata, handle):
461 """Extract timing for standard PipelineTask quantum-execution steps
462 from metadata.
464 Parameters
465 ----------
466 quantum_metadata : `lsst.pipe.base.TaskMetadata`
467 The nested metadata associated with the label "quantum" inside a
468 PipelineTask's metadata.
469 handle : `lsst.daf.butler.DeferredDatasetHandle`
470 Butler handle for the metadata dataset; used infer the prefix used
471 for method names within the metadata.
473 Returns
474 -------
475 timing : `dict` [ `str`, `float` ]
476 CPU times in bytes, for all methods enabled in configuration.
477 """
478 if self.config.input_task_label is not None:
479 task_label = self.config.input_task_label
480 else:
481 task_label = handle.ref.datasetType.name[: -len("_metadata")]
482 result = {}
483 for method_name in self.config.method_times:
484 terms = [task_label] + list(method_name.split("."))
485 metadata_method_name = ":".join(terms[:-1]) + "." + terms[-1]
486 try:
487 method_start_time = metadata[f"{metadata_method_name}StartCpuTime"]
488 method_end_time = metadata[f"{metadata_method_name}EndCpuTime"]
489 except KeyError:
490 # A method missing from the metadata is not a problem;
491 # it's reasonable for configuration or even runtime
492 # logic to result in a method not being called. When
493 # that happens, we just let the times stay zero.
494 pass
495 else:
496 result[f"{task_label}.{method_name}"] = method_end_time - method_start_time
497 return result
500def _dtype_from_field_spec(field_spec):
501 """Return the `np.dtype` that can be used to hold the values of a butler
502 dimension field.
504 Parameters
505 ----------
506 field_spec : `lsst.daf.butler.core.ddl.FieldSpec`
507 Object describing the field in a SQL-friendly sense.
509 Returns
510 -------
511 dtype : `np.dtype`
512 Numpy data type description.
513 """
514 python_type = field_spec.getPythonType()
515 if python_type is str:
516 return np.dtype((str, field_spec.length))
517 else:
518 return np.dtype(python_type)
521class ResourceUsageQuantumGraphBuilder(QuantumGraphBuilder):
522 """Custom quantum graph generator and pipeline builder for resource
523 usage summary tasks.
525 Parameters
526 ----------
527 butler : `lsst.daf.butler.Butler`
528 Butler client to query for inputs and dataset types.
529 dataset_type_names : `~collections.abc.Iterable` [ `str` ], optional
530 Iterable of dataset type names or shell-style glob patterns for the
531 metadata datasets to be used as input. Default is all datasets ending
532 with ``_metadata`` (other than the resource-usage summary tasks' own
533 metadata outputs, where are always ignored). A gather-resource task
534 with a single quantum is created for each matching metadata dataset.
535 where : `str`, optional
536 Data ID expression that constrains the input metadata datasets.
537 input_collections : `~collections.abc.Sequence` [ `str` ], optional
538 Sequence of collections to search for inputs. If not provided,
539 ``butler.collections`` is used and must not be empty.
540 output_run : `str`, optional
541 Output `~lsst.daf.butler.CollectionType.RUN` collection name. If not
542 provided, ``butler.run`` is used and must not be `None`.
543 skip_existing_in : `~collections.abc.Sequence` [ `str` ], optional
544 Sequence of collections to search for outputs, allowing quanta whose
545 outputs exist to be skipped.
546 clobber : `bool`, optional
547 Whether *execution* of this quantum graph will permit clobbering. If
548 `False` (default), existing outputs in ``output_run`` are an error
549 unless ``skip_existing_in`` will cause those quanta to be skipped.
551 Notes
552 -----
553 The resource usage summary tasks cannot easily be added to a regular
554 pipeline, as it's much more natural to have the gather tasks run
555 automatically on all *other* tasks. And we can generate a quantum graph
556 for these particular tasks much more efficiently than the general-purpose
557 algorithm could.
558 """
560 def __init__(
561 self,
562 butler: Butler,
563 *,
564 dataset_type_names: Iterable[str] | None = None,
565 where: str = "",
566 input_collections: Sequence[str] | None = None,
567 output_run: str | None = None,
568 skip_existing_in: Sequence[str] = (),
569 clobber: bool = False,
570 ):
571 # Start by querying for metadata datasets, since we'll need to know
572 # which dataset types exist in the input collections in order to
573 # build the pipeline.
574 input_dataset_types: Any
575 if not dataset_type_names:
576 base_dataset_type_filter = re.compile(r"\w+_metadata")
577 input_dataset_types = base_dataset_type_filter
578 else:
579 input_dataset_types = [globToRegex(expr) for expr in dataset_type_names]
580 pipeline_graph = PipelineGraph()
581 metadata_refs: dict[str, set[DatasetRef]] = {}
582 consolidate_config = ConsolidateResourceUsageConfig()
583 for results in butler.registry.queryDatasets(
584 input_dataset_types,
585 where=where,
586 findFirst=True,
587 collections=input_collections,
588 ).byParentDatasetType():
589 input_metadata_dataset_type = results.parentDatasetType
590 refs_for_type = set(results)
591 if refs_for_type:
592 gather_task_label, gather_dataset_type_name = self._add_gather_task(
593 pipeline_graph, input_metadata_dataset_type
594 )
595 metadata_refs[gather_task_label] = refs_for_type
596 consolidate_config.input_names.append(gather_dataset_type_name)
597 pipeline_graph.add_task(
598 task_class=ConsolidateResourceUsageTask,
599 config=consolidate_config,
600 label=ConsolidateResourceUsageTask._DefaultName,
601 )
602 # Now that we have the pipeline graph, we can delegate to super.
603 super().__init__(
604 pipeline_graph,
605 butler,
606 input_collections=input_collections,
607 output_run=output_run,
608 skip_existing_in=skip_existing_in,
609 clobber=clobber,
610 )
611 # We've already queried for all of our input datasets, so we don't want
612 # to do that again in process_subgraph, even though that's where most
613 # QG builders do their queries.
614 self.gather_inputs: dict[str, list[DatasetKey]] = {}
615 for gather_task_label, gather_input_refs in metadata_refs.items():
616 gather_inputs_for_task: list[DatasetKey] = []
617 for ref in gather_input_refs:
618 dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values)
619 self.existing_datasets.inputs[dataset_key] = ref
620 gather_inputs_for_task.append(dataset_key)
621 self.gather_inputs[gather_task_label] = gather_inputs_for_task
623 @classmethod
624 def _add_gather_task(
625 cls, pipeline_graph: PipelineGraph, input_metadata_dataset_type: DatasetType
626 ) -> tuple[str, str]:
627 """Add a single configuration of `GatherResourceUsageTask` to a
628 pipeline graph.
630 Parameters
631 ----------
632 pipeline_graph : `lsst.pipe.base.PipelineGraph`
633 Pipeline graph to modify in-place.
634 input_metadata_dataset_type : `lsst.daf.butler.DatasetType`
635 Dataset type for the task's input dataset, which is the metadata
636 output of the task whose resource usage information is being
637 extracted.
639 Returns
640 -------
641 gather_task_label : `str`
642 Label of the new task in the pipeline.
643 gather_dataset_type_name : `str
644 Name of the task's output table dataset type.
645 """
646 if (m := re.fullmatch(r"^(\w+)_metadata$", input_metadata_dataset_type.name)) is None:
647 return
648 elif "gatherResourceUsage" in input_metadata_dataset_type.name:
649 return
650 else:
651 input_task_label = m.group(1)
652 gather_task_label = f"{input_task_label}_gatherResourceUsage"
653 gather_dataset_type_name = f"{input_task_label}_resource_usage"
654 gather_config = GatherResourceUsageConfig()
655 gather_config.dimensions = input_metadata_dataset_type.dimensions.names
656 gather_config.connections.input_metadata = input_metadata_dataset_type.name
657 gather_config.connections.output_table = gather_dataset_type_name
658 pipeline_graph.add_task(
659 label=gather_task_label,
660 task_class=GatherResourceUsageTask,
661 config=gather_config,
662 )
663 return gather_task_label, gather_dataset_type_name
665 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton:
666 skeleton = QuantumGraphSkeleton(subgraph.tasks.keys())
667 consolidate_inputs = []
668 for task_node in subgraph.tasks.values():
669 if task_node.task_class is GatherResourceUsageTask:
670 quantum_key = skeleton.add_quantum_node(task_node.label, self.empty_data_id)
671 skeleton.add_input_edges(quantum_key, self.gather_inputs[task_node.label])
672 for write_edge in task_node.iter_all_outputs():
673 output_node = subgraph.dataset_types[write_edge.parent_dataset_type_name]
674 assert (
675 output_node.dimensions == self.universe.empty
676 ), "All outputs should have empty dimensions."
677 gather_output_key = skeleton.add_dataset_node(
678 write_edge.parent_dataset_type_name, self.empty_data_id
679 )
680 skeleton.add_output_edge(quantum_key, gather_output_key)
681 if write_edge.connection_name in task_node.outputs:
682 # Not a special output like metadata or log.
683 consolidate_inputs.append(gather_output_key)
684 else:
685 assert task_node.task_class is ConsolidateResourceUsageTask
686 quantum_key = skeleton.add_quantum_node(task_node.label, self.empty_data_id)
687 skeleton.add_input_edges(quantum_key, consolidate_inputs)
688 for write_edge in task_node.iter_all_outputs():
689 output_node = subgraph.dataset_types[write_edge.parent_dataset_type_name]
690 assert (
691 output_node.dimensions == self.universe.empty
692 ), "All outputs should have empty dimensions."
693 consolidate_output_key = skeleton.add_dataset_node(
694 write_edge.parent_dataset_type_name, self.empty_data_id
695 )
696 skeleton.add_output_edge(quantum_key, consolidate_output_key)
697 # We don't need to do any follow-up searches for output datasets,
698 # because the outputs all have empty dimensions and the base
699 # QuantumGraphBuilder takes care of those.
700 return skeleton
702 @classmethod
703 def make_argument_parser(cls) -> argparse.ArgumentParser:
704 """Make the argument parser for the command-line interface."""
705 parser = argparse.ArgumentParser(
706 description=(
707 "Build a QuantumGraph that gathers and consolidates "
708 "resource usage tables from existing metadata datasets."
709 ),
710 )
711 parser.add_argument("repo", type=str, help="Path to data repository or butler configuration.")
712 parser.add_argument("filename", type=str, help="Output filename for QuantumGraph.")
713 parser.add_argument(
714 "collections",
715 type=str,
716 nargs="+",
717 help="Collection(s)s to search for input metadata.",
718 )
719 parser.add_argument(
720 "--dataset-types",
721 type=str,
722 action="extend",
723 help="Glob-style patterns for input metadata dataset types.",
724 )
725 parser.add_argument(
726 "--where",
727 type=str,
728 default="",
729 help="Data ID expression used when querying for input metadata datasets.",
730 )
731 parser.add_argument(
732 "--output",
733 type=str,
734 help=(
735 "Name of the output CHAINED collection. If this options is specified and "
736 "--output-run is not, then a new RUN collection will be created by appending "
737 "a timestamp to the value of this option."
738 ),
739 default=None,
740 metavar="COLL",
741 )
742 parser.add_argument(
743 "--output-run",
744 type=str,
745 help=(
746 "Output RUN collection to write resulting images. If not provided "
747 "then --output must be provided and a new RUN collection will be created "
748 "by appending a timestamp to the value passed with --output."
749 ),
750 default=None,
751 metavar="RUN",
752 )
753 return parser
755 @classmethod
756 def main(cls) -> None:
757 """Run the command-line interface for this quantum-graph builder.
759 This function provides the implementation for the
760 ``build-gather-resource-usage-qg`` script.
761 """
762 parser = cls.make_argument_parser()
763 args = parser.parse_args()
764 # Figure out collection names
765 if args.output_run is None:
766 if args.output is None:
767 raise ValueError("At least one of --output or --output-run options is required.")
768 args.output_run = "{}/{}".format(args.output, Instrument.makeCollectionTimestamp())
770 butler = Butler(args.repo, collections=args.collections)
771 builder = cls(
772 butler,
773 dataset_type_names=args.dataset_types,
774 where=args.where,
775 input_collections=args.collections,
776 output_run=args.output_run,
777 )
778 qg: QuantumGraph = builder.build(
779 # Metadata includes a subset of attributes defined in CmdLineFwk.
780 metadata={
781 "input": args.collections,
782 "butler_argument": args.repo,
783 "output": args.output,
784 "output_run": args.output_run,
785 "data_query": args.where,
786 "time": f"{datetime.datetime.now()}",
787 }
788 )
789 qg.saveUri(args.filename)