Coverage for python/lsst/analysis/tools/tasks/gatherResourceUsage.py: 21%

227 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-23 13:09 +0000

1# This file is part of analysis_tools. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ( 

23 "ConsolidateResourceUsageConfig", 

24 "ConsolidateResourceUsageConnections", 

25 "ConsolidateResourceUsageTask", 

26 "GatherResourceUsageConfig", 

27 "GatherResourceUsageConnections", 

28 "GatherResourceUsageTask", 

29 "ResourceUsageQuantumGraphBuilder", 

30) 

31 

32import argparse 

33import dataclasses 

34import datetime 

35import logging 

36import re 

37from collections.abc import Iterable, Sequence 

38from typing import Any 

39 

40import numpy as np 

41import pandas as pd 

42from lsst.daf.butler import Butler, DatasetRef, DatasetType 

43from lsst.daf.butler.utils import globToRegex 

44from lsst.pex.config import Field, ListField 

45from lsst.pipe.base import ( 

46 Instrument, 

47 PipelineTask, 

48 PipelineTaskConfig, 

49 PipelineTaskConnections, 

50 QuantumGraph, 

51 Struct, 

52) 

53from lsst.pipe.base import connectionTypes as cT 

54from lsst.pipe.base.pipeline_graph import PipelineGraph 

55from lsst.pipe.base.quantum_graph_builder import QuantumGraphBuilder 

56from lsst.pipe.base.quantum_graph_skeleton import DatasetKey, QuantumGraphSkeleton 

57 

58# It's not great to be importing a private symbol, but this is a temporary 

59# workaround for the fact that prior to w.2022.10, the units for memory values 

60# written in task metadata were platform-dependent. Once we no longer care 

61# about older runs, this import and the code that uses it can be removed. 

62from lsst.utils.usage import _RUSAGE_MEMORY_MULTIPLIER 

63 

64_LOG = logging.getLogger(__name__) 

65 

66 

67class ConsolidateResourceUsageConnections(PipelineTaskConnections, dimensions=()): 

68 """Connection definitions for `ConsolidateResourceUsageTask`.""" 

69 

70 output_table = cT.Output( 

71 name="ResourceUsageSummary", 

72 storageClass="DataFrame", 

73 dimensions=(), 

74 doc="Consolidated table of resource usage statistics. One row per task label", 

75 ) 

76 

77 def __init__(self, *, config): 

78 super().__init__(config=config) 

79 for name in self.config.input_names: 

80 setattr( 

81 self, 

82 name, 

83 cT.Input( 

84 name, 

85 storageClass="DataFrame", 

86 dimensions=(), 

87 doc="Resource usage statistics for a task.", 

88 ), 

89 ) 

90 self.inputs.add(name) 

91 

92 

93class ConsolidateResourceUsageConfig( 

94 PipelineTaskConfig, pipelineConnections=ConsolidateResourceUsageConnections 

95): 

96 """Configuration definitions for `ConsolidateResourceUsageTask`.""" 

97 

98 input_names = ListField[str]( 

99 doc="Input resource usage dataset type names", 

100 default=[], 

101 ) 

102 

103 

104class ConsolidateResourceUsageTask(PipelineTask): 

105 """A `PipelineTask` that summarizes task resource usage into a single 

106 table with per-task rows. 

107 

108 Notes 

109 ----- 

110 This is an unusual `PipelineTask` in that its input connection has 

111 dynamic dimensions, and its quanta are generally built via a custom 

112 quantum-graph builder defined in the same module. 

113 """ 

114 

115 ConfigClass = ConsolidateResourceUsageConfig 

116 _DefaultName = "consolidateResourceUsage" 

117 

118 def run(self, **kwargs: Any) -> Struct: 

119 quantiles = [] 

120 for input_name, ru_table in kwargs.items(): 

121 if not input_name.endswith("resource_usage"): 

122 continue 

123 else: 

124 df = ru_table.quantile( 

125 [0.0, 0.01, 0.05, 0.32, 0.50, 0.68, 0.95, 0.99, 1.0], 

126 numeric_only=True, 

127 ).reset_index() 

128 df["task"] = input_name.replace("_resource_usage", "") 

129 df["quanta"] = len(ru_table) 

130 df["integrated_runtime"] = ru_table["run_time"].sum() 

131 

132 quantiles.append( 

133 df[ 

134 [ 

135 "index", 

136 "quanta", 

137 "task", 

138 "memory", 

139 "init_time", 

140 "run_time", 

141 "integrated_runtime", 

142 ] 

143 ] 

144 ) 

145 

146 full_quantiles = pd.concat(quantiles) 

147 full_quantiles["percentile"] = (full_quantiles["index"] * 100).astype(int) 

148 full_quantiles["percentile_name"] = "p" + full_quantiles["percentile"].astype(str).str.zfill(3) 

149 full_quantiles["memoryGB"] = full_quantiles["memory"] / 1024 / 1024 / 1024 

150 full_quantiles["integrated_runtime_hrs"] = full_quantiles["integrated_runtime"] / 3600.0 

151 memoryGB = pd.pivot_table( 

152 full_quantiles, values="memoryGB", columns=["percentile_name"], index=["task"] 

153 ).add_prefix("mem_GB_") 

154 runtime = pd.pivot_table( 

155 full_quantiles, values="run_time", columns=["percentile_name"], index=["task"] 

156 ).add_prefix("runtime_s_") 

157 memrun = pd.merge( 

158 memoryGB.reset_index(), 

159 runtime.reset_index(), 

160 left_on="task", 

161 right_on="task", 

162 ) 

163 memrun = pd.merge( 

164 full_quantiles[["task", "quanta", "integrated_runtime_hrs"]] 

165 .drop_duplicates() 

166 .sort_values("task"), 

167 memrun, 

168 ) 

169 

170 return Struct(output_table=memrun) 

171 

172 

173class GatherResourceUsageConnections( 

174 PipelineTaskConnections, dimensions=(), defaultTemplates={"input_task_label": "PLACEHOLDER"} 

175): 

176 """Connection definitions for `GatherResourceUsageTask`.""" 

177 

178 output_table = cT.Output( 

179 "{input_task_label}_resource_statistics", # Should always be overridden. 

180 storageClass="DataFrame", 

181 dimensions=(), 

182 doc=( 

183 "Table that aggregates memory and CPU usage statistics from one " 

184 "or more tasks. " 

185 "This will have one row for each data ID, with columns for each " 

186 "task or method's memory usage and runtime." 

187 ), 

188 ) 

189 input_metadata = cT.Input( 

190 "{input_task_label}_metadata", # Should always be overridden. 

191 storageClass="TaskMetadata", 

192 dimensions=(), # Actually set in __init__, according to configuration. 

193 doc="Metadata dataset for another task to gather resource usage from.", 

194 multiple=True, 

195 deferLoad=True, 

196 ) 

197 

198 def __init__(self, *, config): 

199 super().__init__(config=config) 

200 if "PLACEHOLDER" in self.output_table.name: 

201 raise ValueError("Connection configuration for output_table must be overridden.") 

202 if "PLACEHOLDER" in self.input_metadata.name: 

203 raise ValueError("Connection configuration for input_metadata must be overridden.") 

204 # Override the empty dimension set the connection was defined with with 

205 # those the task was configured with. 

206 self.input_metadata = dataclasses.replace( 

207 self.input_metadata, 

208 dimensions=list(self.config.dimensions), 

209 ) 

210 

211 

212class GatherResourceUsageConfig(PipelineTaskConfig, pipelineConnections=GatherResourceUsageConnections): 

213 """Configuration definitions for `GatherResourceUsageTask`.""" 

214 

215 dimensions = ListField[str]( 

216 doc=( 

217 "The quantum dimensions for the input metadata connection, and " 

218 "the columns (after expansion to include implied dimensions) used " 

219 "to identify rows in the output table." 

220 ), 

221 ) 

222 memory = Field[bool]( 

223 doc=( 

224 "Whether to extract peak memory usage (maximum resident set size) " 

225 "for this task. " 

226 "Note that memory usage cannot be further subdivided because only " 

227 "a per-process peak is available (and hence if multiple quanta " 

228 "are run in one quantum, even per-quantum values may be " 

229 "misleading)." 

230 ), 

231 default=True, 

232 ) 

233 prep_time = Field[bool]( 

234 doc=( 

235 "Whether to extract the CPU time duration for the work the " 

236 "middleware does prior to initializing the task (mostly checking " 

237 "for input dataset existence)." 

238 ), 

239 default=False, 

240 ) 

241 init_time = Field[bool]( 

242 doc=("Whether to extract the CPU time duration for actually " "constructing the task."), 

243 default=True, 

244 ) 

245 run_time = Field[bool]( 

246 doc=("Whether to extract the CPU time duration for actually " "executing the task."), 

247 default=True, 

248 ) 

249 method_times = ListField[str]( 

250 doc=( 

251 "Names of @lsst.utils.timer.timeMethod-decorated methods for " 

252 "which CPU time durations should also be extracted. Use '.' " 

253 "separators to refer to subtask methods at arbitrary depth." 

254 ), 

255 optional=False, 

256 default=[], 

257 ) 

258 input_task_label = Field[str]( 

259 doc=( 

260 "Label for the top-level task whose metadata is being processed " 

261 "within its own metadata file, if this differs from the prefix of " 

262 "connections.input_metadata." 

263 ), 

264 default=None, 

265 optional=True, 

266 ) 

267 

268 

269class GatherResourceUsageTask(PipelineTask): 

270 """A `PipelineTask` that gathers resource usage statistics from task 

271 metadata. 

272 

273 Notes 

274 ----- 

275 This is an unusual `PipelineTask` in that its input connection has 

276 dynamic dimensions. 

277 

278 Its output table has columns for each of the dimensions of the input 

279 metadata's data ID, as well as (subject to configuration): 

280 

281 - ``memory``: the maximum resident set size for the entire quantum 

282 (in bytes); 

283 - ``prep_time``: the time spent in the pre-initialization step in 

284 which the middleware checks which of the quantum's inputs are available; 

285 - ``init_time``: the time spent in task construction; 

286 - ``run_time``: the time spent executing the task's runQuantum 

287 method. 

288 - ``{method}``: the time spent in a particular task or subtask 

289 method decorated with `lsst.utils.timer.timeMethod`. 

290 

291 All time durations are CPU times in seconds, and all columns are 64-bit 

292 floating point. Methods or steps that did not run are given a duration of 

293 zero. 

294 

295 It is expected that this task will be configured to run multiple times in 

296 most pipelines, often once for each other task in the pipeline. 

297 """ 

298 

299 ConfigClass = GatherResourceUsageConfig 

300 _DefaultName = "gatherResourceUsage" 

301 

302 def runQuantum( 

303 self, 

304 butlerQC, 

305 inputRefs, 

306 outputRefs, 

307 ): 

308 # Docstring inherited. 

309 # This override exists just so we can pass the butler registry's 

310 # DimensionUniverse to run in order to standardize the dimensions. 

311 inputs = butlerQC.get(inputRefs) 

312 outputs = self.run(butlerQC.dimensions, **inputs) 

313 butlerQC.put(outputs, outputRefs) 

314 

315 def run(self, universe, input_metadata): 

316 """Gather resource usage statistics from per-quantum metadata. 

317 

318 Parameters 

319 ---------- 

320 universe : `DimensionUniverse` 

321 Object managing all dimensions recognized by the butler; used to 

322 standardize and expand `GatherResourceUsageConfig.dimensions`. 

323 input_metadata : `list` [ `DeferredDatasetHandle` ] 

324 List of `lsst.daf.butler.DeferredDatasetHandle` that can be used to 

325 load all input metadata datasets. 

326 

327 Returns 

328 ------- 

329 result : `Struct` 

330 Structure with a single element: 

331 

332 - ``outout_table``: a `pandas.DataFrame` that aggregates the 

333 configured resource usage statistics. 

334 """ 

335 dimensions = universe.conform(self.config.dimensions) 

336 # Transform input list into a dict keyed by data ID. 

337 handles_by_data_id = {} 

338 for handle in input_metadata: 

339 handles_by_data_id[handle.dataId] = handle 

340 n_rows = len(handles_by_data_id) 

341 # Create a dict of empty column arrays that we'll ultimately make into 

342 # a table. 

343 columns = { 

344 d: np.zeros(n_rows, dtype=_dtype_from_field_spec(universe.dimensions[d].primaryKey)) 

345 for d in dimensions.names 

346 } 

347 for attr_name in ("memory", "prep_time", "init_time", "run_time"): 

348 if getattr(self.config, attr_name): 

349 columns[attr_name] = np.zeros(n_rows, dtype=float) 

350 for method_name in self.config.method_times: 

351 columns[method_name] = np.zeros(n_rows, dtype=float) 

352 # Populate the table, one row at a time. 

353 warned_about_metadata_version = False 

354 for index, (data_id, handle) in enumerate(handles_by_data_id.items()): 

355 # Fill in the data ID columns. 

356 for k, v in data_id.mapping.items(): 

357 columns[k][index] = v 

358 # Load the metadata dataset and fill in the columns derived from 

359 # it. 

360 metadata = handle.get() 

361 try: 

362 quantum_metadata = metadata["quantum"] 

363 except KeyError: 

364 self.log.warning( 

365 "Metadata dataset %s @ %s has no 'quantum' key.", 

366 handle.ref.datasetType.name, 

367 handle.dataId, 

368 ) 

369 else: 

370 if self.config.memory: 

371 columns["memory"][index], warned_about_metadata_version = self._extract_memory( 

372 quantum_metadata, 

373 handle, 

374 warned_about_metadata_version, 

375 ) 

376 for key, value in self._extract_quantum_timing(quantum_metadata).items(): 

377 columns[key][index] = value 

378 for key, value in self._extract_method_timing(metadata, handle).items(): 

379 columns[key][index] = value 

380 return Struct(output_table=pd.DataFrame(columns, copy=False)) 

381 

382 def _extract_memory(self, quantum_metadata, handle, warned_about_metadata_version): 

383 """Extract maximum memory usage from quantum metadata. 

384 

385 Parameters 

386 ---------- 

387 quantum_metadata : `lsst.pipe.base.TaskMetadata` 

388 The nested metadata associated with the label "quantum" inside a 

389 PipelineTask's metadata. 

390 handle : `lsst.daf.butler.DeferredDatasetHandle` 

391 Butler handle for the metadata dataset; used to identify the 

392 metadata in diagnostic messages only. 

393 warned_about_metadata_version : `bool` 

394 Whether we have already emitted at least one warning about old 

395 metadata versions. 

396 

397 Returns 

398 ------- 

399 memory : `float` 

400 Maximum memory usage in bytes. 

401 warned_about_metadata_version : `bool` 

402 Whether we have now emitted at least one warning about old 

403 metadata versions. 

404 """ 

405 # Attempt to work around memory units being 

406 # platform-dependent for metadata written prior to 

407 # w.2022.10. 

408 memory_multiplier = 1 

409 if quantum_metadata.get("__version__", 0) < 1: 

410 memory_multiplier = _RUSAGE_MEMORY_MULTIPLIER 

411 msg = ( 

412 "Metadata dataset %s @ %s is too old; guessing memory units by " 

413 "assuming the platform has not changed" 

414 ) 

415 if not warned_about_metadata_version: 

416 self.log.warning(msg, handle.ref.datasetType.name, handle.dataId) 

417 self.log.warning( 

418 "Warnings about memory units for other inputs " "will be emitted only at DEBUG level." 

419 ) 

420 warned_about_metadata_version = True 

421 else: 

422 self.log.debug(msg, handle.ref.datasetType.name, handle.dataId) 

423 return ( 

424 quantum_metadata["endMaxResidentSetSize"] * memory_multiplier, 

425 warned_about_metadata_version, 

426 ) 

427 

428 def _extract_quantum_timing(self, quantum_metadata): 

429 """Extract timing for standard PipelineTask quantum-execution steps 

430 from metadata. 

431 

432 Parameters 

433 ---------- 

434 quantum_metadata : `lsst.pipe.base.TaskMetadata` 

435 The nested metadata associated with the label "quantum" inside a 

436 PipelineTask's metadata. 

437 

438 Returns 

439 ------- 

440 timing : `dict` [ `str`, `float` ] 

441 CPU times in bytes, for all stages enabled in configuration. 

442 """ 

443 end_time = quantum_metadata["endCpuTime"] 

444 times = [ 

445 quantum_metadata["prepCpuTime"], 

446 quantum_metadata.get("initCpuTime", end_time), 

447 quantum_metadata.get("startCpuTime", end_time), 

448 end_time, 

449 ] 

450 return { 

451 attr_name: end - begin 

452 for attr_name, begin, end in zip( 

453 ["prep_time", "init_time", "run_time"], 

454 times[:-1], 

455 times[1:], 

456 ) 

457 if getattr(self.config, attr_name) 

458 } 

459 

460 def _extract_method_timing(self, metadata, handle): 

461 """Extract timing for standard PipelineTask quantum-execution steps 

462 from metadata. 

463 

464 Parameters 

465 ---------- 

466 quantum_metadata : `lsst.pipe.base.TaskMetadata` 

467 The nested metadata associated with the label "quantum" inside a 

468 PipelineTask's metadata. 

469 handle : `lsst.daf.butler.DeferredDatasetHandle` 

470 Butler handle for the metadata dataset; used infer the prefix used 

471 for method names within the metadata. 

472 

473 Returns 

474 ------- 

475 timing : `dict` [ `str`, `float` ] 

476 CPU times in bytes, for all methods enabled in configuration. 

477 """ 

478 if self.config.input_task_label is not None: 

479 task_label = self.config.input_task_label 

480 else: 

481 task_label = handle.ref.datasetType.name[: -len("_metadata")] 

482 result = {} 

483 for method_name in self.config.method_times: 

484 terms = [task_label] + list(method_name.split(".")) 

485 metadata_method_name = ":".join(terms[:-1]) + "." + terms[-1] 

486 try: 

487 method_start_time = metadata[f"{metadata_method_name}StartCpuTime"] 

488 method_end_time = metadata[f"{metadata_method_name}EndCpuTime"] 

489 except KeyError: 

490 # A method missing from the metadata is not a problem; 

491 # it's reasonable for configuration or even runtime 

492 # logic to result in a method not being called. When 

493 # that happens, we just let the times stay zero. 

494 pass 

495 else: 

496 result[f"{task_label}.{method_name}"] = method_end_time - method_start_time 

497 return result 

498 

499 

500def _dtype_from_field_spec(field_spec): 

501 """Return the `np.dtype` that can be used to hold the values of a butler 

502 dimension field. 

503 

504 Parameters 

505 ---------- 

506 field_spec : `lsst.daf.butler.core.ddl.FieldSpec` 

507 Object describing the field in a SQL-friendly sense. 

508 

509 Returns 

510 ------- 

511 dtype : `np.dtype` 

512 Numpy data type description. 

513 """ 

514 python_type = field_spec.getPythonType() 

515 if python_type is str: 

516 return np.dtype((str, field_spec.length)) 

517 else: 

518 return np.dtype(python_type) 

519 

520 

521class ResourceUsageQuantumGraphBuilder(QuantumGraphBuilder): 

522 """Custom quantum graph generator and pipeline builder for resource 

523 usage summary tasks. 

524 

525 Parameters 

526 ---------- 

527 butler : `lsst.daf.butler.Butler` 

528 Butler client to query for inputs and dataset types. 

529 dataset_type_names : `~collections.abc.Iterable` [ `str` ], optional 

530 Iterable of dataset type names or shell-style glob patterns for the 

531 metadata datasets to be used as input. Default is all datasets ending 

532 with ``_metadata`` (other than the resource-usage summary tasks' own 

533 metadata outputs, where are always ignored). A gather-resource task 

534 with a single quantum is created for each matching metadata dataset. 

535 where : `str`, optional 

536 Data ID expression that constrains the input metadata datasets. 

537 input_collections : `~collections.abc.Sequence` [ `str` ], optional 

538 Sequence of collections to search for inputs. If not provided, 

539 ``butler.collections`` is used and must not be empty. 

540 output_run : `str`, optional 

541 Output `~lsst.daf.butler.CollectionType.RUN` collection name. If not 

542 provided, ``butler.run`` is used and must not be `None`. 

543 skip_existing_in : `~collections.abc.Sequence` [ `str` ], optional 

544 Sequence of collections to search for outputs, allowing quanta whose 

545 outputs exist to be skipped. 

546 clobber : `bool`, optional 

547 Whether *execution* of this quantum graph will permit clobbering. If 

548 `False` (default), existing outputs in ``output_run`` are an error 

549 unless ``skip_existing_in`` will cause those quanta to be skipped. 

550 

551 Notes 

552 ----- 

553 The resource usage summary tasks cannot easily be added to a regular 

554 pipeline, as it's much more natural to have the gather tasks run 

555 automatically on all *other* tasks. And we can generate a quantum graph 

556 for these particular tasks much more efficiently than the general-purpose 

557 algorithm could. 

558 """ 

559 

560 def __init__( 

561 self, 

562 butler: Butler, 

563 *, 

564 dataset_type_names: Iterable[str] | None = None, 

565 where: str = "", 

566 input_collections: Sequence[str] | None = None, 

567 output_run: str | None = None, 

568 skip_existing_in: Sequence[str] = (), 

569 clobber: bool = False, 

570 ): 

571 # Start by querying for metadata datasets, since we'll need to know 

572 # which dataset types exist in the input collections in order to 

573 # build the pipeline. 

574 input_dataset_types: Any 

575 if not dataset_type_names: 

576 base_dataset_type_filter = re.compile(r"\w+_metadata") 

577 input_dataset_types = base_dataset_type_filter 

578 else: 

579 input_dataset_types = [globToRegex(expr) for expr in dataset_type_names] 

580 pipeline_graph = PipelineGraph() 

581 metadata_refs: dict[str, set[DatasetRef]] = {} 

582 consolidate_config = ConsolidateResourceUsageConfig() 

583 for results in butler.registry.queryDatasets( 

584 input_dataset_types, 

585 where=where, 

586 findFirst=True, 

587 collections=input_collections, 

588 ).byParentDatasetType(): 

589 input_metadata_dataset_type = results.parentDatasetType 

590 refs_for_type = set(results) 

591 if refs_for_type: 

592 gather_task_label, gather_dataset_type_name = self._add_gather_task( 

593 pipeline_graph, input_metadata_dataset_type 

594 ) 

595 metadata_refs[gather_task_label] = refs_for_type 

596 consolidate_config.input_names.append(gather_dataset_type_name) 

597 pipeline_graph.add_task( 

598 task_class=ConsolidateResourceUsageTask, 

599 config=consolidate_config, 

600 label=ConsolidateResourceUsageTask._DefaultName, 

601 ) 

602 # Now that we have the pipeline graph, we can delegate to super. 

603 super().__init__( 

604 pipeline_graph, 

605 butler, 

606 input_collections=input_collections, 

607 output_run=output_run, 

608 skip_existing_in=skip_existing_in, 

609 clobber=clobber, 

610 ) 

611 # We've already queried for all of our input datasets, so we don't want 

612 # to do that again in process_subgraph, even though that's where most 

613 # QG builders do their queries. 

614 self.gather_inputs: dict[str, list[DatasetKey]] = {} 

615 for gather_task_label, gather_input_refs in metadata_refs.items(): 

616 gather_inputs_for_task: list[DatasetKey] = [] 

617 for ref in gather_input_refs: 

618 dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values) 

619 self.existing_datasets.inputs[dataset_key] = ref 

620 gather_inputs_for_task.append(dataset_key) 

621 self.gather_inputs[gather_task_label] = gather_inputs_for_task 

622 

623 @classmethod 

624 def _add_gather_task( 

625 cls, pipeline_graph: PipelineGraph, input_metadata_dataset_type: DatasetType 

626 ) -> tuple[str, str]: 

627 """Add a single configuration of `GatherResourceUsageTask` to a 

628 pipeline graph. 

629 

630 Parameters 

631 ---------- 

632 pipeline_graph : `lsst.pipe.base.PipelineGraph` 

633 Pipeline graph to modify in-place. 

634 input_metadata_dataset_type : `lsst.daf.butler.DatasetType` 

635 Dataset type for the task's input dataset, which is the metadata 

636 output of the task whose resource usage information is being 

637 extracted. 

638 

639 Returns 

640 ------- 

641 gather_task_label : `str` 

642 Label of the new task in the pipeline. 

643 gather_dataset_type_name : `str 

644 Name of the task's output table dataset type. 

645 """ 

646 if (m := re.fullmatch(r"^(\w+)_metadata$", input_metadata_dataset_type.name)) is None: 

647 return 

648 elif "gatherResourceUsage" in input_metadata_dataset_type.name: 

649 return 

650 else: 

651 input_task_label = m.group(1) 

652 gather_task_label = f"{input_task_label}_gatherResourceUsage" 

653 gather_dataset_type_name = f"{input_task_label}_resource_usage" 

654 gather_config = GatherResourceUsageConfig() 

655 gather_config.dimensions = input_metadata_dataset_type.dimensions.names 

656 gather_config.connections.input_metadata = input_metadata_dataset_type.name 

657 gather_config.connections.output_table = gather_dataset_type_name 

658 pipeline_graph.add_task( 

659 label=gather_task_label, 

660 task_class=GatherResourceUsageTask, 

661 config=gather_config, 

662 ) 

663 return gather_task_label, gather_dataset_type_name 

664 

665 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton: 

666 skeleton = QuantumGraphSkeleton(subgraph.tasks.keys()) 

667 consolidate_inputs = [] 

668 for task_node in subgraph.tasks.values(): 

669 if task_node.task_class is GatherResourceUsageTask: 

670 quantum_key = skeleton.add_quantum_node(task_node.label, self.empty_data_id) 

671 skeleton.add_input_edges(quantum_key, self.gather_inputs[task_node.label]) 

672 for write_edge in task_node.iter_all_outputs(): 

673 output_node = subgraph.dataset_types[write_edge.parent_dataset_type_name] 

674 assert ( 

675 output_node.dimensions == self.universe.empty 

676 ), "All outputs should have empty dimensions." 

677 gather_output_key = skeleton.add_dataset_node( 

678 write_edge.parent_dataset_type_name, self.empty_data_id 

679 ) 

680 skeleton.add_output_edge(quantum_key, gather_output_key) 

681 if write_edge.connection_name in task_node.outputs: 

682 # Not a special output like metadata or log. 

683 consolidate_inputs.append(gather_output_key) 

684 else: 

685 assert task_node.task_class is ConsolidateResourceUsageTask 

686 quantum_key = skeleton.add_quantum_node(task_node.label, self.empty_data_id) 

687 skeleton.add_input_edges(quantum_key, consolidate_inputs) 

688 for write_edge in task_node.iter_all_outputs(): 

689 output_node = subgraph.dataset_types[write_edge.parent_dataset_type_name] 

690 assert ( 

691 output_node.dimensions == self.universe.empty 

692 ), "All outputs should have empty dimensions." 

693 consolidate_output_key = skeleton.add_dataset_node( 

694 write_edge.parent_dataset_type_name, self.empty_data_id 

695 ) 

696 skeleton.add_output_edge(quantum_key, consolidate_output_key) 

697 # We don't need to do any follow-up searches for output datasets, 

698 # because the outputs all have empty dimensions and the base 

699 # QuantumGraphBuilder takes care of those. 

700 return skeleton 

701 

702 @classmethod 

703 def make_argument_parser(cls) -> argparse.ArgumentParser: 

704 """Make the argument parser for the command-line interface.""" 

705 parser = argparse.ArgumentParser( 

706 description=( 

707 "Build a QuantumGraph that gathers and consolidates " 

708 "resource usage tables from existing metadata datasets." 

709 ), 

710 ) 

711 parser.add_argument("repo", type=str, help="Path to data repository or butler configuration.") 

712 parser.add_argument("filename", type=str, help="Output filename for QuantumGraph.") 

713 parser.add_argument( 

714 "collections", 

715 type=str, 

716 nargs="+", 

717 help="Collection(s)s to search for input metadata.", 

718 ) 

719 parser.add_argument( 

720 "--dataset-types", 

721 type=str, 

722 action="extend", 

723 help="Glob-style patterns for input metadata dataset types.", 

724 ) 

725 parser.add_argument( 

726 "--where", 

727 type=str, 

728 default="", 

729 help="Data ID expression used when querying for input metadata datasets.", 

730 ) 

731 parser.add_argument( 

732 "--output", 

733 type=str, 

734 help=( 

735 "Name of the output CHAINED collection. If this options is specified and " 

736 "--output-run is not, then a new RUN collection will be created by appending " 

737 "a timestamp to the value of this option." 

738 ), 

739 default=None, 

740 metavar="COLL", 

741 ) 

742 parser.add_argument( 

743 "--output-run", 

744 type=str, 

745 help=( 

746 "Output RUN collection to write resulting images. If not provided " 

747 "then --output must be provided and a new RUN collection will be created " 

748 "by appending a timestamp to the value passed with --output." 

749 ), 

750 default=None, 

751 metavar="RUN", 

752 ) 

753 return parser 

754 

755 @classmethod 

756 def main(cls) -> None: 

757 """Run the command-line interface for this quantum-graph builder. 

758 

759 This function provides the implementation for the 

760 ``build-gather-resource-usage-qg`` script. 

761 """ 

762 parser = cls.make_argument_parser() 

763 args = parser.parse_args() 

764 # Figure out collection names 

765 if args.output_run is None: 

766 if args.output is None: 

767 raise ValueError("At least one of --output or --output-run options is required.") 

768 args.output_run = "{}/{}".format(args.output, Instrument.makeCollectionTimestamp()) 

769 

770 butler = Butler(args.repo, collections=args.collections) 

771 builder = cls( 

772 butler, 

773 dataset_type_names=args.dataset_types, 

774 where=args.where, 

775 input_collections=args.collections, 

776 output_run=args.output_run, 

777 ) 

778 qg: QuantumGraph = builder.build( 

779 # Metadata includes a subset of attributes defined in CmdLineFwk. 

780 metadata={ 

781 "input": args.collections, 

782 "butler_argument": args.repo, 

783 "output": args.output, 

784 "output_run": args.output_run, 

785 "data_query": args.where, 

786 "time": f"{datetime.datetime.now()}", 

787 } 

788 ) 

789 qg.saveUri(args.filename)