Coverage for python/lsst/pipe/base/execution_reports.py: 28%

130 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-14 10:49 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "QuantumGraphExecutionReport", 

25 "TaskExecutionReport", 

26 "DatasetTypeExecutionReport", 

27 "lookup_quantum_data_id", 

28) 

29 

30import dataclasses 

31import itertools 

32import logging 

33import uuid 

34from collections.abc import Iterable, Mapping 

35from typing import Any 

36 

37import networkx 

38import yaml 

39from lsst.daf.butler import Butler, DataCoordinate, DatasetRef 

40from lsst.resources import ResourcePathExpression 

41 

42from .graph import QuantumGraph, QuantumNode 

43from .pipeline import PipelineDatasetTypes 

44 

45 

46@dataclasses.dataclass 

47class DatasetTypeExecutionReport: 

48 """A report on the number of produced datasets as well as the status of 

49 missing datasets based on metadata. 

50 

51 A `DatasetTypeExecutionReport` is created for each 

52 `~lsst.daf.butler.DatasetType` in a `TaskExecutionReport`. 

53 """ 

54 

55 failed: set[DatasetRef] = dataclasses.field(default_factory=set) 

56 """Datasets not produced because their quanta failed directly in this 

57 run (`set`). 

58 """ 

59 

60 not_produced: set[DatasetRef] = dataclasses.field(default_factory=set) 

61 """Missing datasets which were not produced by successful quanta. 

62 """ 

63 

64 blocked: set[DatasetRef] = dataclasses.field(default_factory=set) 

65 """Datasets not produced due to an upstream failure (`set`). 

66 """ 

67 

68 n_produced: int = 0 

69 """Count of datasets produced (`int`). 

70 """ 

71 

72 def to_summary_dict(self) -> dict[str, Any]: 

73 r"""Summarize the DatasetTypeExecutionReport in a dictionary. 

74 

75 Returns 

76 ------- 

77 summary_dict : `dict` 

78 A count of the datasets with each outcome; the number of 

79 produced, ``failed``, ``not_produced``, and ``blocked`` 

80 `~lsst.daf.butler.DatasetType`\ s. 

81 See above for attribute descriptions. 

82 """ 

83 return { 

84 "produced": self.n_produced, 

85 "failed": len(self.failed), 

86 "not_produced": len(self.not_produced), 

87 "blocked": len(self.blocked), 

88 } 

89 

90 

91@dataclasses.dataclass 

92class TaskExecutionReport: 

93 """A report on the status and content of a task in an executed quantum 

94 graph. 

95 

96 Use task metadata to identify and inspect failures and report on output 

97 datasets. 

98 

99 See Also 

100 -------- 

101 QuantumGraphExecutionReport : Quantum graph report. 

102 DatasetTypeExecutionReport : DatasetType report. 

103 """ 

104 

105 failed: dict[uuid.UUID, DatasetRef] = dataclasses.field(default_factory=dict) 

106 """A mapping from quantum data ID to log dataset reference for quanta that 

107 failed directly in this run (`dict`). 

108 """ 

109 

110 n_succeeded: int = 0 

111 """A count of successful quanta. 

112 

113 This may include quanta that did not produce any datasets; ie, raised 

114 `NoWorkFound`. 

115 """ 

116 

117 blocked: dict[uuid.UUID, DataCoordinate] = dataclasses.field(default_factory=dict) 

118 """A mapping of data IDs of quanta that were not attempted due to an 

119 upstream failure (`dict`). 

120 """ 

121 

122 output_datasets: dict[str, DatasetTypeExecutionReport] = dataclasses.field(default_factory=dict) 

123 """Missing and produced outputs of each `~lsst.daf.butler.DatasetType` 

124 (`dict`). 

125 """ 

126 

127 def inspect_quantum( 

128 self, 

129 quantum_node: QuantumNode, 

130 status_graph: networkx.DiGraph, 

131 refs: Mapping[str, Mapping[uuid.UUID, DatasetRef]], 

132 metadata_name: str, 

133 log_name: str, 

134 ) -> None: 

135 """Inspect a quantum of a quantum graph and ascertain the status of 

136 each associated data product. 

137 

138 Parameters 

139 ---------- 

140 quantum_node : `QuantumNode` 

141 The specific node of the quantum graph to be inspected. 

142 status_graph : `networkx.DiGraph` 

143 The quantum graph produced by 

144 `QuantumGraphExecutionReport.make_reports` which steps through the 

145 quantum graph of a run and logs the status of each quantum. 

146 refs : `~collections.abc.Mapping` [ `str`,\ 

147 `~collections.abc.Mapping` [ `uuid.UUID`,\ 

148 `~lsst.daf.butler.DatasetRef` ] ] 

149 The DatasetRefs of each of the DatasetTypes produced by the task. 

150 Includes initialization, intermediate and output data products. 

151 metadata_name : `str` 

152 The metadata dataset name for the node. 

153 log_name : `str` 

154 The name of the log files for the node. 

155 

156 See Also 

157 -------- 

158 QuantumGraphExecutionReport.make_reports : Make reports. 

159 """ 

160 quantum = quantum_node.quantum 

161 (metadata_ref,) = quantum.outputs[metadata_name] 

162 (log_ref,) = quantum.outputs[log_name] 

163 blocked = False 

164 if metadata_ref.id not in refs[metadata_name]: 

165 if any( 

166 status_graph.nodes[upstream_quantum_id]["failed"] 

167 for upstream_dataset_id in status_graph.predecessors(quantum_node.nodeId) 

168 for upstream_quantum_id in status_graph.predecessors(upstream_dataset_id) 

169 ): 

170 assert quantum.dataId is not None 

171 self.blocked[quantum_node.nodeId] = quantum.dataId 

172 blocked = True 

173 else: 

174 self.failed[quantum_node.nodeId] = log_ref 

175 # note: log_ref may or may not actually exist 

176 failed = True 

177 else: 

178 failed = False 

179 self.n_succeeded += 1 

180 status_graph.nodes[quantum_node.nodeId]["failed"] = failed 

181 

182 # Now, loop over the datasets to make a DatasetTypeExecutionReport. 

183 for output_ref in itertools.chain.from_iterable(quantum.outputs.values()): 

184 if output_ref == metadata_ref or output_ref == log_ref: 

185 continue 

186 if (dataset_type_report := self.output_datasets.get(output_ref.datasetType.name)) is None: 

187 dataset_type_report = DatasetTypeExecutionReport() 

188 self.output_datasets[output_ref.datasetType.name] = dataset_type_report 

189 if output_ref.id not in refs[output_ref.datasetType.name]: 

190 if failed: 

191 if blocked: 

192 dataset_type_report.blocked.add(output_ref) 

193 else: 

194 dataset_type_report.failed.add(output_ref) 

195 else: 

196 dataset_type_report.not_produced.add(output_ref) 

197 else: 

198 dataset_type_report.n_produced += 1 

199 

200 def to_summary_dict( 

201 self, butler: Butler, do_store_logs: bool = True, human_readable: bool = False 

202 ) -> dict[str, Any]: 

203 """Summarize the results of the TaskExecutionReport in a dictionary. 

204 

205 Parameters 

206 ---------- 

207 butler : `lsst.daf.butler.Butler` 

208 The Butler used for this report. 

209 do_store_logs : `bool` 

210 Store the logs in the summary dictionary. 

211 human_readable : `bool` 

212 Store more human-readable information to be printed out to the 

213 command-line. 

214 

215 Returns 

216 ------- 

217 summary_dict : `dict` 

218 A dictionary containing: 

219 

220 - outputs: A dictionary summarizing the 

221 DatasetTypeExecutionReport for each DatasetType associated with 

222 the task 

223 - failed_quanta: A dictionary of quanta which failed and their 

224 dataIDs by quantum graph node id 

225 - n_quanta_blocked: The number of quanta which failed due to 

226 upstream failures. 

227 - n_succeded: The number of quanta which succeeded. 

228 

229 And possibly, if human-readable is passed: 

230 

231 - errors: A dictionary of data ids associated with each error 

232 message. If `human-readable` and `do_store_logs`, this is stored 

233 here. Otherwise, if `do_store_logs`, it is stored in 

234 `failed_quanta` keyed by the quantum graph node id. 

235 """ 

236 failed_quanta = {} 

237 for node_id, log_ref in self.failed.items(): 

238 data_ids = dict(log_ref.dataId.required) 

239 quantum_info: dict[str, Any] = {"data_id": data_ids} 

240 if do_store_logs: 

241 try: 

242 log = butler.get(log_ref) 

243 except LookupError: 

244 quantum_info["error"] = [] 

245 except FileNotFoundError: 

246 quantum_info["error"] = None 

247 else: 

248 quantum_info["error"] = [ 

249 record.message for record in log if record.levelno >= logging.ERROR 

250 ] 

251 if human_readable: 

252 failed_quanta["data_id"] = data_ids 

253 return { 

254 "outputs": {name: r.to_summary_dict() for name, r in self.output_datasets.items()}, 

255 "failed_quanta": failed_quanta, 

256 "n_quanta_blocked": len(self.blocked), 

257 "n_succeeded": self.n_succeeded, 

258 "errors": quantum_info, 

259 } 

260 else: 

261 failed_quanta[str(node_id)] = quantum_info 

262 return { 

263 "outputs": {name: r.to_summary_dict() for name, r in self.output_datasets.items()}, 

264 "failed_quanta": failed_quanta, 

265 "n_quanta_blocked": len(self.blocked), 

266 "n_succeeded": self.n_succeeded, 

267 } 

268 

269 def __str__(self) -> str: 

270 """Return a count of the failed and blocked tasks in the 

271 TaskExecutionReport. 

272 """ 

273 return f"failed: {len(self.failed)}\nblocked: {len(self.blocked)}\n" 

274 

275 

276@dataclasses.dataclass 

277class QuantumGraphExecutionReport: 

278 """A report on the execution of a quantum graph. 

279 

280 Report the detailed status of each failure; whether tasks were not run, 

281 data is missing from upstream failures, or specific errors occurred during 

282 task execution (and report the errors). Contains a count of expected, 

283 produced DatasetTypes for each task. This report can be output as a 

284 dictionary or a yaml file. 

285 

286 Attributes 

287 ---------- 

288 tasks : `dict` 

289 A dictionary of TaskExecutionReports by task label. 

290 

291 See Also 

292 -------- 

293 TaskExecutionReport : A task report. 

294 DatasetTypeExecutionReport : A dataset type report. 

295 """ 

296 

297 tasks: dict[str, TaskExecutionReport] = dataclasses.field(default_factory=dict) 

298 """A dictionary of TaskExecutionReports by task label (`dict`).""" 

299 

300 def to_summary_dict( 

301 self, butler: Butler, do_store_logs: bool = True, human_readable: bool = False 

302 ) -> dict[str, Any]: 

303 """Summarize the results of the `QuantumGraphExecutionReport` in a 

304 dictionary. 

305 

306 Parameters 

307 ---------- 

308 butler : `lsst.daf.butler.Butler` 

309 The Butler used for this report. 

310 do_store_logs : `bool` 

311 Store the logs in the summary dictionary. 

312 human_readable : `bool` 

313 Store more human-readable information to be printed out to the 

314 command-line. 

315 

316 Returns 

317 ------- 

318 summary_dict : `dict` 

319 A dictionary containing a summary of a `TaskExecutionReport` for 

320 each task in the quantum graph. 

321 """ 

322 return { 

323 task: report.to_summary_dict(butler, do_store_logs=do_store_logs, human_readable=human_readable) 

324 for task, report in self.tasks.items() 

325 } 

326 

327 def write_summary_yaml(self, butler: Butler, filename: str, do_store_logs: bool = True) -> None: 

328 """Take the dictionary from 

329 `QuantumGraphExecutionReport.to_summary_dict` and store its contents in 

330 a yaml file. 

331 

332 Parameters 

333 ---------- 

334 butler : `lsst.daf.butler.Butler` 

335 The Butler used for this report. 

336 filename : `str` 

337 The name to be used for the summary yaml file. 

338 do_store_logs : `bool` 

339 Store the logs in the summary dictionary. 

340 """ 

341 with open(filename, "w") as stream: 

342 yaml.safe_dump(self.to_summary_dict(butler, do_store_logs=do_store_logs), stream) 

343 

344 @classmethod 

345 def make_reports( 

346 cls, 

347 butler: Butler, 

348 graph: QuantumGraph | ResourcePathExpression, 

349 ) -> QuantumGraphExecutionReport: 

350 """Make a `QuantumGraphExecutionReport`. 

351 

352 Step through the quantum graph associated with a run, creating a 

353 `networkx.DiGraph` called status_graph to annotate the status of each 

354 quantum node. For each task in the quantum graph, use 

355 `TaskExecutionReport.inspect_quantum` to make a `TaskExecutionReport` 

356 based on the status of each node. Return a `TaskExecutionReport` for 

357 each task in the quantum graph. 

358 

359 Parameters 

360 ---------- 

361 butler : `lsst.daf.butler.Butler` 

362 The Butler used for this report. This should match the Butler used 

363 for the run associated with the executed quantum graph. 

364 graph : `QuantumGraph` | `ResourcePathExpression` 

365 Either the associated quantum graph object or the uri of the 

366 location of said quantum graph. 

367 

368 Returns 

369 ------- 

370 report: `QuantumGraphExecutionReport` 

371 The `TaskExecutionReport` for each task in the quantum graph. 

372 """ 

373 refs = {} # type: dict[str, Any] 

374 status_graph = networkx.DiGraph() 

375 if not isinstance(graph, QuantumGraph): 

376 qg = QuantumGraph.loadUri(graph) 

377 else: 

378 qg = graph 

379 assert qg.metadata is not None, "Saved QGs always have metadata." 

380 collection = qg.metadata["output_run"] 

381 report = cls() 

382 task_defs = list(qg.iterTaskGraph()) 

383 pipeline_dataset_types = PipelineDatasetTypes.fromPipeline(task_defs, registry=butler.registry) 

384 for dataset_type in itertools.chain( 

385 pipeline_dataset_types.initIntermediates, 

386 pipeline_dataset_types.initOutputs, 

387 pipeline_dataset_types.intermediates, 

388 pipeline_dataset_types.outputs, 

389 ): 

390 if (component := dataset_type.component()) is not None: 

391 # Work around the fact that component support has been phased 

392 # out of daf_butler queries but not pipe_base's QGs. This 

393 # should go away on DM-40441. 

394 parent_dataset_type = dataset_type.makeCompositeDatasetType() 

395 refs[dataset_type.name] = { 

396 ref.id: ref.makeComponentRef(component) 

397 for ref in butler.registry.queryDatasets( 

398 parent_dataset_type.name, collections=collection, findFirst=False 

399 ) 

400 } 

401 else: 

402 refs[dataset_type.name] = { 

403 ref.id: ref 

404 for ref in butler.registry.queryDatasets( 

405 dataset_type.name, collections=collection, findFirst=False 

406 ) 

407 } 

408 for task_def in qg.iterTaskGraph(): 

409 for node in qg.getNodesForTask(task_def): 

410 status_graph.add_node(node.nodeId) 

411 for ref in itertools.chain.from_iterable(node.quantum.outputs.values()): 

412 status_graph.add_edge(node.nodeId, ref.id) 

413 for ref in itertools.chain.from_iterable(node.quantum.inputs.values()): 

414 status_graph.add_edge(ref.id, node.nodeId) 

415 

416 for task_def in qg.iterTaskGraph(): 

417 task_report = TaskExecutionReport() 

418 if task_def.logOutputDatasetName is None: 

419 raise RuntimeError("QG must have log outputs to use execution reports.") 

420 for node in qg.getNodesForTask(task_def): 

421 task_report.inspect_quantum( 

422 node, 

423 status_graph, 

424 refs, 

425 metadata_name=task_def.metadataDatasetName, 

426 log_name=task_def.logOutputDatasetName, 

427 ) 

428 report.tasks[task_def.label] = task_report 

429 return report 

430 

431 def __str__(self) -> str: 

432 return "\n".join(f"{tasklabel}:{report}" for tasklabel, report in self.tasks.items()) 

433 

434 

435def lookup_quantum_data_id( 

436 graph_uri: ResourcePathExpression, nodes: Iterable[uuid.UUID] 

437) -> list[DataCoordinate | None]: 

438 """Look up a dataId from a quantum graph and a list of quantum graph 

439 nodeIDs. 

440 

441 Parameters 

442 ---------- 

443 graph_uri : `ResourcePathExpression` 

444 URI of the quantum graph of the run. 

445 nodes : `~collections.abc.Iterable` [ `uuid.UUID` ] 

446 Quantum graph nodeID. 

447 

448 Returns 

449 ------- 

450 data_ids : `list` [ `lsst.daf.butler.DataCoordinate` ] 

451 A list of human-readable dataIDs which map to the nodeIDs on the 

452 quantum graph at graph_uri. 

453 """ 

454 qg = QuantumGraph.loadUri(graph_uri, nodes=nodes) 

455 return [qg.getQuantumNodeByNodeId(node).quantum.dataId for node in nodes]