Coverage for python/lsst/pipe/base/execution_reports.py: 28%

125 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-10 03:25 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "QuantumGraphExecutionReport", 

25 "TaskExecutionReport", 

26 "DatasetTypeExecutionReport", 

27 "lookup_quantum_data_id", 

28) 

29 

30import dataclasses 

31import itertools 

32import logging 

33import uuid 

34from collections.abc import Iterable, Mapping 

35from typing import Any 

36 

37import networkx 

38import yaml 

39from lsst.daf.butler import Butler, DataCoordinate, DatasetRef, Quantum 

40from lsst.resources import ResourcePathExpression 

41 

42from .graph import QuantumGraph 

43 

44 

45@dataclasses.dataclass 

46class DatasetTypeExecutionReport: 

47 """A report on the number of produced datasets as well as the status of 

48 missing datasets based on metadata. 

49 

50 A `DatasetTypeExecutionReport` is created for each 

51 `~lsst.daf.butler.DatasetType` in a `TaskExecutionReport`. 

52 """ 

53 

54 failed: set[DatasetRef] = dataclasses.field(default_factory=set) 

55 """Datasets not produced because their quanta failed directly in this 

56 run (`set`). 

57 """ 

58 

59 not_produced: set[DatasetRef] = dataclasses.field(default_factory=set) 

60 """Missing datasets which were not produced by successful quanta. 

61 """ 

62 

63 blocked: set[DatasetRef] = dataclasses.field(default_factory=set) 

64 """Datasets not produced due to an upstream failure (`set`). 

65 """ 

66 

67 n_produced: int = 0 

68 """Count of datasets produced (`int`). 

69 """ 

70 

71 def to_summary_dict(self) -> dict[str, Any]: 

72 r"""Summarize the DatasetTypeExecutionReport in a dictionary. 

73 

74 Returns 

75 ------- 

76 summary_dict : `dict` 

77 A count of the datasets with each outcome; the number of 

78 produced, ``failed``, ``not_produced``, and ``blocked`` 

79 `~lsst.daf.butler.DatasetType`\ s. 

80 See above for attribute descriptions. 

81 """ 

82 return { 

83 "produced": self.n_produced, 

84 "failed": len(self.failed), 

85 "not_produced": len(self.not_produced), 

86 "blocked": len(self.blocked), 

87 } 

88 

89 

90@dataclasses.dataclass 

91class TaskExecutionReport: 

92 """A report on the status and content of a task in an executed quantum 

93 graph. 

94 

95 Use task metadata to identify and inspect failures and report on output 

96 datasets. 

97 

98 See Also 

99 -------- 

100 QuantumGraphExecutionReport : Quantum graph report. 

101 DatasetTypeExecutionReport : DatasetType report. 

102 """ 

103 

104 failed: dict[uuid.UUID, DatasetRef] = dataclasses.field(default_factory=dict) 

105 """A mapping from quantum data ID to log dataset reference for quanta that 

106 failed directly in this run (`dict`). 

107 """ 

108 

109 n_succeeded: int = 0 

110 """A count of successful quanta. 

111 

112 This may include quanta that did not produce any datasets; ie, raised 

113 `NoWorkFound`. 

114 """ 

115 

116 blocked: dict[uuid.UUID, DataCoordinate] = dataclasses.field(default_factory=dict) 

117 """A mapping of data IDs of quanta that were not attempted due to an 

118 upstream failure (`dict`). 

119 """ 

120 

121 output_datasets: dict[str, DatasetTypeExecutionReport] = dataclasses.field(default_factory=dict) 

122 """Missing and produced outputs of each `~lsst.daf.butler.DatasetType` 

123 (`dict`). 

124 """ 

125 

126 def inspect_quantum( 

127 self, 

128 quantum_id: uuid.UUID, 

129 quantum: Quantum, 

130 status_graph: networkx.DiGraph, 

131 refs: Mapping[str, Mapping[uuid.UUID, DatasetRef]], 

132 metadata_name: str, 

133 log_name: str, 

134 ) -> None: 

135 """Inspect a quantum of a quantum graph and ascertain the status of 

136 each associated data product. 

137 

138 Parameters 

139 ---------- 

140 quantum_id : `uuid.UUID` 

141 Unique identifier for the quantum to inspect. 

142 quantum : `Quantum` 

143 The specific node of the quantum graph to be inspected. 

144 status_graph : `networkx.DiGraph` 

145 The quantum graph produced by 

146 `QuantumGraphExecutionReport.make_reports` which steps through the 

147 quantum graph of a run and logs the status of each quantum. 

148 refs : `~collections.abc.Mapping` [ `str`,\ 

149 `~collections.abc.Mapping` [ `uuid.UUID`,\ 

150 `~lsst.daf.butler.DatasetRef` ] ] 

151 The DatasetRefs of each of the DatasetTypes produced by the task. 

152 Includes initialization, intermediate and output data products. 

153 metadata_name : `str` 

154 The metadata dataset name for the node. 

155 log_name : `str` 

156 The name of the log files for the node. 

157 

158 See Also 

159 -------- 

160 QuantumGraphExecutionReport.make_reports : Make reports. 

161 """ 

162 (metadata_ref,) = quantum.outputs[metadata_name] 

163 (log_ref,) = quantum.outputs[log_name] 

164 blocked = False 

165 if metadata_ref.id not in refs[metadata_name]: 

166 if any( 

167 status_graph.nodes[upstream_quantum_id]["failed"] 

168 for upstream_dataset_id in status_graph.predecessors(quantum_id) 

169 for upstream_quantum_id in status_graph.predecessors(upstream_dataset_id) 

170 ): 

171 assert quantum.dataId is not None 

172 self.blocked[quantum_id] = quantum.dataId 

173 blocked = True 

174 else: 

175 self.failed[quantum_id] = log_ref 

176 # note: log_ref may or may not actually exist 

177 failed = True 

178 else: 

179 failed = False 

180 self.n_succeeded += 1 

181 status_graph.nodes[quantum_id]["failed"] = failed 

182 

183 # Now, loop over the datasets to make a DatasetTypeExecutionReport. 

184 for output_ref in itertools.chain.from_iterable(quantum.outputs.values()): 

185 if output_ref == metadata_ref or output_ref == log_ref: 

186 continue 

187 if (dataset_type_report := self.output_datasets.get(output_ref.datasetType.name)) is None: 

188 dataset_type_report = DatasetTypeExecutionReport() 

189 self.output_datasets[output_ref.datasetType.name] = dataset_type_report 

190 if output_ref.id not in refs[output_ref.datasetType.name]: 

191 if failed: 

192 if blocked: 

193 dataset_type_report.blocked.add(output_ref) 

194 else: 

195 dataset_type_report.failed.add(output_ref) 

196 else: 

197 dataset_type_report.not_produced.add(output_ref) 

198 else: 

199 dataset_type_report.n_produced += 1 

200 

201 def to_summary_dict( 

202 self, butler: Butler, do_store_logs: bool = True, human_readable: bool = False 

203 ) -> dict[str, Any]: 

204 """Summarize the results of the TaskExecutionReport in a dictionary. 

205 

206 Parameters 

207 ---------- 

208 butler : `lsst.daf.butler.Butler` 

209 The Butler used for this report. 

210 do_store_logs : `bool` 

211 Store the logs in the summary dictionary. 

212 human_readable : `bool` 

213 Store more human-readable information to be printed out to the 

214 command-line. 

215 

216 Returns 

217 ------- 

218 summary_dict : `dict` 

219 A dictionary containing: 

220 

221 - outputs: A dictionary summarizing the 

222 DatasetTypeExecutionReport for each DatasetType associated with 

223 the task 

224 - failed_quanta: A dictionary of quanta which failed and their 

225 dataIDs by quantum graph node id 

226 - n_quanta_blocked: The number of quanta which failed due to 

227 upstream failures. 

228 - n_succeded: The number of quanta which succeeded. 

229 

230 And possibly, if human-readable is passed: 

231 

232 - errors: A dictionary of data ids associated with each error 

233 message. If `human-readable` and `do_store_logs`, this is stored 

234 here. Otherwise, if `do_store_logs`, it is stored in 

235 `failed_quanta` keyed by the quantum graph node id. 

236 """ 

237 failed_quanta = {} 

238 for node_id, log_ref in self.failed.items(): 

239 data_ids = dict(log_ref.dataId.required) 

240 quantum_info: dict[str, Any] = {"data_id": data_ids} 

241 if do_store_logs: 

242 try: 

243 log = butler.get(log_ref) 

244 except LookupError: 

245 quantum_info["error"] = [] 

246 except FileNotFoundError: 

247 quantum_info["error"] = None 

248 else: 

249 quantum_info["error"] = [ 

250 record.message for record in log if record.levelno >= logging.ERROR 

251 ] 

252 if human_readable: 

253 failed_quanta["data_id"] = data_ids 

254 return { 

255 "outputs": {name: r.to_summary_dict() for name, r in self.output_datasets.items()}, 

256 "failed_quanta": failed_quanta, 

257 "n_quanta_blocked": len(self.blocked), 

258 "n_succeeded": self.n_succeeded, 

259 "errors": quantum_info, 

260 } 

261 else: 

262 failed_quanta[str(node_id)] = quantum_info 

263 return { 

264 "outputs": {name: r.to_summary_dict() for name, r in self.output_datasets.items()}, 

265 "failed_quanta": failed_quanta, 

266 "n_quanta_blocked": len(self.blocked), 

267 "n_succeeded": self.n_succeeded, 

268 } 

269 

270 def __str__(self) -> str: 

271 """Return a count of the failed and blocked tasks in the 

272 TaskExecutionReport. 

273 """ 

274 return f"failed: {len(self.failed)}\nblocked: {len(self.blocked)}\n" 

275 

276 

277@dataclasses.dataclass 

278class QuantumGraphExecutionReport: 

279 """A report on the execution of a quantum graph. 

280 

281 Report the detailed status of each failure; whether tasks were not run, 

282 data is missing from upstream failures, or specific errors occurred during 

283 task execution (and report the errors). Contains a count of expected, 

284 produced DatasetTypes for each task. This report can be output as a 

285 dictionary or a yaml file. 

286 

287 Attributes 

288 ---------- 

289 tasks : `dict` 

290 A dictionary of TaskExecutionReports by task label. 

291 

292 See Also 

293 -------- 

294 TaskExecutionReport : A task report. 

295 DatasetTypeExecutionReport : A dataset type report. 

296 """ 

297 

298 tasks: dict[str, TaskExecutionReport] = dataclasses.field(default_factory=dict) 

299 """A dictionary of TaskExecutionReports by task label (`dict`).""" 

300 

301 def to_summary_dict( 

302 self, butler: Butler, do_store_logs: bool = True, human_readable: bool = False 

303 ) -> dict[str, Any]: 

304 """Summarize the results of the `QuantumGraphExecutionReport` in a 

305 dictionary. 

306 

307 Parameters 

308 ---------- 

309 butler : `lsst.daf.butler.Butler` 

310 The Butler used for this report. 

311 do_store_logs : `bool` 

312 Store the logs in the summary dictionary. 

313 human_readable : `bool` 

314 Store more human-readable information to be printed out to the 

315 command-line. 

316 

317 Returns 

318 ------- 

319 summary_dict : `dict` 

320 A dictionary containing a summary of a `TaskExecutionReport` for 

321 each task in the quantum graph. 

322 """ 

323 return { 

324 task: report.to_summary_dict(butler, do_store_logs=do_store_logs, human_readable=human_readable) 

325 for task, report in self.tasks.items() 

326 } 

327 

328 def write_summary_yaml(self, butler: Butler, filename: str, do_store_logs: bool = True) -> None: 

329 """Take the dictionary from 

330 `QuantumGraphExecutionReport.to_summary_dict` and store its contents in 

331 a yaml file. 

332 

333 Parameters 

334 ---------- 

335 butler : `lsst.daf.butler.Butler` 

336 The Butler used for this report. 

337 filename : `str` 

338 The name to be used for the summary yaml file. 

339 do_store_logs : `bool` 

340 Store the logs in the summary dictionary. 

341 """ 

342 with open(filename, "w") as stream: 

343 yaml.safe_dump(self.to_summary_dict(butler, do_store_logs=do_store_logs), stream) 

344 

345 @classmethod 

346 def make_reports( 

347 cls, 

348 butler: Butler, 

349 graph: QuantumGraph | ResourcePathExpression, 

350 ) -> QuantumGraphExecutionReport: 

351 """Make a `QuantumGraphExecutionReport`. 

352 

353 Step through the quantum graph associated with a run, creating a 

354 `networkx.DiGraph` called status_graph to annotate the status of each 

355 quantum node. For each task in the quantum graph, use 

356 `TaskExecutionReport.inspect_quantum` to make a `TaskExecutionReport` 

357 based on the status of each node. Return a `TaskExecutionReport` for 

358 each task in the quantum graph. 

359 

360 Parameters 

361 ---------- 

362 butler : `lsst.daf.butler.Butler` 

363 The Butler used for this report. This should match the Butler used 

364 for the run associated with the executed quantum graph. 

365 graph : `QuantumGraph` | `ResourcePathExpression` 

366 Either the associated quantum graph object or the uri of the 

367 location of said quantum graph. 

368 

369 Returns 

370 ------- 

371 report: `QuantumGraphExecutionReport` 

372 The `TaskExecutionReport` for each task in the quantum graph. 

373 """ 

374 refs = {} # type: dict[str, Any] 

375 status_graph = networkx.DiGraph() 

376 if not isinstance(graph, QuantumGraph): 

377 qg = QuantumGraph.loadUri(graph) 

378 else: 

379 qg = graph 

380 assert qg.metadata is not None, "Saved QGs always have metadata." 

381 collection = qg.metadata["output_run"] 

382 report = cls() 

383 for dataset_type_node in qg.pipeline_graph.dataset_types.values(): 

384 if qg.pipeline_graph.producer_of(dataset_type_node.name) is None: 

385 continue 

386 refs[dataset_type_node.name] = { 

387 ref.id: ref 

388 for ref in butler.registry.queryDatasets( 

389 dataset_type_node.name, collections=collection, findFirst=False 

390 ) 

391 } 

392 for task_node in qg.pipeline_graph.tasks.values(): 

393 for quantum_id, quantum in qg.get_task_quanta(task_node.label).items(): 

394 status_graph.add_node(quantum_id) 

395 for ref in itertools.chain.from_iterable(quantum.outputs.values()): 

396 status_graph.add_edge(quantum_id, ref.id) 

397 for ref in itertools.chain.from_iterable(quantum.inputs.values()): 

398 status_graph.add_edge(ref.id, quantum_id) 

399 

400 for task_node in qg.pipeline_graph.tasks.values(): 

401 task_report = TaskExecutionReport() 

402 if task_node.log_output is None: 

403 raise RuntimeError("QG must have log outputs to use execution reports.") 

404 for quantum_id, quantum in qg.get_task_quanta(task_node.label).items(): 

405 task_report.inspect_quantum( 

406 quantum_id, 

407 quantum, 

408 status_graph, 

409 refs, 

410 metadata_name=task_node.metadata_output.dataset_type_name, 

411 log_name=task_node.log_output.dataset_type_name, 

412 ) 

413 report.tasks[task_node.label] = task_report 

414 return report 

415 

416 def __str__(self) -> str: 

417 return "\n".join(f"{tasklabel}:{report}" for tasklabel, report in self.tasks.items()) 

418 

419 

420def lookup_quantum_data_id( 

421 graph_uri: ResourcePathExpression, nodes: Iterable[uuid.UUID] 

422) -> list[DataCoordinate | None]: 

423 """Look up a dataId from a quantum graph and a list of quantum graph 

424 nodeIDs. 

425 

426 Parameters 

427 ---------- 

428 graph_uri : `ResourcePathExpression` 

429 URI of the quantum graph of the run. 

430 nodes : `~collections.abc.Iterable` [ `uuid.UUID` ] 

431 Quantum graph nodeID. 

432 

433 Returns 

434 ------- 

435 data_ids : `list` [ `lsst.daf.butler.DataCoordinate` ] 

436 A list of human-readable dataIDs which map to the nodeIDs on the 

437 quantum graph at graph_uri. 

438 """ 

439 qg = QuantumGraph.loadUri(graph_uri, nodes=nodes) 

440 return [qg.getQuantumNodeByNodeId(node).quantum.dataId for node in nodes]