Coverage for python/lsst/pipe/base/execution_reports.py: 27%

133 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-18 09:56 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "QuantumGraphExecutionReport", 

25 "TaskExecutionReport", 

26 "DatasetTypeExecutionReport", 

27 "lookup_quantum_data_id", 

28) 

29 

30import dataclasses 

31import itertools 

32import logging 

33import uuid 

34from collections.abc import Iterable, Mapping 

35from typing import Any 

36 

37import networkx 

38import yaml 

39from lsst.daf.butler import Butler, DataCoordinate, DatasetRef, Quantum 

40from lsst.resources import ResourcePathExpression 

41 

42from .graph import QuantumGraph 

43 

44 

45@dataclasses.dataclass 

46class DatasetTypeExecutionReport: 

47 """A report on the number of produced datasets as well as the status of 

48 missing datasets based on metadata. 

49 

50 A `DatasetTypeExecutionReport` is created for each 

51 `~lsst.daf.butler.DatasetType` in a `TaskExecutionReport`. 

52 """ 

53 

54 failed: set[DatasetRef] = dataclasses.field(default_factory=set) 

55 """Datasets not produced because their quanta failed directly in this 

56 run (`set`). 

57 """ 

58 

59 not_produced: set[DatasetRef] = dataclasses.field(default_factory=set) 

60 """Missing datasets which were not produced by successful quanta. 

61 """ 

62 

63 blocked: set[DatasetRef] = dataclasses.field(default_factory=set) 

64 """Datasets not produced due to an upstream failure (`set`). 

65 """ 

66 

67 n_produced: int = 0 

68 """Count of datasets produced (`int`). 

69 """ 

70 

71 def to_summary_dict(self) -> dict[str, Any]: 

72 r"""Summarize the DatasetTypeExecutionReport in a dictionary. 

73 

74 Returns 

75 ------- 

76 summary_dict : `dict` 

77 A count of the datasets with each outcome; the number of 

78 produced, ``failed``, ``not_produced``, and ``blocked`` 

79 `~lsst.daf.butler.DatasetType`\ s. 

80 See above for attribute descriptions. 

81 """ 

82 return { 

83 "produced": self.n_produced, 

84 "failed": len(self.failed), 

85 "not_produced": len(self.not_produced), 

86 "blocked": len(self.blocked), 

87 } 

88 

89 

90@dataclasses.dataclass 

91class TaskExecutionReport: 

92 """A report on the status and content of a task in an executed quantum 

93 graph. 

94 

95 Use task metadata to identify and inspect failures and report on output 

96 datasets. 

97 

98 See Also 

99 -------- 

100 QuantumGraphExecutionReport : Quantum graph report. 

101 DatasetTypeExecutionReport : DatasetType report. 

102 """ 

103 

104 failed: dict[uuid.UUID, DatasetRef] = dataclasses.field(default_factory=dict) 

105 """A mapping from quantum data ID to log dataset reference for quanta that 

106 failed directly in this run (`dict`). 

107 """ 

108 

109 n_succeeded: int = 0 

110 """A count of successful quanta. 

111 

112 This may include quanta that did not produce any datasets; ie, raised 

113 `NoWorkFound`. 

114 """ 

115 

116 blocked: dict[uuid.UUID, DataCoordinate] = dataclasses.field(default_factory=dict) 

117 """A mapping of data IDs of quanta that were not attempted due to an 

118 upstream failure (`dict`). 

119 """ 

120 

121 output_datasets: dict[str, DatasetTypeExecutionReport] = dataclasses.field(default_factory=dict) 

122 """Missing and produced outputs of each `~lsst.daf.butler.DatasetType` 

123 (`dict`). 

124 """ 

125 

126 def inspect_quantum( 

127 self, 

128 quantum_id: uuid.UUID, 

129 quantum: Quantum, 

130 status_graph: networkx.DiGraph, 

131 refs: Mapping[str, Mapping[uuid.UUID, DatasetRef]], 

132 metadata_name: str, 

133 log_name: str, 

134 ) -> None: 

135 """Inspect a quantum of a quantum graph and ascertain the status of 

136 each associated data product. 

137 

138 Parameters 

139 ---------- 

140 quantum_id : `uuid.UUID` 

141 Unique identifier for the quantum to inspect. 

142 quantum : `Quantum` 

143 The specific node of the quantum graph to be inspected. 

144 status_graph : `networkx.DiGraph` 

145 The quantum graph produced by 

146 `QuantumGraphExecutionReport.make_reports` which steps through the 

147 quantum graph of a run and logs the status of each quantum. 

148 refs : `~collections.abc.Mapping` [ `str`,\ 

149 `~collections.abc.Mapping` [ `uuid.UUID`,\ 

150 `~lsst.daf.butler.DatasetRef` ] ] 

151 The DatasetRefs of each of the DatasetTypes produced by the task. 

152 Includes initialization, intermediate and output data products. 

153 metadata_name : `str` 

154 The metadata dataset name for the node. 

155 log_name : `str` 

156 The name of the log files for the node. 

157 

158 See Also 

159 -------- 

160 QuantumGraphExecutionReport.make_reports : Make reports. 

161 """ 

162 (metadata_ref,) = quantum.outputs[metadata_name] 

163 (log_ref,) = quantum.outputs[log_name] 

164 blocked = False 

165 if metadata_ref.id not in refs[metadata_name]: 

166 if any( 

167 status_graph.nodes[upstream_quantum_id]["failed"] 

168 for upstream_dataset_id in status_graph.predecessors(quantum_id) 

169 for upstream_quantum_id in status_graph.predecessors(upstream_dataset_id) 

170 ): 

171 assert quantum.dataId is not None 

172 self.blocked[quantum_id] = quantum.dataId 

173 blocked = True 

174 else: 

175 self.failed[quantum_id] = log_ref 

176 # note: log_ref may or may not actually exist 

177 failed = True 

178 else: 

179 failed = False 

180 self.n_succeeded += 1 

181 status_graph.nodes[quantum_id]["failed"] = failed 

182 

183 # Now, loop over the datasets to make a DatasetTypeExecutionReport. 

184 for output_ref in itertools.chain.from_iterable(quantum.outputs.values()): 

185 if output_ref == metadata_ref or output_ref == log_ref: 

186 continue 

187 if (dataset_type_report := self.output_datasets.get(output_ref.datasetType.name)) is None: 

188 dataset_type_report = DatasetTypeExecutionReport() 

189 self.output_datasets[output_ref.datasetType.name] = dataset_type_report 

190 if output_ref.id not in refs[output_ref.datasetType.name]: 

191 if failed: 

192 if blocked: 

193 dataset_type_report.blocked.add(output_ref) 

194 else: 

195 dataset_type_report.failed.add(output_ref) 

196 else: 

197 dataset_type_report.not_produced.add(output_ref) 

198 else: 

199 dataset_type_report.n_produced += 1 

200 

201 def to_summary_dict( 

202 self, butler: Butler, do_store_logs: bool = True, human_readable: bool = False 

203 ) -> dict[str, Any]: 

204 """Summarize the results of the TaskExecutionReport in a dictionary. 

205 

206 Parameters 

207 ---------- 

208 butler : `lsst.daf.butler.Butler` 

209 The Butler used for this report. 

210 do_store_logs : `bool` 

211 Store the logs in the summary dictionary. 

212 human_readable : `bool` 

213 Store more human-readable information to be printed out to the 

214 command-line. 

215 

216 Returns 

217 ------- 

218 summary_dict : `dict` 

219 A dictionary containing: 

220 

221 - outputs: A dictionary summarizing the 

222 DatasetTypeExecutionReport for each DatasetType associated with 

223 the task 

224 - failed_quanta: A dictionary of quanta which failed and their 

225 dataIDs by quantum graph node id 

226 - n_quanta_blocked: The number of quanta which failed due to 

227 upstream failures. 

228 - n_succeded: The number of quanta which succeeded. 

229 

230 And possibly, if human-readable is passed: 

231 

232 - errors: A dictionary of data ids associated with each error 

233 message. If `human-readable` and `do_store_logs`, this is stored 

234 here. Otherwise, if `do_store_logs`, it is stored in 

235 `failed_quanta` keyed by the quantum graph node id. 

236 """ 

237 failed_quanta = {} 

238 failed_data_ids = [] 

239 errors = [] 

240 for node_id, log_ref in self.failed.items(): 

241 data_id = dict(log_ref.dataId.required) 

242 quantum_info: dict[str, Any] = {"data_id": data_id} 

243 if do_store_logs: 

244 try: 

245 log = butler.get(log_ref) 

246 except LookupError: 

247 quantum_info["error"] = [] 

248 except FileNotFoundError: 

249 quantum_info["error"] = None 

250 else: 

251 quantum_info["error"] = [ 

252 record.message for record in log if record.levelno >= logging.ERROR 

253 ] 

254 if human_readable: 

255 failed_data_ids.append(data_id) 

256 if do_store_logs: 

257 errors.append(quantum_info) 

258 

259 else: 

260 failed_quanta[str(node_id)] = quantum_info 

261 result = { 

262 "outputs": {name: r.to_summary_dict() for name, r in self.output_datasets.items()}, 

263 "n_quanta_blocked": len(self.blocked), 

264 "n_succeeded": self.n_succeeded, 

265 } 

266 if human_readable: 

267 result["failed_quanta"] = failed_data_ids 

268 result["errors"] = errors 

269 else: 

270 result["failed_quanta"] = failed_quanta 

271 return result 

272 

273 def __str__(self) -> str: 

274 """Return a count of the failed and blocked tasks in the 

275 TaskExecutionReport. 

276 """ 

277 return f"failed: {len(self.failed)}\nblocked: {len(self.blocked)}\n" 

278 

279 

280@dataclasses.dataclass 

281class QuantumGraphExecutionReport: 

282 """A report on the execution of a quantum graph. 

283 

284 Report the detailed status of each failure; whether tasks were not run, 

285 data is missing from upstream failures, or specific errors occurred during 

286 task execution (and report the errors). Contains a count of expected, 

287 produced DatasetTypes for each task. This report can be output as a 

288 dictionary or a yaml file. 

289 

290 Attributes 

291 ---------- 

292 tasks : `dict` 

293 A dictionary of TaskExecutionReports by task label. 

294 

295 See Also 

296 -------- 

297 TaskExecutionReport : A task report. 

298 DatasetTypeExecutionReport : A dataset type report. 

299 """ 

300 

301 tasks: dict[str, TaskExecutionReport] = dataclasses.field(default_factory=dict) 

302 """A dictionary of TaskExecutionReports by task label (`dict`).""" 

303 

304 def to_summary_dict( 

305 self, butler: Butler, do_store_logs: bool = True, human_readable: bool = False 

306 ) -> dict[str, Any]: 

307 """Summarize the results of the `QuantumGraphExecutionReport` in a 

308 dictionary. 

309 

310 Parameters 

311 ---------- 

312 butler : `lsst.daf.butler.Butler` 

313 The Butler used for this report. 

314 do_store_logs : `bool` 

315 Store the logs in the summary dictionary. 

316 human_readable : `bool` 

317 Store more human-readable information to be printed out to the 

318 command-line. 

319 

320 Returns 

321 ------- 

322 summary_dict : `dict` 

323 A dictionary containing a summary of a `TaskExecutionReport` for 

324 each task in the quantum graph. 

325 """ 

326 return { 

327 task: report.to_summary_dict(butler, do_store_logs=do_store_logs, human_readable=human_readable) 

328 for task, report in self.tasks.items() 

329 } 

330 

331 def write_summary_yaml(self, butler: Butler, filename: str, do_store_logs: bool = True) -> None: 

332 """Take the dictionary from 

333 `QuantumGraphExecutionReport.to_summary_dict` and store its contents in 

334 a yaml file. 

335 

336 Parameters 

337 ---------- 

338 butler : `lsst.daf.butler.Butler` 

339 The Butler used for this report. 

340 filename : `str` 

341 The name to be used for the summary yaml file. 

342 do_store_logs : `bool` 

343 Store the logs in the summary dictionary. 

344 """ 

345 with open(filename, "w") as stream: 

346 yaml.safe_dump(self.to_summary_dict(butler, do_store_logs=do_store_logs), stream) 

347 

348 @classmethod 

349 def make_reports( 

350 cls, 

351 butler: Butler, 

352 graph: QuantumGraph | ResourcePathExpression, 

353 ) -> QuantumGraphExecutionReport: 

354 """Make a `QuantumGraphExecutionReport`. 

355 

356 Step through the quantum graph associated with a run, creating a 

357 `networkx.DiGraph` called status_graph to annotate the status of each 

358 quantum node. For each task in the quantum graph, use 

359 `TaskExecutionReport.inspect_quantum` to make a `TaskExecutionReport` 

360 based on the status of each node. Return a `TaskExecutionReport` for 

361 each task in the quantum graph. 

362 

363 Parameters 

364 ---------- 

365 butler : `lsst.daf.butler.Butler` 

366 The Butler used for this report. This should match the Butler used 

367 for the run associated with the executed quantum graph. 

368 graph : `QuantumGraph` | `ResourcePathExpression` 

369 Either the associated quantum graph object or the uri of the 

370 location of said quantum graph. 

371 

372 Returns 

373 ------- 

374 report: `QuantumGraphExecutionReport` 

375 The `TaskExecutionReport` for each task in the quantum graph. 

376 """ 

377 refs = {} # type: dict[str, Any] 

378 status_graph = networkx.DiGraph() 

379 if not isinstance(graph, QuantumGraph): 

380 qg = QuantumGraph.loadUri(graph) 

381 else: 

382 qg = graph 

383 assert qg.metadata is not None, "Saved QGs always have metadata." 

384 collection = qg.metadata["output_run"] 

385 report = cls() 

386 for dataset_type_node in qg.pipeline_graph.dataset_types.values(): 

387 if qg.pipeline_graph.producer_of(dataset_type_node.name) is None: 

388 continue 

389 refs[dataset_type_node.name] = { 

390 ref.id: ref 

391 for ref in butler.registry.queryDatasets( 

392 dataset_type_node.name, collections=collection, findFirst=False 

393 ) 

394 } 

395 for task_node in qg.pipeline_graph.tasks.values(): 

396 for quantum_id, quantum in qg.get_task_quanta(task_node.label).items(): 

397 status_graph.add_node(quantum_id) 

398 for ref in itertools.chain.from_iterable(quantum.outputs.values()): 

399 status_graph.add_edge(quantum_id, ref.id) 

400 for ref in itertools.chain.from_iterable(quantum.inputs.values()): 

401 status_graph.add_edge(ref.id, quantum_id) 

402 

403 for task_node in qg.pipeline_graph.tasks.values(): 

404 task_report = TaskExecutionReport() 

405 if task_node.log_output is None: 

406 raise RuntimeError("QG must have log outputs to use execution reports.") 

407 for quantum_id, quantum in qg.get_task_quanta(task_node.label).items(): 

408 task_report.inspect_quantum( 

409 quantum_id, 

410 quantum, 

411 status_graph, 

412 refs, 

413 metadata_name=task_node.metadata_output.dataset_type_name, 

414 log_name=task_node.log_output.dataset_type_name, 

415 ) 

416 report.tasks[task_node.label] = task_report 

417 return report 

418 

419 def __str__(self) -> str: 

420 return "\n".join(f"{tasklabel}:{report}" for tasklabel, report in self.tasks.items()) 

421 

422 

423def lookup_quantum_data_id( 

424 graph_uri: ResourcePathExpression, nodes: Iterable[uuid.UUID] 

425) -> list[DataCoordinate | None]: 

426 """Look up a dataId from a quantum graph and a list of quantum graph 

427 nodeIDs. 

428 

429 Parameters 

430 ---------- 

431 graph_uri : `ResourcePathExpression` 

432 URI of the quantum graph of the run. 

433 nodes : `~collections.abc.Iterable` [ `uuid.UUID` ] 

434 Quantum graph nodeID. 

435 

436 Returns 

437 ------- 

438 data_ids : `list` [ `lsst.daf.butler.DataCoordinate` ] 

439 A list of human-readable dataIDs which map to the nodeIDs on the 

440 quantum graph at graph_uri. 

441 """ 

442 qg = QuantumGraph.loadUri(graph_uri, nodes=nodes) 

443 return [qg.getQuantumNodeByNodeId(node).quantum.dataId for node in nodes]