Coverage for python/lsst/pipe/base/execution_reports.py: 30%

124 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-06 10:56 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ( 

24 "QuantumGraphExecutionReport", 

25 "TaskExecutionReport", 

26 "DatasetTypeExecutionReport", 

27 "lookup_quantum_data_id", 

28) 

29 

30import dataclasses 

31import itertools 

32import logging 

33import uuid 

34from collections.abc import Iterable, Mapping 

35from typing import Any 

36 

37import networkx 

38import yaml 

39from lsst.daf.butler import Butler, DataCoordinate, DatasetRef 

40from lsst.resources import ResourcePathExpression 

41 

42from .graph import QuantumGraph, QuantumNode 

43from .pipeline import PipelineDatasetTypes 

44 

45 

46@dataclasses.dataclass 

47class DatasetTypeExecutionReport: 

48 """A report on the number of produced datasets as well as the status of 

49 missing datasets based on metadata. 

50 

51 A `DatasetTypeExecutionReport` is created for each `DatasetType` in a 

52 `TaskExecutionReport`. 

53 """ 

54 

55 missing_failed: set[DatasetRef] = dataclasses.field(default_factory=set) 

56 """Datasets not produced because their quanta failed directly in this 

57 run (`set`). 

58 """ 

59 

60 missing_not_produced: dict[DatasetRef, bool] = dataclasses.field(default_factory=dict) 

61 """Missing datasets which were not produced due either missing inputs or a 

62 failure in finding inputs (`dict`). 

63 bool: were predicted inputs produced? 

64 """ 

65 

66 missing_upstream_failed: set[DatasetRef] = dataclasses.field(default_factory=set) 

67 """Datasets not produced due to an upstream failure (`set`). 

68 """ 

69 

70 n_produced: int = 0 

71 """Count of datasets produced (`int`). 

72 """ 

73 

74 def to_summary_dict(self) -> dict[str, Any]: 

75 """Summarize the DatasetTypeExecutionReport in a dictionary. 

76 

77 Returns 

78 ------- 

79 summary_dict : `dict` 

80 A count of the datasets with each outcome; the number of 

81 produced, `missing_failed`, `missing_not_produced`, and 

82 `missing_upstream_failed` `DatasetTypes`. See above for attribute 

83 descriptions. 

84 """ 

85 return { 

86 "produced": self.n_produced, 

87 "missing_failed": len(self.missing_failed), 

88 "missing_not_produced": len(self.missing_not_produced), 

89 "missing_upstream_failed": len(self.missing_upstream_failed), 

90 } 

91 

92 def handle_missing_dataset( 

93 self, output_ref: DatasetRef, failed: bool, status_graph: networkx.DiGraph 

94 ) -> None: 

95 """Sort missing datasets into outcomes. 

96 

97 Parameters 

98 ---------- 

99 output_ref : `~lsst.daf.butler.DatasetRef` 

100 Dataset reference of the missing dataset. 

101 failed : `bool` 

102 Whether the task associated with the missing dataset failed. 

103 status_graph : `networkx.DiGraph` 

104 The quantum graph produced by `TaskExecutionReport.inspect_quantum` 

105 which steps through the run quantum graph and logs the status of 

106 each quanta. 

107 """ 

108 if failed: 

109 for upstream_quantum_id in status_graph.predecessors(output_ref.id): 

110 if status_graph.nodes[upstream_quantum_id]["failed"]: 

111 self.missing_upstream_failed.add(output_ref) 

112 break 

113 else: 

114 self.missing_failed.add(output_ref) 

115 else: 

116 status_graph.nodes[output_ref.id]["not_produced"] = True 

117 self.missing_not_produced[output_ref] = any( 

118 status_graph.nodes[upstream_dataset_id].get("not_produced", False) 

119 for upstream_quantum_id in status_graph.predecessors(output_ref.id) 

120 for upstream_dataset_id in status_graph.predecessors(upstream_quantum_id) 

121 ) 

122 

123 def handle_produced_dataset(self, output_ref: DatasetRef, status_graph: networkx.DiGraph) -> None: 

124 """Account for produced datasets. 

125 

126 Parameters 

127 ---------- 

128 output_ref : `~lsst.daf.butler.DatasetRef` 

129 Dataset reference of the dataset. 

130 status_graph : `networkx.DiGraph` 

131 The quantum graph produced by 

132 `QuantumGraphExecutionReport.make_reports` which steps through the 

133 quantum graph of a run and logs the status of each quantum. 

134 

135 See Also 

136 -------- 

137 TaskExecutionReport.inspect_quantum 

138 """ 

139 status_graph.nodes[output_ref.id]["not_produced"] = False 

140 self.n_produced += 1 

141 

142 

143@dataclasses.dataclass 

144class TaskExecutionReport: 

145 """A report on the status and content of a task in an executed quantum 

146 graph. 

147 

148 Use task metadata to identify and inspect failures and report on output 

149 datasets. 

150 

151 See Also 

152 -------- 

153 QuantumGraphExecutionReport 

154 DatasetTypeExecutionReport 

155 """ 

156 

157 failed: dict[uuid.UUID, DatasetRef] = dataclasses.field(default_factory=dict) 

158 """A mapping from quantum data ID to log dataset reference for quanta that 

159 failed directly in this run (`dict`). 

160 """ 

161 

162 failed_upstream: dict[uuid.UUID, DataCoordinate] = dataclasses.field(default_factory=dict) 

163 """A mapping of data IDs of quanta that were not attempted due to an 

164 upstream failure (`dict`). 

165 """ 

166 

167 output_datasets: dict[str, DatasetTypeExecutionReport] = dataclasses.field(default_factory=dict) 

168 """Missing and produced outputs of each `DatasetType` (`dict`). 

169 """ 

170 

171 def inspect_quantum( 

172 self, 

173 quantum_node: QuantumNode, 

174 status_graph: networkx.DiGraph, 

175 refs: Mapping[str, Mapping[uuid.UUID, DatasetRef]], 

176 metadata_name: str, 

177 log_name: str, 

178 ) -> None: 

179 """Inspect a quantum of a quantum graph and ascertain the status of 

180 each associated data product. 

181 

182 Parameters 

183 ---------- 

184 quantum_node : `QuantumNode` 

185 The specific node of the quantum graph to be inspected. 

186 status_graph : `networkx.DiGraph` 

187 The quantum graph produced by 

188 `QuantumGraphExecutionReport.make_reports` which steps through the 

189 quantum graph of a run and logs the status of each quantum. 

190 refs : `~collections.abc.Mapping` [ `str`,\ 

191 `~collections.abc.Mapping` [ `uuid.UUID`,\ 

192 `~lsst.daf.butler.DatasetRef` ] ] 

193 The DatasetRefs of each of the DatasetTypes produced by the task. 

194 Includes initialization, intermediate and output data products. 

195 metadata_name : `str` 

196 The metadata dataset name for the node. 

197 log_name : `str` 

198 The name of the log files for the node. 

199 

200 See Also 

201 -------- 

202 DatasetTypeExecutionReport.handle_missing_dataset 

203 DatasetTypeExecutionReport.handle_produced_dataset 

204 QuantumGraphExecutionReport.make_reports 

205 """ 

206 quantum = quantum_node.quantum 

207 (metadata_ref,) = quantum.outputs[metadata_name] 

208 (log_ref,) = quantum.outputs[log_name] 

209 if metadata_ref.id not in refs[metadata_name]: 

210 if any( 

211 status_graph.nodes[upstream_quantum_id]["failed"] 

212 for upstream_dataset_id in status_graph.predecessors(quantum_node.nodeId) 

213 for upstream_quantum_id in status_graph.predecessors(upstream_dataset_id) 

214 ): 

215 assert quantum.dataId is not None 

216 self.failed_upstream[quantum_node.nodeId] = quantum.dataId 

217 else: 

218 self.failed[quantum_node.nodeId] = log_ref 

219 # note: log_ref may or may not actually exist 

220 failed = True 

221 else: 

222 failed = False 

223 status_graph.nodes[quantum_node.nodeId]["failed"] = failed 

224 for output_ref in itertools.chain.from_iterable(quantum.outputs.values()): 

225 if (dataset_type_report := self.output_datasets.get(output_ref.datasetType.name)) is None: 

226 dataset_type_report = DatasetTypeExecutionReport() 

227 self.output_datasets[output_ref.datasetType.name] = dataset_type_report 

228 if output_ref.id not in refs[output_ref.datasetType.name]: 

229 dataset_type_report.handle_missing_dataset(output_ref, failed, status_graph) 

230 else: 

231 dataset_type_report.handle_produced_dataset(output_ref, status_graph) 

232 

233 def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[str, Any]: 

234 """Summarize the results of the TaskExecutionReport in a dictionary. 

235 

236 Parameters 

237 ---------- 

238 butler : `lsst.daf.butler.Butler` 

239 The Butler used for this report. 

240 do_store_logs : `bool` 

241 Store the logs in the summary dictionary. 

242 

243 Returns 

244 ------- 

245 summary_dict : `dict` 

246 A dictionary containing: 

247 

248 - outputs: A dictionary summarizing the 

249 DatasetTypeExecutionReport for each DatasetType associated with 

250 the task 

251 - failed_quanta: A dictionary of quanta which failed and their 

252 dataIDs by quantum graph node id 

253 - n_quanta_blocked: The number of quanta which failed due to 

254 upstream failures. 

255 

256 """ 

257 failed_quanta = {} 

258 for node_id, log_ref in self.failed.items(): 

259 quantum_info: dict[str, Any] = {"data_id": dict(log_ref.dataId.required)} 

260 if do_store_logs: 

261 try: 

262 log = butler.get(log_ref) 

263 except LookupError: 

264 quantum_info["error"] = [] 

265 except FileNotFoundError: 

266 quantum_info["error"] = None 

267 else: 

268 quantum_info["error"] = [ 

269 record.message for record in log if record.levelno >= logging.ERROR 

270 ] 

271 failed_quanta[str(node_id)] = quantum_info 

272 return { 

273 "outputs": {name: r.to_summary_dict() for name, r in self.output_datasets.items()}, 

274 "failed_quanta": failed_quanta, 

275 "n_quanta_blocked": len(self.failed_upstream), 

276 } 

277 

278 def __str__(self) -> str: 

279 """Return a count of the failed and failed_upstream tasks in the 

280 TaskExecutionReport. 

281 """ 

282 return f"failed: {len(self.failed)}\nfailed upstream: {len(self.failed_upstream)}\n" 

283 

284 

285@dataclasses.dataclass 

286class QuantumGraphExecutionReport: 

287 """A report on the execution of a quantum graph. 

288 

289 Report the detailed status of each failure; whether tasks were not run, 

290 data is missing from upstream failures, or specific errors occurred during 

291 task execution (and report the errors). Contains a count of expected, 

292 produced DatasetTypes for each task. This report can be output as a 

293 dictionary or a yaml file. 

294 

295 Parameters 

296 ---------- 

297 tasks : `dict` 

298 A dictionary of TaskExecutionReports by task label. 

299 

300 See Also 

301 -------- 

302 TaskExecutionReport 

303 DatasetTypeExecutionReport 

304 """ 

305 

306 tasks: dict[str, TaskExecutionReport] = dataclasses.field(default_factory=dict) 

307 """A dictionary of TaskExecutionReports by task label (`dict`). 

308 """ 

309 

310 def to_summary_dict(self, butler: Butler, do_store_logs: bool = True) -> dict[str, Any]: 

311 """Summarize the results of the `QuantumGraphExecutionReport` in a 

312 dictionary. 

313 

314 Parameters 

315 ---------- 

316 butler : `lsst.daf.butler.Butler` 

317 The Butler used for this report. 

318 do_store_logs : `bool` 

319 Store the logs in the summary dictionary. 

320 

321 Returns 

322 ------- 

323 summary_dict : `dict` 

324 A dictionary containing a summary of a `TaskExecutionReport` for 

325 each task in the quantum graph. 

326 """ 

327 return { 

328 task: report.to_summary_dict(butler, do_store_logs=do_store_logs) 

329 for task, report in self.tasks.items() 

330 } 

331 

332 def write_summary_yaml(self, butler: Butler, filename: str, do_store_logs: bool = True) -> None: 

333 """Take the dictionary from 

334 `QuantumGraphExecutionReport.to_summary_dict` and store its contents in 

335 a yaml file. 

336 

337 Parameters 

338 ---------- 

339 butler : `lsst.daf.butler.Butler` 

340 The Butler used for this report. 

341 filename : `str` 

342 The name to be used for the summary yaml file. 

343 do_store_logs : `bool` 

344 Store the logs in the summary dictionary. 

345 """ 

346 with open(filename, "w") as stream: 

347 yaml.safe_dump(self.to_summary_dict(butler, do_store_logs=do_store_logs), stream) 

348 

349 @classmethod 

350 def make_reports( 

351 cls, 

352 butler: Butler, 

353 graph: QuantumGraph | ResourcePathExpression, 

354 ) -> QuantumGraphExecutionReport: 

355 """Make a `QuantumGraphExecutionReport`. 

356 

357 Step through the quantum graph associated with a run, creating a 

358 `networkx.DiGraph` called status_graph to annotate the status of each 

359 quantum node. For each task in the quantum graph, use 

360 `TaskExecutionReport.inspect_quantum` to make a `TaskExecutionReport` 

361 based on the status of each node. Return a `TaskExecutionReport` for 

362 each task in the quantum graph. 

363 

364 Parameters 

365 ---------- 

366 butler : `lsst.daf.butler.Butler` 

367 The Butler used for this report. This should match the Butler used 

368 for the run associated with the executed quantum graph. 

369 graph : `QuantumGraph` | `ResourcePathExpression` 

370 Either the associated quantum graph object or the uri of the 

371 location of said quantum graph. 

372 

373 Returns 

374 ------- 

375 report: `QuantumGraphExecutionReport` 

376 The `TaskExecutionReport` for each task in the quantum graph. 

377 """ 

378 refs = {} # type: dict[str, Any] 

379 status_graph = networkx.DiGraph() 

380 if not isinstance(graph, QuantumGraph): 

381 qg = QuantumGraph.loadUri(graph) 

382 else: 

383 qg = graph 

384 assert qg.metadata is not None, "Saved QGs always have metadata." 

385 collection = qg.metadata["output_run"] 

386 report = cls() 

387 task_defs = list(qg.iterTaskGraph()) 

388 pipeline_dataset_types = PipelineDatasetTypes.fromPipeline(task_defs, registry=butler.registry) 

389 for dataset_type in itertools.chain( 

390 pipeline_dataset_types.initIntermediates, 

391 pipeline_dataset_types.initOutputs, 

392 pipeline_dataset_types.intermediates, 

393 pipeline_dataset_types.outputs, 

394 ): 

395 refs[dataset_type.name] = { 

396 ref.id: ref 

397 for ref in butler.registry.queryDatasets( 

398 dataset_type.name, collections=collection, findFirst=False 

399 ) 

400 } 

401 for task_def in qg.iterTaskGraph(): 

402 for node in qg.getNodesForTask(task_def): 

403 status_graph.add_node(node.nodeId) 

404 for ref in itertools.chain.from_iterable(node.quantum.outputs.values()): 

405 status_graph.add_edge(node.nodeId, ref.id) 

406 for ref in itertools.chain.from_iterable(node.quantum.inputs.values()): 

407 status_graph.add_edge(ref.id, node.nodeId) 

408 

409 for task_def in qg.iterTaskGraph(): 

410 task_report = TaskExecutionReport() 

411 if task_def.logOutputDatasetName is None: 

412 raise RuntimeError("QG must have log outputs to use execution reports.") 

413 for node in qg.getNodesForTask(task_def): 

414 task_report.inspect_quantum( 

415 node, 

416 status_graph, 

417 refs, 

418 metadata_name=task_def.metadataDatasetName, 

419 log_name=task_def.logOutputDatasetName, 

420 ) 

421 report.tasks[task_def.label] = task_report 

422 return report 

423 

424 def __str__(self) -> str: 

425 return "\n".join(f"{tasklabel}:{report}" for tasklabel, report in self.tasks.items()) 

426 

427 

428def lookup_quantum_data_id( 

429 graph_uri: ResourcePathExpression, nodes: Iterable[uuid.UUID] 

430) -> list[DataCoordinate | None]: 

431 """Look up a dataId from a quantum graph and a list of quantum graph 

432 nodeIDs. 

433 

434 Parameters 

435 ---------- 

436 graph_uri : `ResourcePathExpression` 

437 URI of the quantum graph of the run. 

438 nodes : `~collections.abc.Iterable` [ `uuid.UUID` ] 

439 Quantum graph nodeID. 

440 

441 Returns 

442 ------- 

443 data_ids : `list` [ `lsst.daf.butler.DataCoordinate` ] 

444 A list of human-readable dataIDs which map to the nodeIDs on the 

445 quantum graph at graph_uri. 

446 """ 

447 qg = QuantumGraph.loadUri(graph_uri, nodes=nodes) 

448 return [qg.getQuantumNodeByNodeId(node).quantum.dataId for node in nodes]