Coverage for python/lsst/ctrl/bps/bps_reports.py: 17%

152 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-24 11:02 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Classes and functions used in reporting run status. 

29""" 

30 

31__all__ = ["BaseRunReport", "DetailedRunReport", "SummaryRunReport", "ExitCodesReport"] 

32 

33import abc 

34import logging 

35 

36from astropy.table import Table 

37 

38from .wms_service import WmsStates 

39 

40_LOG = logging.getLogger(__name__) 

41 

42 

43class BaseRunReport(abc.ABC): 

44 """The base class representing a run report. 

45 

46 Parameters 

47 ---------- 

48 fields : `list` [ `tuple` [ `str`, `str`]] 

49 The list of column specification, fields, to include in the report. 

50 Each field has a name and a type. 

51 """ 

52 

53 def __init__(self, fields): 

54 self._table = Table(dtype=fields) 

55 self._msg = None 

56 

57 def __eq__(self, other): 

58 if isinstance(other, BaseRunReport): 

59 return all(self._table == other._table) 

60 return False 

61 

62 def __len__(self): 

63 """Return the number of runs in the report.""" 

64 return len(self._table) 

65 

66 def __str__(self): 

67 lines = list(self._table.pformat_all()) 

68 return "\n".join(lines) 

69 

70 @property 

71 def message(self): 

72 """Extra information a method need to pass to its caller (`str`).""" 

73 return self._msg 

74 

75 def clear(self): 

76 """Remove all entries from the report.""" 

77 self._msg = None 

78 self._table.remove_rows(slice(len(self))) 

79 

80 def sort(self, columns, ascending=True): 

81 """Sort the report entries according to one or more keys. 

82 

83 Parameters 

84 ---------- 

85 columns : `str` | `list` [ `str` ] 

86 The column(s) to order the report by. 

87 ascending : `bool`, optional 

88 Sort report entries in ascending order, default. 

89 

90 Raises 

91 ------ 

92 AttributeError 

93 Raised if supplied with non-existent column(s). 

94 """ 

95 if isinstance(columns, str): 

96 columns = [columns] 

97 unknown_keys = set(columns) - set(self._table.colnames) 

98 if unknown_keys: 

99 raise AttributeError( 

100 f"cannot sort the report entries: column(s) {', '.join(unknown_keys)} not found" 

101 ) 

102 self._table.sort(keys=columns, reverse=not ascending) 

103 

104 @classmethod 

105 def from_table(cls, table): 

106 """Create a report from a table. 

107 

108 Parameters 

109 ---------- 

110 table : `astropy.table.Table` 

111 Information about a run in a tabular form. 

112 

113 Returns 

114 ------- 

115 inst : `lsst.ctrl.bps.bps_reports.BaseRunReport` 

116 A report created based on the information in the provided table. 

117 """ 

118 inst = cls(table.dtype.descr) 

119 inst._table = table.copy() 

120 return inst 

121 

122 @abc.abstractmethod 

123 def add(self, run_report, use_global_id=False): 

124 """Add a single run info to the report. 

125 

126 Parameters 

127 ---------- 

128 run_report : `lsst.ctrl.bps.WmsRunReport` 

129 Information for single run. 

130 use_global_id : `bool`, optional 

131 If set, use global run id. Defaults to False which means that 

132 the local id will be used instead. 

133 

134 Only applicable in the context of a WMS using distributed job 

135 queues (e.g., HTCondor). 

136 """ 

137 

138 

139class SummaryRunReport(BaseRunReport): 

140 """A summary run report.""" 

141 

142 def add(self, run_report, use_global_id=False): 

143 # Docstring inherited from the base class. 

144 

145 # Flag any running workflow that might need human attention. 

146 run_flag = " " 

147 if run_report.state == WmsStates.RUNNING: 

148 if run_report.job_state_counts.get(WmsStates.FAILED, 0): 

149 run_flag = "F" 

150 elif run_report.job_state_counts.get(WmsStates.DELETED, 0): 

151 run_flag = "D" 

152 elif run_report.job_state_counts.get(WmsStates.HELD, 0): 

153 run_flag = "H" 

154 

155 # Estimate success rate. 

156 percent_succeeded = "UNK" 

157 _LOG.debug("total_number_jobs = %s", run_report.total_number_jobs) 

158 _LOG.debug("run_report.job_state_counts = %s", run_report.job_state_counts) 

159 if run_report.total_number_jobs: 

160 succeeded = run_report.job_state_counts.get(WmsStates.SUCCEEDED, 0) 

161 _LOG.debug("succeeded = %s", succeeded) 

162 percent_succeeded = f"{int(succeeded / run_report.total_number_jobs * 100)}" 

163 

164 row = ( 

165 run_flag, 

166 run_report.state.name, 

167 percent_succeeded, 

168 run_report.global_wms_id if use_global_id else run_report.wms_id, 

169 run_report.operator, 

170 run_report.project, 

171 run_report.campaign, 

172 run_report.payload, 

173 run_report.run, 

174 ) 

175 self._table.add_row(row) 

176 

177 

178class DetailedRunReport(BaseRunReport): 

179 """A detailed run report.""" 

180 

181 def add(self, run_report, use_global_id=False): 

182 # Docstring inherited from the base class. 

183 

184 # If run summary exists, use it to get the reference job counts. 

185 by_label_expected = {} 

186 if run_report.run_summary: 

187 for part in run_report.run_summary.split(";"): 

188 label, count = part.split(":") 

189 by_label_expected[label] = int(count) 

190 

191 total = ["TOTAL"] 

192 total.extend([run_report.job_state_counts[state] for state in WmsStates]) 

193 total.append(sum(by_label_expected.values()) if by_label_expected else run_report.total_number_jobs) 

194 self._table.add_row(total) 

195 

196 # Use the provided job summary. If it doesn't exist, compile it from 

197 # information about individual jobs. 

198 if run_report.job_summary: 

199 job_summary = run_report.job_summary 

200 elif run_report.jobs: 

201 job_summary = compile_job_summary(run_report.jobs) 

202 else: 

203 id_ = run_report.global_wms_id if use_global_id else run_report.wms_id 

204 self._msg = f"WARNING: Job summary for run '{id_}' not available, report maybe incomplete." 

205 return 

206 

207 if by_label_expected: 

208 job_order = list(by_label_expected) 

209 else: 

210 job_order = sorted(job_summary) 

211 self._msg = "WARNING: Could not determine order of pipeline, instead sorted alphabetically." 

212 for label in job_order: 

213 try: 

214 counts = job_summary[label] 

215 except KeyError: 

216 counts = dict.fromkeys(WmsStates, -1) 

217 else: 

218 if label in by_label_expected: 

219 already_counted = sum(counts.values()) 

220 if already_counted != by_label_expected[label]: 

221 counts[WmsStates.UNREADY] += by_label_expected[label] - already_counted 

222 

223 run = [label] 

224 run.extend([counts[state] for state in WmsStates]) 

225 run.append(by_label_expected[label] if by_label_expected else -1) 

226 self._table.add_row(run) 

227 

228 def __str__(self): 

229 alignments = ["<"] + [">"] * (len(self._table.colnames) - 1) 

230 lines = list(self._table.pformat_all(align=alignments)) 

231 lines.insert(3, lines[1]) 

232 return str("\n".join(lines)) 

233 

234 

235class ExitCodesReport(BaseRunReport): 

236 """An extension of run report to give information about 

237 error handling from the wms service. 

238 """ 

239 

240 def add(self, run_report, use_global_id=False): 

241 # Docstring inherited from the base class. 

242 

243 # get labels from things and exit codes 

244 

245 labels = [] 

246 if run_report.run_summary: 

247 for part in run_report.run_summary.split(";"): 

248 label, _ = part.split(":") 

249 labels.append(label) 

250 else: 

251 id_ = run_report.global_wms_id if use_global_id else run_report.wms_id 

252 self._msg = f"WARNING: Job summary for run '{id_}' not available, report maybe incomplete." 

253 return 

254 exit_code_summary = run_report.exit_code_summary 

255 for label in labels: 

256 exit_codes = exit_code_summary[label] 

257 if exit_codes: 

258 # payload errors always return 1 on failure 

259 pipe_error_count = sum([code for code in exit_codes if code == 1]) 

260 infra_codes = [code for code in exit_codes if code != 0 and code != 1] 

261 if infra_codes: 

262 infra_error_count = len(infra_codes) 

263 str_infra_codes = [str(code) for code in infra_codes] 

264 infra_error_codes = ", ".join(sorted(set(str_infra_codes))) 

265 else: 

266 infra_error_count = 0 

267 infra_error_codes = "None" 

268 else: 

269 pipe_error_count = 0 

270 infra_error_codes = "None" 

271 infra_error_count = 0 

272 run = [label] 

273 run.extend([pipe_error_count, infra_error_count, infra_error_codes]) 

274 self._table.add_row(run) 

275 

276 def __str__(self): 

277 alignments = ["<"] + [">"] * (len(self._table.colnames) - 1) 

278 lines = list(self._table.pformat_all(align=alignments)) 

279 return str("\n".join(lines)) 

280 

281 

282def compile_job_summary(jobs): 

283 """Compile job summary from information available for individual jobs. 

284 

285 Parameters 

286 ---------- 

287 jobs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

288 List of run reports. 

289 

290 Returns 

291 ------- 

292 job_summary : `dict` [`str`, dict` [`lsst.ctrl.bps.WmsState`, `int`]] 

293 The summary of the execution statuses for each job label in the run. 

294 For each job label, execution statuses are mapped to number of jobs 

295 having a given status. 

296 """ 

297 job_summary = {} 

298 by_label = group_jobs_by_label(jobs) 

299 for label, job_group in by_label.items(): 

300 by_label_state = group_jobs_by_state(job_group) 

301 _LOG.debug("by_label_state = %s", by_label_state) 

302 counts = {state: len(jobs) for state, jobs in by_label_state.items()} 

303 job_summary[label] = counts 

304 return job_summary 

305 

306 

307def group_jobs_by_state(jobs): 

308 """Divide given jobs into groups based on their state value. 

309 

310 Parameters 

311 ---------- 

312 jobs : `list` [`lsst.ctrl.bps.WmsJobReport`] 

313 Jobs to divide into groups based on state. 

314 

315 Returns 

316 ------- 

317 by_state : `dict` 

318 Mapping of job state to a list of jobs. 

319 """ 

320 _LOG.debug("group_jobs_by_state: jobs=%s", jobs) 

321 by_state = {state: [] for state in WmsStates} 

322 for job in jobs: 

323 by_state[job.state].append(job) 

324 return by_state 

325 

326 

327def group_jobs_by_label(jobs): 

328 """Divide given jobs into groups based on their label value. 

329 

330 Parameters 

331 ---------- 

332 jobs : `list` [`lsst.ctrl.bps.WmsJobReport`] 

333 Jobs to divide into groups based on label. 

334 

335 Returns 

336 ------- 

337 by_label : `dict` [`str`, `lsst.ctrl.bps.WmsJobReport`] 

338 Mapping of job state to a list of jobs. 

339 """ 

340 by_label = {} 

341 for job in jobs: 

342 group = by_label.setdefault(job.label, []) 

343 group.append(job) 

344 return by_label