Coverage for python/lsst/ctrl/bps/bps_reports.py: 17%

146 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-28 10:23 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Classes and functions used in reporting run status. 

29""" 

30 

31__all__ = ["BaseRunReport", "DetailedRunReport", "SummaryRunReport", "ExitCodesReport"] 

32 

33import abc 

34import logging 

35 

36from astropy.table import Table 

37 

38from .wms_service import WmsStates 

39 

40_LOG = logging.getLogger(__name__) 

41 

42 

43class BaseRunReport(abc.ABC): 

44 """The base class representing a run report. 

45 

46 Parameters 

47 ---------- 

48 fields : `list` [ `tuple` [ `str`, `str`]] 

49 The list of column specification, fields, to include in the report. 

50 Each field has a name and a type. 

51 """ 

52 

53 def __init__(self, fields): 

54 self._table = Table(dtype=fields) 

55 self._msg = None 

56 

57 def __eq__(self, other): 

58 if isinstance(other, BaseRunReport): 

59 return all(self._table == other._table) 

60 return False 

61 

62 def __len__(self): 

63 """Return the number of runs in the report.""" 

64 return len(self._table) 

65 

66 def __str__(self): 

67 lines = list(self._table.pformat_all()) 

68 return "\n".join(lines) 

69 

70 @property 

71 def message(self): 

72 """Extra information a method need to pass to its caller (`str`).""" 

73 return self._msg 

74 

75 def clear(self): 

76 """Remove all entries from the report.""" 

77 self._msg = None 

78 self._table.remove_rows(slice(len(self))) 

79 

80 def sort(self, columns, ascending=True): 

81 """Sort the report entries according to one or more keys. 

82 

83 Parameters 

84 ---------- 

85 columns : `str` | `list` [ `str` ] 

86 The column(s) to order the report by. 

87 ascending : `bool`, optional 

88 Sort report entries in ascending order, default. 

89 

90 Raises 

91 ------ 

92 AttributeError 

93 Raised if supplied with non-existent column(s). 

94 """ 

95 if isinstance(columns, str): 

96 columns = [columns] 

97 unknown_keys = set(columns) - set(self._table.colnames) 

98 if unknown_keys: 

99 raise AttributeError( 

100 f"cannot sort the report entries: column(s) {', '.join(unknown_keys)} not found" 

101 ) 

102 self._table.sort(keys=columns, reverse=not ascending) 

103 

104 @classmethod 

105 def from_table(cls, table): 

106 """Create a report from a table. 

107 

108 Parameters 

109 ---------- 

110 table : `astropy.table.Table` 

111 Information about a run in a tabular form. 

112 

113 Returns 

114 ------- 

115 inst : `lsst.ctrl.bps.bps_reports.BaseRunReport` 

116 A report created based on the information in the provided table. 

117 """ 

118 inst = cls(table.dtype.descr) 

119 inst._table = table.copy() 

120 return inst 

121 

122 @abc.abstractmethod 

123 def add(self, run_report, use_global_id=False): 

124 """Add a single run info to the report. 

125 

126 Parameters 

127 ---------- 

128 run_report : `lsst.ctrl.bps.WmsRunReport` 

129 Information for single run. 

130 use_global_id : `bool`, optional 

131 If set, use global run id. Defaults to False which means that 

132 the local id will be used instead. 

133 

134 Only applicable in the context of a WMS using distributed job 

135 queues (e.g., HTCondor). 

136 """ 

137 

138 

139class SummaryRunReport(BaseRunReport): 

140 """A summary run report.""" 

141 

142 def add(self, run_report, use_global_id=False): 

143 # Docstring inherited from the base class. 

144 

145 # Flag any running workflow that might need human attention. 

146 run_flag = " " 

147 if run_report.state == WmsStates.RUNNING: 

148 if run_report.job_state_counts.get(WmsStates.FAILED, 0): 

149 run_flag = "F" 

150 elif run_report.job_state_counts.get(WmsStates.DELETED, 0): 

151 run_flag = "D" 

152 elif run_report.job_state_counts.get(WmsStates.HELD, 0): 

153 run_flag = "H" 

154 

155 # Estimate success rate. 

156 percent_succeeded = "UNK" 

157 _LOG.debug("total_number_jobs = %s", run_report.total_number_jobs) 

158 _LOG.debug("run_report.job_state_counts = %s", run_report.job_state_counts) 

159 if run_report.total_number_jobs: 

160 succeeded = run_report.job_state_counts.get(WmsStates.SUCCEEDED, 0) 

161 _LOG.debug("succeeded = %s", succeeded) 

162 percent_succeeded = f"{int(succeeded / run_report.total_number_jobs * 100)}" 

163 

164 row = ( 

165 run_flag, 

166 run_report.state.name, 

167 percent_succeeded, 

168 run_report.global_wms_id if use_global_id else run_report.wms_id, 

169 run_report.operator, 

170 run_report.project, 

171 run_report.campaign, 

172 run_report.payload, 

173 run_report.run, 

174 ) 

175 self._table.add_row(row) 

176 

177 

178class DetailedRunReport(BaseRunReport): 

179 """A detailed run report.""" 

180 

181 def add(self, run_report, use_global_id=False): 

182 # Docstring inherited from the base class. 

183 

184 # If run summary exists, use it to get the reference job counts. 

185 by_label_expected = {} 

186 if run_report.run_summary: 

187 for part in run_report.run_summary.split(";"): 

188 label, count = part.split(":") 

189 by_label_expected[label] = int(count) 

190 

191 total = ["TOTAL"] 

192 total.extend([run_report.job_state_counts[state] for state in WmsStates]) 

193 total.append(sum(by_label_expected.values()) if by_label_expected else run_report.total_number_jobs) 

194 self._table.add_row(total) 

195 

196 # Use the provided job summary. If it doesn't exist, compile it from 

197 # information about individual jobs. 

198 if run_report.job_summary: 

199 job_summary = run_report.job_summary 

200 elif run_report.jobs: 

201 job_summary = compile_job_summary(run_report.jobs) 

202 else: 

203 id_ = run_report.global_wms_id if use_global_id else run_report.wms_id 

204 self._msg = f"WARNING: Job summary for run '{id_}' not available, report maybe incomplete." 

205 return 

206 

207 if by_label_expected: 

208 job_order = list(by_label_expected) 

209 else: 

210 job_order = sorted(job_summary) 

211 self._msg = "WARNING: Could not determine order of pipeline, instead sorted alphabetically." 

212 for label in job_order: 

213 try: 

214 counts = job_summary[label] 

215 except KeyError: 

216 counts = dict.fromkeys(WmsStates, -1) 

217 else: 

218 if label in by_label_expected: 

219 already_counted = sum(counts.values()) 

220 if already_counted != by_label_expected[label]: 

221 counts[WmsStates.UNREADY] += by_label_expected[label] - already_counted 

222 

223 run = [label] 

224 run.extend([counts[state] for state in WmsStates]) 

225 run.append(by_label_expected[label] if by_label_expected else -1) 

226 self._table.add_row(run) 

227 

228 def __str__(self): 

229 alignments = ["<"] + [">"] * (len(self._table.colnames) - 1) 

230 lines = list(self._table.pformat_all(align=alignments)) 

231 lines.insert(3, lines[1]) 

232 return str("\n".join(lines)) 

233 

234 

235class ExitCodesReport(BaseRunReport): 

236 """An extension of run report to give information about 

237 error handling from the wms service. 

238 """ 

239 

240 def add(self, run_report, use_global_id=False): 

241 # Docstring inherited from the base class. 

242 

243 # Use label ordering from the run summary as it should reflect 

244 # the ordering of the pipetasks in the pipeline. 

245 labels = [] 

246 if run_report.run_summary: 

247 for part in run_report.run_summary.split(";"): 

248 label, _ = part.split(":") 

249 labels.append(label) 

250 else: 

251 id_ = run_report.global_wms_id if use_global_id else run_report.wms_id 

252 self._msg = f"WARNING: Job summary for run '{id_}' not available, report maybe incomplete." 

253 return 

254 

255 # Payload (e.g. pipetask) error codes: 

256 # * 1: general failure, 

257 # * 2: command line error (e.g. unknown command and/or option). 

258 pyld_error_codes = {1, 2} 

259 

260 exit_code_summary = run_report.exit_code_summary 

261 for label in labels: 

262 exit_codes = exit_code_summary[label] 

263 

264 pyld_errors = [code for code in exit_codes if code in pyld_error_codes] 

265 pyld_error_count = len(pyld_errors) 

266 pyld_error_summary = ( 

267 ", ".join(sorted(str(code) for code in set(pyld_errors))) if pyld_errors else "None" 

268 ) 

269 

270 infra_errors = [code for code in exit_codes if code not in pyld_error_codes] 

271 infra_error_count = len(infra_errors) 

272 infra_error_summary = ( 

273 ", ".join(sorted(str(code) for code in set(infra_errors))) if infra_errors else "None" 

274 ) 

275 

276 run = [label, pyld_error_count, pyld_error_summary, infra_error_count, infra_error_summary] 

277 self._table.add_row(run) 

278 

279 def __str__(self): 

280 alignments = ["<"] + [">"] * (len(self._table.colnames) - 1) 

281 lines = list(self._table.pformat_all(align=alignments)) 

282 return str("\n".join(lines)) 

283 

284 

285def compile_job_summary(jobs): 

286 """Compile job summary from information available for individual jobs. 

287 

288 Parameters 

289 ---------- 

290 jobs : `list` [`lsst.ctrl.bps.WmsJobReport`] 

291 List of run reports. 

292 

293 Returns 

294 ------- 

295 job_summary : `dict` [`str`, dict` [`lsst.ctrl.bps.WmsState`, `int`]] 

296 The summary of the execution statuses for each job label in the run. 

297 For each job label, execution statuses are mapped to number of jobs 

298 having a given status. 

299 """ 

300 job_summary = {} 

301 by_label = group_jobs_by_label(jobs) 

302 for label, job_group in by_label.items(): 

303 by_label_state = group_jobs_by_state(job_group) 

304 _LOG.debug("by_label_state = %s", by_label_state) 

305 counts = {state: len(jobs) for state, jobs in by_label_state.items()} 

306 job_summary[label] = counts 

307 return job_summary 

308 

309 

310def group_jobs_by_state(jobs): 

311 """Divide given jobs into groups based on their state value. 

312 

313 Parameters 

314 ---------- 

315 jobs : `list` [`lsst.ctrl.bps.WmsJobReport`] 

316 Jobs to divide into groups based on state. 

317 

318 Returns 

319 ------- 

320 by_state : `dict` 

321 Mapping of job state to a list of jobs. 

322 """ 

323 _LOG.debug("group_jobs_by_state: jobs=%s", jobs) 

324 by_state = {state: [] for state in WmsStates} 

325 for job in jobs: 

326 by_state[job.state].append(job) 

327 return by_state 

328 

329 

330def group_jobs_by_label(jobs): 

331 """Divide given jobs into groups based on their label value. 

332 

333 Parameters 

334 ---------- 

335 jobs : `list` [`lsst.ctrl.bps.WmsJobReport`] 

336 Jobs to divide into groups based on label. 

337 

338 Returns 

339 ------- 

340 by_label : `dict` [`str`, `list` [`lsst.ctrl.bps.WmsJobReport`]] 

341 Mapping of job state to a list of jobs. 

342 """ 

343 by_label = {} 

344 for job in jobs: 

345 group = by_label.setdefault(job.label, []) 

346 group.append(job) 

347 return by_label