Coverage for python / lsst / summit / extras / logUtils.py: 12%

129 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-07 09:03 +0000

1# This file is part of summit_extras. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22import logging 

23 

24import lsst.daf.butler as dafButler 

25 

26__all__ = ["LogBrowser"] 

27 

28_LOG = logging.getLogger(__name__) 

29 

30 

31class LogBrowser: 

32 """A convenience class for helping identify different failure modes within 

33 a processing collection. 

34 

35 Parameters 

36 ---------- 

37 butler : `lsst.daf.butler.Butler` 

38 The butler. Must contain the collection to be examined. 

39 taskName : `str` 

40 The name of the task, e.g. ``isr``, ``characterizeImage``, etc. 

41 collection : `str` 

42 The processing collection to use. 

43 where : `str`, optional 

44 A dataId search string formatted appropriately (i.e. similary to a 

45 SQL WHERE clause) for a where clause in butler.registry.queryDatasets. 

46 E.g. 

47 where = ("instrument=\'{}\' AND skymap=\'{}\' AND 

48 "visit IN (0..100).format("LATISS", "latiss_v1")) 

49 bind : `~collections.abc.Mapping`, optional 

50 Mapping containing literal values to be injected into the ``where`` 

51 expression, keyed by the identifiers they replace (note that the name 

52 of the bind key cannot be the same as any butler dimension name). 

53 E.g. 

54 where = "exposure IN (exposures)" 

55 bind = {"exposures": exposure_list} 

56 

57 Notes 

58 ----- 

59 Many tasks throw errors with values in them, meaning the ``doFailZoology`` 

60 function doesn't collapse them down to a single failure case as one would 

61 like. If this is the case, take the first part of the message that is 

62 common among the ones you would like to be classed together, and add it to 

63 the class property ``SPECIAL_ZOO_CASES`` to declare a new type of error 

64 animal. 

65 

66 example usage: 

67 logBrowser = LogBrowser(butler, taskName, collection, where=where, 

68 bind=bind) 

69 fail = 'TaskError: Fatal astrometry failure detected: mean on-sky distance' 

70 logBrowser.SPECIAL_ZOO_CASES.append(fail) 

71 logBrowser.doFailZoology() 

72 """ 

73 

74 IGNORE_LOGS_FROM = [ 

75 # butler.datastores is verbose by default and not interesting to most 

76 "lsst.daf.butler.datastores", 

77 ] 

78 SPECIAL_ZOO_CASES = [ 

79 "with gufunc signature (n?,k),(k,m?)->(n?,m?)", 

80 ] 

81 

82 def __init__( 

83 self, 

84 butler: dafButler.Butler, 

85 taskName: str, 

86 collection: str, 

87 where: str = "", 

88 bind: dict | None = None, 

89 ): 

90 self.taskName = taskName 

91 self.collection = collection 

92 self.where = where 

93 self.bind = bind 

94 

95 self.log = _LOG.getChild("logBrowser") 

96 self.butler = butler 

97 

98 if self.bind is not None: 

99 for key in self.bind.keys(): 

100 if key not in self.where: 

101 self.log.warning( 

102 f"Key '{key}' in bind is not in the where string provided: " 

103 f"'{self.where}', so no binding will take effect." 

104 ) 

105 

106 self.dataRefs = self._getDataRefs() 

107 self.logs = self._loadLogs(self.dataRefs) 

108 

109 def _getDataRefs(self) -> list[dafButler.DatasetRef]: 

110 """Query the registry for this task's log dataRefs. 

111 

112 Returns 

113 ------- 

114 dataRefs : `list` [`lsst.daf.butler.DatasetRef`] 

115 Sorted, deduplicated list of ``{taskName}_log`` dataRefs in 

116 the configured collection that match the ``where``/``bind`` 

117 filter. 

118 """ 

119 queryResults = self.butler.registry.queryDatasets( 

120 f"{self.taskName}_log", 

121 collections=self.collection, 

122 findFirst=True, 

123 where=self.where, 

124 bind=self.bind, 

125 ) 

126 results = list(set(queryResults)) 

127 self.log.info(f"Found {len(results)} datasets in collection for task {self.taskName}") 

128 return sorted(results) 

129 

130 def _loadLogs(self, dataRefs: list) -> dict[dafButler.DatasetRef, dafButler.ButlerLogRecords]: 

131 """Fetch the log for each dataRef from the butler. 

132 

133 Parameters 

134 ---------- 

135 dataRefs : `list` [`lsst.daf.butler.DatasetRef`] 

136 The log dataRefs to load. 

137 

138 Returns 

139 ------- 

140 logs : `dict` [`lsst.daf.butler.DatasetRef`, \ 

141 `lsst.daf.butler.ButlerLogRecords`] 

142 Dict of loaded logs keyed by their dataRef. 

143 """ 

144 logs = {} 

145 for i, dataRef in enumerate(dataRefs): 

146 if (i + 1) % 100 == 0: 

147 self.log.info(f"Loaded {i + 1} logs...") 

148 log = self.butler.get(dataRef) 

149 logs[dataRef] = log 

150 return logs 

151 

152 def getPassingDataIds(self) -> list[dafButler.DataCoordinate]: 

153 """Return the dataIds for all successful task runs. 

154 

155 Returns 

156 ------- 

157 dataIds : `list` [`lsst.daf.butler.DataCoordinate`] 

158 DataIds whose final log line does not contain ``"failed"``. 

159 """ 

160 fails = self._getFailDataRefs() 

161 passes = [r.dataId for r in self.dataRefs if r not in fails] 

162 return passes 

163 

164 def getFailingDataIds(self) -> list[dafButler.DataCoordinate]: 

165 """Return the dataIds for all failed task runs. 

166 

167 Returns 

168 ------- 

169 dataIds : `list` [`lsst.daf.butler.DataCoordinate`] 

170 DataIds whose final log line contains ``"failed"``. 

171 """ 

172 fails = self._getFailDataRefs() 

173 return [r.dataId for r in fails] 

174 

175 def printPasses(self) -> None: 

176 """Print out all the passing dataIds.""" 

177 passes = self.getPassingDataIds() 

178 for dataId in passes: 

179 print(dataId) 

180 

181 def printFails(self) -> None: 

182 """Print out all the failing dataIds.""" 

183 fails = self.getFailingDataIds() 

184 for dataId in fails: 

185 print(dataId) 

186 

187 def countFails(self) -> None: 

188 """Print a count of all the failing dataIds.""" 

189 print(f"{len(self._getFailDataRefs())} failing cases found") 

190 

191 def countPasses(self) -> None: 

192 """Print a count of all the passing dataIds.""" 

193 print(f"{len(self.getPassingDataIds())} passing cases found") 

194 

195 def _getFailDataRefs(self) -> list[dafButler.DatasetRef]: 

196 """Get a list of all the failing dataRefs. 

197 

198 Note that these are dataset references to the logs, and as such are 

199 not fails themselves, but logs containing the fail messages, and as 

200 such the item of interest for the failures are their dataIds. This is 

201 why ``_getFailDataRefs()`` is a private method, but getFailingDataIds 

202 is the public API. 

203 

204 Returns 

205 ------- 

206 logs : `list` [`lsst.daf.butler.DatasetRef`] 

207 A list of all the failing dataRefs. 

208 """ 

209 fails = [] 

210 for dataRef, log in self.logs.items(): 

211 # dereferencing a log with [] gives the individual lines in it, 

212 # each containing a level, message, etc. 

213 # the final task failure message always comes in the last line 

214 # of the log and contains the string 'failed' as this is the 

215 # pipeline executor reporting on success/fail and the time and id. 

216 if len(log) == 0: 

217 continue 

218 if log[-1].message.find("failed") != -1: 

219 fails.append(dataRef) 

220 return fails 

221 

222 def _printLineIf(self, logLine: dafButler.logging.ButlerLogRecord) -> None: 

223 """Print the line if the name of the logger isn't in IGNORE_LOGS_FROM. 

224 

225 Parameters 

226 ---------- 

227 logLine : `lsst.daf.butler.logging.ButlerLogRecord` 

228 The log line to print the message from. 

229 """ 

230 skip = False 

231 for skipTask in self.IGNORE_LOGS_FROM: 

232 if logLine.name.find(skipTask) != -1: 

233 skip = True 

234 break 

235 if not skip: 

236 self._printFormattedLine(logLine) 

237 

238 @staticmethod 

239 def _printFormattedLine(logLine: dafButler.logging.ButlerLogRecord) -> None: 

240 """Print the line, formatted as it would be for a normal task. 

241 

242 Parameters 

243 ---------- 

244 logLine : `lsst.daf.butler.logging.ButlerLogRecord` 

245 The log line to print the message from. 

246 """ 

247 print(f"{logLine.levelname} {logLine.name}: {logLine.message}") 

248 

249 def printFailLogs(self, full: bool = False) -> None: 

250 """Print the logs of all failing task instances. 

251 

252 Parameters 

253 ---------- 

254 full : `bool`, optional 

255 Prints the full log if true, otherwise just prints the last line 

256 containing the exception message. This defaults to False because 

257 logs can be very long when printed in full, and printing all in 

258 full can be many many thousands of lines. 

259 """ 

260 fails = self._getFailDataRefs() 

261 for dataRef in fails: 

262 print(f"\n{dataRef.dataId}:") 

263 log = self.logs[dataRef] 

264 if full: # print the whole thing 

265 for line in log: 

266 self._printLineIf(line) 

267 else: 

268 # print the last line from the Exception onwards if found, 

269 # failing over to printing the whole thing just in case. 

270 msg = log[-1].message 

271 head, sep, tail = msg.partition("Exception ") 

272 if sep: 

273 print(tail) 

274 else: 

275 print(msg) 

276 

277 def doFailZoology(self, giveExampleId: bool = False) -> None: 

278 """Print all the different types of error, with a count for how many of 

279 each type occurred. 

280 

281 Parameters 

282 ---------- 

283 giveExampleId : `bool`, optional 

284 If true, for each type of error seen, print an example dataId. This 

285 can be useful if you want to rerun a single image from the command 

286 line to debug a particular type of failure mode. 

287 """ 

288 zoo = {} 

289 examples = {} 

290 fails = self._getFailDataRefs() 

291 for dataRef in fails: 

292 log = self.logs[dataRef] 

293 msg = log[-1].message # log[-1].message is the text of the last line of the log 

294 head, sep, tail = msg.partition("Exception ") 

295 if not sep: 

296 self.log.warning(f"Surprise parsing log for {dataRef.dataId}") 

297 continue 

298 else: 

299 error = tail 

300 for error_string in self.SPECIAL_ZOO_CASES: 

301 if error.find(error_string) != -1: 

302 error = error.split(error_string)[0] + error_string + "..." 

303 if error not in zoo: 

304 zoo[error] = 1 

305 if giveExampleId: 

306 examples[error] = dataRef.dataId 

307 else: 

308 zoo[error] += 1 

309 

310 pad = 0 # don't pad when giving examples, it looks weird 

311 if not giveExampleId: 

312 if zoo.values(): 

313 maxCount = max([v for v in zoo.values()]) 

314 pad = len(str(maxCount)) # number of digits in the largest count 

315 

316 for error in sorted(zoo.keys()): 

317 count = zoo[error] 

318 print(f"{count:{pad}} instance{'s' if count > 1 else ' '} of {error}") 

319 if giveExampleId: 

320 print(f"example dataId: {examples[error]}\n") 

321 

322 def printSingleLog(self, dataId: dict | dafButler.DataCoordinate, full: bool = True) -> None: 

323 """Convenience function for printing a single log by its dataId. 

324 

325 Useful because you are given example dataIds by `doFailZoology()` but 

326 printing all the logs and looking for that id is not practical. 

327 

328 Parameters 

329 ---------- 

330 dataId : `dict` or `lsst.daf.butler.dimensions.DataCoordinate` 

331 The dataId. 

332 full : `bool`, optional 

333 Print the log in full, or just the exception? 

334 """ 

335 dRefs = [d for d in self.dataRefs if d.dataId == dataId] 

336 if len(dRefs) != 1: 

337 raise ValueError(f"Found {len(dRefs)} for {dataId}, expected exactly 1.") 

338 dataRef = dRefs[0] 

339 

340 print(f"\n{dataRef.dataId}:") 

341 log = self.logs[dataRef] 

342 if full: 

343 for line in log: 

344 self._printLineIf(line) 

345 else: 

346 msg = log[-1].message # log[-1].message is the text of the last line of the log 

347 head, sep, tail = msg.partition("Exception ") 

348 if sep: 

349 print(tail) 

350 else: 

351 print(msg)