Coverage for python / lsst / summit / extras / logUtils.py: 13%

128 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-22 09:18 +0000

1# This file is part of summit_extras. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22import logging 

23import math 

24 

25import lsst.daf.butler as dafButler 

26 

27__all__ = ["LogBrowser"] 

28 

29_LOG = logging.getLogger(__name__) 

30 

31 

32class LogBrowser: 

33 """A convenience class for helping identify different failure modes within 

34 a processing collection. 

35 

36 Parameters 

37 ---------- 

38 butler : `lsst.daf.butler.Butler` 

39 The butler. Must contain the collection to be examined. 

40 taskName : `str` 

41 The name of the task, e.g. ``isr``, ``characterizeImage``, etc. 

42 collection : `str` 

43 The processing collection to use. 

44 where : `str`, optional 

45 A dataId search string formatted appropriately (i.e. similary to a 

46 SQL WHERE clause) for a where clause in butler.registry.queryDatasets. 

47 E.g. 

48 where = ("instrument=\'{}\' AND skymap=\'{}\' AND 

49 "visit IN (0..100).format("LATISS", "latiss_v1")) 

50 bind : `~collections.abc.Mapping`, optional 

51 Mapping containing literal values to be injected into the ``where`` 

52 expression, keyed by the identifiers they replace (note that the name 

53 of the bind key cannot be the same as any butler dimension name). 

54 E.g. 

55 where = "exposure IN (exposures)" 

56 bind = {"exposures": exposure_list} 

57 

58 Notes 

59 ----- 

60 Many tasks throw errors with values in them, meaning the ``doFailZoology`` 

61 function doesn't collapse them down to a single failure case as one would 

62 like. If this is the case, take the first part of the message that is 

63 common among the ones you would like to be classed together, and add it to 

64 the class property ``SPECIAL_ZOO_CASES`` to declare a new type of error 

65 animal. 

66 

67 example usage: 

68 logBrowser = LogBrowser(butler, taskName, collection, where=where, 

69 bind=bind) 

70 fail = 'TaskError: Fatal astrometry failure detected: mean on-sky distance' 

71 logBrowser.SPECIAL_ZOO_CASES.append(fail) 

72 logBrowser.doFailZoology() 

73 """ 

74 

75 IGNORE_LOGS_FROM = [ 

76 # butler.datastores is verbose by default and not interesting to most 

77 "lsst.daf.butler.datastores", 

78 ] 

79 SPECIAL_ZOO_CASES = [ 

80 "with gufunc signature (n?,k),(k,m?)->(n?,m?)", 

81 ] 

82 

83 def __init__( 

84 self, 

85 butler: dafButler.Butler, 

86 taskName: str, 

87 collection: str, 

88 where: str = "", 

89 bind: dict | None = None, 

90 ): 

91 self.taskName = taskName 

92 self.collection = collection 

93 self.where = where 

94 self.bind = bind 

95 

96 self.log = _LOG.getChild("logBrowser") 

97 self.butler = butler 

98 

99 if self.bind is not None: 

100 for key in self.bind.keys(): 

101 if key not in self.where: 

102 self.log.warn( 

103 f"Key '{key}' in bind is not in the where string provided: " 

104 f"'{self.where}', so no binding will take effect." 

105 ) 

106 

107 self.dataRefs = self._getDataRefs() 

108 self.logs = self._loadLogs(self.dataRefs) 

109 

110 def _getDataRefs(self) -> list[dafButler.DatasetRef]: 

111 """Get the dataRefs for the specified task and collection. 

112 

113 Returns 

114 ------- 

115 dataRefs : `list` [`lsst.daf.butler.DatasetRef`] 

116 """ 

117 results = self.butler.registry.queryDatasets( 

118 f"{self.taskName}_log", 

119 collections=self.collection, 

120 findFirst=True, 

121 where=self.where, 

122 bind=self.bind, 

123 ) 

124 results = list(set(results)) 

125 self.log.info(f"Found {len(results)} datasets in collection for task {self.taskName}") 

126 return sorted(results) 

127 

128 def _loadLogs(self, dataRefs: list) -> dict[dafButler.DatasetRef, dafButler.ButlerLogRecords]: 

129 """Load all the logs for the dataRefs. 

130 

131 Returns 

132 ------- 

133 logs : `dict` {`lsst.daf.butler.DatasetRef`: 

134 `lsst.daf.butler.ButlerLogRecords`} 

135 A dict of all the logs, keyed by their dataRef. 

136 """ 

137 logs = {} 

138 for i, dataRef in enumerate(dataRefs): 

139 if (i + 1) % 100 == 0: 

140 self.log.info(f"Loaded {i + 1} logs...") 

141 log = self.butler.get(dataRef) 

142 logs[dataRef] = log 

143 return logs 

144 

145 def getPassingDataIds(self) -> list[dafButler.DataCoordinate]: 

146 """Get the dataIds for all passes within the collection for the task. 

147 

148 Returns 

149 ------- 

150 dataIds : `list` [`lsst.daf.butler.dimensions.DataCoordinate`] 

151 """ 

152 fails = self._getFailDataRefs() 

153 passes = [r.dataId for r in self.dataRefs if r not in fails] 

154 return passes 

155 

156 def getFailingDataIds(self) -> list[dafButler.DataCoordinate]: 

157 """Get the dataIds for all fails within the collection for the task. 

158 

159 Returns 

160 ------- 

161 dataIds : `list` [`lsst.daf.butler.dimensions.DataCoordinate`] 

162 """ 

163 fails = self._getFailDataRefs() 

164 return [r.dataId for r in fails] 

165 

166 def printPasses(self) -> None: 

167 """Print out all the passing dataIds.""" 

168 passes = self.getPassingDataIds() 

169 for dataId in passes: 

170 print(dataId) 

171 

172 def printFails(self) -> None: 

173 """Print out all the failing dataIds.""" 

174 fails = self.getFailingDataIds() 

175 for dataId in fails: 

176 print(dataId) 

177 

178 def countFails(self) -> None: 

179 """Print a count of all the failing dataIds.""" 

180 print(f"{len(self._getFailDataRefs())} failing cases found") 

181 

182 def countPasses(self) -> None: 

183 """Print a count of all the passing dataIds.""" 

184 print(f"{len(self.getPassingDataIds())} passing cases found") 

185 

186 def _getFailDataRefs(self) -> list[dafButler.DatasetRef]: 

187 """Get a list of all the failing dataRefs. 

188 

189 Note that these are dataset references to the logs, and as such are 

190 not fails themselves, but logs containing the fail messages, and as 

191 such the item of interest for the failures are their dataIds. This is 

192 why ``_getFailDataRefs()`` is a private method, but getFailingDataIds 

193 is the public API. 

194 

195 Returns 

196 ------- 

197 logs : `list` [`lsst.daf.butler.DatasetRef`] 

198 A list of all the failing dataRefs. 

199 """ 

200 fails = [] 

201 for dataRef, log in self.logs.items(): 

202 # dereferencing a log with [] gives the individual lines in it, 

203 # each containing a level, message, etc. 

204 # the final task failure message always comes in the last line 

205 # of the log and contains the string 'failed' as this is the 

206 # pipeline executor reporting on success/fail and the time and id. 

207 if log[-1].message.find("failed") != -1: 

208 fails.append(dataRef) 

209 return fails 

210 

211 def _printLineIf(self, logLine: dafButler.logging.ButlerLogRecord) -> None: 

212 """Print the line if the name of the logger isn't in IGNORE_LOGS_FROM. 

213 

214 Parameters 

215 ---------- 

216 logLine : `lsst.daf.butler.logging.ButlerLogRecord` 

217 The log line to print the message from. 

218 """ 

219 skip = False 

220 for skipTask in self.IGNORE_LOGS_FROM: 

221 if logLine.name.find(skipTask) != -1: 

222 skip = True 

223 break 

224 if not skip: 

225 self._printFormattedLine(logLine) 

226 

227 @staticmethod 

228 def _printFormattedLine(logLine: dafButler.logging.ButlerLogRecord) -> None: 

229 """Print the line, formatted as it would be for a normal task. 

230 

231 Parameters 

232 ---------- 

233 logLine : `lsst.daf.butler.logging.ButlerLogRecord` 

234 The log line to print the message from. 

235 """ 

236 print(f"{logLine.levelname} {logLine.name}: {logLine.message}") 

237 

238 def printFailLogs(self, full: bool = False) -> None: 

239 """Print the logs of all failing task instances. 

240 

241 Parameters 

242 ---------- 

243 full : `bool`, optional 

244 Prints the full log if true, otherwise just prints the last line 

245 containing the exception message. This defaults to False because 

246 logs can be very long when printed in full, and printing all in 

247 full can be many many thousands of lines. 

248 """ 

249 fails = self._getFailDataRefs() 

250 for dataRef in fails: 

251 print(f"\n{dataRef.dataId}:") 

252 log = self.logs[dataRef] 

253 if full: # print the whole thing 

254 for line in log: 

255 self._printLineIf.print(line) 

256 else: 

257 # print the last line from the Exception onwards if found, 

258 # failing over to printing the whole thing just in case. 

259 msg = log[-1].message 

260 parts = msg.split("Exception ") 

261 if len(parts) == 2: 

262 print(parts[1]) 

263 else: 

264 print(msg) 

265 

266 def doFailZoology(self, giveExampleId: bool = False) -> None: 

267 """Print all the different types of error, with a count for how many of 

268 each type occurred. 

269 

270 Parameters 

271 ---------- 

272 giveExampleId : `bool`, optional 

273 If true, for each type of error seen, print an example dataId. This 

274 can be useful if you want to rerun a single image from the command 

275 line to debug a particular type of failure mode. 

276 """ 

277 zoo = {} 

278 examples = {} 

279 fails = self._getFailDataRefs() 

280 for dataRef in fails: 

281 log = self.logs[dataRef] 

282 msg = log[-1].message # log[-1].message is the text of the last line of the log 

283 parts = msg.split("Exception ") 

284 if len(parts) != 2: # pretty sure all fails contain one and only one 'Exception' but be safe 

285 self.log.warning(f"Surprise parsing log for {dataRef.dataId}") 

286 continue 

287 else: 

288 error = parts[1] 

289 for error_string in self.SPECIAL_ZOO_CASES: 

290 if error.find(error_string) != -1: 

291 error = error.split(error_string)[0] + error_string + "..." 

292 if error not in zoo: 

293 zoo[error] = 1 

294 if giveExampleId: 

295 examples[error] = dataRef.dataId 

296 else: 

297 zoo[error] += 1 

298 

299 pad = 0 # don't pad when giving examples, it looks weird 

300 if not giveExampleId: 

301 if zoo.values(): 

302 maxCount = max([v for v in zoo.values()]) 

303 pad = math.ceil(math.log10(maxCount)) # number of digits in the largest count 

304 

305 for error in sorted(zoo.keys()): 

306 count = zoo[error] 

307 print(f"{count:{pad}} instance{'s' if count > 1 else ' '} of {error}") 

308 if giveExampleId: 

309 print(f"example dataId: {examples[error]}\n") 

310 

311 def printSingleLog(self, dataId: dict | dafButler.DataCoordinate, full: bool = True) -> None: 

312 """Convenience function for printing a single log by its dataId. 

313 

314 Useful because you are given example dataIds by `doFailZoology()` but 

315 printing all the logs and looking for that id is not practical. 

316 

317 Parameters 

318 ---------- 

319 dataId : `dict` or `lsst.daf.butler.dimensions.DataCoordinate` 

320 The dataId. 

321 full : `bool`, optional 

322 Print the log in full, or just the exception? 

323 """ 

324 dRefs = [d for d in self.dataRefs if d.dataId == dataId] 

325 if len(dRefs) != 1: 

326 raise ValueError(f"Found {len(dRefs)} for {dataId}, expected exactly 1.") 

327 dataRef = dRefs[0] 

328 

329 print(f"\n{dataRef.dataId}:") 

330 log = self.logs[dataRef] 

331 if full: 

332 for line in log: 

333 self._printLineIf(line) 

334 else: 

335 msg = log[-1].message # log[-1].message is the text of the last line of the log 

336 parts = msg.split("Exception ") 

337 if len(parts) == 2: 

338 print(parts[1]) 

339 else: 

340 print(msg)