Coverage for python / lsst / summit / extras / logUtils.py: 12%
129 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-07 09:03 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-07 09:03 +0000
1# This file is part of summit_extras.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import logging
24import lsst.daf.butler as dafButler
26__all__ = ["LogBrowser"]
28_LOG = logging.getLogger(__name__)
31class LogBrowser:
32 """A convenience class for helping identify different failure modes within
33 a processing collection.
35 Parameters
36 ----------
37 butler : `lsst.daf.butler.Butler`
38 The butler. Must contain the collection to be examined.
39 taskName : `str`
40 The name of the task, e.g. ``isr``, ``characterizeImage``, etc.
41 collection : `str`
42 The processing collection to use.
43 where : `str`, optional
44 A dataId search string formatted appropriately (i.e. similary to a
45 SQL WHERE clause) for a where clause in butler.registry.queryDatasets.
46 E.g.
47 where = ("instrument=\'{}\' AND skymap=\'{}\' AND
48 "visit IN (0..100).format("LATISS", "latiss_v1"))
49 bind : `~collections.abc.Mapping`, optional
50 Mapping containing literal values to be injected into the ``where``
51 expression, keyed by the identifiers they replace (note that the name
52 of the bind key cannot be the same as any butler dimension name).
53 E.g.
54 where = "exposure IN (exposures)"
55 bind = {"exposures": exposure_list}
57 Notes
58 -----
59 Many tasks throw errors with values in them, meaning the ``doFailZoology``
60 function doesn't collapse them down to a single failure case as one would
61 like. If this is the case, take the first part of the message that is
62 common among the ones you would like to be classed together, and add it to
63 the class property ``SPECIAL_ZOO_CASES`` to declare a new type of error
64 animal.
66 example usage:
67 logBrowser = LogBrowser(butler, taskName, collection, where=where,
68 bind=bind)
69 fail = 'TaskError: Fatal astrometry failure detected: mean on-sky distance'
70 logBrowser.SPECIAL_ZOO_CASES.append(fail)
71 logBrowser.doFailZoology()
72 """
74 IGNORE_LOGS_FROM = [
75 # butler.datastores is verbose by default and not interesting to most
76 "lsst.daf.butler.datastores",
77 ]
78 SPECIAL_ZOO_CASES = [
79 "with gufunc signature (n?,k),(k,m?)->(n?,m?)",
80 ]
82 def __init__(
83 self,
84 butler: dafButler.Butler,
85 taskName: str,
86 collection: str,
87 where: str = "",
88 bind: dict | None = None,
89 ):
90 self.taskName = taskName
91 self.collection = collection
92 self.where = where
93 self.bind = bind
95 self.log = _LOG.getChild("logBrowser")
96 self.butler = butler
98 if self.bind is not None:
99 for key in self.bind.keys():
100 if key not in self.where:
101 self.log.warning(
102 f"Key '{key}' in bind is not in the where string provided: "
103 f"'{self.where}', so no binding will take effect."
104 )
106 self.dataRefs = self._getDataRefs()
107 self.logs = self._loadLogs(self.dataRefs)
109 def _getDataRefs(self) -> list[dafButler.DatasetRef]:
110 """Query the registry for this task's log dataRefs.
112 Returns
113 -------
114 dataRefs : `list` [`lsst.daf.butler.DatasetRef`]
115 Sorted, deduplicated list of ``{taskName}_log`` dataRefs in
116 the configured collection that match the ``where``/``bind``
117 filter.
118 """
119 queryResults = self.butler.registry.queryDatasets(
120 f"{self.taskName}_log",
121 collections=self.collection,
122 findFirst=True,
123 where=self.where,
124 bind=self.bind,
125 )
126 results = list(set(queryResults))
127 self.log.info(f"Found {len(results)} datasets in collection for task {self.taskName}")
128 return sorted(results)
130 def _loadLogs(self, dataRefs: list) -> dict[dafButler.DatasetRef, dafButler.ButlerLogRecords]:
131 """Fetch the log for each dataRef from the butler.
133 Parameters
134 ----------
135 dataRefs : `list` [`lsst.daf.butler.DatasetRef`]
136 The log dataRefs to load.
138 Returns
139 -------
140 logs : `dict` [`lsst.daf.butler.DatasetRef`, \
141 `lsst.daf.butler.ButlerLogRecords`]
142 Dict of loaded logs keyed by their dataRef.
143 """
144 logs = {}
145 for i, dataRef in enumerate(dataRefs):
146 if (i + 1) % 100 == 0:
147 self.log.info(f"Loaded {i + 1} logs...")
148 log = self.butler.get(dataRef)
149 logs[dataRef] = log
150 return logs
152 def getPassingDataIds(self) -> list[dafButler.DataCoordinate]:
153 """Return the dataIds for all successful task runs.
155 Returns
156 -------
157 dataIds : `list` [`lsst.daf.butler.DataCoordinate`]
158 DataIds whose final log line does not contain ``"failed"``.
159 """
160 fails = self._getFailDataRefs()
161 passes = [r.dataId for r in self.dataRefs if r not in fails]
162 return passes
164 def getFailingDataIds(self) -> list[dafButler.DataCoordinate]:
165 """Return the dataIds for all failed task runs.
167 Returns
168 -------
169 dataIds : `list` [`lsst.daf.butler.DataCoordinate`]
170 DataIds whose final log line contains ``"failed"``.
171 """
172 fails = self._getFailDataRefs()
173 return [r.dataId for r in fails]
175 def printPasses(self) -> None:
176 """Print out all the passing dataIds."""
177 passes = self.getPassingDataIds()
178 for dataId in passes:
179 print(dataId)
181 def printFails(self) -> None:
182 """Print out all the failing dataIds."""
183 fails = self.getFailingDataIds()
184 for dataId in fails:
185 print(dataId)
187 def countFails(self) -> None:
188 """Print a count of all the failing dataIds."""
189 print(f"{len(self._getFailDataRefs())} failing cases found")
191 def countPasses(self) -> None:
192 """Print a count of all the passing dataIds."""
193 print(f"{len(self.getPassingDataIds())} passing cases found")
195 def _getFailDataRefs(self) -> list[dafButler.DatasetRef]:
196 """Get a list of all the failing dataRefs.
198 Note that these are dataset references to the logs, and as such are
199 not fails themselves, but logs containing the fail messages, and as
200 such the item of interest for the failures are their dataIds. This is
201 why ``_getFailDataRefs()`` is a private method, but getFailingDataIds
202 is the public API.
204 Returns
205 -------
206 logs : `list` [`lsst.daf.butler.DatasetRef`]
207 A list of all the failing dataRefs.
208 """
209 fails = []
210 for dataRef, log in self.logs.items():
211 # dereferencing a log with [] gives the individual lines in it,
212 # each containing a level, message, etc.
213 # the final task failure message always comes in the last line
214 # of the log and contains the string 'failed' as this is the
215 # pipeline executor reporting on success/fail and the time and id.
216 if len(log) == 0:
217 continue
218 if log[-1].message.find("failed") != -1:
219 fails.append(dataRef)
220 return fails
222 def _printLineIf(self, logLine: dafButler.logging.ButlerLogRecord) -> None:
223 """Print the line if the name of the logger isn't in IGNORE_LOGS_FROM.
225 Parameters
226 ----------
227 logLine : `lsst.daf.butler.logging.ButlerLogRecord`
228 The log line to print the message from.
229 """
230 skip = False
231 for skipTask in self.IGNORE_LOGS_FROM:
232 if logLine.name.find(skipTask) != -1:
233 skip = True
234 break
235 if not skip:
236 self._printFormattedLine(logLine)
238 @staticmethod
239 def _printFormattedLine(logLine: dafButler.logging.ButlerLogRecord) -> None:
240 """Print the line, formatted as it would be for a normal task.
242 Parameters
243 ----------
244 logLine : `lsst.daf.butler.logging.ButlerLogRecord`
245 The log line to print the message from.
246 """
247 print(f"{logLine.levelname} {logLine.name}: {logLine.message}")
249 def printFailLogs(self, full: bool = False) -> None:
250 """Print the logs of all failing task instances.
252 Parameters
253 ----------
254 full : `bool`, optional
255 Prints the full log if true, otherwise just prints the last line
256 containing the exception message. This defaults to False because
257 logs can be very long when printed in full, and printing all in
258 full can be many many thousands of lines.
259 """
260 fails = self._getFailDataRefs()
261 for dataRef in fails:
262 print(f"\n{dataRef.dataId}:")
263 log = self.logs[dataRef]
264 if full: # print the whole thing
265 for line in log:
266 self._printLineIf(line)
267 else:
268 # print the last line from the Exception onwards if found,
269 # failing over to printing the whole thing just in case.
270 msg = log[-1].message
271 head, sep, tail = msg.partition("Exception ")
272 if sep:
273 print(tail)
274 else:
275 print(msg)
277 def doFailZoology(self, giveExampleId: bool = False) -> None:
278 """Print all the different types of error, with a count for how many of
279 each type occurred.
281 Parameters
282 ----------
283 giveExampleId : `bool`, optional
284 If true, for each type of error seen, print an example dataId. This
285 can be useful if you want to rerun a single image from the command
286 line to debug a particular type of failure mode.
287 """
288 zoo = {}
289 examples = {}
290 fails = self._getFailDataRefs()
291 for dataRef in fails:
292 log = self.logs[dataRef]
293 msg = log[-1].message # log[-1].message is the text of the last line of the log
294 head, sep, tail = msg.partition("Exception ")
295 if not sep:
296 self.log.warning(f"Surprise parsing log for {dataRef.dataId}")
297 continue
298 else:
299 error = tail
300 for error_string in self.SPECIAL_ZOO_CASES:
301 if error.find(error_string) != -1:
302 error = error.split(error_string)[0] + error_string + "..."
303 if error not in zoo:
304 zoo[error] = 1
305 if giveExampleId:
306 examples[error] = dataRef.dataId
307 else:
308 zoo[error] += 1
310 pad = 0 # don't pad when giving examples, it looks weird
311 if not giveExampleId:
312 if zoo.values():
313 maxCount = max([v for v in zoo.values()])
314 pad = len(str(maxCount)) # number of digits in the largest count
316 for error in sorted(zoo.keys()):
317 count = zoo[error]
318 print(f"{count:{pad}} instance{'s' if count > 1 else ' '} of {error}")
319 if giveExampleId:
320 print(f"example dataId: {examples[error]}\n")
322 def printSingleLog(self, dataId: dict | dafButler.DataCoordinate, full: bool = True) -> None:
323 """Convenience function for printing a single log by its dataId.
325 Useful because you are given example dataIds by `doFailZoology()` but
326 printing all the logs and looking for that id is not practical.
328 Parameters
329 ----------
330 dataId : `dict` or `lsst.daf.butler.dimensions.DataCoordinate`
331 The dataId.
332 full : `bool`, optional
333 Print the log in full, or just the exception?
334 """
335 dRefs = [d for d in self.dataRefs if d.dataId == dataId]
336 if len(dRefs) != 1:
337 raise ValueError(f"Found {len(dRefs)} for {dataId}, expected exactly 1.")
338 dataRef = dRefs[0]
340 print(f"\n{dataRef.dataId}:")
341 log = self.logs[dataRef]
342 if full:
343 for line in log:
344 self._printLineIf(line)
345 else:
346 msg = log[-1].message # log[-1].message is the text of the last line of the log
347 head, sep, tail = msg.partition("Exception ")
348 if sep:
349 print(tail)
350 else:
351 print(msg)