Coverage for python / lsst / summit / extras / headerFunctions.py: 9%
181 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 17:51 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 17:51 +0000
1# This file is part of summit_extras.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import filecmp
23import hashlib
24import logging
25import os
26import pickle
27import sys
28from typing import Any
30import astropy
31import numpy as np
32from astropy.io import fits
34# redirect logger to stdout so that logger messages appear in notebooks too
35logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
36logger = logging.getLogger("headerFunctions")
39def loadHeaderDictsFromLibrary(libraryFilename: str) -> tuple[dict, dict]:
40 """Load cached header and hash dicts from a pickle library file.
42 Parameters
43 ----------
44 libraryFilename : `str`
45 Path of the library pickle file to load from.
47 Returns
48 -------
49 headersDict : `dict`
50 A dict, keyed by filename, with values being the full primary
51 header, in the same format built by `buildHashAndHeaderDicts`.
52 Empty if the file does not exist or fails to load.
53 dataDict : `dict`
54 A dict, keyed by filename, with values being hashes of the data
55 sections, in the same format built by `buildHashAndHeaderDicts`.
56 Empty if the file does not exist or fails to load.
57 """
58 try:
59 with open(libraryFilename, "rb") as pickleFile:
60 headersDict, dataDict = pickle.load(pickleFile)
62 if len(headersDict) != len(dataDict):
63 print("Loaded differing numbers of entries for the header and data dicts.")
64 print(f"{len(headersDict)} vs {len(dataDict)}")
65 print("Something has gone badly wrong - your library seems corrupted!")
66 else:
67 print(f"Loaded {len(headersDict)} values from pickle files")
68 except Exception as e:
69 if not os.path.exists(libraryFilename):
70 print(
71 f"{libraryFilename} not found. If building the header dicts for the first time this"
72 " is to be expected.\nOtherwise you've misspecified the path to you library!"
73 )
74 else:
75 print(f"Something more sinister went wrong loading headers from {libraryFilename}:\n{e}")
76 return {}, {}
78 return headersDict, dataDict
81def _saveToLibrary(libraryFilename: str, headersDict: dict, dataDict: dict) -> None:
82 """Persist header and data-hash dicts to a pickle library file.
84 Parameters
85 ----------
86 libraryFilename : `str`
87 Path of the library pickle file to write.
88 headersDict : `dict`
89 Dict of primary headers keyed by filename.
90 dataDict : `dict`
91 Dict of data-section hashes keyed by filename.
92 """
93 try:
94 with open(libraryFilename, "wb") as dumpFile:
95 pickle.dump((headersDict, dataDict), dumpFile, pickle.HIGHEST_PROTOCOL)
96 except Exception:
97 print("Failed to write pickle file! Here's a debugger so you don't lose all your work:")
98 import ipdb as pdb
100 pdb.set_trace()
103def _findKeyForValue(
104 dictionary: dict, value: Any, warnOnCollision: bool = True, returnCollisions: bool = False
105) -> Any:
106 """Return the key or keys that map to a given value.
108 Parameters
109 ----------
110 dictionary : `dict`
111 The dictionary to search.
112 value : `object`
113 The value to look up.
114 warnOnCollision : `bool`, optional
115 If `True`, log a warning when the value is held by more than one
116 key.
117 returnCollisions : `bool`, optional
118 If `True`, return the full list of matching keys. Otherwise
119 return only the first match.
121 Returns
122 -------
123 keys : `object` or `list`
124 A single matching key, or the full list of matching keys if
125 ``returnCollisions`` is `True`.
126 """
127 listOfKeys = [k for (k, v) in dictionary.items() if v == value]
128 if warnOnCollision and len(listOfKeys) > 1:
129 logger.warning("Found multiple keys for value! Returning only first.")
130 if returnCollisions:
131 return listOfKeys
132 if not listOfKeys:
133 return None
134 return listOfKeys[0]
137def _hashFile(fileToHash: Any, dataHdu: int | str, sliceToUse: slice) -> str:
138 """Return a hash of a 2D region of a FITS HDU's data array.
140 Kept as a small helper so that hashing multiple HDUs (e.g. to cope
141 with HDUs filled with zeros) can be added straightforwardly.
143 Parameters
144 ----------
145 fileToHash : `astropy.io.fits.HDUList`
146 The opened FITS file.
147 dataHdu : `int` or `str`
148 Index or extension name of the HDU holding the pixel data.
149 sliceToUse : `slice`
150 Slice applied to both axes of the data array before hashing.
152 Returns
153 -------
154 hashStr : `str`
155 Hex-encoded SHA-256 digest of the sliced data.
156 """
157 # Cast to a fixed dtype so an all-zero array hashes to ZERO_HASH
158 # regardless of the HDU's native pixel dtype.
159 data = fileToHash[dataHdu].data[sliceToUse, sliceToUse].astype(np.int32).tobytes()
160 h = _hashData(data)
161 return h
164def _hashData(data: np.ndarray) -> str:
165 """Return a hex SHA-256 hash of the given array's bytes.
167 Parameters
168 ----------
169 data : `numpy.ndarray` or `bytes`
170 The data to hash.
172 Returns
173 -------
174 hashStr : `str`
175 Hex-encoded SHA-256 digest, suitable for storing in a dict.
176 """
177 h = hashlib.sha256(data).hexdigest() # hex because we want it readable in the dict
178 return h
181ZERO_HASH = _hashData(np.zeros((100, 100), dtype=np.int32))
184def buildHashAndHeaderDicts(
185 fileList: list[str], dataHdu: int | str = "Segment00", libraryLocation: str | None = None
186) -> tuple[dict, dict]:
187 """Build dicts of primary headers and data-section hashes.
189 Data is hashed using the hard-coded 100x100 corner of the pixel
190 array, i.e. ``file[dataHdu].data[0:100, 0:100]``. If a library
191 location is supplied, existing entries are loaded from there and
192 newly computed entries are written back.
194 Parameters
195 ----------
196 fileList : `list` [`str`]
197 The fully-specified paths of the files to scrape.
198 dataHdu : `int` or `str`, optional
199 The HDU to use for the pixel data to hash.
200 libraryLocation : `str`, optional
201 Path to a pickle file used to cache results across runs. If
202 provided, existing entries are loaded before processing and
203 newly computed entries are written back.
205 Returns
206 -------
207 headersDict : `dict`
208 A dict, keyed by filename, with values being the full primary
209 header.
210 dataDict : `dict`
211 A dict, keyed by filename, with values being hashes of the
212 file's data section.
213 """
214 headersDict: dict[str, Any] = {}
215 dataDict: dict[str, str] = {}
217 if libraryLocation:
218 headersDict, dataDict = loadHeaderDictsFromLibrary(libraryLocation)
220 # don't load files we already know about from the library
221 filesToLoad = [f for f in fileList if f not in headersDict.keys()]
223 s = slice(0, 100)
224 for filenum, filename in enumerate(filesToLoad):
225 if len(filesToLoad) > 1000 and filenum % 1000 == 0:
226 if libraryLocation:
227 logger.info(f"Processed {filenum} of {len(filesToLoad)} files not loaded from library...")
228 else:
229 logger.info(f"Processed {filenum} of {len(fileList)} files...")
230 with fits.open(filename) as f:
231 try:
232 headersDict[filename] = f[0].header
233 h = _hashFile(f, dataHdu, s)
234 if h in dataDict.values():
235 collision = _findKeyForValue(dataDict, h, warnOnCollision=False)
236 logger.warning(
237 f"Duplicate file (or hash collision!) for files {filename} and " f"{collision}!"
238 )
239 if filecmp.cmp(filename, collision):
240 logger.warning("Filecmp shows files are identical")
241 else:
242 logger.warning(
243 "Filecmp shows files differ - "
244 "likely just zeros for data (or a genuine hash collision!)"
245 )
247 dataDict[filename] = h
248 except Exception:
249 logger.warning(f"Failed to load {filename} - file is likely corrupted.")
251 # we have always added to this, so save it back over the original
252 if libraryLocation and len(filesToLoad) > 0:
253 _saveToLibrary(libraryLocation, headersDict, dataDict)
255 # have to pare these down, as library loaded could be a superset
256 headersDict = {k: headersDict[k] for k in fileList if k in headersDict.keys()}
257 dataDict = {k: dataDict[k] for k in fileList if k in dataDict.keys()}
259 return headersDict, dataDict
262def sortedAsStrings(inlist: set | list, replacementValue: str = "<BLANK VALUE>") -> list:
263 """Sort a list, coercing all values to strings and handling blanks.
265 Replaces any ``astropy.io.fits.card.Undefined`` entries with
266 ``replacementValue`` so they sort consistently alongside mixed
267 string and integer values.
269 Parameters
270 ----------
271 inlist : `set` or `list`
272 Values to sort.
273 replacementValue : `str`, optional
274 String used in place of undefined FITS card values.
276 Returns
277 -------
278 output : `list` [`str`]
279 Sorted list of string values.
280 """
281 output = [
282 str(x) if not isinstance(x, astropy.io.fits.card.Undefined) else replacementValue for x in inlist
283 ]
284 return sorted(output)
287def keyValuesSetFromFiles(
288 fileList: list[str],
289 keys: list[str],
290 joinKeys: list[str],
291 noWarn: bool = False,
292 printResults: bool = True,
293 libraryLocation: str | None = None,
294 printPerFile: bool = False,
295) -> Any:
296 """Get the set of values seen for a set of header keys across files.
298 Parameters
299 ----------
300 fileList : `list` [`str`]
301 The fully-specified paths of the files to scrape.
302 keys : `list` [`str`]
303 The header keys to scrape.
304 joinKeys : `list` [`str`]
305 List of keys to concatenate with ``+`` when scraping. For
306 example, a header with ``FILTER1 = SDSS_u`` and
307 ``FILTER2 = NB_640nm`` yields ``SDSS_u+NB_640nm``. Useful when
308 looking for the actually observed combinations rather than the
309 Cartesian product of the individual sets. Pass an empty list to
310 skip join processing.
311 noWarn : `bool`, optional
312 If `True`, suppress warnings about keys missing from a header.
313 printResults : `bool`, optional
314 If `True`, print the collected values (and files with all-zero
315 data) to stdout.
316 libraryLocation : `str`, optional
317 Path to a cached header library; passed through to
318 `buildHashAndHeaderDicts`.
319 printPerFile : `bool`, optional
320 If `True`, print each ``filename<tab>key<tab>value`` triple as
321 it is scraped. Prompts for confirmation when the output would
322 exceed 200 lines.
324 Returns
325 -------
326 kValues : `dict` [`str`, `set`] or `None`
327 Mapping of each requested key to the set of values seen. `None`
328 if ``keys`` is empty.
329 joinedValues : `set` [`str`]
330 Set of joined value strings. Only returned if ``joinKeys`` is
331 non-empty.
332 """
333 print(f"Scraping headers from {len(fileList)} files...")
334 if printPerFile and (len(fileList) * len(keys) > 200):
335 print(f"You asked to print headers per-file, for {len(fileList)} files x {len(keys)} keys.")
336 cont = input("Are you sure? Press y to continue, anything else to quit:")
337 if not cont or cont.lower()[0] != "y":
338 exit()
340 headerDict, hashDict = buildHashAndHeaderDicts(fileList, libraryLocation=libraryLocation)
342 kValues: dict[str, set[Any]] | None
343 if keys: # necessary so that -j works on its own
344 kValues = {k: set() for k in keys}
345 else:
346 keys = []
347 kValues = None
349 joinedValues: set[str] = set()
351 for filename in headerDict.keys():
352 header = headerDict[filename]
353 for key in keys:
354 if key in header:
355 assert kValues is not None
356 kValues[key].add(header[key])
357 if printPerFile:
358 print(f"{filename}\t{key}\t{header[key]}")
359 if len(keys) > 1 and key == keys[-1]:
360 # newline between files if multikey
361 print()
362 else:
363 if not noWarn:
364 logger.warning(f"{key} not found in header of {filename}")
366 if joinKeys:
367 jVals = None
368 # Note that CCS doesn't leave values blank, it misses the whole
369 # card out for things like FILTER2 when not being used
370 jVals = [header[k] if k in header else "<missing card>" for k in joinKeys]
372 # However, we do ALSO get blank cards to, so:
373 # substitute <BLANK_VALUE> when there is an undefined card
374 # because str(v) will give the address for each blank value
375 # too, meaning each blank card looks like a different value
376 joinedValues.add(
377 "+".join(
378 [
379 str(v) if not isinstance(v, astropy.io.fits.card.Undefined) else "<BLANK_VALUE>"
380 for v in jVals
381 ]
382 )
383 )
385 if printResults:
386 # Do this first because it's messy
387 zeroFiles = _findKeyForValue(hashDict, ZERO_HASH, warnOnCollision=False, returnCollisions=True)
388 if zeroFiles:
389 print("\nFiles with zeros for data:")
390 for filename in zeroFiles:
391 print(f"{filename}")
393 if kValues is not None:
394 for key in kValues.keys():
395 print(f"\nValues found for header key {key}:")
396 print(f"{sortedAsStrings(kValues[key])}")
398 if joinKeys:
399 print(f"\nValues found when joining {joinKeys}:")
400 print(f"{sortedAsStrings(joinedValues)}")
402 if joinKeys:
403 return kValues, joinedValues
405 return kValues
408def compareHeaders(filename1: str, filename2: str) -> None:
409 """Compare the headers of two FITS files in detail.
411 First, the two files are confirmed to have the same pixel data (by
412 hashing the first 100x100 pixels in HDU 1) to ensure the files
413 really should be compared.
415 The function then prints:
417 - keys that appear in A but not in B
418 - keys that appear in B but not in A
419 - keys common to both files, split into those with identical
420 values and those whose values differ (showing the differing
421 values side-by-side)
422 - whether the post-PDU HDU ``EXTNAME`` ordering differs between
423 the two files
425 Parameters
426 ----------
427 filename1 : `str`
428 Full path to the first file to compare.
429 filename2 : `str`
430 Full path to the second file to compare.
431 """
432 assert isinstance(filename1, str)
433 assert isinstance(filename2, str)
435 headerDict1, hashDict1 = buildHashAndHeaderDicts([filename1])
436 headerDict2, hashDict2 = buildHashAndHeaderDicts([filename2])
438 if hashDict1[filename1] != hashDict2[filename2]:
439 print("Pixel data was not the same - did you really mean to compare these files?")
440 print(f"{filename1}\n{filename2}")
441 cont = input("Press y to continue, anything else to quit:")
442 if not cont or cont.lower()[0] != "y":
443 exit()
445 # you might think you don't want to always call sorted() on the key sets
446 # BUT otherwise they seem to be returned in random order each time you run
447 # and that can be crazy-making
449 h1 = headerDict1[filename1]
450 h2 = headerDict2[filename2]
451 h1Keys = list(h1.keys())
452 h2Keys = list(h2.keys())
454 commonKeys = set(h1Keys)
455 commonKeys = commonKeys.intersection(h2Keys)
457 keysInh1NotInh2 = sortedAsStrings([_ for _ in h1Keys if _ not in h2Keys])
458 keysInh2NotInh1 = sortedAsStrings([_ for _ in h2Keys if _ not in h1Keys])
460 print(f"Keys in {filename1} not in {filename2}:\n{keysInh1NotInh2}\n")
461 print(f"Keys in {filename2} not in {filename1}:\n{keysInh2NotInh1}\n")
462 print(f"Keys in common:\n{sortedAsStrings(commonKeys)}\n")
464 # put in lists so we can output neatly rather than interleaving
465 identical = []
466 differing = []
467 for key in commonKeys:
468 if h1[key] == h2[key]:
469 identical.append(key)
470 else:
471 differing.append(key)
473 assert len(identical) + len(differing) == len(commonKeys)
475 if len(identical) == len(commonKeys):
476 print("All keys in common have identical values :)")
477 else:
478 print("Of the common keys, the following had identical values:")
479 print(f"{sortedAsStrings(identical)}\n")
480 print("Common keys with differing values were:")
481 for key in sortedAsStrings(differing):
482 d = "<blank card>".ljust(25)
483 v1 = str(h1[key]).ljust(25) if not isinstance(h1[key], astropy.io.fits.card.Undefined) else d
484 v2 = str(h2[key]).ljust(25) if not isinstance(h2[key], astropy.io.fits.card.Undefined) else d
485 print(f"{key.ljust(8)}: {v1} vs {v2}")
487 # Finally, check the extension naming has the same ordering.
488 # We have to touch the files again, which is pretty lame
489 # but not doing so would require the header builder to know about
490 # file pairings or return extra info, and that's not ideal either,
491 # and also not worth the hassle to optimise as this is only
492 # ever for a single file, not bulk file processing
493 numbering1, numbering2 = [], []
494 with fits.open(filename1) as f1, fits.open(filename2) as f2:
495 for hduF1, hduF2 in zip(f1[1:], f2[1:]): # skip the PDU
496 if "EXTNAME" in hduF1.header and "EXTNAME" in hduF2.header:
497 numbering1.append(hduF1.header["EXTNAME"])
498 numbering2.append(hduF2.header["EXTNAME"])
500 if numbering1 != numbering2:
501 print("\nSection numbering differs between files!")
502 for s1, s2 in zip(numbering1, numbering2):
503 print(f"{s1.ljust(12)} vs {s2.ljust(12)}")
504 if len(numbering1) != len(numbering2):
505 print("The length of those lists was also DIFFERENT! Presumably a non-image HDU was interleaved.")