Coverage for python/lsst/summit/extras/headerFunctions.py: 8%
181 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-11 13:44 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-11 13:44 +0000
1# This file is part of summit_extras.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import filecmp
23import hashlib
24import logging
25import os
26import pickle
27import sys
28from typing import Any
30import astropy
31import numpy as np
32from astropy.io import fits
34# redirect logger to stdout so that logger messages appear in notebooks too
35logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
36logger = logging.getLogger("headerFunctions")
39def loadHeaderDictsFromLibrary(libraryFilename: str) -> tuple[dict, dict]:
40 """Load the header and hash dicts from a pickle file.
42 Parameters
43 ----------
44 libraryFilename : `str`
45 Path of the library file to load from
47 Returns
48 -------
49 headersDict : `dict`
50 A dict, keyed by filename, with the values being the full primary
51 header, exactly as if it were built by buildHashAndHeaderDicts().
53 dataDict : `dict`
54 A dict, keyed by filename, with the values being hashes of the data
55 sections, exactly as if it were built by buildHashAndHeaderDicts().
56 """
57 try:
58 with open(libraryFilename, "rb") as pickleFile:
59 headersDict, dataDict = pickle.load(pickleFile)
61 if len(headersDict) != len(dataDict):
62 print("Loaded differing numbers of entries for the header and data dicts.")
63 print(f"{len(headersDict)} vs {len(dataDict)}")
64 print("Something has gone badly wrong - your library seems corrupted!")
65 else:
66 print(f"Loaded {len(headersDict)} values from pickle files")
67 except Exception as e:
68 if not os.path.exists(libraryFilename):
69 print(
70 f"{libraryFilename} not found. If building the header dicts for the first time this"
71 " is to be expected.\nOtherwise you've misspecified the path to you library!"
72 )
73 else:
74 print(f"Something more sinister went wrong loading headers from {libraryFilename}:\n{e}")
75 return {}, {}
77 return headersDict, dataDict
80def _saveToLibrary(libraryFilename: str, headersDict: dict, dataDict: dict) -> None:
81 try:
82 with open(libraryFilename, "wb") as dumpFile:
83 pickle.dump((headersDict, dataDict), dumpFile, pickle.HIGHEST_PROTOCOL)
84 except Exception:
85 print("Failed to write pickle file! Here's a debugger so you don't lose all your work:")
86 import ipdb as pdb
88 pdb.set_trace()
91def _findKeyForValue(
92 dictionary: dict, value: Any, warnOnCollision: bool = True, returnCollisions: bool = False
93) -> Any:
94 listOfKeys = [k for (k, v) in dictionary.items() if v == value]
95 if warnOnCollision and len(listOfKeys) != 1:
96 logger.warning("Found multiple keys for value! Returning only first.")
97 if returnCollisions:
98 return listOfKeys
99 return listOfKeys[0]
102def _hashFile(fileToHash, dataHdu, sliceToUse) -> str:
103 """Put in place so that if hashing multiple HDUs is desired when one
104 is filled with zeros it will be easy to add"""
105 data = fileToHash[dataHdu].data[sliceToUse, sliceToUse].tostring()
106 h = _hashData(data)
107 return h
110def _hashData(data: np.array) -> str:
111 h = hashlib.sha256(data).hexdigest() # hex because we want it readable in the dict
112 return h
115ZERO_HASH = _hashData(np.zeros((100, 100), dtype=np.int32))
118def buildHashAndHeaderDicts(
119 fileList: list[str], dataHdu: int | str = "Segment00", libraryLocation: str | None = None
120) -> tuple[dict, dict]:
121 """For a list of files, build dicts of hashed data and headers.
123 Data is hashed using a currently-hard-coded 100x100 region of the pixels
124 i.e. file[dataHdu].data[0:100, 0:100]
126 Parameters
127 ----------
128 fileList : `list` of `str`
129 The fully-specified paths of the files to scrape
131 dataHdu : `str` or `int`
132 The HDU to use for the pixel data to hash.
134 Returns
135 -------
136 headersDict : `dict`
137 A dict, keyed by filename, with the values being the full primary
138 header.
140 dataDict : `dict`
141 A dict, keyed by filename, with the values being hashes of the file's
142 data section, as defined by the dataSize and dataHdu.
144 """
145 headersDict = {}
146 dataDict = {}
148 if libraryLocation:
149 headersDict, dataDict = loadHeaderDictsFromLibrary(libraryLocation)
151 # don't load files we already know about from the library
152 filesToLoad = [f for f in fileList if f not in headersDict.keys()]
154 s = slice(0, 100)
155 for filenum, filename in enumerate(filesToLoad):
156 if len(filesToLoad) > 1000 and filenum % 1000 == 0:
157 if libraryLocation:
158 logger.info(f"Processed {filenum} of {len(filesToLoad)} files not loaded from library...")
159 else:
160 logger.info(f"Processed {filenum} of {len(fileList)} files...")
161 with fits.open(filename) as f:
162 try:
163 headersDict[filename] = f[0].header
164 h = _hashFile(f, dataHdu, s)
165 if h in dataDict.values():
166 collision = _findKeyForValue(dataDict, h, warnOnCollision=False)
167 logger.warning(
168 f"Duplicate file (or hash collision!) for files {filename} and " f"{collision}!"
169 )
170 if filecmp.cmp(filename, collision):
171 logger.warning("Filecmp shows files are identical")
172 else:
173 logger.warning(
174 "Filecmp shows files differ - "
175 "likely just zeros for data (or a genuine hash collision!)"
176 )
178 dataDict[filename] = h
179 except Exception:
180 logger.warning(f"Failed to load {filename} - file is likely corrupted.")
182 # we have always added to this, so save it back over the original
183 if libraryLocation and len(filesToLoad) > 0:
184 _saveToLibrary(libraryLocation, headersDict, dataDict)
186 # have to pare these down, as library loaded could be a superset
187 headersDict = {k: headersDict[k] for k in fileList if k in headersDict.keys()}
188 dataDict = {k: dataDict[k] for k in fileList if k in dataDict.keys()}
190 return headersDict, dataDict
193def sorted(inlist: list, replacementValue: str = "<BLANK VALUE>") -> list:
194 """Redefinition of sorted() to deal with blank values and str/int mixes"""
195 from builtins import sorted as _sorted
197 output = [
198 str(x) if not isinstance(x, astropy.io.fits.card.Undefined) else replacementValue for x in inlist
199 ]
200 output = _sorted(output)
201 return output
204def keyValuesSetFromFiles(
205 fileList: list[str],
206 keys: list[str],
207 joinKeys: list[str],
208 noWarn: bool = False,
209 printResults: bool = True,
210 libraryLocation: str | None = None,
211 printPerFile: bool = False,
212) -> list[str]:
213 """For a list of FITS files, get the set of values for the given keys.
215 Parameters
216 ----------
217 fileList : `list` of `str`
218 The fully-specified paths of the files to scrape
220 keys : `list` of `str`
221 The header keys to scrape
223 joinKeys : `list` of `str`
224 List of keys to concatenate when scraping, e.g. for a header with
225 FILTER1 = SDSS_u and FILTER2 == NB_640nm
226 this would return SDSS_u+NB_640nm
227 Useful when looking for the actual set, rather than taking the product
228 of all the individual values, as some combinations may never happen.
229 """
230 print(f"Scraping headers from {len(fileList)} files...")
231 if printPerFile and (len(fileList) * len(keys) > 200):
232 print(f"You asked to print headers per-file, for {len(fileList)} files x {len(keys)} keys.")
233 cont = input("Are you sure? Press y to continue, anything else to quit:")
234 if cont.lower()[0] != "y":
235 exit()
237 headerDict, hashDict = buildHashAndHeaderDicts(fileList, libraryLocation=libraryLocation)
239 if keys: # necessary so that -j works on its own
240 kValues = {k: set() for k in keys}
241 else:
242 keys = []
243 kValues = None
245 if joinKeys:
246 joinedValues = set()
248 for filename in headerDict.keys():
249 header = headerDict[filename]
250 for key in keys:
251 if key in header:
252 kValues[key].add(header[key])
253 if printPerFile:
254 print(f"{filename}\t{key}\t{header[key]}")
255 if len(keys) > 1 and key == keys[-1]:
256 # newline between files if multikey
257 print()
258 else:
259 if not noWarn:
260 logger.warning(f"{key} not found in header of {filename}")
262 if joinKeys:
263 jVals = None
264 # Note that CCS doesn't leave values blank, it misses the whole
265 # card out for things like FILTER2 when not being used
266 jVals = [header[k] if k in header else "<missing card>" for k in joinKeys]
268 # However, we do ALSO get blank cards to, so:
269 # substitute <BLANK_VALUE> when there is an undefined card
270 # because str(v) will give the address for each blank value
271 # too, meaning each blank card looks like a different value
272 joinedValues.add(
273 "+".join(
274 [
275 str(v) if not isinstance(v, astropy.io.fits.card.Undefined) else "<BLANK_VALUE>"
276 for v in jVals
277 ]
278 )
279 )
281 if printResults:
282 # Do this first because it's messy
283 zeroFiles = _findKeyForValue(hashDict, ZERO_HASH, warnOnCollision=False, returnCollisions=True)
284 if zeroFiles:
285 print("\nFiles with zeros for data:")
286 for filename in zeroFiles:
287 print(f"{filename}")
289 if kValues is not None:
290 for key in kValues.keys():
291 print(f"\nValues found for header key {key}:")
292 print(f"{sorted(kValues[key])}")
294 if joinKeys:
295 print(f"\nValues found when joining {joinKeys}:")
296 print(f"{sorted(joinedValues)}")
298 if joinKeys:
299 return kValues, joinedValues
301 return kValues
304def compareHeaders(filename1: str, filename2: str) -> None:
305 """Compare the headers of two files in detail.
307 First, the two files are confirmed to have the same pixel data to ensure
308 the files should be being compared (by hashing the first 100x100 pixels
309 in HDU 1).
311 It then prints out:
312 the keys that appear in A and not B
313 the keys that appear in B but not A
314 the keys that in common, and of those in common:
315 which are the same,
316 which differ,
317 and where different, what the differing values are
319 Parameters
320 ----------
321 filename1 : str
322 Full path to the first of the files to compare
324 filename2 : str
325 Full path to the second of the files to compare
326 """
327 assert isinstance(filename1, str)
328 assert isinstance(filename2, str)
330 headerDict1, hashDict1 = buildHashAndHeaderDicts([filename1])
331 headerDict2, hashDict2 = buildHashAndHeaderDicts([filename2])
333 if hashDict1[filename1] != hashDict2[filename2]:
334 print("Pixel data was not the same - did you really mean to compare these files?")
335 print(f"{filename1}\n{filename2}")
336 cont = input("Press y to continue, anything else to quit:")
337 if cont.lower()[0] != "y":
338 exit()
340 # you might think you don't want to always call sorted() on the key sets
341 # BUT otherwise they seem to be returned in random order each time you run
342 # and that can be crazy-making
344 h1 = headerDict1[filename1]
345 h2 = headerDict2[filename2]
346 h1Keys = list(h1.keys())
347 h2Keys = list(h2.keys())
349 commonKeys = set(h1Keys)
350 commonKeys = commonKeys.intersection(h2Keys)
352 keysInh1NotInh2 = sorted([_ for _ in h1Keys if _ not in h2Keys])
353 keysInh2NotInh1 = sorted([_ for _ in h2Keys if _ not in h1Keys])
355 print(f"Keys in {filename1} not in {filename2}:\n{keysInh1NotInh2}\n")
356 print(f"Keys in {filename2} not in {filename1}:\n{keysInh2NotInh1}\n")
357 print(f"Keys in common:\n{sorted(commonKeys)}\n")
359 # put in lists so we can output neatly rather than interleaving
360 identical = []
361 differing = []
362 for key in commonKeys:
363 if h1[key] == h2[key]:
364 identical.append(key)
365 else:
366 differing.append(key)
368 assert len(identical) + len(differing) == len(commonKeys)
370 if len(identical) == len(commonKeys):
371 print("All keys in common have identical values :)")
372 else:
373 print("Of the common keys, the following had identical values:")
374 print(f"{sorted(identical)}\n")
375 print("Common keys with differing values were:")
376 for key in sorted(differing):
377 d = "<blank card>".ljust(25)
378 v1 = str(h1[key]).ljust(25) if not isinstance(h1[key], astropy.io.fits.card.Undefined) else d
379 v2 = str(h2[key]).ljust(25) if not isinstance(h2[key], astropy.io.fits.card.Undefined) else d
380 print(f"{key.ljust(8)}: {v1} vs {v2}")
382 # Finally, check the extension naming has the same ordering.
383 # We have to touch the files again, which is pretty lame
384 # but not doing so would require the header builder to know about
385 # file pairings or return extra info, and that's not ideal either,
386 # and also not worth the hassle to optimise as this is only
387 # ever for a single file, not bulk file processing
388 numbering1, numbering2 = [], []
389 with fits.open(filename1) as f1, fits.open(filename2) as f2:
390 for hduF1, hduF2 in zip(f1[1:], f2[1:]): # skip the PDU
391 if "EXTNAME" in hduF1.header and "EXTNAME" in hduF2.header:
392 numbering1.append(hduF1.header["EXTNAME"])
393 numbering2.append(hduF2.header["EXTNAME"])
395 if numbering1 != numbering2:
396 print("\nSection numbering differs between files!")
397 for s1, s2 in zip(numbering1, numbering2):
398 print(f"{s1.ljust(12)} vs {s2.ljust(12)}")
399 if len(numbering1) != len(numbering2):
400 print("The length of those lists was also DIFFERENT! Presumably a non-image HDU was interleaved.")