Coverage for python/lsst/summit/extras/headerFunctions.py: 8%
180 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-01 18:03 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-01 18:03 -0700
1# This file is part of summit_extras.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import filecmp
23import hashlib
24import logging
25import os
26import pickle
27import sys
29import astropy
30import numpy as np
31from astropy.io import fits
33# redirect logger to stdout so that logger messages appear in notebooks too
34logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
35logger = logging.getLogger("headerFunctions")
38def loadHeaderDictsFromLibrary(libraryFilename):
39 """Load the header and hash dicts from a pickle file.
41 Parameters
42 ----------
43 libraryFilename : `str`
44 Path of the library file to load from
46 Returns
47 -------
48 headersDict : `dict`
49 A dict, keyed by filename, with the values being the full primary
50 header, exactly as if it were built by buildHashAndHeaderDicts().
52 dataDict : `dict`
53 A dict, keyed by filename, with the values being hashes of the data
54 sections, exactly as if it were built by buildHashAndHeaderDicts().
55 """
56 try:
57 with open(libraryFilename, "rb") as pickleFile:
58 headersDict, dataDict = pickle.load(pickleFile)
60 if len(headersDict) != len(dataDict):
61 print("Loaded differing numbers of entries for the header and data dicts.")
62 print(f"{len(headersDict)} vs {len(dataDict)}")
63 print("Something has gone badly wrong - your library seems corrupted!")
64 else:
65 print(f"Loaded {len(headersDict)} values from pickle files")
66 except Exception as e:
67 if not os.path.exists(libraryFilename):
68 print(
69 f"{libraryFilename} not found. If building the header dicts for the first time this"
70 " is to be expected.\nOtherwise you've misspecified the path to you library!"
71 )
72 else:
73 print(f"Something more sinister went wrong loading headers from {libraryFilename}:\n{e}")
74 return {}, {}
76 return headersDict, dataDict
79def _saveToLibrary(libraryFilename, headersDict, dataDict):
80 try:
81 with open(libraryFilename, "wb") as dumpFile:
82 pickle.dump((headersDict, dataDict), dumpFile, pickle.HIGHEST_PROTOCOL)
83 except Exception:
84 print("Failed to write pickle file! Here's a debugger so you don't lose all your work:")
85 import ipdb as pdb
87 pdb.set_trace()
90def _findKeyForValue(dictionary, value, warnOnCollision=True, returnCollisions=False):
91 listOfKeys = [k for (k, v) in dictionary.items() if v == value]
92 if warnOnCollision and len(listOfKeys) != 1:
93 logger.warning("Found multiple keys for value! Returning only first.")
94 if returnCollisions:
95 return listOfKeys
96 return listOfKeys[0]
99def _hashFile(fileToHash, dataHdu, sliceToUse):
100 """Put in place so that if hashing multiple HDUs is desired when one
101 is filled with zeros it will be easy to add"""
102 data = fileToHash[dataHdu].data[sliceToUse, sliceToUse].tostring()
103 h = _hashData(data)
104 return h
107def _hashData(data):
108 h = hashlib.sha256(data).hexdigest() # hex because we want it readable in the dict
109 return h
112ZERO_HASH = _hashData(np.zeros((100, 100), dtype=np.int32))
115def buildHashAndHeaderDicts(fileList, dataHdu="Segment00", libraryLocation=None):
116 """For a list of files, build dicts of hashed data and headers.
118 Data is hashed using a currently-hard-coded 100x100 region of the pixels
119 i.e. file[dataHdu].data[0:100, 0:100]
121 Parameters
122 ----------
123 fileList : `list` of `str`
124 The fully-specified paths of the files to scrape
126 dataHdu : `str` or `int`
127 The HDU to use for the pixel data to hash.
129 Returns
130 -------
131 headersDict : `dict`
132 A dict, keyed by filename, with the values being the full primary
133 header.
135 dataDict : `dict`
136 A dict, keyed by filename, with the values being hashes of the file's
137 data section, as defined by the dataSize and dataHdu.
139 """
140 headersDict = {}
141 dataDict = {}
143 if libraryLocation:
144 headersDict, dataDict = loadHeaderDictsFromLibrary(libraryLocation)
146 # don't load files we already know about from the library
147 filesToLoad = [f for f in fileList if f not in headersDict.keys()]
149 s = slice(0, 100)
150 for filenum, filename in enumerate(filesToLoad):
151 if len(filesToLoad) > 1000 and filenum % 1000 == 0:
152 if libraryLocation:
153 logger.info(f"Processed {filenum} of {len(filesToLoad)} files not loaded from library...")
154 else:
155 logger.info(f"Processed {filenum} of {len(fileList)} files...")
156 with fits.open(filename) as f:
157 try:
158 headersDict[filename] = f[0].header
159 h = _hashFile(f, dataHdu, s)
160 if h in dataDict.values():
161 collision = _findKeyForValue(dataDict, h, warnOnCollision=False)
162 logger.warning(
163 f"Duplicate file (or hash collision!) for files {filename} and " f"{collision}!"
164 )
165 if filecmp.cmp(filename, collision):
166 logger.warning("Filecmp shows files are identical")
167 else:
168 logger.warning(
169 "Filecmp shows files differ - "
170 "likely just zeros for data (or a genuine hash collision!)"
171 )
173 dataDict[filename] = h
174 except Exception:
175 logger.warning(f"Failed to load {filename} - file is likely corrupted.")
177 # we have always added to this, so save it back over the original
178 if libraryLocation and len(filesToLoad) > 0:
179 _saveToLibrary(libraryLocation, headersDict, dataDict)
181 # have to pare these down, as library loaded could be a superset
182 headersDict = {k: headersDict[k] for k in fileList if k in headersDict.keys()}
183 dataDict = {k: dataDict[k] for k in fileList if k in dataDict.keys()}
185 return headersDict, dataDict
188def sorted(inlist, replacementValue="<BLANK VALUE>"):
189 """Redefinition of sorted() to deal with blank values and str/int mixes"""
190 from builtins import sorted as _sorted
192 output = [
193 str(x) if not isinstance(x, astropy.io.fits.card.Undefined) else replacementValue for x in inlist
194 ]
195 output = _sorted(output)
196 return output
199def keyValuesSetFromFiles(
200 fileList, keys, joinKeys, noWarn=False, printResults=True, libraryLocation=None, printPerFile=False
201):
202 """For a list of FITS files, get the set of values for the given keys.
204 Parameters
205 ----------
206 fileList : `list` of `str`
207 The fully-specified paths of the files to scrape
209 keys : `list` of `str`
210 The header keys to scrape
212 joinKeys : `list` of `str`
213 List of keys to concatenate when scraping, e.g. for a header with
214 FILTER1 = SDSS_u and FILTER2 == NB_640nm
215 this would return SDSS_u+NB_640nm
216 Useful when looking for the actual set, rather than taking the product
217 of all the individual values, as some combinations may never happen.
218 """
219 print(f"Scraping headers from {len(fileList)} files...")
220 if printPerFile and (len(fileList) * len(keys) > 200):
221 print(f"You asked to print headers per-file, for {len(fileList)} files x {len(keys)} keys.")
222 cont = input("Are you sure? Press y to continue, anything else to quit:")
223 if cont.lower()[0] != "y":
224 exit()
226 headerDict, hashDict = buildHashAndHeaderDicts(fileList, libraryLocation=libraryLocation)
228 if keys: # necessary so that -j works on its own
229 kValues = {k: set() for k in keys}
230 else:
231 keys = []
232 kValues = None
234 if joinKeys:
235 joinedValues = set()
237 for filename in headerDict.keys():
238 header = headerDict[filename]
239 for key in keys:
240 if key in header:
241 kValues[key].add(header[key])
242 if printPerFile:
243 print(f"{filename}\t{key}\t{header[key]}")
244 if len(keys) > 1 and key == keys[-1]:
245 # newline between files if multikey
246 print()
247 else:
248 if not noWarn:
249 logger.warning(f"{key} not found in header of {filename}")
251 if joinKeys:
252 jVals = None
253 # Note that CCS doesn't leave values blank, it misses the whole
254 # card out for things like FILTER2 when not being used
255 jVals = [header[k] if k in header else "<missing card>" for k in joinKeys]
257 # However, we do ALSO get blank cards to, so:
258 # substitute <BLANK_VALUE> when there is an undefined card
259 # because str(v) will give the address for each blank value
260 # too, meaning each blank card looks like a different value
261 joinedValues.add(
262 "+".join(
263 [
264 str(v) if not isinstance(v, astropy.io.fits.card.Undefined) else "<BLANK_VALUE>"
265 for v in jVals
266 ]
267 )
268 )
270 if printResults:
271 # Do this first because it's messy
272 zeroFiles = _findKeyForValue(hashDict, ZERO_HASH, warnOnCollision=False, returnCollisions=True)
273 if zeroFiles:
274 print("\nFiles with zeros for data:")
275 for filename in zeroFiles:
276 print(f"{filename}")
278 if kValues is not None:
279 for key in kValues.keys():
280 print(f"\nValues found for header key {key}:")
281 print(f"{sorted(kValues[key])}")
283 if joinKeys:
284 print(f"\nValues found when joining {joinKeys}:")
285 print(f"{sorted(joinedValues)}")
287 if joinKeys:
288 return kValues, joinedValues
290 return kValues
293def compareHeaders(filename1, filename2):
294 """Compare the headers of two files in detail.
296 First, the two files are confirmed to have the same pixel data to ensure
297 the files should be being compared (by hashing the first 100x100 pixels
298 in HDU 1).
300 It then prints out:
301 the keys that appear in A and not B
302 the keys that appear in B but not A
303 the keys that in common, and of those in common:
304 which are the same,
305 which differ,
306 and where different, what the differing values are
308 Parameters
309 ----------
310 filename1 : str
311 Full path to the first of the files to compare
313 filename2 : str
314 Full path to the second of the files to compare
315 """
316 assert isinstance(filename1, str)
317 assert isinstance(filename2, str)
319 headerDict1, hashDict1 = buildHashAndHeaderDicts([filename1])
320 headerDict2, hashDict2 = buildHashAndHeaderDicts([filename2])
322 if hashDict1[filename1] != hashDict2[filename2]:
323 print("Pixel data was not the same - did you really mean to compare these files?")
324 print(f"{filename1}\n{filename2}")
325 cont = input("Press y to continue, anything else to quit:")
326 if cont.lower()[0] != "y":
327 exit()
329 # you might think you don't want to always call sorted() on the key sets
330 # BUT otherwise they seem to be returned in random order each time you run
331 # and that can be crazy-making
333 h1 = headerDict1[filename1]
334 h2 = headerDict2[filename2]
335 h1Keys = list(h1.keys())
336 h2Keys = list(h2.keys())
338 commonKeys = set(h1Keys)
339 commonKeys = commonKeys.intersection(h2Keys)
341 keysInh1NotInh2 = sorted([_ for _ in h1Keys if _ not in h2Keys])
342 keysInh2NotInh1 = sorted([_ for _ in h2Keys if _ not in h1Keys])
344 print(f"Keys in {filename1} not in {filename2}:\n{keysInh1NotInh2}\n")
345 print(f"Keys in {filename2} not in {filename1}:\n{keysInh2NotInh1}\n")
346 print(f"Keys in common:\n{sorted(commonKeys)}\n")
348 # put in lists so we can output neatly rather than interleaving
349 identical = []
350 differing = []
351 for key in commonKeys:
352 if h1[key] == h2[key]:
353 identical.append(key)
354 else:
355 differing.append(key)
357 assert len(identical) + len(differing) == len(commonKeys)
359 if len(identical) == len(commonKeys):
360 print("All keys in common have identical values :)")
361 else:
362 print("Of the common keys, the following had identical values:")
363 print(f"{sorted(identical)}\n")
364 print("Common keys with differing values were:")
365 for key in sorted(differing):
366 d = "<blank card>".ljust(25)
367 v1 = str(h1[key]).ljust(25) if not isinstance(h1[key], astropy.io.fits.card.Undefined) else d
368 v2 = str(h2[key]).ljust(25) if not isinstance(h2[key], astropy.io.fits.card.Undefined) else d
369 print(f"{key.ljust(8)}: {v1} vs {v2}")
371 # Finally, check the extension naming has the same ordering.
372 # We have to touch the files again, which is pretty lame
373 # but not doing so would require the header builder to know about
374 # file pairings or return extra info, and that's not ideal either,
375 # and also not worth the hassle to optimise as this is only
376 # ever for a single file, not bulk file processing
377 numbering1, numbering2 = [], []
378 with fits.open(filename1) as f1, fits.open(filename2) as f2:
379 for hduF1, hduF2 in zip(f1[1:], f2[1:]): # skip the PDU
380 if "EXTNAME" in hduF1.header and "EXTNAME" in hduF2.header:
381 numbering1.append(hduF1.header["EXTNAME"])
382 numbering2.append(hduF2.header["EXTNAME"])
384 if numbering1 != numbering2:
385 print("\nSection numbering differs between files!")
386 for s1, s2 in zip(numbering1, numbering2):
387 print(f"{s1.ljust(12)} vs {s2.ljust(12)}")
388 if len(numbering1) != len(numbering2):
389 print("The length of those lists was also DIFFERENT! Presumably a non-image HDU was interleaved.")