Coverage for python/lsst/summit/extras/headerFunctions.py: 8%
180 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-10 03:54 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-10 03:54 -0800
1# This file is part of summit_extras.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import logging
23import astropy
24from astropy.io import fits
25import filecmp
26import sys
27import os
28import pickle
29import hashlib
30import numpy as np
32# redirect logger to stdout so that logger messages appear in notebooks too
33logging.basicConfig(
34 level=logging.INFO,
35 handlers=[logging.StreamHandler(sys.stdout)]
36)
37logger = logging.getLogger("headerFunctions")
40def loadHeaderDictsFromLibrary(libraryFilename):
41 """Load the header and hash dicts from a pickle file.
43 Parameters
44 ----------
45 libraryFilename : `str`
46 Path of the library file to load from
48 Returns
49 -------
50 headersDict : `dict`
51 A dict, keyed by filename, with the values being the full primary
52 header, exactly as if it were built by buildHashAndHeaderDicts().
54 dataDict : `dict`
55 A dict, keyed by filename, with the values being hashes of the data
56 sections, exactly as if it were built by buildHashAndHeaderDicts().
57 """
58 try:
59 with open(libraryFilename, "rb") as pickleFile:
60 headersDict, dataDict = pickle.load(pickleFile)
62 if len(headersDict) != len(dataDict):
63 print("Loaded differing numbers of entries for the header and data dicts.")
64 print(f"{len(headersDict)} vs {len(dataDict)}")
65 print("Something has gone badly wrong - your library seems corrupted!")
66 else:
67 print(f"Loaded {len(headersDict)} values from pickle files")
68 except Exception as e:
69 if not os.path.exists(libraryFilename):
70 print(f"{libraryFilename} not found. If building the header dicts for the first time this"
71 " is to be expected.\nOtherwise you've misspecified the path to you library!")
72 else:
73 print(f"Something more sinister went wrong loading headers from {libraryFilename}:\n{e}")
74 return {}, {}
76 return headersDict, dataDict
79def _saveToLibrary(libraryFilename, headersDict, dataDict):
80 try:
81 with open(libraryFilename, "wb") as dumpFile:
82 pickle.dump((headersDict, dataDict), dumpFile, pickle.HIGHEST_PROTOCOL)
83 except Exception:
84 print("Failed to write pickle file! Here's a debugger so you don't lose all your work:")
85 import ipdb as pdb
86 pdb.set_trace()
89def _findKeyForValue(dictionary, value, warnOnCollision=True, returnCollisions=False):
90 listOfKeys = [k for (k, v) in dictionary.items() if v == value]
91 if warnOnCollision and len(listOfKeys) != 1:
92 logger.warning("Found multiple keys for value! Returning only first.")
93 if returnCollisions:
94 return listOfKeys
95 return listOfKeys[0]
98def _hashFile(fileToHash, dataHdu, sliceToUse):
99 """Put in place so that if hashing multiple HDUs is desired when one
100 is filled with zeros it will be easy to add"""
101 data = fileToHash[dataHdu].data[sliceToUse, sliceToUse].tostring()
102 h = _hashData(data)
103 return h
106def _hashData(data):
107 h = hashlib.sha256(data).hexdigest() # hex because we want it readable in the dict
108 return h
111ZERO_HASH = _hashData(np.zeros((100, 100), dtype=np.int32))
114def buildHashAndHeaderDicts(fileList, dataHdu='Segment00', libraryLocation=None):
115 """For a list of files, build dicts of hashed data and headers.
117 Data is hashed using a currently-hard-coded 100x100 region of the pixels
118 i.e. file[dataHdu].data[0:100, 0:100]
120 Parameters
121 ----------
122 fileList : `list` of `str`
123 The fully-specified paths of the files to scrape
125 dataHdu : `str` or `int`
126 The HDU to use for the pixel data to hash.
128 Returns
129 -------
130 headersDict : `dict`
131 A dict, keyed by filename, with the values being the full primary
132 header.
134 dataDict : `dict`
135 A dict, keyed by filename, with the values being hashes of the file's
136 data section, as defined by the dataSize and dataHdu.
138 """
139 headersDict = {}
140 dataDict = {}
142 if libraryLocation:
143 headersDict, dataDict = loadHeaderDictsFromLibrary(libraryLocation)
145 # don't load files we already know about from the library
146 filesToLoad = [f for f in fileList if f not in headersDict.keys()]
148 s = slice(0, 100)
149 for filenum, filename in enumerate(filesToLoad):
150 if len(filesToLoad) > 1000 and filenum%1000 == 0:
151 if libraryLocation:
152 logger.info(f"Processed {filenum} of {len(filesToLoad)} files not loaded from library...")
153 else:
154 logger.info(f"Processed {filenum} of {len(fileList)} files...")
155 with fits.open(filename) as f:
156 try:
157 headersDict[filename] = f[0].header
158 h = _hashFile(f, dataHdu, s)
159 if h in dataDict.values():
160 collision = _findKeyForValue(dataDict, h, warnOnCollision=False)
161 logger.warning(f"Duplicate file (or hash collision!) for files {filename} and "
162 f"{collision}!")
163 if filecmp.cmp(filename, collision):
164 logger.warning("Filecmp shows files are identical")
165 else:
166 logger.warning("Filecmp shows files differ - "
167 "likely just zeros for data (or a genuine hash collision!)")
169 dataDict[filename] = h
170 except Exception:
171 logger.warning(f"Failed to load {filename} - file is likely corrupted.")
173 # we have always added to this, so save it back over the original
174 if libraryLocation and len(filesToLoad) > 0:
175 _saveToLibrary(libraryLocation, headersDict, dataDict)
177 # have to pare these down, as library loaded could be a superset
178 headersDict = {k: headersDict[k] for k in fileList if k in headersDict.keys()}
179 dataDict = {k: dataDict[k] for k in fileList if k in dataDict.keys()}
181 return headersDict, dataDict
184def sorted(inlist, replacementValue="<BLANK VALUE>"):
185 """Redefinition of sorted() to deal with blank values and str/int mixes"""
186 from builtins import sorted as _sorted
187 output = [str(x) if not isinstance(x, astropy.io.fits.card.Undefined)
188 else replacementValue for x in inlist]
189 output = _sorted(output)
190 return output
193def keyValuesSetFromFiles(fileList, keys, joinKeys, noWarn=False, printResults=True,
194 libraryLocation=None, printPerFile=False):
195 """For a list of FITS files, get the set of values for the given keys.
197 Parameters
198 ----------
199 fileList : `list` of `str`
200 The fully-specified paths of the files to scrape
202 keys : `list` of `str`
203 The header keys to scrape
205 joinKeys : `list` of `str`
206 List of keys to concatenate when scraping, e.g. for a header with
207 FILTER1 = SDSS_u and FILTER2 == NB_640nm
208 this would return SDSS_u+NB_640nm
209 Useful when looking for the actual set, rather than taking the product
210 of all the individual values, as some combinations may never happen.
211 """
212 print(f"Scraping headers from {len(fileList)} files...")
213 if printPerFile and (len(fileList)*len(keys) > 200):
214 print(f"You asked to print headers per-file, for {len(fileList)} files x {len(keys)} keys.")
215 cont = input("Are you sure? Press y to continue, anything else to quit:")
216 if cont.lower()[0] != 'y':
217 exit()
219 headerDict, hashDict = buildHashAndHeaderDicts(fileList, libraryLocation=libraryLocation)
221 if keys: # necessary so that -j works on its own
222 kValues = {k: set() for k in keys}
223 else:
224 keys = []
225 kValues = None
227 if joinKeys:
228 joinedValues = set()
230 for filename in headerDict.keys():
231 header = headerDict[filename]
232 for key in keys:
233 if key in header:
234 kValues[key].add(header[key])
235 if printPerFile:
236 print(f"{filename}\t{key}\t{header[key]}")
237 if len(keys) > 1 and key == keys[-1]:
238 # newline between files if multikey
239 print()
240 else:
241 if not noWarn:
242 logger.warning(f"{key} not found in header of {filename}")
244 if joinKeys:
245 jVals = None
246 # Note that CCS doesn't leave values blank, it misses the whole
247 # card out for things like FILTER2 when not being used
248 jVals = [header[k] if k in header else "<missing card>" for k in joinKeys]
250 # However, we do ALSO get blank cards to, so:
251 # substitute <BLANK_VALUE> when there is an undefined card
252 # because str(v) will give the address for each blank value
253 # too, meaning each blank card looks like a different value
254 joinedValues.add("+".join([str(v) if not isinstance(v, astropy.io.fits.card.Undefined)
255 else "<BLANK_VALUE>" for v in jVals]))
257 if printResults:
258 # Do this first because it's messy
259 zeroFiles = _findKeyForValue(hashDict, ZERO_HASH, warnOnCollision=False, returnCollisions=True)
260 if zeroFiles:
261 print("\nFiles with zeros for data:")
262 for filename in zeroFiles:
263 print(f"{filename}")
265 if kValues is not None:
266 for key in kValues.keys():
267 print(f"\nValues found for header key {key}:")
268 print(f"{sorted(kValues[key])}")
270 if joinKeys:
271 print(f"\nValues found when joining {joinKeys}:")
272 print(f"{sorted(joinedValues)}")
274 if joinKeys:
275 return kValues, joinedValues
277 return kValues
280def compareHeaders(filename1, filename2):
281 """Compare the headers of two files in detail.
283 First, the two files are confirmed to have the same pixel data to ensure
284 the files should be being compared (by hashing the first 100x100 pixels
285 in HDU 1).
287 It then prints out:
288 the keys that appear in A and not B
289 the keys that appear in B but not A
290 the keys that in common, and of those in common:
291 which are the same,
292 which differ,
293 and where different, what the differing values are
295 Parameters
296 ----------
297 filename1 : str
298 Full path to the first of the files to compare
300 filename2 : str
301 Full path to the second of the files to compare
302 """
303 assert isinstance(filename1, str)
304 assert isinstance(filename2, str)
306 headerDict1, hashDict1 = buildHashAndHeaderDicts([filename1])
307 headerDict2, hashDict2 = buildHashAndHeaderDicts([filename2])
309 if hashDict1[filename1] != hashDict2[filename2]:
310 print("Pixel data was not the same - did you really mean to compare these files?")
311 print(f"{filename1}\n{filename2}")
312 cont = input("Press y to continue, anything else to quit:")
313 if cont.lower()[0] != 'y':
314 exit()
316 # you might think you don't want to always call sorted() on the key sets
317 # BUT otherwise they seem to be returned in random order each time you run
318 # and that can be crazy-making
320 h1 = headerDict1[filename1]
321 h2 = headerDict2[filename2]
322 h1Keys = list(h1.keys())
323 h2Keys = list(h2.keys())
325 commonKeys = set(h1Keys)
326 commonKeys = commonKeys.intersection(h2Keys)
328 keysInh1NotInh2 = sorted([_ for _ in h1Keys if _ not in h2Keys])
329 keysInh2NotInh1 = sorted([_ for _ in h2Keys if _ not in h1Keys])
331 print(f"Keys in {filename1} not in {filename2}:\n{keysInh1NotInh2}\n")
332 print(f"Keys in {filename2} not in {filename1}:\n{keysInh2NotInh1}\n")
333 print(f"Keys in common:\n{sorted(commonKeys)}\n")
335 # put in lists so we can output neatly rather than interleaving
336 identical = []
337 differing = []
338 for key in commonKeys:
339 if h1[key] == h2[key]:
340 identical.append(key)
341 else:
342 differing.append(key)
344 assert len(identical)+len(differing) == len(commonKeys)
346 if len(identical) == len(commonKeys):
347 print("All keys in common have identical values :)")
348 else:
349 print("Of the common keys, the following had identical values:")
350 print(f"{sorted(identical)}\n")
351 print("Common keys with differing values were:")
352 for key in sorted(differing):
353 d = "<blank card>".ljust(25)
354 v1 = str(h1[key]).ljust(25) if not isinstance(h1[key], astropy.io.fits.card.Undefined) else d
355 v2 = str(h2[key]).ljust(25) if not isinstance(h2[key], astropy.io.fits.card.Undefined) else d
356 print(f"{key.ljust(8)}: {v1} vs {v2}")
358 # Finally, check the extension naming has the same ordering.
359 # We have to touch the files again, which is pretty lame
360 # but not doing so would require the header builder to know about
361 # file pairings or return extra info, and that's not ideal either,
362 # and also not worth the hassle to optimise as this is only
363 # ever for a single file, not bulk file processing
364 numbering1, numbering2 = [], []
365 with fits.open(filename1) as f1, fits.open(filename2) as f2:
366 for hduF1, hduF2 in zip(f1[1:], f2[1:]): # skip the PDU
367 if 'EXTNAME' in hduF1.header and 'EXTNAME' in hduF2.header:
368 numbering1.append(hduF1.header['EXTNAME'])
369 numbering2.append(hduF2.header['EXTNAME'])
371 if numbering1 != numbering2:
372 print('\nSection numbering differs between files!')
373 for s1, s2 in zip(numbering1, numbering2):
374 print(f"{s1.ljust(12)} vs {s2.ljust(12)}")
375 if len(numbering1) != len(numbering2):
376 print("The length of those lists was also DIFFERENT! Presumably a non-image HDU was interleaved.")