Coverage for python/lsst/summit/extras/headerFunctions.py: 8%

180 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-12 11:23 +0000

1# This file is part of summit_extras. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22import logging 

23import astropy 

24from astropy.io import fits 

25import filecmp 

26import sys 

27import os 

28import pickle 

29import hashlib 

30import numpy as np 

31 

32# redirect logger to stdout so that logger messages appear in notebooks too 

33logging.basicConfig( 

34 level=logging.INFO, 

35 handlers=[logging.StreamHandler(sys.stdout)] 

36) 

37logger = logging.getLogger("headerFunctions") 

38 

39 

40def loadHeaderDictsFromLibrary(libraryFilename): 

41 """Load the header and hash dicts from a pickle file. 

42 

43 Parameters 

44 ---------- 

45 libraryFilename : `str` 

46 Path of the library file to load from 

47 

48 Returns 

49 ------- 

50 headersDict : `dict` 

51 A dict, keyed by filename, with the values being the full primary 

52 header, exactly as if it were built by buildHashAndHeaderDicts(). 

53 

54 dataDict : `dict` 

55 A dict, keyed by filename, with the values being hashes of the data 

56 sections, exactly as if it were built by buildHashAndHeaderDicts(). 

57 """ 

58 try: 

59 with open(libraryFilename, "rb") as pickleFile: 

60 headersDict, dataDict = pickle.load(pickleFile) 

61 

62 if len(headersDict) != len(dataDict): 

63 print("Loaded differing numbers of entries for the header and data dicts.") 

64 print(f"{len(headersDict)} vs {len(dataDict)}") 

65 print("Something has gone badly wrong - your library seems corrupted!") 

66 else: 

67 print(f"Loaded {len(headersDict)} values from pickle files") 

68 except Exception as e: 

69 if not os.path.exists(libraryFilename): 

70 print(f"{libraryFilename} not found. If building the header dicts for the first time this" 

71 " is to be expected.\nOtherwise you've misspecified the path to you library!") 

72 else: 

73 print(f"Something more sinister went wrong loading headers from {libraryFilename}:\n{e}") 

74 return {}, {} 

75 

76 return headersDict, dataDict 

77 

78 

79def _saveToLibrary(libraryFilename, headersDict, dataDict): 

80 try: 

81 with open(libraryFilename, "wb") as dumpFile: 

82 pickle.dump((headersDict, dataDict), dumpFile, pickle.HIGHEST_PROTOCOL) 

83 except Exception: 

84 print("Failed to write pickle file! Here's a debugger so you don't lose all your work:") 

85 import ipdb as pdb 

86 pdb.set_trace() 

87 

88 

89def _findKeyForValue(dictionary, value, warnOnCollision=True, returnCollisions=False): 

90 listOfKeys = [k for (k, v) in dictionary.items() if v == value] 

91 if warnOnCollision and len(listOfKeys) != 1: 

92 logger.warning("Found multiple keys for value! Returning only first.") 

93 if returnCollisions: 

94 return listOfKeys 

95 return listOfKeys[0] 

96 

97 

98def _hashFile(fileToHash, dataHdu, sliceToUse): 

99 """Put in place so that if hashing multiple HDUs is desired when one 

100 is filled with zeros it will be easy to add""" 

101 data = fileToHash[dataHdu].data[sliceToUse, sliceToUse].tostring() 

102 h = _hashData(data) 

103 return h 

104 

105 

106def _hashData(data): 

107 h = hashlib.sha256(data).hexdigest() # hex because we want it readable in the dict 

108 return h 

109 

110 

111ZERO_HASH = _hashData(np.zeros((100, 100), dtype=np.int32)) 

112 

113 

114def buildHashAndHeaderDicts(fileList, dataHdu='Segment00', libraryLocation=None): 

115 """For a list of files, build dicts of hashed data and headers. 

116 

117 Data is hashed using a currently-hard-coded 100x100 region of the pixels 

118 i.e. file[dataHdu].data[0:100, 0:100] 

119 

120 Parameters 

121 ---------- 

122 fileList : `list` of `str` 

123 The fully-specified paths of the files to scrape 

124 

125 dataHdu : `str` or `int` 

126 The HDU to use for the pixel data to hash. 

127 

128 Returns 

129 ------- 

130 headersDict : `dict` 

131 A dict, keyed by filename, with the values being the full primary 

132 header. 

133 

134 dataDict : `dict` 

135 A dict, keyed by filename, with the values being hashes of the file's 

136 data section, as defined by the dataSize and dataHdu. 

137 

138 """ 

139 headersDict = {} 

140 dataDict = {} 

141 

142 if libraryLocation: 

143 headersDict, dataDict = loadHeaderDictsFromLibrary(libraryLocation) 

144 

145 # don't load files we already know about from the library 

146 filesToLoad = [f for f in fileList if f not in headersDict.keys()] 

147 

148 s = slice(0, 100) 

149 for filenum, filename in enumerate(filesToLoad): 

150 if len(filesToLoad) > 1000 and filenum%1000 == 0: 

151 if libraryLocation: 

152 logger.info(f"Processed {filenum} of {len(filesToLoad)} files not loaded from library...") 

153 else: 

154 logger.info(f"Processed {filenum} of {len(fileList)} files...") 

155 with fits.open(filename) as f: 

156 try: 

157 headersDict[filename] = f[0].header 

158 h = _hashFile(f, dataHdu, s) 

159 if h in dataDict.values(): 

160 collision = _findKeyForValue(dataDict, h, warnOnCollision=False) 

161 logger.warning(f"Duplicate file (or hash collision!) for files {filename} and " 

162 f"{collision}!") 

163 if filecmp.cmp(filename, collision): 

164 logger.warning("Filecmp shows files are identical") 

165 else: 

166 logger.warning("Filecmp shows files differ - " 

167 "likely just zeros for data (or a genuine hash collision!)") 

168 

169 dataDict[filename] = h 

170 except Exception: 

171 logger.warning(f"Failed to load {filename} - file is likely corrupted.") 

172 

173 # we have always added to this, so save it back over the original 

174 if libraryLocation and len(filesToLoad) > 0: 

175 _saveToLibrary(libraryLocation, headersDict, dataDict) 

176 

177 # have to pare these down, as library loaded could be a superset 

178 headersDict = {k: headersDict[k] for k in fileList if k in headersDict.keys()} 

179 dataDict = {k: dataDict[k] for k in fileList if k in dataDict.keys()} 

180 

181 return headersDict, dataDict 

182 

183 

184def sorted(inlist, replacementValue="<BLANK VALUE>"): 

185 """Redefinition of sorted() to deal with blank values and str/int mixes""" 

186 from builtins import sorted as _sorted 

187 output = [str(x) if not isinstance(x, astropy.io.fits.card.Undefined) 

188 else replacementValue for x in inlist] 

189 output = _sorted(output) 

190 return output 

191 

192 

193def keyValuesSetFromFiles(fileList, keys, joinKeys, noWarn=False, printResults=True, 

194 libraryLocation=None, printPerFile=False): 

195 """For a list of FITS files, get the set of values for the given keys. 

196 

197 Parameters 

198 ---------- 

199 fileList : `list` of `str` 

200 The fully-specified paths of the files to scrape 

201 

202 keys : `list` of `str` 

203 The header keys to scrape 

204 

205 joinKeys : `list` of `str` 

206 List of keys to concatenate when scraping, e.g. for a header with 

207 FILTER1 = SDSS_u and FILTER2 == NB_640nm 

208 this would return SDSS_u+NB_640nm 

209 Useful when looking for the actual set, rather than taking the product 

210 of all the individual values, as some combinations may never happen. 

211 """ 

212 print(f"Scraping headers from {len(fileList)} files...") 

213 if printPerFile and (len(fileList)*len(keys) > 200): 

214 print(f"You asked to print headers per-file, for {len(fileList)} files x {len(keys)} keys.") 

215 cont = input("Are you sure? Press y to continue, anything else to quit:") 

216 if cont.lower()[0] != 'y': 

217 exit() 

218 

219 headerDict, hashDict = buildHashAndHeaderDicts(fileList, libraryLocation=libraryLocation) 

220 

221 if keys: # necessary so that -j works on its own 

222 kValues = {k: set() for k in keys} 

223 else: 

224 keys = [] 

225 kValues = None 

226 

227 if joinKeys: 

228 joinedValues = set() 

229 

230 for filename in headerDict.keys(): 

231 header = headerDict[filename] 

232 for key in keys: 

233 if key in header: 

234 kValues[key].add(header[key]) 

235 if printPerFile: 

236 print(f"{filename}\t{key}\t{header[key]}") 

237 if len(keys) > 1 and key == keys[-1]: 

238 # newline between files if multikey 

239 print() 

240 else: 

241 if not noWarn: 

242 logger.warning(f"{key} not found in header of {filename}") 

243 

244 if joinKeys: 

245 jVals = None 

246 # Note that CCS doesn't leave values blank, it misses the whole 

247 # card out for things like FILTER2 when not being used 

248 jVals = [header[k] if k in header else "<missing card>" for k in joinKeys] 

249 

250 # However, we do ALSO get blank cards to, so: 

251 # substitute <BLANK_VALUE> when there is an undefined card 

252 # because str(v) will give the address for each blank value 

253 # too, meaning each blank card looks like a different value 

254 joinedValues.add("+".join([str(v) if not isinstance(v, astropy.io.fits.card.Undefined) 

255 else "<BLANK_VALUE>" for v in jVals])) 

256 

257 if printResults: 

258 # Do this first because it's messy 

259 zeroFiles = _findKeyForValue(hashDict, ZERO_HASH, warnOnCollision=False, returnCollisions=True) 

260 if zeroFiles: 

261 print("\nFiles with zeros for data:") 

262 for filename in zeroFiles: 

263 print(f"{filename}") 

264 

265 if kValues is not None: 

266 for key in kValues.keys(): 

267 print(f"\nValues found for header key {key}:") 

268 print(f"{sorted(kValues[key])}") 

269 

270 if joinKeys: 

271 print(f"\nValues found when joining {joinKeys}:") 

272 print(f"{sorted(joinedValues)}") 

273 

274 if joinKeys: 

275 return kValues, joinedValues 

276 

277 return kValues 

278 

279 

280def compareHeaders(filename1, filename2): 

281 """Compare the headers of two files in detail. 

282 

283 First, the two files are confirmed to have the same pixel data to ensure 

284 the files should be being compared (by hashing the first 100x100 pixels 

285 in HDU 1). 

286 

287 It then prints out: 

288 the keys that appear in A and not B 

289 the keys that appear in B but not A 

290 the keys that in common, and of those in common: 

291 which are the same, 

292 which differ, 

293 and where different, what the differing values are 

294 

295 Parameters 

296 ---------- 

297 filename1 : str 

298 Full path to the first of the files to compare 

299 

300 filename2 : str 

301 Full path to the second of the files to compare 

302 """ 

303 assert isinstance(filename1, str) 

304 assert isinstance(filename2, str) 

305 

306 headerDict1, hashDict1 = buildHashAndHeaderDicts([filename1]) 

307 headerDict2, hashDict2 = buildHashAndHeaderDicts([filename2]) 

308 

309 if hashDict1[filename1] != hashDict2[filename2]: 

310 print("Pixel data was not the same - did you really mean to compare these files?") 

311 print(f"{filename1}\n{filename2}") 

312 cont = input("Press y to continue, anything else to quit:") 

313 if cont.lower()[0] != 'y': 

314 exit() 

315 

316 # you might think you don't want to always call sorted() on the key sets 

317 # BUT otherwise they seem to be returned in random order each time you run 

318 # and that can be crazy-making 

319 

320 h1 = headerDict1[filename1] 

321 h2 = headerDict2[filename2] 

322 h1Keys = list(h1.keys()) 

323 h2Keys = list(h2.keys()) 

324 

325 commonKeys = set(h1Keys) 

326 commonKeys = commonKeys.intersection(h2Keys) 

327 

328 keysInh1NotInh2 = sorted([_ for _ in h1Keys if _ not in h2Keys]) 

329 keysInh2NotInh1 = sorted([_ for _ in h2Keys if _ not in h1Keys]) 

330 

331 print(f"Keys in {filename1} not in {filename2}:\n{keysInh1NotInh2}\n") 

332 print(f"Keys in {filename2} not in {filename1}:\n{keysInh2NotInh1}\n") 

333 print(f"Keys in common:\n{sorted(commonKeys)}\n") 

334 

335 # put in lists so we can output neatly rather than interleaving 

336 identical = [] 

337 differing = [] 

338 for key in commonKeys: 

339 if h1[key] == h2[key]: 

340 identical.append(key) 

341 else: 

342 differing.append(key) 

343 

344 assert len(identical)+len(differing) == len(commonKeys) 

345 

346 if len(identical) == len(commonKeys): 

347 print("All keys in common have identical values :)") 

348 else: 

349 print("Of the common keys, the following had identical values:") 

350 print(f"{sorted(identical)}\n") 

351 print("Common keys with differing values were:") 

352 for key in sorted(differing): 

353 d = "<blank card>".ljust(25) 

354 v1 = str(h1[key]).ljust(25) if not isinstance(h1[key], astropy.io.fits.card.Undefined) else d 

355 v2 = str(h2[key]).ljust(25) if not isinstance(h2[key], astropy.io.fits.card.Undefined) else d 

356 print(f"{key.ljust(8)}: {v1} vs {v2}") 

357 

358 # Finally, check the extension naming has the same ordering. 

359 # We have to touch the files again, which is pretty lame 

360 # but not doing so would require the header builder to know about 

361 # file pairings or return extra info, and that's not ideal either, 

362 # and also not worth the hassle to optimise as this is only 

363 # ever for a single file, not bulk file processing 

364 numbering1, numbering2 = [], [] 

365 with fits.open(filename1) as f1, fits.open(filename2) as f2: 

366 for hduF1, hduF2 in zip(f1[1:], f2[1:]): # skip the PDU 

367 if 'EXTNAME' in hduF1.header and 'EXTNAME' in hduF2.header: 

368 numbering1.append(hduF1.header['EXTNAME']) 

369 numbering2.append(hduF2.header['EXTNAME']) 

370 

371 if numbering1 != numbering2: 

372 print('\nSection numbering differs between files!') 

373 for s1, s2 in zip(numbering1, numbering2): 

374 print(f"{s1.ljust(12)} vs {s2.ljust(12)}") 

375 if len(numbering1) != len(numbering2): 

376 print("The length of those lists was also DIFFERENT! Presumably a non-image HDU was interleaved.")