Coverage for python/lsst/summit/extras/headerFunctions.py: 8%

180 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-23 05:41 -0700

1# This file is part of summit_extras. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22import filecmp 

23import hashlib 

24import logging 

25import os 

26import pickle 

27import sys 

28 

29import astropy 

30import numpy as np 

31from astropy.io import fits 

32 

33# redirect logger to stdout so that logger messages appear in notebooks too 

34logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)]) 

35logger = logging.getLogger("headerFunctions") 

36 

37 

38def loadHeaderDictsFromLibrary(libraryFilename): 

39 """Load the header and hash dicts from a pickle file. 

40 

41 Parameters 

42 ---------- 

43 libraryFilename : `str` 

44 Path of the library file to load from 

45 

46 Returns 

47 ------- 

48 headersDict : `dict` 

49 A dict, keyed by filename, with the values being the full primary 

50 header, exactly as if it were built by buildHashAndHeaderDicts(). 

51 

52 dataDict : `dict` 

53 A dict, keyed by filename, with the values being hashes of the data 

54 sections, exactly as if it were built by buildHashAndHeaderDicts(). 

55 """ 

56 try: 

57 with open(libraryFilename, "rb") as pickleFile: 

58 headersDict, dataDict = pickle.load(pickleFile) 

59 

60 if len(headersDict) != len(dataDict): 

61 print("Loaded differing numbers of entries for the header and data dicts.") 

62 print(f"{len(headersDict)} vs {len(dataDict)}") 

63 print("Something has gone badly wrong - your library seems corrupted!") 

64 else: 

65 print(f"Loaded {len(headersDict)} values from pickle files") 

66 except Exception as e: 

67 if not os.path.exists(libraryFilename): 

68 print( 

69 f"{libraryFilename} not found. If building the header dicts for the first time this" 

70 " is to be expected.\nOtherwise you've misspecified the path to you library!" 

71 ) 

72 else: 

73 print(f"Something more sinister went wrong loading headers from {libraryFilename}:\n{e}") 

74 return {}, {} 

75 

76 return headersDict, dataDict 

77 

78 

79def _saveToLibrary(libraryFilename, headersDict, dataDict): 

80 try: 

81 with open(libraryFilename, "wb") as dumpFile: 

82 pickle.dump((headersDict, dataDict), dumpFile, pickle.HIGHEST_PROTOCOL) 

83 except Exception: 

84 print("Failed to write pickle file! Here's a debugger so you don't lose all your work:") 

85 import ipdb as pdb 

86 

87 pdb.set_trace() 

88 

89 

90def _findKeyForValue(dictionary, value, warnOnCollision=True, returnCollisions=False): 

91 listOfKeys = [k for (k, v) in dictionary.items() if v == value] 

92 if warnOnCollision and len(listOfKeys) != 1: 

93 logger.warning("Found multiple keys for value! Returning only first.") 

94 if returnCollisions: 

95 return listOfKeys 

96 return listOfKeys[0] 

97 

98 

99def _hashFile(fileToHash, dataHdu, sliceToUse): 

100 """Put in place so that if hashing multiple HDUs is desired when one 

101 is filled with zeros it will be easy to add""" 

102 data = fileToHash[dataHdu].data[sliceToUse, sliceToUse].tostring() 

103 h = _hashData(data) 

104 return h 

105 

106 

107def _hashData(data): 

108 h = hashlib.sha256(data).hexdigest() # hex because we want it readable in the dict 

109 return h 

110 

111 

112ZERO_HASH = _hashData(np.zeros((100, 100), dtype=np.int32)) 

113 

114 

115def buildHashAndHeaderDicts(fileList, dataHdu="Segment00", libraryLocation=None): 

116 """For a list of files, build dicts of hashed data and headers. 

117 

118 Data is hashed using a currently-hard-coded 100x100 region of the pixels 

119 i.e. file[dataHdu].data[0:100, 0:100] 

120 

121 Parameters 

122 ---------- 

123 fileList : `list` of `str` 

124 The fully-specified paths of the files to scrape 

125 

126 dataHdu : `str` or `int` 

127 The HDU to use for the pixel data to hash. 

128 

129 Returns 

130 ------- 

131 headersDict : `dict` 

132 A dict, keyed by filename, with the values being the full primary 

133 header. 

134 

135 dataDict : `dict` 

136 A dict, keyed by filename, with the values being hashes of the file's 

137 data section, as defined by the dataSize and dataHdu. 

138 

139 """ 

140 headersDict = {} 

141 dataDict = {} 

142 

143 if libraryLocation: 

144 headersDict, dataDict = loadHeaderDictsFromLibrary(libraryLocation) 

145 

146 # don't load files we already know about from the library 

147 filesToLoad = [f for f in fileList if f not in headersDict.keys()] 

148 

149 s = slice(0, 100) 

150 for filenum, filename in enumerate(filesToLoad): 

151 if len(filesToLoad) > 1000 and filenum % 1000 == 0: 

152 if libraryLocation: 

153 logger.info(f"Processed {filenum} of {len(filesToLoad)} files not loaded from library...") 

154 else: 

155 logger.info(f"Processed {filenum} of {len(fileList)} files...") 

156 with fits.open(filename) as f: 

157 try: 

158 headersDict[filename] = f[0].header 

159 h = _hashFile(f, dataHdu, s) 

160 if h in dataDict.values(): 

161 collision = _findKeyForValue(dataDict, h, warnOnCollision=False) 

162 logger.warning( 

163 f"Duplicate file (or hash collision!) for files {filename} and " f"{collision}!" 

164 ) 

165 if filecmp.cmp(filename, collision): 

166 logger.warning("Filecmp shows files are identical") 

167 else: 

168 logger.warning( 

169 "Filecmp shows files differ - " 

170 "likely just zeros for data (or a genuine hash collision!)" 

171 ) 

172 

173 dataDict[filename] = h 

174 except Exception: 

175 logger.warning(f"Failed to load {filename} - file is likely corrupted.") 

176 

177 # we have always added to this, so save it back over the original 

178 if libraryLocation and len(filesToLoad) > 0: 

179 _saveToLibrary(libraryLocation, headersDict, dataDict) 

180 

181 # have to pare these down, as library loaded could be a superset 

182 headersDict = {k: headersDict[k] for k in fileList if k in headersDict.keys()} 

183 dataDict = {k: dataDict[k] for k in fileList if k in dataDict.keys()} 

184 

185 return headersDict, dataDict 

186 

187 

188def sorted(inlist, replacementValue="<BLANK VALUE>"): 

189 """Redefinition of sorted() to deal with blank values and str/int mixes""" 

190 from builtins import sorted as _sorted 

191 

192 output = [ 

193 str(x) if not isinstance(x, astropy.io.fits.card.Undefined) else replacementValue for x in inlist 

194 ] 

195 output = _sorted(output) 

196 return output 

197 

198 

199def keyValuesSetFromFiles( 

200 fileList, keys, joinKeys, noWarn=False, printResults=True, libraryLocation=None, printPerFile=False 

201): 

202 """For a list of FITS files, get the set of values for the given keys. 

203 

204 Parameters 

205 ---------- 

206 fileList : `list` of `str` 

207 The fully-specified paths of the files to scrape 

208 

209 keys : `list` of `str` 

210 The header keys to scrape 

211 

212 joinKeys : `list` of `str` 

213 List of keys to concatenate when scraping, e.g. for a header with 

214 FILTER1 = SDSS_u and FILTER2 == NB_640nm 

215 this would return SDSS_u+NB_640nm 

216 Useful when looking for the actual set, rather than taking the product 

217 of all the individual values, as some combinations may never happen. 

218 """ 

219 print(f"Scraping headers from {len(fileList)} files...") 

220 if printPerFile and (len(fileList) * len(keys) > 200): 

221 print(f"You asked to print headers per-file, for {len(fileList)} files x {len(keys)} keys.") 

222 cont = input("Are you sure? Press y to continue, anything else to quit:") 

223 if cont.lower()[0] != "y": 

224 exit() 

225 

226 headerDict, hashDict = buildHashAndHeaderDicts(fileList, libraryLocation=libraryLocation) 

227 

228 if keys: # necessary so that -j works on its own 

229 kValues = {k: set() for k in keys} 

230 else: 

231 keys = [] 

232 kValues = None 

233 

234 if joinKeys: 

235 joinedValues = set() 

236 

237 for filename in headerDict.keys(): 

238 header = headerDict[filename] 

239 for key in keys: 

240 if key in header: 

241 kValues[key].add(header[key]) 

242 if printPerFile: 

243 print(f"{filename}\t{key}\t{header[key]}") 

244 if len(keys) > 1 and key == keys[-1]: 

245 # newline between files if multikey 

246 print() 

247 else: 

248 if not noWarn: 

249 logger.warning(f"{key} not found in header of {filename}") 

250 

251 if joinKeys: 

252 jVals = None 

253 # Note that CCS doesn't leave values blank, it misses the whole 

254 # card out for things like FILTER2 when not being used 

255 jVals = [header[k] if k in header else "<missing card>" for k in joinKeys] 

256 

257 # However, we do ALSO get blank cards to, so: 

258 # substitute <BLANK_VALUE> when there is an undefined card 

259 # because str(v) will give the address for each blank value 

260 # too, meaning each blank card looks like a different value 

261 joinedValues.add( 

262 "+".join( 

263 [ 

264 str(v) if not isinstance(v, astropy.io.fits.card.Undefined) else "<BLANK_VALUE>" 

265 for v in jVals 

266 ] 

267 ) 

268 ) 

269 

270 if printResults: 

271 # Do this first because it's messy 

272 zeroFiles = _findKeyForValue(hashDict, ZERO_HASH, warnOnCollision=False, returnCollisions=True) 

273 if zeroFiles: 

274 print("\nFiles with zeros for data:") 

275 for filename in zeroFiles: 

276 print(f"{filename}") 

277 

278 if kValues is not None: 

279 for key in kValues.keys(): 

280 print(f"\nValues found for header key {key}:") 

281 print(f"{sorted(kValues[key])}") 

282 

283 if joinKeys: 

284 print(f"\nValues found when joining {joinKeys}:") 

285 print(f"{sorted(joinedValues)}") 

286 

287 if joinKeys: 

288 return kValues, joinedValues 

289 

290 return kValues 

291 

292 

293def compareHeaders(filename1, filename2): 

294 """Compare the headers of two files in detail. 

295 

296 First, the two files are confirmed to have the same pixel data to ensure 

297 the files should be being compared (by hashing the first 100x100 pixels 

298 in HDU 1). 

299 

300 It then prints out: 

301 the keys that appear in A and not B 

302 the keys that appear in B but not A 

303 the keys that in common, and of those in common: 

304 which are the same, 

305 which differ, 

306 and where different, what the differing values are 

307 

308 Parameters 

309 ---------- 

310 filename1 : str 

311 Full path to the first of the files to compare 

312 

313 filename2 : str 

314 Full path to the second of the files to compare 

315 """ 

316 assert isinstance(filename1, str) 

317 assert isinstance(filename2, str) 

318 

319 headerDict1, hashDict1 = buildHashAndHeaderDicts([filename1]) 

320 headerDict2, hashDict2 = buildHashAndHeaderDicts([filename2]) 

321 

322 if hashDict1[filename1] != hashDict2[filename2]: 

323 print("Pixel data was not the same - did you really mean to compare these files?") 

324 print(f"{filename1}\n{filename2}") 

325 cont = input("Press y to continue, anything else to quit:") 

326 if cont.lower()[0] != "y": 

327 exit() 

328 

329 # you might think you don't want to always call sorted() on the key sets 

330 # BUT otherwise they seem to be returned in random order each time you run 

331 # and that can be crazy-making 

332 

333 h1 = headerDict1[filename1] 

334 h2 = headerDict2[filename2] 

335 h1Keys = list(h1.keys()) 

336 h2Keys = list(h2.keys()) 

337 

338 commonKeys = set(h1Keys) 

339 commonKeys = commonKeys.intersection(h2Keys) 

340 

341 keysInh1NotInh2 = sorted([_ for _ in h1Keys if _ not in h2Keys]) 

342 keysInh2NotInh1 = sorted([_ for _ in h2Keys if _ not in h1Keys]) 

343 

344 print(f"Keys in {filename1} not in {filename2}:\n{keysInh1NotInh2}\n") 

345 print(f"Keys in {filename2} not in {filename1}:\n{keysInh2NotInh1}\n") 

346 print(f"Keys in common:\n{sorted(commonKeys)}\n") 

347 

348 # put in lists so we can output neatly rather than interleaving 

349 identical = [] 

350 differing = [] 

351 for key in commonKeys: 

352 if h1[key] == h2[key]: 

353 identical.append(key) 

354 else: 

355 differing.append(key) 

356 

357 assert len(identical) + len(differing) == len(commonKeys) 

358 

359 if len(identical) == len(commonKeys): 

360 print("All keys in common have identical values :)") 

361 else: 

362 print("Of the common keys, the following had identical values:") 

363 print(f"{sorted(identical)}\n") 

364 print("Common keys with differing values were:") 

365 for key in sorted(differing): 

366 d = "<blank card>".ljust(25) 

367 v1 = str(h1[key]).ljust(25) if not isinstance(h1[key], astropy.io.fits.card.Undefined) else d 

368 v2 = str(h2[key]).ljust(25) if not isinstance(h2[key], astropy.io.fits.card.Undefined) else d 

369 print(f"{key.ljust(8)}: {v1} vs {v2}") 

370 

371 # Finally, check the extension naming has the same ordering. 

372 # We have to touch the files again, which is pretty lame 

373 # but not doing so would require the header builder to know about 

374 # file pairings or return extra info, and that's not ideal either, 

375 # and also not worth the hassle to optimise as this is only 

376 # ever for a single file, not bulk file processing 

377 numbering1, numbering2 = [], [] 

378 with fits.open(filename1) as f1, fits.open(filename2) as f2: 

379 for hduF1, hduF2 in zip(f1[1:], f2[1:]): # skip the PDU 

380 if "EXTNAME" in hduF1.header and "EXTNAME" in hduF2.header: 

381 numbering1.append(hduF1.header["EXTNAME"]) 

382 numbering2.append(hduF2.header["EXTNAME"]) 

383 

384 if numbering1 != numbering2: 

385 print("\nSection numbering differs between files!") 

386 for s1, s2 in zip(numbering1, numbering2): 

387 print(f"{s1.ljust(12)} vs {s2.ljust(12)}") 

388 if len(numbering1) != len(numbering2): 

389 print("The length of those lists was also DIFFERENT! Presumably a non-image HDU was interleaved.")