Coverage for python/lsst/summit/extras/headerFunctions.py: 8%

181 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-03 12:39 +0000

1# This file is part of summit_extras. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22import filecmp 

23import hashlib 

24import logging 

25import os 

26import pickle 

27import sys 

28from typing import Any 

29 

30import astropy 

31import numpy as np 

32from astropy.io import fits 

33 

34# redirect logger to stdout so that logger messages appear in notebooks too 

35logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)]) 

36logger = logging.getLogger("headerFunctions") 

37 

38 

39def loadHeaderDictsFromLibrary(libraryFilename: str) -> tuple[dict, dict]: 

40 """Load the header and hash dicts from a pickle file. 

41 

42 Parameters 

43 ---------- 

44 libraryFilename : `str` 

45 Path of the library file to load from 

46 

47 Returns 

48 ------- 

49 headersDict : `dict` 

50 A dict, keyed by filename, with the values being the full primary 

51 header, exactly as if it were built by buildHashAndHeaderDicts(). 

52 

53 dataDict : `dict` 

54 A dict, keyed by filename, with the values being hashes of the data 

55 sections, exactly as if it were built by buildHashAndHeaderDicts(). 

56 """ 

57 try: 

58 with open(libraryFilename, "rb") as pickleFile: 

59 headersDict, dataDict = pickle.load(pickleFile) 

60 

61 if len(headersDict) != len(dataDict): 

62 print("Loaded differing numbers of entries for the header and data dicts.") 

63 print(f"{len(headersDict)} vs {len(dataDict)}") 

64 print("Something has gone badly wrong - your library seems corrupted!") 

65 else: 

66 print(f"Loaded {len(headersDict)} values from pickle files") 

67 except Exception as e: 

68 if not os.path.exists(libraryFilename): 

69 print( 

70 f"{libraryFilename} not found. If building the header dicts for the first time this" 

71 " is to be expected.\nOtherwise you've misspecified the path to you library!" 

72 ) 

73 else: 

74 print(f"Something more sinister went wrong loading headers from {libraryFilename}:\n{e}") 

75 return {}, {} 

76 

77 return headersDict, dataDict 

78 

79 

80def _saveToLibrary(libraryFilename: str, headersDict: dict, dataDict: dict) -> None: 

81 try: 

82 with open(libraryFilename, "wb") as dumpFile: 

83 pickle.dump((headersDict, dataDict), dumpFile, pickle.HIGHEST_PROTOCOL) 

84 except Exception: 

85 print("Failed to write pickle file! Here's a debugger so you don't lose all your work:") 

86 import ipdb as pdb 

87 

88 pdb.set_trace() 

89 

90 

91def _findKeyForValue( 

92 dictionary: dict, value: Any, warnOnCollision: bool = True, returnCollisions: bool = False 

93) -> Any: 

94 listOfKeys = [k for (k, v) in dictionary.items() if v == value] 

95 if warnOnCollision and len(listOfKeys) != 1: 

96 logger.warning("Found multiple keys for value! Returning only first.") 

97 if returnCollisions: 

98 return listOfKeys 

99 return listOfKeys[0] 

100 

101 

102def _hashFile(fileToHash, dataHdu, sliceToUse) -> str: 

103 """Put in place so that if hashing multiple HDUs is desired when one 

104 is filled with zeros it will be easy to add""" 

105 data = fileToHash[dataHdu].data[sliceToUse, sliceToUse].tostring() 

106 h = _hashData(data) 

107 return h 

108 

109 

110def _hashData(data: np.array) -> str: 

111 h = hashlib.sha256(data).hexdigest() # hex because we want it readable in the dict 

112 return h 

113 

114 

115ZERO_HASH = _hashData(np.zeros((100, 100), dtype=np.int32)) 

116 

117 

118def buildHashAndHeaderDicts( 

119 fileList: list[str], dataHdu: int | str = "Segment00", libraryLocation: str | None = None 

120) -> tuple[dict, dict]: 

121 """For a list of files, build dicts of hashed data and headers. 

122 

123 Data is hashed using a currently-hard-coded 100x100 region of the pixels 

124 i.e. file[dataHdu].data[0:100, 0:100] 

125 

126 Parameters 

127 ---------- 

128 fileList : `list` of `str` 

129 The fully-specified paths of the files to scrape 

130 

131 dataHdu : `str` or `int` 

132 The HDU to use for the pixel data to hash. 

133 

134 Returns 

135 ------- 

136 headersDict : `dict` 

137 A dict, keyed by filename, with the values being the full primary 

138 header. 

139 

140 dataDict : `dict` 

141 A dict, keyed by filename, with the values being hashes of the file's 

142 data section, as defined by the dataSize and dataHdu. 

143 

144 """ 

145 headersDict = {} 

146 dataDict = {} 

147 

148 if libraryLocation: 

149 headersDict, dataDict = loadHeaderDictsFromLibrary(libraryLocation) 

150 

151 # don't load files we already know about from the library 

152 filesToLoad = [f for f in fileList if f not in headersDict.keys()] 

153 

154 s = slice(0, 100) 

155 for filenum, filename in enumerate(filesToLoad): 

156 if len(filesToLoad) > 1000 and filenum % 1000 == 0: 

157 if libraryLocation: 

158 logger.info(f"Processed {filenum} of {len(filesToLoad)} files not loaded from library...") 

159 else: 

160 logger.info(f"Processed {filenum} of {len(fileList)} files...") 

161 with fits.open(filename) as f: 

162 try: 

163 headersDict[filename] = f[0].header 

164 h = _hashFile(f, dataHdu, s) 

165 if h in dataDict.values(): 

166 collision = _findKeyForValue(dataDict, h, warnOnCollision=False) 

167 logger.warning( 

168 f"Duplicate file (or hash collision!) for files {filename} and " f"{collision}!" 

169 ) 

170 if filecmp.cmp(filename, collision): 

171 logger.warning("Filecmp shows files are identical") 

172 else: 

173 logger.warning( 

174 "Filecmp shows files differ - " 

175 "likely just zeros for data (or a genuine hash collision!)" 

176 ) 

177 

178 dataDict[filename] = h 

179 except Exception: 

180 logger.warning(f"Failed to load {filename} - file is likely corrupted.") 

181 

182 # we have always added to this, so save it back over the original 

183 if libraryLocation and len(filesToLoad) > 0: 

184 _saveToLibrary(libraryLocation, headersDict, dataDict) 

185 

186 # have to pare these down, as library loaded could be a superset 

187 headersDict = {k: headersDict[k] for k in fileList if k in headersDict.keys()} 

188 dataDict = {k: dataDict[k] for k in fileList if k in dataDict.keys()} 

189 

190 return headersDict, dataDict 

191 

192 

193def sorted(inlist: list, replacementValue: str = "<BLANK VALUE>") -> list: 

194 """Redefinition of sorted() to deal with blank values and str/int mixes""" 

195 from builtins import sorted as _sorted 

196 

197 output = [ 

198 str(x) if not isinstance(x, astropy.io.fits.card.Undefined) else replacementValue for x in inlist 

199 ] 

200 output = _sorted(output) 

201 return output 

202 

203 

204def keyValuesSetFromFiles( 

205 fileList: list[str], 

206 keys: list[str], 

207 joinKeys: list[str], 

208 noWarn: bool = False, 

209 printResults: bool = True, 

210 libraryLocation: str | None = None, 

211 printPerFile: bool = False, 

212) -> list[str]: 

213 """For a list of FITS files, get the set of values for the given keys. 

214 

215 Parameters 

216 ---------- 

217 fileList : `list` of `str` 

218 The fully-specified paths of the files to scrape 

219 

220 keys : `list` of `str` 

221 The header keys to scrape 

222 

223 joinKeys : `list` of `str` 

224 List of keys to concatenate when scraping, e.g. for a header with 

225 FILTER1 = SDSS_u and FILTER2 == NB_640nm 

226 this would return SDSS_u+NB_640nm 

227 Useful when looking for the actual set, rather than taking the product 

228 of all the individual values, as some combinations may never happen. 

229 """ 

230 print(f"Scraping headers from {len(fileList)} files...") 

231 if printPerFile and (len(fileList) * len(keys) > 200): 

232 print(f"You asked to print headers per-file, for {len(fileList)} files x {len(keys)} keys.") 

233 cont = input("Are you sure? Press y to continue, anything else to quit:") 

234 if cont.lower()[0] != "y": 

235 exit() 

236 

237 headerDict, hashDict = buildHashAndHeaderDicts(fileList, libraryLocation=libraryLocation) 

238 

239 if keys: # necessary so that -j works on its own 

240 kValues = {k: set() for k in keys} 

241 else: 

242 keys = [] 

243 kValues = None 

244 

245 if joinKeys: 

246 joinedValues = set() 

247 

248 for filename in headerDict.keys(): 

249 header = headerDict[filename] 

250 for key in keys: 

251 if key in header: 

252 kValues[key].add(header[key]) 

253 if printPerFile: 

254 print(f"{filename}\t{key}\t{header[key]}") 

255 if len(keys) > 1 and key == keys[-1]: 

256 # newline between files if multikey 

257 print() 

258 else: 

259 if not noWarn: 

260 logger.warning(f"{key} not found in header of {filename}") 

261 

262 if joinKeys: 

263 jVals = None 

264 # Note that CCS doesn't leave values blank, it misses the whole 

265 # card out for things like FILTER2 when not being used 

266 jVals = [header[k] if k in header else "<missing card>" for k in joinKeys] 

267 

268 # However, we do ALSO get blank cards to, so: 

269 # substitute <BLANK_VALUE> when there is an undefined card 

270 # because str(v) will give the address for each blank value 

271 # too, meaning each blank card looks like a different value 

272 joinedValues.add( 

273 "+".join( 

274 [ 

275 str(v) if not isinstance(v, astropy.io.fits.card.Undefined) else "<BLANK_VALUE>" 

276 for v in jVals 

277 ] 

278 ) 

279 ) 

280 

281 if printResults: 

282 # Do this first because it's messy 

283 zeroFiles = _findKeyForValue(hashDict, ZERO_HASH, warnOnCollision=False, returnCollisions=True) 

284 if zeroFiles: 

285 print("\nFiles with zeros for data:") 

286 for filename in zeroFiles: 

287 print(f"{filename}") 

288 

289 if kValues is not None: 

290 for key in kValues.keys(): 

291 print(f"\nValues found for header key {key}:") 

292 print(f"{sorted(kValues[key])}") 

293 

294 if joinKeys: 

295 print(f"\nValues found when joining {joinKeys}:") 

296 print(f"{sorted(joinedValues)}") 

297 

298 if joinKeys: 

299 return kValues, joinedValues 

300 

301 return kValues 

302 

303 

304def compareHeaders(filename1: str, filename2: str) -> None: 

305 """Compare the headers of two files in detail. 

306 

307 First, the two files are confirmed to have the same pixel data to ensure 

308 the files should be being compared (by hashing the first 100x100 pixels 

309 in HDU 1). 

310 

311 It then prints out: 

312 the keys that appear in A and not B 

313 the keys that appear in B but not A 

314 the keys that in common, and of those in common: 

315 which are the same, 

316 which differ, 

317 and where different, what the differing values are 

318 

319 Parameters 

320 ---------- 

321 filename1 : str 

322 Full path to the first of the files to compare 

323 

324 filename2 : str 

325 Full path to the second of the files to compare 

326 """ 

327 assert isinstance(filename1, str) 

328 assert isinstance(filename2, str) 

329 

330 headerDict1, hashDict1 = buildHashAndHeaderDicts([filename1]) 

331 headerDict2, hashDict2 = buildHashAndHeaderDicts([filename2]) 

332 

333 if hashDict1[filename1] != hashDict2[filename2]: 

334 print("Pixel data was not the same - did you really mean to compare these files?") 

335 print(f"{filename1}\n{filename2}") 

336 cont = input("Press y to continue, anything else to quit:") 

337 if cont.lower()[0] != "y": 

338 exit() 

339 

340 # you might think you don't want to always call sorted() on the key sets 

341 # BUT otherwise they seem to be returned in random order each time you run 

342 # and that can be crazy-making 

343 

344 h1 = headerDict1[filename1] 

345 h2 = headerDict2[filename2] 

346 h1Keys = list(h1.keys()) 

347 h2Keys = list(h2.keys()) 

348 

349 commonKeys = set(h1Keys) 

350 commonKeys = commonKeys.intersection(h2Keys) 

351 

352 keysInh1NotInh2 = sorted([_ for _ in h1Keys if _ not in h2Keys]) 

353 keysInh2NotInh1 = sorted([_ for _ in h2Keys if _ not in h1Keys]) 

354 

355 print(f"Keys in {filename1} not in {filename2}:\n{keysInh1NotInh2}\n") 

356 print(f"Keys in {filename2} not in {filename1}:\n{keysInh2NotInh1}\n") 

357 print(f"Keys in common:\n{sorted(commonKeys)}\n") 

358 

359 # put in lists so we can output neatly rather than interleaving 

360 identical = [] 

361 differing = [] 

362 for key in commonKeys: 

363 if h1[key] == h2[key]: 

364 identical.append(key) 

365 else: 

366 differing.append(key) 

367 

368 assert len(identical) + len(differing) == len(commonKeys) 

369 

370 if len(identical) == len(commonKeys): 

371 print("All keys in common have identical values :)") 

372 else: 

373 print("Of the common keys, the following had identical values:") 

374 print(f"{sorted(identical)}\n") 

375 print("Common keys with differing values were:") 

376 for key in sorted(differing): 

377 d = "<blank card>".ljust(25) 

378 v1 = str(h1[key]).ljust(25) if not isinstance(h1[key], astropy.io.fits.card.Undefined) else d 

379 v2 = str(h2[key]).ljust(25) if not isinstance(h2[key], astropy.io.fits.card.Undefined) else d 

380 print(f"{key.ljust(8)}: {v1} vs {v2}") 

381 

382 # Finally, check the extension naming has the same ordering. 

383 # We have to touch the files again, which is pretty lame 

384 # but not doing so would require the header builder to know about 

385 # file pairings or return extra info, and that's not ideal either, 

386 # and also not worth the hassle to optimise as this is only 

387 # ever for a single file, not bulk file processing 

388 numbering1, numbering2 = [], [] 

389 with fits.open(filename1) as f1, fits.open(filename2) as f2: 

390 for hduF1, hduF2 in zip(f1[1:], f2[1:]): # skip the PDU 

391 if "EXTNAME" in hduF1.header and "EXTNAME" in hduF2.header: 

392 numbering1.append(hduF1.header["EXTNAME"]) 

393 numbering2.append(hduF2.header["EXTNAME"]) 

394 

395 if numbering1 != numbering2: 

396 print("\nSection numbering differs between files!") 

397 for s1, s2 in zip(numbering1, numbering2): 

398 print(f"{s1.ljust(12)} vs {s2.ljust(12)}") 

399 if len(numbering1) != len(numbering2): 

400 print("The length of those lists was also DIFFERENT! Presumably a non-image HDU was interleaved.")