Coverage for python / lsst / summit / extras / headerFunctions.py: 9%

181 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-06 09:16 +0000

1# This file is part of summit_extras. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22import filecmp 

23import hashlib 

24import logging 

25import os 

26import pickle 

27import sys 

28from typing import Any 

29 

30import astropy 

31import numpy as np 

32from astropy.io import fits 

33 

34# redirect logger to stdout so that logger messages appear in notebooks too 

35logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)]) 

36logger = logging.getLogger("headerFunctions") 

37 

38 

39def loadHeaderDictsFromLibrary(libraryFilename: str) -> tuple[dict, dict]: 

40 """Load cached header and hash dicts from a pickle library file. 

41 

42 Parameters 

43 ---------- 

44 libraryFilename : `str` 

45 Path of the library pickle file to load from. 

46 

47 Returns 

48 ------- 

49 headersDict : `dict` 

50 A dict, keyed by filename, with values being the full primary 

51 header, in the same format built by `buildHashAndHeaderDicts`. 

52 Empty if the file does not exist or fails to load. 

53 dataDict : `dict` 

54 A dict, keyed by filename, with values being hashes of the data 

55 sections, in the same format built by `buildHashAndHeaderDicts`. 

56 Empty if the file does not exist or fails to load. 

57 """ 

58 try: 

59 with open(libraryFilename, "rb") as pickleFile: 

60 headersDict, dataDict = pickle.load(pickleFile) 

61 

62 if len(headersDict) != len(dataDict): 

63 print("Loaded differing numbers of entries for the header and data dicts.") 

64 print(f"{len(headersDict)} vs {len(dataDict)}") 

65 print("Something has gone badly wrong - your library seems corrupted!") 

66 else: 

67 print(f"Loaded {len(headersDict)} values from pickle files") 

68 except Exception as e: 

69 if not os.path.exists(libraryFilename): 

70 print( 

71 f"{libraryFilename} not found. If building the header dicts for the first time this" 

72 " is to be expected.\nOtherwise you've misspecified the path to you library!" 

73 ) 

74 else: 

75 print(f"Something more sinister went wrong loading headers from {libraryFilename}:\n{e}") 

76 return {}, {} 

77 

78 return headersDict, dataDict 

79 

80 

81def _saveToLibrary(libraryFilename: str, headersDict: dict, dataDict: dict) -> None: 

82 """Persist header and data-hash dicts to a pickle library file. 

83 

84 Parameters 

85 ---------- 

86 libraryFilename : `str` 

87 Path of the library pickle file to write. 

88 headersDict : `dict` 

89 Dict of primary headers keyed by filename. 

90 dataDict : `dict` 

91 Dict of data-section hashes keyed by filename. 

92 """ 

93 try: 

94 with open(libraryFilename, "wb") as dumpFile: 

95 pickle.dump((headersDict, dataDict), dumpFile, pickle.HIGHEST_PROTOCOL) 

96 except Exception: 

97 print("Failed to write pickle file! Here's a debugger so you don't lose all your work:") 

98 import ipdb as pdb 

99 

100 pdb.set_trace() 

101 

102 

103def _findKeyForValue( 

104 dictionary: dict, value: Any, warnOnCollision: bool = True, returnCollisions: bool = False 

105) -> Any: 

106 """Return the key or keys that map to a given value. 

107 

108 Parameters 

109 ---------- 

110 dictionary : `dict` 

111 The dictionary to search. 

112 value : `object` 

113 The value to look up. 

114 warnOnCollision : `bool`, optional 

115 If `True`, log a warning when the value is held by more than one 

116 key. 

117 returnCollisions : `bool`, optional 

118 If `True`, return the full list of matching keys. Otherwise 

119 return only the first match. 

120 

121 Returns 

122 ------- 

123 keys : `object` or `list` 

124 A single matching key, or the full list of matching keys if 

125 ``returnCollisions`` is `True`. 

126 """ 

127 listOfKeys = [k for (k, v) in dictionary.items() if v == value] 

128 if warnOnCollision and len(listOfKeys) > 1: 

129 logger.warning("Found multiple keys for value! Returning only first.") 

130 if returnCollisions: 

131 return listOfKeys 

132 if not listOfKeys: 

133 return None 

134 return listOfKeys[0] 

135 

136 

137def _hashFile(fileToHash: Any, dataHdu: int | str, sliceToUse: slice) -> str: 

138 """Return a hash of a 2D region of a FITS HDU's data array. 

139 

140 Kept as a small helper so that hashing multiple HDUs (e.g. to cope 

141 with HDUs filled with zeros) can be added straightforwardly. 

142 

143 Parameters 

144 ---------- 

145 fileToHash : `astropy.io.fits.HDUList` 

146 The opened FITS file. 

147 dataHdu : `int` or `str` 

148 Index or extension name of the HDU holding the pixel data. 

149 sliceToUse : `slice` 

150 Slice applied to both axes of the data array before hashing. 

151 

152 Returns 

153 ------- 

154 hashStr : `str` 

155 Hex-encoded SHA-256 digest of the sliced data. 

156 """ 

157 # Cast to a fixed dtype so an all-zero array hashes to ZERO_HASH 

158 # regardless of the HDU's native pixel dtype. 

159 data = fileToHash[dataHdu].data[sliceToUse, sliceToUse].astype(np.int32).tobytes() 

160 h = _hashData(data) 

161 return h 

162 

163 

164def _hashData(data: np.ndarray) -> str: 

165 """Return a hex SHA-256 hash of the given array's bytes. 

166 

167 Parameters 

168 ---------- 

169 data : `numpy.ndarray` or `bytes` 

170 The data to hash. 

171 

172 Returns 

173 ------- 

174 hashStr : `str` 

175 Hex-encoded SHA-256 digest, suitable for storing in a dict. 

176 """ 

177 h = hashlib.sha256(data).hexdigest() # hex because we want it readable in the dict 

178 return h 

179 

180 

181ZERO_HASH = _hashData(np.zeros((100, 100), dtype=np.int32)) 

182 

183 

184def buildHashAndHeaderDicts( 

185 fileList: list[str], dataHdu: int | str = "Segment00", libraryLocation: str | None = None 

186) -> tuple[dict, dict]: 

187 """Build dicts of primary headers and data-section hashes. 

188 

189 Data is hashed using the hard-coded 100x100 corner of the pixel 

190 array, i.e. ``file[dataHdu].data[0:100, 0:100]``. If a library 

191 location is supplied, existing entries are loaded from there and 

192 newly computed entries are written back. 

193 

194 Parameters 

195 ---------- 

196 fileList : `list` [`str`] 

197 The fully-specified paths of the files to scrape. 

198 dataHdu : `int` or `str`, optional 

199 The HDU to use for the pixel data to hash. 

200 libraryLocation : `str`, optional 

201 Path to a pickle file used to cache results across runs. If 

202 provided, existing entries are loaded before processing and 

203 newly computed entries are written back. 

204 

205 Returns 

206 ------- 

207 headersDict : `dict` 

208 A dict, keyed by filename, with values being the full primary 

209 header. 

210 dataDict : `dict` 

211 A dict, keyed by filename, with values being hashes of the 

212 file's data section. 

213 """ 

214 headersDict: dict[str, Any] = {} 

215 dataDict: dict[str, str] = {} 

216 

217 if libraryLocation: 

218 headersDict, dataDict = loadHeaderDictsFromLibrary(libraryLocation) 

219 

220 # don't load files we already know about from the library 

221 filesToLoad = [f for f in fileList if f not in headersDict.keys()] 

222 

223 s = slice(0, 100) 

224 for filenum, filename in enumerate(filesToLoad): 

225 if len(filesToLoad) > 1000 and filenum % 1000 == 0: 

226 if libraryLocation: 

227 logger.info(f"Processed {filenum} of {len(filesToLoad)} files not loaded from library...") 

228 else: 

229 logger.info(f"Processed {filenum} of {len(fileList)} files...") 

230 with fits.open(filename) as f: 

231 try: 

232 headersDict[filename] = f[0].header 

233 h = _hashFile(f, dataHdu, s) 

234 if h in dataDict.values(): 

235 collision = _findKeyForValue(dataDict, h, warnOnCollision=False) 

236 logger.warning( 

237 f"Duplicate file (or hash collision!) for files {filename} and " f"{collision}!" 

238 ) 

239 if filecmp.cmp(filename, collision): 

240 logger.warning("Filecmp shows files are identical") 

241 else: 

242 logger.warning( 

243 "Filecmp shows files differ - " 

244 "likely just zeros for data (or a genuine hash collision!)" 

245 ) 

246 

247 dataDict[filename] = h 

248 except Exception: 

249 logger.warning(f"Failed to load {filename} - file is likely corrupted.") 

250 

251 # we have always added to this, so save it back over the original 

252 if libraryLocation and len(filesToLoad) > 0: 

253 _saveToLibrary(libraryLocation, headersDict, dataDict) 

254 

255 # have to pare these down, as library loaded could be a superset 

256 headersDict = {k: headersDict[k] for k in fileList if k in headersDict.keys()} 

257 dataDict = {k: dataDict[k] for k in fileList if k in dataDict.keys()} 

258 

259 return headersDict, dataDict 

260 

261 

262def sortedAsStrings(inlist: set | list, replacementValue: str = "<BLANK VALUE>") -> list: 

263 """Sort a list, coercing all values to strings and handling blanks. 

264 

265 Replaces any ``astropy.io.fits.card.Undefined`` entries with 

266 ``replacementValue`` so they sort consistently alongside mixed 

267 string and integer values. 

268 

269 Parameters 

270 ---------- 

271 inlist : `set` or `list` 

272 Values to sort. 

273 replacementValue : `str`, optional 

274 String used in place of undefined FITS card values. 

275 

276 Returns 

277 ------- 

278 output : `list` [`str`] 

279 Sorted list of string values. 

280 """ 

281 output = [ 

282 str(x) if not isinstance(x, astropy.io.fits.card.Undefined) else replacementValue for x in inlist 

283 ] 

284 return sorted(output) 

285 

286 

287def keyValuesSetFromFiles( 

288 fileList: list[str], 

289 keys: list[str], 

290 joinKeys: list[str], 

291 noWarn: bool = False, 

292 printResults: bool = True, 

293 libraryLocation: str | None = None, 

294 printPerFile: bool = False, 

295) -> Any: 

296 """Get the set of values seen for a set of header keys across files. 

297 

298 Parameters 

299 ---------- 

300 fileList : `list` [`str`] 

301 The fully-specified paths of the files to scrape. 

302 keys : `list` [`str`] 

303 The header keys to scrape. 

304 joinKeys : `list` [`str`] 

305 List of keys to concatenate with ``+`` when scraping. For 

306 example, a header with ``FILTER1 = SDSS_u`` and 

307 ``FILTER2 = NB_640nm`` yields ``SDSS_u+NB_640nm``. Useful when 

308 looking for the actually observed combinations rather than the 

309 Cartesian product of the individual sets. Pass an empty list to 

310 skip join processing. 

311 noWarn : `bool`, optional 

312 If `True`, suppress warnings about keys missing from a header. 

313 printResults : `bool`, optional 

314 If `True`, print the collected values (and files with all-zero 

315 data) to stdout. 

316 libraryLocation : `str`, optional 

317 Path to a cached header library; passed through to 

318 `buildHashAndHeaderDicts`. 

319 printPerFile : `bool`, optional 

320 If `True`, print each ``filename<tab>key<tab>value`` triple as 

321 it is scraped. Prompts for confirmation when the output would 

322 exceed 200 lines. 

323 

324 Returns 

325 ------- 

326 kValues : `dict` [`str`, `set`] or `None` 

327 Mapping of each requested key to the set of values seen. `None` 

328 if ``keys`` is empty. 

329 joinedValues : `set` [`str`] 

330 Set of joined value strings. Only returned if ``joinKeys`` is 

331 non-empty. 

332 """ 

333 print(f"Scraping headers from {len(fileList)} files...") 

334 if printPerFile and (len(fileList) * len(keys) > 200): 

335 print(f"You asked to print headers per-file, for {len(fileList)} files x {len(keys)} keys.") 

336 cont = input("Are you sure? Press y to continue, anything else to quit:") 

337 if not cont or cont.lower()[0] != "y": 

338 exit() 

339 

340 headerDict, hashDict = buildHashAndHeaderDicts(fileList, libraryLocation=libraryLocation) 

341 

342 kValues: dict[str, set[Any]] | None 

343 if keys: # necessary so that -j works on its own 

344 kValues = {k: set() for k in keys} 

345 else: 

346 keys = [] 

347 kValues = None 

348 

349 joinedValues: set[str] = set() 

350 

351 for filename in headerDict.keys(): 

352 header = headerDict[filename] 

353 for key in keys: 

354 if key in header: 

355 assert kValues is not None 

356 kValues[key].add(header[key]) 

357 if printPerFile: 

358 print(f"{filename}\t{key}\t{header[key]}") 

359 if len(keys) > 1 and key == keys[-1]: 

360 # newline between files if multikey 

361 print() 

362 else: 

363 if not noWarn: 

364 logger.warning(f"{key} not found in header of {filename}") 

365 

366 if joinKeys: 

367 jVals = None 

368 # Note that CCS doesn't leave values blank, it misses the whole 

369 # card out for things like FILTER2 when not being used 

370 jVals = [header[k] if k in header else "<missing card>" for k in joinKeys] 

371 

372 # However, we do ALSO get blank cards to, so: 

373 # substitute <BLANK_VALUE> when there is an undefined card 

374 # because str(v) will give the address for each blank value 

375 # too, meaning each blank card looks like a different value 

376 joinedValues.add( 

377 "+".join( 

378 [ 

379 str(v) if not isinstance(v, astropy.io.fits.card.Undefined) else "<BLANK_VALUE>" 

380 for v in jVals 

381 ] 

382 ) 

383 ) 

384 

385 if printResults: 

386 # Do this first because it's messy 

387 zeroFiles = _findKeyForValue(hashDict, ZERO_HASH, warnOnCollision=False, returnCollisions=True) 

388 if zeroFiles: 

389 print("\nFiles with zeros for data:") 

390 for filename in zeroFiles: 

391 print(f"{filename}") 

392 

393 if kValues is not None: 

394 for key in kValues.keys(): 

395 print(f"\nValues found for header key {key}:") 

396 print(f"{sortedAsStrings(kValues[key])}") 

397 

398 if joinKeys: 

399 print(f"\nValues found when joining {joinKeys}:") 

400 print(f"{sortedAsStrings(joinedValues)}") 

401 

402 if joinKeys: 

403 return kValues, joinedValues 

404 

405 return kValues 

406 

407 

408def compareHeaders(filename1: str, filename2: str) -> None: 

409 """Compare the headers of two FITS files in detail. 

410 

411 First, the two files are confirmed to have the same pixel data (by 

412 hashing the first 100x100 pixels in HDU 1) to ensure the files 

413 really should be compared. 

414 

415 The function then prints: 

416 

417 - keys that appear in A but not in B 

418 - keys that appear in B but not in A 

419 - keys common to both files, split into those with identical 

420 values and those whose values differ (showing the differing 

421 values side-by-side) 

422 - whether the post-PDU HDU ``EXTNAME`` ordering differs between 

423 the two files 

424 

425 Parameters 

426 ---------- 

427 filename1 : `str` 

428 Full path to the first file to compare. 

429 filename2 : `str` 

430 Full path to the second file to compare. 

431 """ 

432 assert isinstance(filename1, str) 

433 assert isinstance(filename2, str) 

434 

435 headerDict1, hashDict1 = buildHashAndHeaderDicts([filename1]) 

436 headerDict2, hashDict2 = buildHashAndHeaderDicts([filename2]) 

437 

438 if hashDict1[filename1] != hashDict2[filename2]: 

439 print("Pixel data was not the same - did you really mean to compare these files?") 

440 print(f"{filename1}\n{filename2}") 

441 cont = input("Press y to continue, anything else to quit:") 

442 if not cont or cont.lower()[0] != "y": 

443 exit() 

444 

445 # you might think you don't want to always call sorted() on the key sets 

446 # BUT otherwise they seem to be returned in random order each time you run 

447 # and that can be crazy-making 

448 

449 h1 = headerDict1[filename1] 

450 h2 = headerDict2[filename2] 

451 h1Keys = list(h1.keys()) 

452 h2Keys = list(h2.keys()) 

453 

454 commonKeys = set(h1Keys) 

455 commonKeys = commonKeys.intersection(h2Keys) 

456 

457 keysInh1NotInh2 = sortedAsStrings([_ for _ in h1Keys if _ not in h2Keys]) 

458 keysInh2NotInh1 = sortedAsStrings([_ for _ in h2Keys if _ not in h1Keys]) 

459 

460 print(f"Keys in {filename1} not in {filename2}:\n{keysInh1NotInh2}\n") 

461 print(f"Keys in {filename2} not in {filename1}:\n{keysInh2NotInh1}\n") 

462 print(f"Keys in common:\n{sortedAsStrings(commonKeys)}\n") 

463 

464 # put in lists so we can output neatly rather than interleaving 

465 identical = [] 

466 differing = [] 

467 for key in commonKeys: 

468 if h1[key] == h2[key]: 

469 identical.append(key) 

470 else: 

471 differing.append(key) 

472 

473 assert len(identical) + len(differing) == len(commonKeys) 

474 

475 if len(identical) == len(commonKeys): 

476 print("All keys in common have identical values :)") 

477 else: 

478 print("Of the common keys, the following had identical values:") 

479 print(f"{sortedAsStrings(identical)}\n") 

480 print("Common keys with differing values were:") 

481 for key in sortedAsStrings(differing): 

482 d = "<blank card>".ljust(25) 

483 v1 = str(h1[key]).ljust(25) if not isinstance(h1[key], astropy.io.fits.card.Undefined) else d 

484 v2 = str(h2[key]).ljust(25) if not isinstance(h2[key], astropy.io.fits.card.Undefined) else d 

485 print(f"{key.ljust(8)}: {v1} vs {v2}") 

486 

487 # Finally, check the extension naming has the same ordering. 

488 # We have to touch the files again, which is pretty lame 

489 # but not doing so would require the header builder to know about 

490 # file pairings or return extra info, and that's not ideal either, 

491 # and also not worth the hassle to optimise as this is only 

492 # ever for a single file, not bulk file processing 

493 numbering1, numbering2 = [], [] 

494 with fits.open(filename1) as f1, fits.open(filename2) as f2: 

495 for hduF1, hduF2 in zip(f1[1:], f2[1:]): # skip the PDU 

496 if "EXTNAME" in hduF1.header and "EXTNAME" in hduF2.header: 

497 numbering1.append(hduF1.header["EXTNAME"]) 

498 numbering2.append(hduF2.header["EXTNAME"]) 

499 

500 if numbering1 != numbering2: 

501 print("\nSection numbering differs between files!") 

502 for s1, s2 in zip(numbering1, numbering2): 

503 print(f"{s1.ljust(12)} vs {s2.ljust(12)}") 

504 if len(numbering1) != len(numbering2): 

505 print("The length of those lists was also DIFFERENT! Presumably a non-image HDU was interleaved.")