Coverage for python/astro_metadata_translator/headers.py: 9%

188 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 10:02 +0000

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12"""Code to support header manipulation operations.""" 

13 

14from __future__ import annotations 

15 

16__all__ = ("merge_headers", "fix_header") 

17 

18import copy 

19import datetime 

20import importlib.resources as resources 

21import itertools 

22import logging 

23import os 

24import posixpath 

25from collections import Counter 

26from collections.abc import Mapping, MutableMapping, Sequence 

27from typing import IO, Any 

28 

29import yaml 

30 

31from .translator import MetadataTranslator 

32from .translators import FitsTranslator 

33 

34log = logging.getLogger(__name__) 

35 

36ENV_VAR_NAME = "METADATA_CORRECTIONS_PATH" 

37"""Name of environment variable containing search path for header fix up.""" 

38 

39HIERARCH = "HIERARCH ASTRO METADATA FIX" 

40"""FITS-style hierarchical keyword root.""" 

41 

42FIXUP_SENTINEL = HIERARCH + " MODIFIED" 

43"""Keyword to add to header when header has been fixed.""" 

44 

45 

46def merge_headers( 

47 headers: Sequence[MutableMapping[str, Any]], 

48 mode: str = "overwrite", 

49 sort: bool = False, 

50 first: Sequence[str] | None = None, 

51 last: Sequence[str] | None = None, 

52) -> MutableMapping[str, Any]: 

53 """Merge multiple headers into a single dict. 

54 

55 Given a list of dict-like data headers, combine them following the 

56 specified mode. 

57 

58 Parameters 

59 ---------- 

60 headers : `list` of `dict` (or `dict`-like) 

61 Collection of headers to combine. `~lsst.daf.base.PropertyList` 

62 is supported. 

63 mode : `str` 

64 Scheme to use when a header has the same key as another header 

65 but different value. Options are: 

66 

67 - ``'overwrite'`` : Value in later header overwrites earlier value. 

68 - ``'drop'`` : Entire key is dropped. If a key only appears in a 

69 subset of the headers, and is identical in those, it will be 

70 retained. 

71 - ``'diff'`` : As for ``drop`` but the dropped values are stored in a 

72 `list` of `dict` in the returned merged header in key 

73 ``__DIFF__``. The order used matches the supplied order or 

74 the sorted order if specified. This allows a simple header diff 

75 to be performed and associated with the original headers. Only 

76 keys that appear in all headers will be retained in the merged one. 

77 Unlike for ``'drop'`` headers that are identical and only present in 

78 a subset will always be included in the diff. 

79 - ``'first'`` : Retain first value encountered. 

80 - ``'append'`` : Convert value to list with a value for each header 

81 (`None` if the key was not present). If the value is 

82 identical in multiple headers but key is missing in 

83 some, then the single identical header is stored. 

84 sort : `bool`, optional 

85 If `True`, sort the supplied headers into date order if possible. 

86 This affects the resulting merged output depending on the requested 

87 merge mode. An attempt will be made to extract a date from the 

88 headers. 

89 first : `list` or `tuple`, optional 

90 Keys to retain even if they differ. For all modes excepting ``append`` 

91 (where it is ignored) the value in the merged header will always be 

92 the value first encountered. This is usually to allow time-dependent 

93 headers such as ``DATE-OBS`` and ``AZSTART`` to be retained to allow 

94 the header to indicate the range of values. No exception is raised if 

95 a key can not be found in a header since this allows a range of 

96 expected headers to be listed covering multiple instruments. 

97 last : `list` or `tuple`, optional 

98 Keys to retain even if they differ. For all modes excepting ``append`` 

99 (where it is ignored) the value in the merged header will always be 

100 the final value encountered. This is usually to allow time-dependent 

101 headers such as ``DATE-END`` and ``AZEND`` to be retained to allow 

102 the header to indicate the range of values. No exception is raised if 

103 a key can not be found in a header since this allows a range of 

104 expected headers to be listed covering multiple instruments. 

105 

106 Returns 

107 ------- 

108 merged : `dict` 

109 Single `dict` combining all the headers using the specified 

110 combination mode. 

111 

112 Notes 

113 ----- 

114 If ``first`` and ``last`` are supplied, the keys from ``first`` are 

115 handled first, followed by the keys from ``last``. No check is made to 

116 ensure that the keys do not overlap. 

117 """ 

118 if not headers: 

119 raise ValueError("No headers supplied.") 

120 

121 # Copy the input list because we will be reorganizing it 

122 headers = list(headers) 

123 

124 # With a single header provided return a copy immediately 

125 if len(headers) == 1: 

126 return copy.deepcopy(headers[0]) 

127 

128 if sort: 

129 

130 def key_func(hdr: Mapping[str, Any]) -> Any: 

131 translator_class = None 

132 try: 

133 translator_class = MetadataTranslator.determine_translator(hdr) 

134 except ValueError: 

135 # Try the FITS translator 

136 translator_class = FitsTranslator 

137 translator = translator_class(hdr) 

138 return translator.to_datetime_begin() 

139 

140 headers = sorted(headers, key=key_func) 

141 

142 log.debug("Received %d headers for merging", len(headers)) 

143 

144 # Pull out first header 

145 first_hdr = headers.pop(0) 

146 

147 # Seed the merged header with a copy 

148 merged = copy.deepcopy(first_hdr) 

149 

150 if mode == "overwrite": 

151 for h in headers: 

152 merged.update(h) 

153 

154 elif mode == "first": 

155 # Reversing the headers and using overwrite mode would result in the 

156 # header order being inconsistent dependent on mode. 

157 for hdr in headers: 

158 for key in hdr: 

159 if key not in merged: 

160 merged[key] = hdr[key] 

161 

162 elif mode == "drop": 

163 drop = set() 

164 for hdr in headers: 

165 for key in hdr: 

166 if key not in merged: 

167 merged[key] = hdr[key] 

168 elif merged[key] != hdr[key]: 

169 # Key should be dropped later (not in loop since removing 

170 # the key now might add it back for the next header). 

171 drop.add(key) 

172 

173 for key in drop: 

174 del merged[key] 

175 

176 elif mode == "diff": 

177 dropped_keys = set() 

178 

179 # Only want to keep keys in the merged header that are in all the 

180 # input headers and identical. Seed with the first header 

181 counter = Counter(merged.keys()) 

182 

183 for hdr in headers: 

184 counter.update(hdr.keys()) 

185 for key in hdr: 

186 if key not in merged: 

187 merged[key] = hdr[key] 

188 elif merged[key] != hdr[key]: 

189 # Key should be dropped later (not in loop since removing 

190 # the key now might add it back for the next header). 

191 dropped_keys.add(key) 

192 

193 # Add to the list of dropped keys all the keys that 

194 # have a count less than number of input headers (incl first one) 

195 n = len(headers) + 1 

196 for key in counter: 

197 if counter[key] != n: 

198 dropped_keys.add(key) 

199 

200 # For each dropped key, create a distinct diff header 

201 # We must include the first header in this 

202 diffs = [] 

203 for hdr in itertools.chain([first_hdr], headers): 

204 # Get a list of all the dropped keys that are in this header 

205 # Sometimes a key will only be in some headers. For now 

206 # do not include it in the diff at all. 

207 diff_keys = dropped_keys & set(hdr) 

208 

209 diffs.append({k: hdr[k] for k in diff_keys}) 

210 

211 # PropertyList does not let us attach a dict to it 

212 # so if we encounter this we have to force a type change to dict 

213 try: 

214 merged["__DIFF__"] = diffs 

215 except TypeError: 

216 merged = dict(merged) 

217 merged["__DIFF__"] = diffs 

218 

219 for key in dropped_keys: 

220 del merged[key] 

221 

222 elif mode == "append": 

223 fill = set() 

224 for hdr in headers: 

225 for key in hdr: 

226 if key not in merged: 

227 merged[key] = hdr[key] 

228 elif not isinstance(merged[key], list) and merged[key] != hdr[key]: 

229 # If we detect different values, store an empty list 

230 # in the slot and fill it later. Do it at end so 

231 # we can pick up earlier values and fill empty with None. 

232 merged[key] = [] 

233 fill.add(key) 

234 

235 # Fill the entries that have multiple differing values 

236 for key in fill: 

237 merged[key] = [h[key] if key in h else None for h in itertools.chain([first_hdr], headers)] 

238 

239 else: 

240 raise ValueError(f"Unsupported value of '{mode}' for mode parameter.") 

241 

242 # Force the first and last values to be inserted 

243 # 

244 if mode != "append": 

245 

246 def retain_value( 

247 to_receive: MutableMapping[str, Any], 

248 to_retain: Sequence[str] | None, 

249 sources: tuple[Mapping[str, Any], ...], 

250 ) -> None: 

251 if to_retain: 

252 for k in to_retain: 

253 # Look for values until we find one 

254 for h in sources: 

255 if k in h: 

256 to_receive[k] = h[k] 

257 break 

258 

259 all_headers = (first_hdr, *headers) 

260 retain_value(merged, first, all_headers) 

261 retain_value(merged, last, tuple(reversed(all_headers))) 

262 

263 return merged 

264 

265 

266def _read_yaml(fh: IO[bytes], msg: str) -> Mapping[str, Any] | None: 

267 """Read YAML from file descriptor. 

268 

269 Parameters 

270 ---------- 

271 fh : `io.IOBase` 

272 Open file handle containing the YAML stream 

273 msg : `str` 

274 Text to include in log file when referring to this stream. Examples 

275 could be "file something.yaml" or "resource module:resource". 

276 

277 Returns 

278 ------- 

279 parsed : `dict` or `None` 

280 The contents of the YAML file if it was a `dict`, else `None` if 

281 the contents could not be parsed or the contents were YAML but 

282 not a mapping. 

283 """ 

284 try: 

285 content = yaml.safe_load(fh) 

286 except Exception as e: 

287 log.warning("Error parsing YAML header corrections from %s: %s", msg, str(e)) 

288 return None 

289 

290 if not isinstance(content, Mapping): 

291 log.warning("YAML Mapping not found in %s. Ignoring contents.", msg) 

292 return None 

293 

294 return content 

295 

296 

297def _find_from_file(header: MutableMapping[str, Any], paths: Sequence[str], target_file: str) -> str | None: 

298 """Search file system for matching correction files. 

299 

300 Parameters 

301 ---------- 

302 header : `dict` 

303 Header to update. 

304 paths : `list` 

305 Paths to search. 

306 target_file : `str` 

307 File to locate in the path. 

308 

309 Returns 

310 ------- 

311 correction_found : `str` or `None` 

312 The path of the correction file used to update the header or 

313 `None`. Only the first correction located in a path is used. 

314 """ 

315 for p in paths: 

316 correction_file = os.path.join(p, target_file) 

317 if os.path.exists(correction_file): 

318 with open(correction_file, "rb") as fh: 

319 log.debug("Applying header corrections from file %s", correction_file) 

320 corrections = _read_yaml(fh, f"file {correction_file}") 

321 

322 if corrections is None: 

323 continue 

324 

325 # Apply corrections 

326 header.update(corrections) 

327 

328 return correction_file 

329 return None 

330 

331 

332def _find_from_resource( 

333 header: MutableMapping[str, Any], package: str | None, resource_root: str | None, target_file: str 

334) -> str | None: 

335 """Search package resource for correction information. 

336 

337 Parameters 

338 ---------- 

339 header : `dict` 

340 Header to update. 

341 package : `str` 

342 Package resource to search. 

343 resource_root : `str` 

344 Resource root. 

345 target_file : `str` 

346 Resource to locate. 

347 

348 Returns 

349 ------- 

350 resource : `str` or `None` 

351 Name of resource read. `None` if no corrections found. 

352 """ 

353 if package is not None and resource_root is not None: 

354 resource_path = resources.files(package).joinpath(resource_root, target_file) # type: ignore 

355 if resource_path.is_file(): 

356 resource_uri = f"resource://{package}/{posixpath.join(resource_root, target_file)}" 

357 log.debug("Applying header corrections from package resource %s", resource_uri) 

358 with resource_path.open("rb") as fh: 

359 corrections = _read_yaml(fh, f"package resource {resource_path}") 

360 

361 if corrections is None: 

362 return None 

363 

364 header.update(corrections) 

365 

366 return resource_uri 

367 return None 

368 

369 

370def fix_header( 

371 header: MutableMapping[str, Any], 

372 search_path: str | Sequence[str] | None = None, 

373 translator_class: type[MetadataTranslator] | None = None, 

374 filename: str | None = None, 

375) -> bool: 

376 """Update, in place, the supplied header with known corrections. 

377 

378 Parameters 

379 ---------- 

380 header : `dict`-like 

381 Header to correct. 

382 search_path : `list` or `str`, optional 

383 Explicit directory paths to search for correction files. 

384 A single directory path can be given as a string. 

385 translator_class : `MetadataTranslator`-class, optional 

386 If not `None`, the class to use to translate the supplied headers 

387 into standard form. Otherwise each registered translator class will 

388 be asked in turn if it knows how to translate the supplied header. 

389 filename : `str`, optional 

390 Name of the file whose header is being translated. For some 

391 datasets with missing header information this can sometimes 

392 allow for some fixups in translations. 

393 

394 Returns 

395 ------- 

396 fixed : `bool` 

397 `True` if the header was updated. 

398 

399 Raises 

400 ------ 

401 TypeError 

402 Raised if the supplied translation class is not a `MetadataTranslator`. 

403 

404 Notes 

405 ----- 

406 In order to determine that a header update is required it is 

407 necessary for the header to be handled by the supplied translator 

408 class or else support automatic translation class determination. 

409 It is also required that the ``observation_id`` and ``instrument`` 

410 be calculable prior to header fix up. If a translator class can not 

411 be found or if there is a problem determining the instrument or 

412 observation ID, the function will return without action. 

413 

414 Correction files use names of the form ``instrument-obsid.yaml`` (for 

415 example ``LATISS-AT_O_20190329_000022.yaml``). 

416 The YAML file should have the format of: 

417 

418 .. code-block:: yaml 

419 

420 EXPTIME: 30.0 

421 IMGTYPE: bias 

422 

423 where each key/value pair is copied directly into the supplied header, 

424 overwriting any previous values. 

425 

426 This function searches a number of locations for such a correction file. 

427 The search order is: 

428 

429 - Any paths explicitly supplied through ``search_path``. 

430 - The contents of the PATH-like environment variable 

431 ``$METADATA_CORRECTIONS_PATH``. 

432 - Any search paths supplied by the matching translator class. 

433 

434 The first file located in the search path is used for the correction. 

435 """ 

436 if FIXUP_SENTINEL in header: 

437 return header[FIXUP_SENTINEL] 

438 

439 if translator_class is None: 

440 try: 

441 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

442 except ValueError as e: 

443 # if the header is not recognized, we should not complain 

444 # and should not proceed further. 

445 log.debug( 

446 "Unable to determine translator class %s -- not fixing header: %s", 

447 f"for {filename}" if filename is not None else "", 

448 e, 

449 ) 

450 return False 

451 elif not issubclass(translator_class, MetadataTranslator): 

452 raise TypeError(f"Translator class must be a MetadataTranslator, not {translator_class}") 

453 

454 # Create an instance for this header 

455 translator = translator_class(header, filename=filename) 

456 

457 # To determine the file look up we need the observation_id and instrument 

458 try: 

459 obsid = translator.to_observation_id() 

460 instrument = translator.to_instrument() 

461 except Exception: 

462 # Return without comment if these translations failed 

463 return False 

464 

465 target_file = f"{instrument}-{obsid}.yaml" 

466 log.debug("Checking for header correction file named %s", target_file) 

467 

468 # Work out the search path 

469 paths: list[str] = [] 

470 if search_path is not None: 

471 if isinstance(search_path, str): 

472 # Allow a single path to be given as a string 

473 search_path = [search_path] 

474 paths.extend(search_path) 

475 if ENV_VAR_NAME in os.environ and os.environ[ENV_VAR_NAME]: 

476 paths.extend(os.environ[ENV_VAR_NAME].split(os.path.pathsep)) 

477 

478 paths.extend(translator.search_paths()) 

479 

480 # Prioritize file system overrides 

481 corrections_file = _find_from_file(header, paths, target_file) 

482 

483 # Apply updates from resources only if none found in files 

484 if corrections_file is None: 

485 package, resource_root = translator.resource_root() 

486 corrections_file = _find_from_resource(header, package, resource_root, target_file) 

487 

488 # Allow a translation class to do local fixups 

489 # Allow it to fail but log the failure 

490 try: 

491 translator_modified = translator_class.fix_header(header, instrument, obsid, filename=filename) 

492 except Exception as e: 

493 log.fatal("Ignoring translator header fixup of %s %s: %s", instrument, obsid, e) 

494 translator_modified = False 

495 

496 was_modified = (corrections_file is not None) or translator_modified 

497 

498 # Always add a sentinel even if we nothing was updated 

499 # since this will speed up later fixes by not requiring the file 

500 # system scan or calling of the per-instrument translator methods. 

501 # Do not do it if there has been a problem determining a translator 

502 # since it may be that a new translator is registered later on for 

503 # another attempt. 

504 header[FIXUP_SENTINEL] = was_modified 

505 

506 # Record provenance 

507 header[HIERARCH + " DATE"] = datetime.datetime.now().isoformat() 

508 if corrections_file is not None: 

509 header[HIERARCH + " FILE"] = corrections_file 

510 if translator_modified: 

511 # Store the translator version 

512 header[HIERARCH + " VERSION"] = translator_class.translator_version() 

513 

514 return was_modified