Coverage for python/astro_metadata_translator/indexing.py: 27%

128 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-20 03:54 -0700

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12"""Functions to support file indexing.""" 

13 

14from __future__ import annotations 

15 

16__all__ = ( 

17 "read_index", 

18 "read_sidecar", 

19 "calculate_index", 

20 "index_files", 

21 "process_index_data", 

22 "process_sidecar_data", 

23) 

24 

25import collections.abc 

26import json 

27import logging 

28import os 

29import sys 

30from collections.abc import MutableMapping, Sequence 

31from copy import deepcopy 

32from typing import IO, Any, Literal, overload 

33 

34from .file_helpers import read_file_info 

35from .headers import merge_headers 

36from .observationGroup import ObservationGroup 

37from .observationInfo import ObservationInfo 

38 

39log = logging.getLogger(__name__) 

40 

41COMMON_KEY = "__COMMON__" 

42CONTENT_KEY = "__CONTENT__" 

43 

44 

45def index_files( 

46 files: Sequence[str], 

47 root: str | None, 

48 hdrnum: int, 

49 print_trace: bool, 

50 content: str, 

51 outstream: IO = sys.stdout, 

52 errstream: IO = sys.stderr, 

53) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]: 

54 """Create an index from the supplied files. 

55 

56 No file is written. The Python structure returned is suitable 

57 for writing. 

58 

59 Parameters 

60 ---------- 

61 files : iterable of `str` 

62 Paths to the files to be indexed. They do not have to all be 

63 in a single directory but all content will be indexed into a single 

64 index. 

65 root : `str` 

66 Directory root that can be combined with each file (if the supplied) 

67 file is relative. Will be ignored if `None`. 

68 hdrnum : `int` 

69 The HDU number to read. The primary header is always read and 

70 merged with the header from this HDU. 

71 print_trace : `bool` 

72 If there is an error reading the file and this parameter is `True`, 

73 a full traceback of the exception will be reported. If `False` prints 

74 a one line summary of the error condition. If `None` the exception 

75 will be allowed. 

76 content : `str` 

77 Form of data to write in index file. Options are: 

78 ``translated`` (default) to write ObservationInfo to the index; 

79 ``metadata`` to write native metadata headers to the index. 

80 The index file is called ``{mode}_index.json``. 

81 outstream : `io.StringIO`, optional 

82 Output stream to use for standard messages. Defaults to `sys.stdout`. 

83 errstream : `io.StringIO`, optional 

84 Stream to send messages that would normally be sent to standard 

85 error. Defaults to `sys.stderr`. 

86 

87 Returns 

88 ------- 

89 file_index : `dict` of [`str`, `dict`] 

90 The headers in form suitable for writing to an index. The keys will 

91 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the 

92 content mode used to construct the index, and paths to the files. The 

93 paths will be the supplied paths and will not include any supplied 

94 ``root``. 

95 okay : `list` of `str` 

96 All the files that were processed successfully. 

97 failed : `list` of `str` 

98 All the files that could not be processed. Will be empty if 

99 ``print_trace`` is not `None`. 

100 """ 

101 if content not in ("translated", "metadata"): 

102 raise ValueError("Unrecognized mode {mode}") 

103 

104 failed: list[str] = [] 

105 okay: list[str] = [] 

106 

107 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content 

108 for file in sorted(files): 

109 if root is not None: 

110 path = os.path.join(root, file) 

111 else: 

112 path = file 

113 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream) 

114 if simple is None: 

115 failed.append(path) 

116 continue 

117 else: 

118 okay.append(path) 

119 

120 # Store the information indexed by the filename within dir 

121 # We may get a PropertyList here and can therefore not just 

122 # assert Mapping for mypy. We therefore assert that it's not the 

123 # other 2 options, which we were enforcing with the "simple" parameter 

124 # in the call to read_file_info. 

125 assert not isinstance(simple, (str, ObservationInfo)) 

126 content_by_file[file] = simple 

127 

128 output = calculate_index(content_by_file, content) 

129 

130 return output, okay, failed 

131 

132 

133def calculate_index( 

134 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str 

135) -> MutableMapping[str, str | MutableMapping[str, Any]]: 

136 """Calculate an index data structure from the supplied headers. 

137 

138 Parameters 

139 ---------- 

140 headers : `dict` of [`str`, `dict`] 

141 The headers indexed by filename. 

142 content_mode : `str` 

143 The mode associated with these headers. Not used other than to 

144 store the information in the data structure for later use on 

145 deserialization. 

146 

147 Returns 

148 ------- 

149 index_ : `dict` of [`str`, `dict`] 

150 The headers in form suitable for writing to an index. 

151 """ 

152 if content_mode not in ("metadata", "translated"): 

153 raise ValueError(f"Unrecognized mode for index creation: {content_mode}") 

154 

155 # Merge all the information into a primary plus diff 

156 merged = merge_headers([hdr for hdr in headers.values()], mode="diff") 

157 

158 # For a single file it is possible that the merged contents 

159 # are not a dict but are an LSST-style PropertyList. JSON needs 

160 # dict though. mypy can't know about PropertyList so we must ignore 

161 # the type error. 

162 if not isinstance(merged, collections.abc.Mapping): 

163 merged = dict(merged) # type: ignore 

164 

165 # The structure to write to file is intended to look like (in YAML): 

166 # __COMMON__: 

167 # KEY1: value1 

168 # KEY2: value2 

169 # FILE1: 

170 # KEY3: value3a 

171 # FILE2: 

172 # KEY3: value3b 

173 

174 # if there was only one file there will not be a diff but we 

175 # want it to look like there was. 

176 diff_dict = merged.pop("__DIFF__", [dict()]) 

177 

178 # Put the common headers first in the output. 

179 # Store the mode so that we can work out how to read the file in 

180 output: MutableMapping[str, str | MutableMapping[str, Any]] = { 

181 CONTENT_KEY: content_mode, 

182 COMMON_KEY: merged, 

183 } 

184 for file, diff in zip(headers, diff_dict): 

185 output[file] = diff 

186 

187 return output 

188 

189 

190@overload 

191def read_index( 191 ↛ exitline 191 didn't jump to the function exit

192 path: str, 

193 *, 

194 force_dict: Literal[True], 

195) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

196 

197 

198@overload 

199def read_index( 199 ↛ exitline 199 didn't jump to the function exit

200 path: str, 

201 *, 

202 force_dict: Literal[False], 

203) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

204 

205 

206def read_index( 

207 path: str, force_dict: bool = False 

208) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

209 """Read an index file. 

210 

211 Parameters 

212 ---------- 

213 path : `str` 

214 Path to the index file. 

215 force_dict : `bool`, optional 

216 If `True` the structure returned will always be a dict keyed 

217 by filename. 

218 

219 Returns 

220 ------- 

221 index_ : `.ObservationGroup` or `dict` [ `str`, \ 

222 `dict` | `.ObservationInfo` ] 

223 The return content matches that returned by `process_index_data`. 

224 """ 

225 if not path.endswith(".json"): 

226 raise ValueError(f"Index files must be in .json format; got {path}") 

227 

228 with open(path) as fd: 

229 content: MutableMapping[str, Any] = json.loads(fd.read()) 

230 

231 if not isinstance(content, MutableMapping): 

232 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

233 

234 return process_index_data(content, force_dict=force_dict) 

235 

236 

237@overload 

238def process_index_data( 238 ↛ exitline 238 didn't jump to the function exit

239 content: MutableMapping[str, Any], 

240 *, 

241 force_metadata: Literal[True], 

242 force_dict: Literal[False], 

243) -> MutableMapping[str, Any]: ... 

244 

245 

246@overload 

247def process_index_data( 247 ↛ exitline 247 didn't jump to the function exit

248 content: MutableMapping[str, Any], 

249 *, 

250 force_metadata: Literal[False], 

251 force_dict: Literal[True], 

252) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

253 

254 

255@overload 

256def process_index_data( 256 ↛ exitline 256 didn't jump to the function exit

257 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

258) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

259 

260 

261def process_index_data( 

262 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

263) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

264 """Process the content read from a JSON index file. 

265 

266 Parameters 

267 ---------- 

268 content : `dict` 

269 Data structure stored in JSON index file converted to simple python 

270 form. 

271 force_metadata : `bool`, optional 

272 By default the content returned will match the original form that 

273 was used for the index. If this parameter is `True` an index of 

274 `.ObservationInfo` will be returned as if it was simple dict content. 

275 force_dict : `bool`, optional 

276 If `True` the structure returned will always be a dict keyed 

277 by filename. 

278 

279 Returns 

280 ------- 

281 index : `.ObservationGroup` or `dict` of [`str`, `dict`] 

282 If the index file referred to `.ObservationInfo` this will return 

283 an `.ObservationGroup`, otherwise a `dict` will be returned with the 

284 keys being paths to files and the values being the keys and values 

285 stored in the index (with common information merged in). This 

286 can be overridden using the ``force_metadata`` parameter. If 

287 ``force_dict`` is `True` a `dict` will be returned with filename 

288 keys even if the index file refers to `.ObservationInfo` (the values 

289 will be `.ObservationInfo` unless ``force_metadata`` is `True`). 

290 

291 Notes 

292 ----- 

293 File keys will be relative to the location of the index file. 

294 """ 

295 if COMMON_KEY not in content: 

296 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.") 

297 

298 # Copy the input structure so we can update in place 

299 unpacked = deepcopy(content) 

300 

301 content_mode = unpacked.pop(CONTENT_KEY, None) 

302 if force_metadata: 

303 content_mode = "metadata" 

304 elif content_mode is None: 

305 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY) 

306 content_mode = "metadata" 

307 

308 # The common headers will be copied into each header 

309 common = unpacked.pop(COMMON_KEY) 

310 

311 for file in unpacked: 

312 unpacked[file].update(common) 

313 

314 if content_mode == "metadata": 

315 # nothing more to be done 

316 return unpacked 

317 

318 obs_infos: list[ObservationInfo] = [] 

319 # This type annotation is really MutableMapping[str, ObservationInfo] 

320 # but mypy needs it to look like the function return value. 

321 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {} 

322 for file, hdr in unpacked.items(): 

323 info = ObservationInfo.from_simple(hdr) 

324 info.filename = file 

325 obs_infos.append(info) 

326 by_file[file] = info 

327 

328 if force_dict: 

329 return by_file 

330 return ObservationGroup(obs_infos) 

331 

332 

333def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]: 

334 """Read a metadata sidecar file. 

335 

336 Parameters 

337 ---------- 

338 path : `str` 

339 Path to the sidecar file. 

340 

341 Returns 

342 ------- 

343 info : `.ObservationInfo` or `dict` of [`str`, `dict`] 

344 If the sidecar file referred to `.ObservationInfo` this will return 

345 an `.ObservationInfo`, otherwise a `dict` will be returned. 

346 """ 

347 if not path.endswith(".json"): 

348 raise ValueError(f"Sidecar files must be in .json format; got {path}") 

349 

350 with open(path) as fd: 

351 content: MutableMapping[str, Any] = json.loads(fd.read()) 

352 

353 if not isinstance(content, MutableMapping): 

354 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

355 

356 return process_sidecar_data(content) 

357 

358 

359@overload 

360def process_sidecar_data( 360 ↛ exitline 360 didn't jump to the function exit

361 content: MutableMapping[str, Any], 

362) -> ObservationInfo | MutableMapping[str, Any]: ... 

363 

364 

365@overload 

366def process_sidecar_data( 366 ↛ exitline 366 didn't jump to the function exit

367 content: MutableMapping[str, Any], force_metadata: Literal[True] 

368) -> MutableMapping[str, Any]: ... 

369 

370 

371@overload 

372def process_sidecar_data( 372 ↛ exitline 372 didn't jump to the function exit

373 content: MutableMapping[str, Any], force_metadata: Literal[False] 

374) -> ObservationInfo | MutableMapping[str, Any]: ... 

375 

376 

377def process_sidecar_data( 

378 content: MutableMapping[str, Any], force_metadata: bool = False 

379) -> ObservationInfo | MutableMapping[str, Any]: 

380 """Process the content read from a JSON sidecar file. 

381 

382 Parameters 

383 ---------- 

384 content : `dict` 

385 Data structure stored in JSON sidecar file converted to simple python 

386 form. 

387 force_metadata : `bool`, optional 

388 By default the content returned will match the original form that 

389 was used for the sidecar. If this parameter is `True` a sidecar of 

390 `.ObservationInfo` will be returned as if it was simple dict content. 

391 

392 Returns 

393 ------- 

394 info : `.ObservationInfo` or `dict` of [`str`, `~typing.Any`] 

395 If the sidecar file referred to `.ObservationInfo` this will return 

396 an `.ObservationInfo`, otherwise a `dict` will be returned. This 

397 can be overridden using the ``force_metadata`` parameter in which 

398 case a `dict` will always be returned. 

399 """ 

400 if not isinstance(content, dict): 

401 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}") 

402 

403 # Copy the input structure so we can update in place 

404 content = deepcopy(content) 

405 

406 guessing = False 

407 content_mode = content.pop(CONTENT_KEY, None) 

408 if force_metadata: 

409 content_mode = "metadata" 

410 elif content_mode is None: 

411 # All ObservationInfo objects will have observation_id and instrument 

412 # so if they are there we can guess 

413 guessing = True 

414 if "observation_id" in content and "instrument" in content: 

415 content_mode = "translated" 

416 else: 

417 content_mode = "metadata" 

418 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode) 

419 

420 if content_mode == "metadata": 

421 # nothing more to be done 

422 return content 

423 

424 try: 

425 info = ObservationInfo.from_simple(content) 

426 except Exception as e: 

427 if guessing: 

428 # We were guessing so seems like this is not ObservationInfo 

429 return content 

430 raise e 

431 

432 return info