Coverage for python/astro_metadata_translator/indexing.py: 21%

136 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 11:01 -0700

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12"""Functions to support file indexing.""" 

13 

14from __future__ import annotations 

15 

16__all__ = ( 

17 "read_index", 

18 "read_sidecar", 

19 "calculate_index", 

20 "index_files", 

21 "process_index_data", 

22 "process_sidecar_data", 

23) 

24 

25import collections.abc 

26import json 

27import logging 

28import os 

29import sys 

30from collections.abc import MutableMapping, Sequence 

31from copy import deepcopy 

32from typing import IO, Any, Literal, overload 

33 

34from .file_helpers import read_file_info 

35from .headers import merge_headers 

36from .observationGroup import ObservationGroup 

37from .observationInfo import ObservationInfo 

38 

39log = logging.getLogger(__name__) 

40 

41COMMON_KEY = "__COMMON__" 

42CONTENT_KEY = "__CONTENT__" 

43 

44 

45def index_files( 

46 files: Sequence[str], 

47 root: str | None, 

48 hdrnum: int, 

49 print_trace: bool, 

50 content: str, 

51 outstream: IO = sys.stdout, 

52 errstream: IO = sys.stderr, 

53) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]: 

54 """Create an index from the supplied files. 

55 

56 No file is written. The Python structure returned is suitable 

57 for writing. 

58 

59 Parameters 

60 ---------- 

61 files : iterable of `str` 

62 Paths to the files to be indexed. They do not have to all be 

63 in a single directory but all content will be indexed into a single 

64 index. 

65 root : `str` 

66 Directory root that can be combined with each file (if the supplied) 

67 file is relative. Will be ignored if `None`. 

68 hdrnum : `int` 

69 The HDU number to read. The primary header is always read and 

70 print_trace : `bool` 

71 If there is an error reading the file and this parameter is `True`, 

72 a full traceback of the exception will be reported. If `False` prints 

73 a one line summary of the error condition. If `None` the exception 

74 will be allowed. 

75 content : `str` 

76 Form of data to write in index file. Options are: 

77 ``translated`` (default) to write ObservationInfo to the index; 

78 ``metadata`` to write native metadata headers to the index. 

79 The index file is called ``{mode}_index.json`` 

80 outstream : `io.StringIO`, optional 

81 Output stream to use for standard messages. Defaults to `sys.stdout`. 

82 errstream : `io.StringIO`, optional 

83 Stream to send messages that would normally be sent to standard 

84 error. Defaults to `sys.stderr`. 

85 

86 Returns 

87 ------- 

88 file_index : `dict` of [`str`, `dict`] 

89 The headers in form suitable for writing to an index. The keys will 

90 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the 

91 content mode used to construct the index, and paths to the files. The 

92 paths will be the supplied paths and will not include any supplied 

93 ``root``. 

94 okay : `list` of `str` 

95 All the files that were processed successfully. 

96 failed : `list` of `str` 

97 All the files that could not be processed. Will be empty if 

98 ``print_trace`` is not `None`. 

99 """ 

100 if content not in ("translated", "metadata"): 

101 raise ValueError("Unrecognized mode {mode}") 

102 

103 failed: list[str] = [] 

104 okay: list[str] = [] 

105 

106 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content 

107 for file in sorted(files): 

108 if root is not None: 

109 path = os.path.join(root, file) 

110 else: 

111 path = file 

112 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream) 

113 if simple is None: 

114 failed.append(path) 

115 continue 

116 else: 

117 okay.append(path) 

118 

119 # Store the information indexed by the filename within dir 

120 # We may get a PropertyList here and can therefore not just 

121 # assert Mapping for mypy. We therefore assert that it's not the 

122 # other 2 options, which we were enforcing with the "simple" parameter 

123 # in the call to read_file_info. 

124 assert not isinstance(simple, (str, ObservationInfo)) 

125 content_by_file[file] = simple 

126 

127 output = calculate_index(content_by_file, content) 

128 

129 return output, okay, failed 

130 

131 

132def calculate_index( 

133 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str 

134) -> MutableMapping[str, str | MutableMapping[str, Any]]: 

135 """Calculate an index data structure from the supplied headers. 

136 

137 Parameters 

138 ---------- 

139 headers : `dict` of [`str`, `dict`] 

140 The headers indexed by filename. 

141 content_mode : `str` 

142 The mode associated with these headers. Not used other than to 

143 store the information in the data structure for later use on 

144 deserialization. 

145 

146 Returns 

147 ------- 

148 index_ : `dict` of [`str`, `dict`] 

149 The headers in form suitable for writing to an index. 

150 """ 

151 if content_mode not in ("metadata", "translated"): 

152 raise ValueError(f"Unrecognized mode for index creation: {content_mode}") 

153 

154 # Merge all the information into a primary plus diff 

155 merged = merge_headers([hdr for hdr in headers.values()], mode="diff") 

156 

157 # For a single file it is possible that the merged contents 

158 # are not a dict but are an LSST-style PropertyList. JSON needs 

159 # dict though. mypy can't know about PropertyList so we must ignore 

160 # the type error. 

161 if not isinstance(merged, collections.abc.Mapping): 

162 merged = dict(merged) # type: ignore 

163 

164 # The structure to write to file is intended to look like (in YAML): 

165 # __COMMON__: 

166 # KEY1: value1 

167 # KEY2: value2 

168 # FILE1: 

169 # KEY3: value3a 

170 # FILE2: 

171 # KEY3: value3b 

172 

173 # if there was only one file there will not be a diff but we 

174 # want it to look like there was. 

175 diff_dict = merged.pop("__DIFF__", [dict()]) 

176 

177 # Put the common headers first in the output. 

178 # Store the mode so that we can work out how to read the file in 

179 output: MutableMapping[str, str | MutableMapping[str, Any]] = { 

180 CONTENT_KEY: content_mode, 

181 COMMON_KEY: merged, 

182 } 

183 for file, diff in zip(headers, diff_dict): 

184 output[file] = diff 

185 

186 return output 

187 

188 

189@overload 

190def read_index( 

191 path: str, 

192 *, 

193 force_dict: Literal[True], 

194) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

195 ... 

196 

197 

198@overload 

199def read_index( 

200 path: str, 

201 *, 

202 force_dict: Literal[False], 

203) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

204 ... 

205 

206 

207def read_index( 

208 path: str, force_dict: bool = False 

209) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

210 """Read an index file. 

211 

212 Parameters 

213 ---------- 

214 path : `str` 

215 Path to the index file. 

216 force_dict : `bool`, optional 

217 If `True` the structure returned will always be a dict keyed 

218 by filename. 

219 

220 Returns 

221 ------- 

222 index_ : `.ObservationGroup` or `dict` [ `str`, \ 

223 `dict` | `.ObservationInfo` ] 

224 The return content matches that returned by `process_index_data`. 

225 """ 

226 if not path.endswith(".json"): 

227 raise ValueError(f"Index files must be in .json format; got {path}") 

228 

229 with open(path) as fd: 

230 content: MutableMapping[str, Any] = json.loads(fd.read()) 

231 

232 if not isinstance(content, MutableMapping): 

233 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

234 

235 return process_index_data(content, force_dict=force_dict) 

236 

237 

238@overload 

239def process_index_data( 

240 content: MutableMapping[str, Any], 

241 *, 

242 force_metadata: Literal[True], 

243 force_dict: Literal[False], 

244) -> MutableMapping[str, Any]: 

245 ... 

246 

247 

248@overload 

249def process_index_data( 

250 content: MutableMapping[str, Any], 

251 *, 

252 force_metadata: Literal[False], 

253 force_dict: Literal[True], 

254) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

255 ... 

256 

257 

258@overload 

259def process_index_data( 

260 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

261) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

262 ... 

263 

264 

265def process_index_data( 

266 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

267) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

268 """Process the content read from a JSON index file. 

269 

270 Parameters 

271 ---------- 

272 content : `dict` 

273 Data structure stored in JSON index file converted to simple python 

274 form. 

275 force_metadata : `bool`, optional 

276 By default the content returned will match the original form that 

277 was used for the index. If this parameter is `True` an index of 

278 `.ObservationInfo` will be returned as if it was simple dict content. 

279 force_dict : `bool`, optional 

280 If `True` the structure returned will always be a dict keyed 

281 by filename. 

282 

283 Returns 

284 ------- 

285 index : `.ObservationGroup` or `dict` of [`str`, `dict`] 

286 If the index file referred to `.ObservationInfo` this will return 

287 an `.ObservationGroup`, otherwise a `dict` will be returned with the 

288 keys being paths to files and the values being the keys and values 

289 stored in the index (with common information merged in). This 

290 can be overridden using the ``force_metadata`` parameter. If 

291 ``force_dict`` is `True` a `dict` will be returned with filename 

292 keys even if the index file refers to `.ObservationInfo` (the values 

293 will be `.ObservationInfo` unless ``force_metadata`` is `True`). 

294 

295 Notes 

296 ----- 

297 File keys will be relative to the location of the index file. 

298 """ 

299 if COMMON_KEY not in content: 

300 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.") 

301 

302 # Copy the input structure so we can update in place 

303 unpacked = deepcopy(content) 

304 

305 content_mode = unpacked.pop(CONTENT_KEY, None) 

306 if force_metadata: 

307 content_mode = "metadata" 

308 elif content_mode is None: 

309 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY) 

310 content_mode = "metadata" 

311 

312 # The common headers will be copied into each header 

313 common = unpacked.pop(COMMON_KEY) 

314 

315 for file in unpacked: 

316 unpacked[file].update(common) 

317 

318 if content_mode == "metadata": 

319 # nothing more to be done 

320 return unpacked 

321 

322 obs_infos: list[ObservationInfo] = [] 

323 # This type annotation is really MutableMapping[str, ObservationInfo] 

324 # but mypy needs it to look like the function return value. 

325 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {} 

326 for file, hdr in unpacked.items(): 

327 info = ObservationInfo.from_simple(hdr) 

328 info.filename = file 

329 obs_infos.append(info) 

330 by_file[file] = info 

331 

332 if force_dict: 

333 return by_file 

334 return ObservationGroup(obs_infos) 

335 

336 

337def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]: 

338 """Read a metadata sidecar file. 

339 

340 Parameters 

341 ---------- 

342 path : `str` 

343 Path to the sidecar file. 

344 

345 Returns 

346 ------- 

347 info : `.ObservationInfo` or `dict` of [`str`, `dict`] 

348 If the sidecar file referred to `.ObservationInfo` this will return 

349 an `.ObservationInfo`, otherwise a `dict` will be returned. 

350 """ 

351 if not path.endswith(".json"): 

352 raise ValueError(f"Sidecar files must be in .json format; got {path}") 

353 

354 with open(path) as fd: 

355 content: MutableMapping[str, Any] = json.loads(fd.read()) 

356 

357 if not isinstance(content, MutableMapping): 

358 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

359 

360 return process_sidecar_data(content) 

361 

362 

363@overload 

364def process_sidecar_data( 

365 content: MutableMapping[str, Any], 

366) -> ObservationInfo | MutableMapping[str, Any]: 

367 ... 

368 

369 

370@overload 

371def process_sidecar_data( 

372 content: MutableMapping[str, Any], force_metadata: Literal[True] 

373) -> MutableMapping[str, Any]: 

374 ... 

375 

376 

377@overload 

378def process_sidecar_data( 

379 content: MutableMapping[str, Any], force_metadata: Literal[False] 

380) -> ObservationInfo | MutableMapping[str, Any]: 

381 ... 

382 

383 

384def process_sidecar_data( 

385 content: MutableMapping[str, Any], force_metadata: bool = False 

386) -> ObservationInfo | MutableMapping[str, Any]: 

387 """Process the content read from a JSON sidecar file. 

388 

389 Parameters 

390 ---------- 

391 content : `dict` 

392 Data structure stored in JSON sidecar file converted to simple python 

393 form. 

394 force_metadata : `bool`, optional 

395 By default the content returned will match the original form that 

396 was used for the sidecar. If this parameter is `True` a sidecar of 

397 `.ObservationInfo` will be returned as if it was simple dict content. 

398 

399 Returns 

400 ------- 

401 info : `.ObservationInfo` or `dict` of [`str`, `~typing.Any`] 

402 If the sidecar file referred to `.ObservationInfo` this will return 

403 an `.ObservationInfo`, otherwise a `dict` will be returned. This 

404 can be overridden using the ``force_metadata`` parameter in which 

405 case a `dict` will always be returned. 

406 """ 

407 if not isinstance(content, dict): 

408 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}") 

409 

410 # Copy the input structure so we can update in place 

411 content = deepcopy(content) 

412 

413 guessing = False 

414 content_mode = content.pop(CONTENT_KEY, None) 

415 if force_metadata: 

416 content_mode = "metadata" 

417 elif content_mode is None: 

418 # All ObservationInfo objects will have observation_id and instrument 

419 # so if they are there we can guess 

420 guessing = True 

421 if "observation_id" in content and "instrument" in content: 

422 content_mode = "translated" 

423 else: 

424 content_mode = "metadata" 

425 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode) 

426 

427 if content_mode == "metadata": 

428 # nothing more to be done 

429 return content 

430 

431 try: 

432 info = ObservationInfo.from_simple(content) 

433 except Exception as e: 

434 if guessing: 

435 # We were guessing so seems like this is not ObservationInfo 

436 return content 

437 raise e 

438 

439 return info