Coverage for python/astro_metadata_translator/indexing.py: 21%

135 statements  

« prev     ^ index     » next       coverage.py v6.4, created at 2022-06-02 03:27 -0700

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ( 

15 "read_index", 

16 "read_sidecar", 

17 "calculate_index", 

18 "index_files", 

19 "process_index_data", 

20 "process_sidecar_data", 

21) 

22 

23"""Functions to support file indexing.""" 

24 

25import collections.abc 

26import json 

27import logging 

28import os 

29import sys 

30from copy import deepcopy 

31from typing import IO, Any, List, Literal, MutableMapping, Optional, Sequence, Tuple, Union, overload 

32 

33from .file_helpers import read_file_info 

34from .headers import merge_headers 

35from .observationGroup import ObservationGroup 

36from .observationInfo import ObservationInfo 

37 

38log = logging.getLogger(__name__) 

39 

40COMMON_KEY = "__COMMON__" 

41CONTENT_KEY = "__CONTENT__" 

42 

43 

44def index_files( 

45 files: Sequence[str], 

46 root: Optional[str], 

47 hdrnum: int, 

48 print_trace: bool, 

49 content: str, 

50 outstream: IO = sys.stdout, 

51 errstream: IO = sys.stderr, 

52) -> Tuple[MutableMapping[str, Union[str, MutableMapping[str, Any]]], List[str], List[str]]: 

53 """Create an index from the supplied files. 

54 

55 No file is written. The Python structure returned is suitable 

56 for writing. 

57 

58 Parameters 

59 ---------- 

60 files : iterable of `str` 

61 Paths to the files to be indexed. They do not have to all be 

62 in a single directory but all content will be indexed into a single 

63 index. 

64 root : `str` 

65 Directory root that can be combined with each file (if the supplied) 

66 file is relative. Will be ignored if `None`. 

67 hdrnum : `int` 

68 The HDU number to read. The primary header is always read and 

69 print_trace : `bool` 

70 If there is an error reading the file and this parameter is `True`, 

71 a full traceback of the exception will be reported. If `False` prints 

72 a one line summary of the error condition. If `None` the exception 

73 will be allowed. 

74 content : `str` 

75 Form of data to write in index file. Options are: 

76 ``translated`` (default) to write ObservationInfo to the index; 

77 ``metadata`` to write native metadata headers to the index. 

78 The index file is called ``{mode}_index.json`` 

79 outstream : `io.StringIO`, optional 

80 Output stream to use for standard messages. Defaults to `sys.stdout`. 

81 errstream : `io.StringIO`, optional 

82 Stream to send messages that would normally be sent to standard 

83 error. Defaults to `sys.stderr`. 

84 

85 Returns 

86 ------- 

87 file_index : `dict` of [`str`, `dict`] 

88 The headers in form suitable for writing to an index. The keys will 

89 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the 

90 content mode used to construct the index, and paths to the files. The 

91 paths will be the supplied paths and will not include any supplied 

92 ``root``. 

93 okay : `list` of `str` 

94 All the files that were processed successfully. 

95 failed : `list` of `str` 

96 All the files that could not be processed. Will be empty if 

97 ``print_trace`` is not `None`. 

98 """ 

99 if content not in ("translated", "metadata"): 

100 raise ValueError("Unrecognized mode {mode}") 

101 

102 failed: List[str] = [] 

103 okay: List[str] = [] 

104 

105 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content 

106 for file in sorted(files): 

107 if root is not None: 

108 path = os.path.join(root, file) 

109 else: 

110 path = file 

111 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream) 

112 if simple is None: 

113 failed.append(path) 

114 continue 

115 else: 

116 okay.append(path) 

117 

118 # Store the information indexed by the filename within dir 

119 # We may get a PropertyList here and can therefore not just 

120 # assert Mapping for mypy. We therefore assert that it's not the 

121 # other 2 options, which we were enforcing with the "simple" parameter 

122 # in the call to read_file_info. 

123 assert not isinstance(simple, (str, ObservationInfo)) 

124 content_by_file[file] = simple 

125 

126 output = calculate_index(content_by_file, content) 

127 

128 return output, okay, failed 

129 

130 

131def calculate_index( 

132 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str 

133) -> MutableMapping[str, Union[str, MutableMapping[str, Any]]]: 

134 """Calculate an index data structure from the supplied headers. 

135 

136 Parameters 

137 ---------- 

138 headers : `dict` of [`str`, `dict`] 

139 The headers indexed by filename. 

140 content_mode : `str` 

141 The mode associated with these headers. Not used other than to 

142 store the information in the data structure for later use on 

143 deserialization. 

144 

145 Returns 

146 ------- 

147 index_ : `dict` of [`str`, `dict`] 

148 The headers in form suitable for writing to an index. 

149 """ 

150 if content_mode not in ("metadata", "translated"): 

151 raise ValueError(f"Unrecognized mode for index creation: {content_mode}") 

152 

153 # Merge all the information into a primary plus diff 

154 merged = merge_headers([hdr for hdr in headers.values()], mode="diff") 

155 

156 # For a single file it is possible that the merged contents 

157 # are not a dict but are an LSST-style PropertyList. JSON needs 

158 # dict though. mypy can't know about PropertyList so we must ignore 

159 # the type error. 

160 if not isinstance(merged, collections.abc.Mapping): 

161 merged = dict(merged) # type: ignore 

162 

163 # The structure to write to file is intended to look like (in YAML): 

164 # __COMMON__: 

165 # KEY1: value1 

166 # KEY2: value2 

167 # FILE1: 

168 # KEY3: value3a 

169 # FILE2: 

170 # KEY3: value3b 

171 

172 # if there was only one file there will not be a diff but we 

173 # want it to look like there was. 

174 diff_dict = merged.pop("__DIFF__", [dict()]) 

175 

176 # Put the common headers first in the output. 

177 # Store the mode so that we can work out how to read the file in 

178 output: MutableMapping[str, Union[str, MutableMapping[str, Any]]] = { 

179 CONTENT_KEY: content_mode, 

180 COMMON_KEY: merged, 

181 } 

182 for file, diff in zip(headers, diff_dict): 

183 output[file] = diff 

184 

185 return output 

186 

187 

188@overload 

189def read_index( 

190 path: str, 

191 *, 

192 force_dict: Literal[True], 

193) -> MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]: 

194 ... 

195 

196 

197@overload 

198def read_index( 

199 path: str, 

200 *, 

201 force_dict: Literal[False], 

202) -> Union[ObservationGroup, MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]]: 

203 ... 

204 

205 

206def read_index( 

207 path: str, force_dict: bool = False 

208) -> Union[ObservationGroup, MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]]: 

209 """Read an index file. 

210 

211 Parameters 

212 ---------- 

213 path : `str` 

214 Path to the index file. 

215 force_dict : `bool`, optional 

216 If `True` the structure returned will always be a dict keyed 

217 by filename. 

218 

219 Returns 

220 ------- 

221 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]` 

222 The return content matches that returned by `process_index_data`. 

223 """ 

224 if not path.endswith(".json"): 

225 raise ValueError(f"Index files must be in .json format; got {path}") 

226 

227 with open(path, "r") as fd: 

228 content: MutableMapping[str, Any] = json.loads(fd.read()) 

229 

230 if not isinstance(content, MutableMapping): 

231 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

232 

233 return process_index_data(content, force_dict=force_dict) 

234 

235 

236@overload 

237def process_index_data( 

238 content: MutableMapping[str, Any], 

239 *, 

240 force_metadata: Literal[True], 

241 force_dict: Literal[False], 

242) -> MutableMapping[str, Any]: 

243 ... 

244 

245 

246@overload 

247def process_index_data( 

248 content: MutableMapping[str, Any], 

249 *, 

250 force_metadata: Literal[False], 

251 force_dict: Literal[True], 

252) -> MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]: 

253 ... 

254 

255 

256@overload 

257def process_index_data( 

258 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

259) -> Union[ObservationGroup, MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]]: 

260 ... 

261 

262 

263def process_index_data( 

264 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

265) -> Union[ObservationGroup, MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]]: 

266 """Process the content read from a JSON index file. 

267 

268 Parameters 

269 ---------- 

270 content : `dict` 

271 Data structure stored in JSON index file converted to simple python 

272 form. 

273 force_metadata : `bool`, optional 

274 By default the content returned will match the original form that 

275 was used for the index. If this parameter is `True` an index of 

276 `ObservationInfo` will be returned as if it was simple dict content. 

277 force_dict : `bool`, optional 

278 If `True` the structure returned will always be a dict keyed 

279 by filename. 

280 

281 Returns 

282 ------- 

283 index : `ObservationGroup` or `dict` of [`str`, `dict`] 

284 If the index file referred to `ObservationInfo` this will return 

285 an `ObservationGroup`, otherwise a `dict` will be returned with the 

286 keys being paths to files and the values being the keys and values 

287 stored in the index (with common information merged in). This 

288 can be overridden using the ``force_metadata`` parameter. If 

289 ``force_dict`` is `True` a `dict` will be returned with filename 

290 keys even if the index file refers to `ObservationInfo` (the values 

291 will be `ObservationInfo` unless ``force_metadata`` is `True`). 

292 

293 Notes 

294 ----- 

295 File keys will be relative to the location of the index file. 

296 """ 

297 

298 if COMMON_KEY not in content: 

299 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.") 

300 

301 # Copy the input structure so we can update in place 

302 unpacked = deepcopy(content) 

303 

304 content_mode = unpacked.pop(CONTENT_KEY, None) 

305 if force_metadata: 

306 content_mode = "metadata" 

307 elif content_mode is None: 

308 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY) 

309 content_mode = "metadata" 

310 

311 # The common headers will be copied into each header 

312 common = unpacked.pop(COMMON_KEY) 

313 

314 for file in unpacked: 

315 unpacked[file].update(common) 

316 

317 if content_mode == "metadata": 

318 # nothing more to be done 

319 return unpacked 

320 

321 obs_infos: List[ObservationInfo] = [] 

322 # This type annotation is really MutableMapping[str, ObservationInfo] 

323 # but mypy needs it to look like the function return value. 

324 by_file: MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]] = {} 

325 for file, hdr in unpacked.items(): 

326 info = ObservationInfo.from_simple(hdr) 

327 info.filename = file 

328 obs_infos.append(info) 

329 by_file[file] = info 

330 

331 if force_dict: 

332 return by_file 

333 return ObservationGroup(obs_infos) 

334 

335 

336def read_sidecar(path: str) -> Union[ObservationInfo, MutableMapping[str, Any]]: 

337 """Read a metadata sidecar file. 

338 

339 Parameters 

340 ---------- 

341 path : `str` 

342 Path to the sidecar file. 

343 

344 Returns 

345 ------- 

346 info : `ObservationInfo` or `dict` of [`str`, `dict`] 

347 If the sidecar file referred to `ObservationInfo` this will return 

348 an `ObservationInfo`, otherwise a `dict` will be returned. 

349 """ 

350 if not path.endswith(".json"): 

351 raise ValueError(f"Sidecar files must be in .json format; got {path}") 

352 

353 with open(path, "r") as fd: 

354 content: MutableMapping[str, Any] = json.loads(fd.read()) 

355 

356 if not isinstance(content, MutableMapping): 

357 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

358 

359 return process_sidecar_data(content) 

360 

361 

362@overload 

363def process_sidecar_data( 

364 content: MutableMapping[str, Any], 

365) -> Union[ObservationInfo, MutableMapping[str, Any]]: 

366 ... 

367 

368 

369@overload 

370def process_sidecar_data( 

371 content: MutableMapping[str, Any], force_metadata: Literal[True] 

372) -> MutableMapping[str, Any]: 

373 ... 

374 

375 

376@overload 

377def process_sidecar_data( 

378 content: MutableMapping[str, Any], force_metadata: Literal[False] 

379) -> Union[ObservationInfo, MutableMapping[str, Any]]: 

380 ... 

381 

382 

383def process_sidecar_data( 

384 content: MutableMapping[str, Any], force_metadata: bool = False 

385) -> Union[ObservationInfo, MutableMapping[str, Any]]: 

386 """Process the content read from a JSON sidecar file. 

387 

388 Parameters 

389 ---------- 

390 content : `dict` 

391 Data structure stored in JSON sidecar file converted to simple python 

392 form. 

393 force_metadata : `bool`, optional 

394 By default the content returned will match the original form that 

395 was used for the sidecar. If this parameter is `True` a sidecar of 

396 `ObservationInfo` will be returned as if it was simple dict content. 

397 

398 Returns 

399 ------- 

400 info : `ObservationInfo` or `dict` of [`str`, `Any`] 

401 If the sidecar file referred to `ObservationInfo` this will return 

402 an `ObservationInfo`, otherwise a `dict` will be returned. This 

403 can be overridden using the ``force_metadata`` parameter in which 

404 case a `dict` will always be returned. 

405 """ 

406 

407 if not isinstance(content, dict): 

408 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}") 

409 

410 # Copy the input structure so we can update in place 

411 content = deepcopy(content) 

412 

413 guessing = False 

414 content_mode = content.pop(CONTENT_KEY, None) 

415 if force_metadata: 

416 content_mode = "metadata" 

417 elif content_mode is None: 

418 # All ObservationInfo objects will have observation_id and instrument 

419 # so if they are there we can guess 

420 guessing = True 

421 if "observation_id" in content and "instrument" in content: 

422 content_mode = "translated" 

423 else: 

424 content_mode = "metadata" 

425 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode) 

426 

427 if content_mode == "metadata": 

428 # nothing more to be done 

429 return content 

430 

431 try: 

432 info = ObservationInfo.from_simple(content) 

433 except Exception as e: 

434 if guessing: 

435 # We were guessing so seems like this is not ObservationInfo 

436 return content 

437 raise e 

438 

439 return info