Coverage for python/astro_metadata_translator/indexing.py: 27%

127 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-06 03:48 -0700

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12"""Functions to support file indexing.""" 

13 

14from __future__ import annotations 

15 

16__all__ = ( 

17 "read_index", 

18 "read_sidecar", 

19 "calculate_index", 

20 "index_files", 

21 "process_index_data", 

22 "process_sidecar_data", 

23) 

24 

25import collections.abc 

26import json 

27import logging 

28import os 

29from collections.abc import MutableMapping, Sequence 

30from copy import deepcopy 

31from typing import IO, Any, Literal, overload 

32 

33from .file_helpers import read_file_info 

34from .headers import merge_headers 

35from .observationGroup import ObservationGroup 

36from .observationInfo import ObservationInfo 

37 

38log = logging.getLogger(__name__) 

39 

40COMMON_KEY = "__COMMON__" 

41CONTENT_KEY = "__CONTENT__" 

42 

43 

44def index_files( 

45 files: Sequence[str], 

46 root: str | None, 

47 hdrnum: int, 

48 print_trace: bool, 

49 content: str, 

50 outstream: IO | None = None, 

51) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]: 

52 """Create an index from the supplied files. 

53 

54 No file is written. The Python structure returned is suitable 

55 for writing. 

56 

57 Parameters 

58 ---------- 

59 files : iterable of `str` 

60 Paths to the files to be indexed. They do not have to all be 

61 in a single directory but all content will be indexed into a single 

62 index. 

63 root : `str` 

64 Directory root that can be combined with each file (if the supplied) 

65 file is relative. Will be ignored if `None`. 

66 hdrnum : `int` 

67 The HDU number to read. The primary header is always read and 

68 merged with the header from this HDU. 

69 print_trace : `bool` 

70 If there is an error reading the file and this parameter is `True`, 

71 a full traceback of the exception will be reported. If `False` prints 

72 a one line summary of the error condition. If `None` the exception 

73 will be allowed. 

74 content : `str` 

75 Form of data to write in index file. Options are: 

76 ``translated`` (default) to write ObservationInfo to the index; 

77 ``metadata`` to write native metadata headers to the index. 

78 The index file is called ``{mode}_index.json``. 

79 outstream : `io.StringIO`, optional 

80 Output stream to use for standard messages. Defaults to `None` which 

81 uses the default output stream. 

82 

83 Returns 

84 ------- 

85 file_index : `dict` of [`str`, `dict`] 

86 The headers in form suitable for writing to an index. The keys will 

87 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the 

88 content mode used to construct the index, and paths to the files. The 

89 paths will be the supplied paths and will not include any supplied 

90 ``root``. 

91 okay : `list` of `str` 

92 All the files that were processed successfully. 

93 failed : `list` of `str` 

94 All the files that could not be processed. Will be empty if 

95 ``print_trace`` is not `None`. 

96 """ 

97 if content not in ("translated", "metadata"): 

98 raise ValueError("Unrecognized mode {mode}") 

99 

100 failed: list[str] = [] 

101 okay: list[str] = [] 

102 

103 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content 

104 for file in sorted(files): 

105 if root is not None: 

106 path = os.path.join(root, file) 

107 else: 

108 path = file 

109 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream) 

110 if simple is None: 

111 failed.append(path) 

112 continue 

113 else: 

114 okay.append(path) 

115 

116 # Store the information indexed by the filename within dir 

117 # We may get a PropertyList here and can therefore not just 

118 # assert Mapping for mypy. We therefore assert that it's not the 

119 # other 2 options, which we were enforcing with the "simple" parameter 

120 # in the call to read_file_info. 

121 assert not isinstance(simple, (str, ObservationInfo)) 

122 content_by_file[file] = simple 

123 

124 output = calculate_index(content_by_file, content) 

125 

126 return output, okay, failed 

127 

128 

129def calculate_index( 

130 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str 

131) -> MutableMapping[str, str | MutableMapping[str, Any]]: 

132 """Calculate an index data structure from the supplied headers. 

133 

134 Parameters 

135 ---------- 

136 headers : `dict` of [`str`, `dict`] 

137 The headers indexed by filename. 

138 content_mode : `str` 

139 The mode associated with these headers. Not used other than to 

140 store the information in the data structure for later use on 

141 deserialization. 

142 

143 Returns 

144 ------- 

145 index_ : `dict` of [`str`, `dict`] 

146 The headers in form suitable for writing to an index. 

147 """ 

148 if content_mode not in ("metadata", "translated"): 

149 raise ValueError(f"Unrecognized mode for index creation: {content_mode}") 

150 

151 # Merge all the information into a primary plus diff 

152 merged = merge_headers([hdr for hdr in headers.values()], mode="diff") 

153 

154 # For a single file it is possible that the merged contents 

155 # are not a dict but are an LSST-style PropertyList. JSON needs 

156 # dict though. mypy can't know about PropertyList so we must ignore 

157 # the type error. 

158 if not isinstance(merged, collections.abc.Mapping): 

159 merged = dict(merged) # type: ignore 

160 

161 # The structure to write to file is intended to look like (in YAML): 

162 # __COMMON__: 

163 # KEY1: value1 

164 # KEY2: value2 

165 # FILE1: 

166 # KEY3: value3a 

167 # FILE2: 

168 # KEY3: value3b 

169 

170 # if there was only one file there will not be a diff but we 

171 # want it to look like there was. 

172 diff_dict = merged.pop("__DIFF__", [dict()]) 

173 

174 # Put the common headers first in the output. 

175 # Store the mode so that we can work out how to read the file in 

176 output: MutableMapping[str, str | MutableMapping[str, Any]] = { 

177 CONTENT_KEY: content_mode, 

178 COMMON_KEY: merged, 

179 } 

180 for file, diff in zip(headers, diff_dict): 

181 output[file] = diff 

182 

183 return output 

184 

185 

186@overload 

187def read_index( 187 ↛ exitline 187 didn't jump to the function exit

188 path: str, 

189 *, 

190 force_dict: Literal[True], 

191) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

192 

193 

194@overload 

195def read_index( 195 ↛ exitline 195 didn't jump to the function exit

196 path: str, 

197 *, 

198 force_dict: Literal[False], 

199) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

200 

201 

202def read_index( 

203 path: str, force_dict: bool = False 

204) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

205 """Read an index file. 

206 

207 Parameters 

208 ---------- 

209 path : `str` 

210 Path to the index file. 

211 force_dict : `bool`, optional 

212 If `True` the structure returned will always be a dict keyed 

213 by filename. 

214 

215 Returns 

216 ------- 

217 index_ : `.ObservationGroup` or `dict` [ `str`, \ 

218 `dict` | `.ObservationInfo` ] 

219 The return content matches that returned by `process_index_data`. 

220 """ 

221 if not path.endswith(".json"): 

222 raise ValueError(f"Index files must be in .json format; got {path}") 

223 

224 with open(path) as fd: 

225 content: MutableMapping[str, Any] = json.loads(fd.read()) 

226 

227 if not isinstance(content, MutableMapping): 

228 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

229 

230 return process_index_data(content, force_dict=force_dict) 

231 

232 

233@overload 

234def process_index_data( 234 ↛ exitline 234 didn't jump to the function exit

235 content: MutableMapping[str, Any], 

236 *, 

237 force_metadata: Literal[True], 

238 force_dict: Literal[False], 

239) -> MutableMapping[str, Any]: ... 

240 

241 

242@overload 

243def process_index_data( 243 ↛ exitline 243 didn't jump to the function exit

244 content: MutableMapping[str, Any], 

245 *, 

246 force_metadata: Literal[False], 

247 force_dict: Literal[True], 

248) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

249 

250 

251@overload 

252def process_index_data( 252 ↛ exitline 252 didn't jump to the function exit

253 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

254) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

255 

256 

257def process_index_data( 

258 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

259) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

260 """Process the content read from a JSON index file. 

261 

262 Parameters 

263 ---------- 

264 content : `dict` 

265 Data structure stored in JSON index file converted to simple python 

266 form. 

267 force_metadata : `bool`, optional 

268 By default the content returned will match the original form that 

269 was used for the index. If this parameter is `True` an index of 

270 `.ObservationInfo` will be returned as if it was simple dict content. 

271 force_dict : `bool`, optional 

272 If `True` the structure returned will always be a dict keyed 

273 by filename. 

274 

275 Returns 

276 ------- 

277 index : `.ObservationGroup` or `dict` of [`str`, `dict`] 

278 If the index file referred to `.ObservationInfo` this will return 

279 an `.ObservationGroup`, otherwise a `dict` will be returned with the 

280 keys being paths to files and the values being the keys and values 

281 stored in the index (with common information merged in). This 

282 can be overridden using the ``force_metadata`` parameter. If 

283 ``force_dict`` is `True` a `dict` will be returned with filename 

284 keys even if the index file refers to `.ObservationInfo` (the values 

285 will be `.ObservationInfo` unless ``force_metadata`` is `True`). 

286 

287 Notes 

288 ----- 

289 File keys will be relative to the location of the index file. 

290 """ 

291 if COMMON_KEY not in content: 

292 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.") 

293 

294 # Copy the input structure so we can update in place 

295 unpacked = deepcopy(content) 

296 

297 content_mode = unpacked.pop(CONTENT_KEY, None) 

298 if force_metadata: 

299 content_mode = "metadata" 

300 elif content_mode is None: 

301 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY) 

302 content_mode = "metadata" 

303 

304 # The common headers will be copied into each header 

305 common = unpacked.pop(COMMON_KEY) 

306 

307 for file in unpacked: 

308 unpacked[file].update(common) 

309 

310 if content_mode == "metadata": 

311 # nothing more to be done 

312 return unpacked 

313 

314 obs_infos: list[ObservationInfo] = [] 

315 # This type annotation is really MutableMapping[str, ObservationInfo] 

316 # but mypy needs it to look like the function return value. 

317 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {} 

318 for file, hdr in unpacked.items(): 

319 info = ObservationInfo.from_simple(hdr) 

320 info.filename = file 

321 obs_infos.append(info) 

322 by_file[file] = info 

323 

324 if force_dict: 

325 return by_file 

326 return ObservationGroup(obs_infos) 

327 

328 

329def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]: 

330 """Read a metadata sidecar file. 

331 

332 Parameters 

333 ---------- 

334 path : `str` 

335 Path to the sidecar file. 

336 

337 Returns 

338 ------- 

339 info : `.ObservationInfo` or `dict` of [`str`, `dict`] 

340 If the sidecar file referred to `.ObservationInfo` this will return 

341 an `.ObservationInfo`, otherwise a `dict` will be returned. 

342 """ 

343 if not path.endswith(".json"): 

344 raise ValueError(f"Sidecar files must be in .json format; got {path}") 

345 

346 with open(path) as fd: 

347 content: MutableMapping[str, Any] = json.loads(fd.read()) 

348 

349 if not isinstance(content, MutableMapping): 

350 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

351 

352 return process_sidecar_data(content) 

353 

354 

355@overload 

356def process_sidecar_data( 356 ↛ exitline 356 didn't jump to the function exit

357 content: MutableMapping[str, Any], 

358) -> ObservationInfo | MutableMapping[str, Any]: ... 

359 

360 

361@overload 

362def process_sidecar_data( 362 ↛ exitline 362 didn't jump to the function exit

363 content: MutableMapping[str, Any], force_metadata: Literal[True] 

364) -> MutableMapping[str, Any]: ... 

365 

366 

367@overload 

368def process_sidecar_data( 368 ↛ exitline 368 didn't jump to the function exit

369 content: MutableMapping[str, Any], force_metadata: Literal[False] 

370) -> ObservationInfo | MutableMapping[str, Any]: ... 

371 

372 

373def process_sidecar_data( 

374 content: MutableMapping[str, Any], force_metadata: bool = False 

375) -> ObservationInfo | MutableMapping[str, Any]: 

376 """Process the content read from a JSON sidecar file. 

377 

378 Parameters 

379 ---------- 

380 content : `dict` 

381 Data structure stored in JSON sidecar file converted to simple python 

382 form. 

383 force_metadata : `bool`, optional 

384 By default the content returned will match the original form that 

385 was used for the sidecar. If this parameter is `True` a sidecar of 

386 `.ObservationInfo` will be returned as if it was simple dict content. 

387 

388 Returns 

389 ------- 

390 info : `.ObservationInfo` or `dict` of [`str`, `~typing.Any`] 

391 If the sidecar file referred to `.ObservationInfo` this will return 

392 an `.ObservationInfo`, otherwise a `dict` will be returned. This 

393 can be overridden using the ``force_metadata`` parameter in which 

394 case a `dict` will always be returned. 

395 """ 

396 if not isinstance(content, dict): 

397 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}") 

398 

399 # Copy the input structure so we can update in place 

400 content = deepcopy(content) 

401 

402 guessing = False 

403 content_mode = content.pop(CONTENT_KEY, None) 

404 if force_metadata: 

405 content_mode = "metadata" 

406 elif content_mode is None: 

407 # All ObservationInfo objects will have observation_id and instrument 

408 # so if they are there we can guess 

409 guessing = True 

410 if "observation_id" in content and "instrument" in content: 

411 content_mode = "translated" 

412 else: 

413 content_mode = "metadata" 

414 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode) 

415 

416 if content_mode == "metadata": 

417 # nothing more to be done 

418 return content 

419 

420 try: 

421 info = ObservationInfo.from_simple(content) 

422 except Exception as e: 

423 if guessing: 

424 # We were guessing so seems like this is not ObservationInfo 

425 return content 

426 raise e 

427 

428 return info