Coverage for python/astro_metadata_translator/indexing.py: 14%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

106 statements  

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("read_index", "calculate_index", "index_files", "process_index_data") 

15 

16"""Functions to support file indexing.""" 

17 

18import collections.abc 

19import json 

20import logging 

21import os 

22import sys 

23from copy import deepcopy 

24from typing import IO, Any, List, MutableMapping, Optional, Sequence, Tuple, Union 

25 

26from .file_helpers import read_file_info 

27from .headers import merge_headers 

28from .observationGroup import ObservationGroup 

29from .observationInfo import ObservationInfo 

30 

31log = logging.getLogger(__name__) 

32 

33COMMON_KEY = "__COMMON__" 

34CONTENT_KEY = "__CONTENT__" 

35 

36 

37def index_files( 

38 files: Sequence[str], 

39 root: Optional[str], 

40 hdrnum: int, 

41 print_trace: bool, 

42 content: str, 

43 outstream: IO = sys.stdout, 

44 errstream: IO = sys.stderr, 

45) -> Tuple[MutableMapping[str, Union[str, MutableMapping[str, Any]]], List[str], List[str]]: 

46 """Create an index from the supplied files. 

47 

48 No file is written. The Python structure returned is suitable 

49 for writing. 

50 

51 Parameters 

52 ---------- 

53 files : iterable of `str` 

54 Paths to the files to be indexed. They do not have to all be 

55 in a single directory but all content will be indexed into a single 

56 index. 

57 root : `str` 

58 Directory root that can be combined with each file (if the supplied) 

59 file is relative. Will be ignored if `None`. 

60 hdrnum : `int` 

61 The HDU number to read. The primary header is always read and 

62 print_trace : `bool` 

63 If there is an error reading the file and this parameter is `True`, 

64 a full traceback of the exception will be reported. If `False` prints 

65 a one line summary of the error condition. If `None` the exception 

66 will be allowed. 

67 content : `str` 

68 Form of data to write in index file. Options are: 

69 ``translated`` (default) to write ObservationInfo to the index; 

70 ``metadata`` to write native metadata headers to the index. 

71 The index file is called ``{mode}_index.json`` 

72 outstream : `io.StringIO`, optional 

73 Output stream to use for standard messages. Defaults to `sys.stdout`. 

74 errstream : `io.StringIO`, optional 

75 Stream to send messages that would normally be sent to standard 

76 error. Defaults to `sys.stderr`. 

77 

78 Returns 

79 ------- 

80 file_index : `dict` of [`str`, `dict`] 

81 The headers in form suitable for writing to an index. The keys will 

82 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the 

83 content mode used to construct the index, and paths to the files. The 

84 paths will be the supplied paths and will not include any supplied 

85 ``root``. 

86 okay : `list` of `str` 

87 All the files that were processed successfully. 

88 failed : `list` of `str` 

89 All the files that could not be processed. Will be empty if 

90 ``print_trace`` is not `None`. 

91 """ 

92 if content not in ("translated", "metadata"): 

93 raise ValueError("Unrecognized mode {mode}") 

94 

95 failed: List[str] = [] 

96 okay: List[str] = [] 

97 

98 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content 

99 for file in sorted(files): 

100 if root is not None: 

101 path = os.path.join(root, file) 

102 else: 

103 path = file 

104 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream) 

105 if simple is None: 

106 failed.append(path) 

107 continue 

108 else: 

109 okay.append(path) 

110 

111 # Store the information indexed by the filename within dir 

112 # We may get a PropertyList here and can therefore not just 

113 # assert Mapping for mypy. We therefore assert that it's not the 

114 # other 2 options, which we were enforcing with the "simple" parameter 

115 # in the call to read_file_info. 

116 assert not isinstance(simple, (str, ObservationInfo)) 

117 content_by_file[file] = simple 

118 

119 output = calculate_index(content_by_file, content) 

120 

121 return output, okay, failed 

122 

123 

124def calculate_index( 

125 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str 

126) -> MutableMapping[str, Union[str, MutableMapping[str, Any]]]: 

127 """Calculate an index data structure from the supplied headers. 

128 

129 Parameters 

130 ---------- 

131 headers : `dict` of [`str`, `dict`] 

132 The headers indexed by filename. 

133 content_mode : `str` 

134 The mode associated with these headers. Not used other than to 

135 store the information in the data structure for later use on 

136 deserialization. 

137 

138 Returns 

139 ------- 

140 index_ : `dict` of [`str`, `dict`] 

141 The headers in form suitable for writing to an index. 

142 """ 

143 if content_mode not in ("metadata", "translated"): 

144 raise ValueError(f"Unrecognized mode for index creation: {content_mode}") 

145 

146 # Merge all the information into a primary plus diff 

147 merged = merge_headers([hdr for hdr in headers.values()], mode="diff") 

148 

149 # For a single file it is possible that the merged contents 

150 # are not a dict but are an LSST-style PropertyList. JSON needs 

151 # dict though. mypy can't know about PropertyList so we must ignore 

152 # the type error. 

153 if not isinstance(merged, collections.abc.Mapping): 

154 merged = dict(merged) # type: ignore 

155 

156 # The structure to write to file is intended to look like (in YAML): 

157 # __COMMON__: 

158 # KEY1: value1 

159 # KEY2: value2 

160 # FILE1: 

161 # KEY3: value3a 

162 # FILE2: 

163 # KEY3: value3b 

164 

165 # if there was only one file there will not be a diff but we 

166 # want it to look like there was. 

167 diff_dict = merged.pop("__DIFF__", [dict()]) 

168 

169 # Put the common headers first in the output. 

170 # Store the mode so that we can work out how to read the file in 

171 output: MutableMapping[str, Union[str, MutableMapping[str, Any]]] = { 

172 CONTENT_KEY: content_mode, 

173 COMMON_KEY: merged, 

174 } 

175 for file, diff in zip(headers, diff_dict): 

176 output[file] = diff 

177 

178 return output 

179 

180 

181def read_index( 

182 path: str, force_dict: bool = False 

183) -> Union[ObservationGroup, MutableMapping[str, Union[str, MutableMapping[str, Any], ObservationInfo]]]: 

184 """Read an index file. 

185 

186 Parameters 

187 ---------- 

188 path : `str` 

189 Path to the index file. 

190 force_dict : `bool`, optional 

191 If `True` the structure returned will always be a dict keyed 

192 by filename. 

193 

194 Returns 

195 ------- 

196 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]` 

197 The return content matches that returned by `process_index_data`. 

198 """ 

199 if not path.endswith(".json"): 

200 raise ValueError(f"Index files must be in .json format; got {path}") 

201 

202 with open(path, "r") as fd: 

203 content = json.loads(fd.read()) 

204 

205 return process_index_data(content, force_dict=force_dict) 

206 

207 

208def process_index_data( 

209 content: MutableMapping[str, Any], force_metadata: bool = False, force_dict: bool = False 

210) -> Union[ObservationGroup, MutableMapping[str, Union[str, MutableMapping[str, Any], ObservationInfo]]]: 

211 """Process the content read from a JSON index file. 

212 

213 Parameters 

214 ---------- 

215 content : `dict` 

216 Data structure stored in JSON index file converted to simple python 

217 form. 

218 force_metadata : `bool`, optional 

219 By default the content returned will match the original form that 

220 was used for the index. If this parameter is `True` an index of 

221 `ObservationInfo` will be returned as if it was simple dict content. 

222 force_dict : `bool`, optional 

223 If `True` the structure returned will always be a dict keyed 

224 by filename. 

225 

226 Returns 

227 ------- 

228 index : `ObservationGroup` or `dict` of [`str`, `dict`] 

229 If the index file referred to `ObservationInfo` this will return 

230 an `ObservationGroup`, otherwise a `dict` will be returned with the 

231 keys being paths to files and the values being the keys and values 

232 stored in the index (with common information merged in). This 

233 can be overridden using the ``force_metadata`` parameter. If 

234 ``force_dict`` is `True` a `dict` will be returned with filename 

235 keys even if the index file refers to `ObservationInfo` (the values 

236 will be `ObservationInfo` unless ``force_metadata`` is `True`). 

237 

238 Notes 

239 ----- 

240 File keys will be relative to the location of the index file. 

241 """ 

242 

243 if COMMON_KEY not in content: 

244 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.") 

245 

246 # Copy the input structure so we can update in place 

247 unpacked = deepcopy(content) 

248 

249 content_mode = unpacked.pop(CONTENT_KEY, None) 

250 if force_metadata: 

251 content_mode = "metadata" 

252 elif content_mode is None: 

253 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY) 

254 content_mode = "metadata" 

255 

256 # The common headers will be copied into each header 

257 common = unpacked.pop(COMMON_KEY) 

258 

259 for file in unpacked: 

260 unpacked[file].update(common) 

261 

262 if content_mode == "metadata": 

263 # nothing more to be done 

264 return unpacked 

265 

266 obs_infos: List[ObservationInfo] = [] 

267 # This type annotation is really MutableMapping[str, ObservationInfo] 

268 # but mypy needs it to look like the function return value. 

269 by_file: MutableMapping[str, Union[str, MutableMapping[str, Any], ObservationInfo]] = {} 

270 for file, hdr in unpacked.items(): 

271 info = ObservationInfo.from_simple(hdr) 

272 info.filename = file 

273 obs_infos.append(info) 

274 by_file[file] = info 

275 

276 if force_dict: 

277 return by_file 

278 return ObservationGroup(obs_infos) 

279 

280 

281def read_sidecar(path: str) -> Union[ObservationInfo, MutableMapping[str, MutableMapping[str, Any]]]: 

282 """Read a metadata sidecar file. 

283 

284 Parameters 

285 ---------- 

286 path : `str` 

287 Path to the sidecar file. 

288 

289 Returns 

290 ------- 

291 info : `ObservationInfo` or `dict` of [`str`, `dict`] 

292 If the sidecar file referred to `ObservationInfo` this will return 

293 an `ObservationInfo`, otherwise a `dict` will be returned. 

294 """ 

295 if not path.endswith(".json"): 

296 raise ValueError(f"Sidecar files must be in .json format; got {path}") 

297 

298 with open(path, "r") as fd: 

299 content = json.loads(fd.read()) 

300 

301 return process_sidecar_data(content) 

302 

303 

304def process_sidecar_data( 

305 content: MutableMapping[str, Any], force_metadata: bool = False 

306) -> Union[ObservationInfo, MutableMapping[str, MutableMapping[str, Any]]]: 

307 """Process the content read from a JSON sidecar file. 

308 

309 Parameters 

310 ---------- 

311 content : `dict` 

312 Data structure stored in JSON sidecar file converted to simple python 

313 form. 

314 force_metadata : `bool`, optional 

315 By default the content returned will match the original form that 

316 was used for the sidecar. If this parameter is `True` a sidecar of 

317 `ObservationInfo` will be returned as if it was simple dict content. 

318 

319 Returns 

320 ------- 

321 info : `ObservationInfo` or `dict` of [`str`, `dict`] 

322 If the sidecar file referred to `ObservationInfo` this will return 

323 an `ObservationGroup`, otherwise a `dict` will be returned. This 

324 can be overridden using the ``force_metadata`` parameter. 

325 """ 

326 

327 if not isinstance(content, dict): 

328 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}") 

329 

330 # Copy the input structure so we can update in place 

331 content = deepcopy(content) 

332 

333 guessing = False 

334 content_mode = content.pop(CONTENT_KEY, None) 

335 if force_metadata: 

336 content_mode = "metadata" 

337 elif content_mode is None: 

338 # All ObservationInfo objects will have observation_id and instrument 

339 # so if they are there we can guess 

340 guessing = True 

341 if "observation_id" in content and "instrument" in content: 

342 content_mode = "translated" 

343 else: 

344 content_mode = "metadata" 

345 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode) 

346 

347 if content_mode == "metadata": 

348 # nothing more to be done 

349 return content 

350 

351 try: 

352 info = ObservationInfo.from_simple(content) 

353 except Exception as e: 

354 if guessing: 

355 # We were guessing so seems like this is not ObservationInfo 

356 return content 

357 raise e 

358 

359 return info