Coverage for python/astro_metadata_translator/indexing.py: 14%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

103 statements  

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12__all__ = ("read_index", "calculate_index", "index_files", "process_index_data") 

13 

14"""Functions to support file indexing.""" 

15 

16import collections.abc 

17import json 

18import logging 

19import os 

20import sys 

21from copy import deepcopy 

22 

23from .observationInfo import ObservationInfo 

24from .observationGroup import ObservationGroup 

25from .headers import merge_headers 

26from .file_helpers import read_file_info 

27 

28log = logging.getLogger(__name__) 

29 

30COMMON_KEY = "__COMMON__" 

31CONTENT_KEY = "__CONTENT__" 

32 

33 

34def index_files(files, root, hdrnum, print_trace, content, outstream=sys.stdout, errstream=sys.stderr): 

35 """Create an index from the supplied files. 

36 

37 No file is written. The Python structure returned is suitable 

38 for writing. 

39 

40 Parameters 

41 ---------- 

42 files : iterable of `str` 

43 Paths to the files to be indexed. They do not have to all be 

44 in a single directory but all content will be indexed into a single 

45 index. 

46 root : `str` 

47 Directory root that can be combined with each file (if the supplied) 

48 file is relative. Will be ignored if `None`. 

49 hdrnum : `int` 

50 The HDU number to read. The primary header is always read and 

51 print_trace : `bool` 

52 If there is an error reading the file and this parameter is `True`, 

53 a full traceback of the exception will be reported. If `False` prints 

54 a one line summary of the error condition. If `None` the exception 

55 will be allowed. 

56 content : `str` 

57 Form of data to write in index file. Options are: 

58 ``translated`` (default) to write ObservationInfo to the index; 

59 ``metadata`` to write native metadata headers to the index. 

60 The index file is called ``{mode}_index.json`` 

61 outstream : `io.StringIO`, optional 

62 Output stream to use for standard messages. Defaults to `sys.stdout`. 

63 errstream : `io.StringIO`, optional 

64 Stream to send messages that would normally be sent to standard 

65 error. Defaults to `sys.stderr`. 

66 

67 Returns 

68 ------- 

69 file_index : `dict` of [`str`, `dict`] 

70 The headers in form suitable for writing to an index. The keys will 

71 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the 

72 content mode used to construct the index, and paths to the files. The 

73 paths will be the supplied paths and will not include any supplied 

74 ``root``. 

75 okay : `list` of `str` 

76 All the files that were processed successfully. 

77 failed : `list` of `str` 

78 All the files that could not be processed. Will be empty if 

79 ``print_trace`` is not `None`. 

80 """ 

81 if content not in ("translated", "metadata"): 

82 raise ValueError("Unrecognized mode {mode}") 

83 

84 failed = [] 

85 okay = [] 

86 

87 content_by_file = {} # Mapping of path to file content 

88 for file in sorted(files): 

89 if root is not None: 

90 path = os.path.join(root, file) 

91 else: 

92 path = file 

93 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream) 

94 if simple is None: 

95 failed.append(path) 

96 continue 

97 else: 

98 okay.append(path) 

99 

100 # Store the information indexed by the filename within dir 

101 content_by_file[file] = simple 

102 

103 output = calculate_index(content_by_file, content) 

104 

105 return output, okay, failed 

106 

107 

108def calculate_index(headers, content_mode): 

109 """Calculate an index data structure from the supplied headers. 

110 

111 Parameters 

112 ---------- 

113 headers : `dict` of [`str`, `dict`] 

114 The headers indexed by filename. 

115 content_mode : `str` 

116 The mode associated with these headers. Not used other than to 

117 store the information in the data structure for later use on 

118 deserialization. 

119 

120 Returns 

121 ------- 

122 index_ : `dict` of [`str`, `dict`] 

123 The headers in form suitable for writing to an index. 

124 """ 

125 if content_mode not in ("metadata", "translated"): 

126 raise ValueError(f"Unrecognized mode for index creation: {content_mode}") 

127 

128 # Merge all the information into a primary plus diff 

129 merged = merge_headers(headers.values(), mode="diff") 

130 

131 # For a single file it is possible that the merged contents 

132 # are not a dict but are an LSST-style PropertyList. JSON needs 

133 # dict though. 

134 if not isinstance(merged, collections.abc.Mapping): 

135 merged = dict(merged) 

136 

137 # The structure to write to file is intended to look like (in YAML): 

138 # __COMMON__: 

139 # KEY1: value1 

140 # KEY2: value2 

141 # FILE1: 

142 # KEY3: value3a 

143 # FILE2: 

144 # KEY3: value3b 

145 

146 # if there was only one file there will not be a diff but we 

147 # want it to look like there was. 

148 diff_dict = merged.pop("__DIFF__", [dict()]) 

149 

150 # Put the common headers first in the output. 

151 # Store the mode so that we can work out how to read the file in 

152 output = {CONTENT_KEY: content_mode, COMMON_KEY: merged} 

153 for file, diff in zip(headers, diff_dict): 

154 output[file] = diff 

155 

156 return output 

157 

158 

159def read_index(path, force_dict=False): 

160 """Read an index file. 

161 

162 Parameters 

163 ---------- 

164 path : `str` 

165 Path to the index file. 

166 force_dict : `bool`, optional 

167 If `True` the structure returned will always be a dict keyed 

168 by filename. 

169 

170 Returns 

171 ------- 

172 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]` 

173 The return content matches that returned by `process_index_data`. 

174 """ 

175 if not path.endswith(".json"): 

176 raise ValueError(f"Index files must be in .json format; got {path}") 

177 

178 with open(path, "r") as fd: 

179 content = json.loads(fd.read()) 

180 

181 return process_index_data(content, force_dict=force_dict) 

182 

183 

184def process_index_data(content, force_metadata=False, force_dict=False): 

185 """Process the content read from a JSON index file. 

186 

187 Parameters 

188 ---------- 

189 content : `dict` 

190 Data structure stored in JSON index file converted to simple python 

191 form. 

192 force_metadata : `bool`, optional 

193 By default the content returned will match the original form that 

194 was used for the index. If this parameter is `True` an index of 

195 `ObservationInfo` will be returned as if it was simple dict content. 

196 force_dict : `bool`, optional 

197 If `True` the structure returned will always be a dict keyed 

198 by filename. 

199 

200 Returns 

201 ------- 

202 index : `ObservationGroup` or `dict` of [`str`, `dict`] 

203 If the index file referred to `ObservationInfo` this will return 

204 an `ObservationGroup`, otherwise a `dict` will be returned with the 

205 keys being paths to files and the values being the keys and values 

206 stored in the index (with common information merged in). This 

207 can be overridden using the ``force_metadata`` parameter. If 

208 ``force_dict`` is `True` a `dict` will be returned with filename 

209 keys even if the index file refers to `ObservationInfo` (the values 

210 will be `ObservationInfo` unless ``force_metadata`` is `True`). 

211 

212 Notes 

213 ----- 

214 File keys will be relative to the location of the index file. 

215 """ 

216 

217 if COMMON_KEY not in content: 

218 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.") 

219 

220 # Copy the input structure so we can update in place 

221 unpacked = deepcopy(content) 

222 

223 content_mode = unpacked.pop(CONTENT_KEY, None) 

224 if force_metadata: 

225 content_mode = "metadata" 

226 elif content is None: 

227 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY) 

228 content_mode = "metadata" 

229 

230 # The common headers will be copied into each header 

231 common = unpacked.pop(COMMON_KEY) 

232 

233 for file in unpacked: 

234 unpacked[file].update(common) 

235 

236 if content_mode == "metadata": 

237 # nothing more to be done 

238 return unpacked 

239 

240 obs_infos = [] 

241 by_file = {} 

242 for file, hdr in unpacked.items(): 

243 info = ObservationInfo.from_simple(hdr) 

244 info.filename = file 

245 obs_infos.append(info) 

246 by_file[file] = info 

247 

248 if force_dict: 

249 return by_file 

250 return ObservationGroup(obs_infos) 

251 

252 

253def read_sidecar(path): 

254 """Read a metadata sidecar file. 

255 

256 Parameters 

257 ---------- 

258 path : `str` 

259 Path to the sidecar file. 

260 

261 Returns 

262 ------- 

263 info : `ObservationInfo` or `dict` of [`str`, `dict`] 

264 If the sidecar file referred to `ObservationInfo` this will return 

265 an `ObservationInfo`, otherwise a `dict` will be returned. 

266 """ 

267 if not path.endswith(".json"): 

268 raise ValueError(f"Sidecar files must be in .json format; got {path}") 

269 

270 with open(path, "r") as fd: 

271 content = json.loads(fd.read()) 

272 

273 return process_sidecar_data(content) 

274 

275 

276def process_sidecar_data(content, force_metadata=False): 

277 """Process the content read from a JSON sidecar file. 

278 

279 Parameters 

280 ---------- 

281 content : `dict` 

282 Data structure stored in JSON sidecar file converted to simple python 

283 form. 

284 force_metadata : `bool`, optional 

285 By default the content returned will match the original form that 

286 was used for the sidecar. If this parameter is `True` a sidecar of 

287 `ObservationInfo` will be returned as if it was simple dict content. 

288 

289 Returns 

290 ------- 

291 info : `ObservationInfo` or `dict` of [`str`, `dict`] 

292 If the sidecar file referred to `ObservationInfo` this will return 

293 an `ObservationGroup`, otherwise a `dict` will be returned. This 

294 can be overridden using the ``force_metadata`` parameter. 

295 """ 

296 

297 if not isinstance(content, dict): 

298 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}") 

299 

300 # Copy the input structure so we can update in place 

301 content = deepcopy(content) 

302 

303 guessing = False 

304 content_mode = content.pop(CONTENT_KEY, None) 

305 if force_metadata: 

306 content_mode = "metadata" 

307 elif content is None: 

308 # All ObservationInfo objects will have observation_id and instrument 

309 # so if they are there we can guess 

310 guessing = True 

311 if "observation_id" in content and "instrument" in content_mode: 

312 content_mode = "translated" 

313 else: 

314 content_mode = "metadata" 

315 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode) 

316 

317 if content_mode == "metadata": 

318 # nothing more to be done 

319 return content 

320 

321 try: 

322 info = ObservationInfo.from_simple(content) 

323 except Exception as e: 

324 if guessing: 

325 # We were guessing so seems like this is not ObservationInfo 

326 return content 

327 raise e 

328 

329 return info