Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12__all__ = ("read_index", "calculate_index", "index_files", "process_index_data") 

13 

14"""Functions to support file indexing.""" 

15 

16import json 

17import logging 

18import os 

19import sys 

20from copy import deepcopy 

21 

22from .observationInfo import ObservationInfo 

23from .observationGroup import ObservationGroup 

24from .headers import merge_headers 

25from .file_helpers import read_file_info 

26 

27log = logging.getLogger(__name__) 

28 

29COMMON_KEY = "__COMMON__" 

30CONTENT_KEY = "__CONTENT__" 

31 

32 

33def index_files(files, root, hdrnum, print_trace, content, outstream=sys.stdout, errstream=sys.stderr): 

34 """Create an index from the supplied files. 

35 

36 No file is written. The Python structure returned is suitable 

37 for writing. 

38 

39 Parameters 

40 ---------- 

41 files : iterable of `str` 

42 Paths to the files to be indexed. They do not have to all be 

43 in a single directory but all content will be indexed into a single 

44 index. 

45 root : `str` 

46 Directory root that can be combined with each file (if the supplied) 

47 file is relative. Will be ignored if `None`. 

48 hdrnum : `int` 

49 The HDU number to read. The primary header is always read and 

50 print_trace : `bool` 

51 If there is an error reading the file and this parameter is `True`, 

52 a full traceback of the exception will be reported. If `False` prints 

53 a one line summary of the error condition. If `None` the exception 

54 will be allowed. 

55 content : `str` 

56 Form of data to write in index file. Options are: 

57 ``translated`` (default) to write ObservationInfo to the index; 

58 ``metadata`` to write native metadata headers to the index. 

59 The index file is called ``{mode}_index.json`` 

60 outstream : `io.StringIO`, optional 

61 Output stream to use for standard messages. Defaults to `sys.stdout`. 

62 errstream : `io.StringIO`, optional 

63 Stream to send messages that would normally be sent to standard 

64 error. Defaults to `sys.stderr`. 

65 

66 Returns 

67 ------- 

68 file_index : `dict` of [`str`, `dict`] 

69 The headers in form suitable for writing to an index. The keys will 

70 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the 

71 content mode used to construct the index, and paths to the files. The 

72 paths will be the supplied paths and will not include any supplied 

73 ``root``. 

74 okay : `list` of `str` 

75 All the files that were processed successfully. 

76 failed : `list` of `str` 

77 All the files that could not be processed. Will be empty if 

78 ``print_trace`` is not `None`. 

79 """ 

80 if content not in ("translated", "metadata"): 

81 raise ValueError("Unrecognized mode {mode}") 

82 

83 failed = [] 

84 okay = [] 

85 

86 content_by_file = {} # Mapping of path to file content 

87 for file in sorted(files): 

88 if root is not None: 

89 path = os.path.join(root, file) 

90 else: 

91 path = file 

92 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream) 

93 if simple is None: 

94 failed.append(path) 

95 continue 

96 else: 

97 okay.append(path) 

98 

99 # Store the information indexed by the filename within dir 

100 content_by_file[file] = simple 

101 

102 output = calculate_index(content_by_file, content) 

103 

104 return output, okay, failed 

105 

106 

107def calculate_index(headers, content_mode): 

108 """Calculate an index data structure from the supplied headers. 

109 

110 Parameters 

111 ---------- 

112 headers : `dict` of [`str`, `dict`] 

113 The headers indexed by filename. 

114 content_mode : `str` 

115 The mode associated with these headers. Not used other than to 

116 store the information in the data structure for later use on 

117 deserialization. 

118 

119 Returns 

120 ------- 

121 index_ : `dict` of [`str`, `dict`] 

122 The headers in form suitable for writing to an index. 

123 """ 

124 if content_mode not in ("metadata", "translated"): 

125 raise ValueError(f"Unrecognized mode for index creation: {content_mode}") 

126 

127 # Merge all the information into a primary plus diff 

128 merged = merge_headers(headers.values(), mode="diff") 

129 

130 # The structure to write to file is intended to look like (in YAML): 

131 # __COMMON__: 

132 # KEY1: value1 

133 # KEY2: value2 

134 # FILE1: 

135 # KEY3: value3a 

136 # FILE2: 

137 # KEY3: value3b 

138 

139 # if there was only one file there will not be a diff but we 

140 # want it to look like there was. 

141 diff_dict = merged.pop("__DIFF__", [dict()]) 

142 

143 # Put the common headers first in the output. 

144 # Store the mode so that we can work out how to read the file in 

145 output = {CONTENT_KEY: content_mode, COMMON_KEY: merged} 

146 for file, diff in zip(headers, diff_dict): 

147 output[file] = diff 

148 

149 return output 

150 

151 

152def read_index(path, force_dict=False): 

153 """Read an index file. 

154 

155 Parameters 

156 ---------- 

157 path : `str` 

158 Path to the index file. 

159 force_dict : `bool`, optional 

160 If `True` the structure returned will always be a dict keyed 

161 by filename. 

162 

163 Returns 

164 ------- 

165 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]` 

166 The return content matches that returned by `process_index_data`. 

167 """ 

168 if not path.endswith(".json"): 

169 raise ValueError(f"Index files must be in .json format; got {path}") 

170 

171 with open(path, "r") as fd: 

172 content = json.loads(fd.read()) 

173 

174 return process_index_data(content, force_dict=force_dict) 

175 

176 

177def process_index_data(content, force_metadata=False, force_dict=False): 

178 """Process the content read from a JSON index file. 

179 

180 Parameters 

181 ---------- 

182 content : `dict` 

183 Data structure stored in JSON index file converted to simple python 

184 form. 

185 force_metadata : `bool`, optional 

186 By default the content returned will match the original form that 

187 was used for the index. If this parameter is `True` an index of 

188 `ObservationInfo` will be returned as if it was simple dict content. 

189 force_dict : `bool`, optional 

190 If `True` the structure returned will always be a dict keyed 

191 by filename. 

192 

193 Returns 

194 ------- 

195 index : `ObservationGroup` or `dict` of [`str`, `dict`] 

196 If the index file referred to `ObservationInfo` this will return 

197 an `ObservationGroup`, otherwise a `dict` will be returned with the 

198 keys being paths to files and the values being the keys and values 

199 stored in the index (with common information merged in). This 

200 can be overridden using the ``force_metadata`` parameter. If 

201 ``force_dict`` is `True` a `dict` will be returned with filename 

202 keys even if the index file refers to `ObservationInfo` (the values 

203 will be `ObservationInfo` unless ``force_metadata`` is `True`). 

204 

205 Notes 

206 ----- 

207 File keys will be relative to the location of the index file. 

208 """ 

209 

210 if COMMON_KEY not in content: 

211 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.") 

212 

213 # Copy the input structure so we can update in place 

214 unpacked = deepcopy(content) 

215 

216 content_mode = unpacked.pop(CONTENT_KEY, None) 

217 if force_metadata: 

218 content_mode = "metadata" 

219 elif content is None: 

220 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY) 

221 content_mode = "metadata" 

222 

223 # The common headers will be copied into each header 

224 common = unpacked.pop(COMMON_KEY) 

225 

226 for file in unpacked: 

227 unpacked[file].update(common) 

228 

229 if content_mode == "metadata": 

230 # nothing more to be done 

231 return unpacked 

232 

233 obs_infos = [] 

234 by_file = {} 

235 for file, hdr in unpacked.items(): 

236 info = ObservationInfo.from_simple(hdr) 

237 info.filename = file 

238 obs_infos.append(info) 

239 by_file[file] = info 

240 

241 if force_dict: 

242 return by_file 

243 return ObservationGroup(obs_infos) 

244 

245 

246def read_sidecar(path): 

247 """Read a metadata sidecar file. 

248 

249 Parameters 

250 ---------- 

251 path : `str` 

252 Path to the sidecar file. 

253 

254 Returns 

255 ------- 

256 info : `ObservationInfo` or `dict` of [`str`, `dict`] 

257 If the sidecar file referred to `ObservationInfo` this will return 

258 an `ObservationInfo`, otherwise a `dict` will be returned. 

259 """ 

260 if not path.endswith(".json"): 

261 raise ValueError(f"Sidecar files must be in .json format; got {path}") 

262 

263 with open(path, "r") as fd: 

264 content = json.loads(fd.read()) 

265 

266 return process_sidecar_data(content) 

267 

268 

269def process_sidecar_data(content, force_metadata=False): 

270 """Process the content read from a JSON sidecar file. 

271 

272 Parameters 

273 ---------- 

274 content : `dict` 

275 Data structure stored in JSON sidecar file converted to simple python 

276 form. 

277 force_metadata : `bool`, optional 

278 By default the content returned will match the original form that 

279 was used for the sidecar. If this parameter is `True` a sidecar of 

280 `ObservationInfo` will be returned as if it was simple dict content. 

281 

282 Returns 

283 ------- 

284 info : `ObservationInfo` or `dict` of [`str`, `dict`] 

285 If the sidecar file referred to `ObservationInfo` this will return 

286 an `ObservationGroup`, otherwise a `dict` will be returned. This 

287 can be overridden using the ``force_metadata`` parameter. 

288 """ 

289 

290 if not isinstance(content, dict): 

291 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}") 

292 

293 # Copy the input structure so we can update in place 

294 content = deepcopy(content) 

295 

296 guessing = False 

297 content_mode = content.pop(CONTENT_KEY, None) 

298 if force_metadata: 

299 content_mode = "metadata" 

300 elif content is None: 

301 # All ObservationInfo objects will have observation_id and instrument 

302 # so if they are there we can guess 

303 guessing = True 

304 if "observation_id" in content and "instrument" in content_mode: 

305 content_mode = "translated" 

306 else: 

307 content_mode = "metadata" 

308 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode) 

309 

310 if content_mode == "metadata": 

311 # nothing more to be done 

312 return content 

313 

314 try: 

315 info = ObservationInfo.from_simple(content) 

316 except Exception as e: 

317 if guessing: 

318 # We were guessing so seems like this is not ObservationInfo 

319 return content 

320 raise e 

321 

322 return info