Coverage for python / astro_metadata_translator / indexing.py: 21%

141 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 08:43 +0000

1# This file is part of astro_metadata_translator. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the LICENSE file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12"""Functions to support file indexing.""" 

13 

14from __future__ import annotations 

15 

16__all__ = ( 

17 "calculate_index", 

18 "index_files", 

19 "process_index_data", 

20 "process_sidecar_data", 

21 "read_index", 

22 "read_sidecar", 

23) 

24 

25import json 

26import logging 

27from collections.abc import MutableMapping, Sequence 

28from copy import deepcopy 

29from typing import IO, TYPE_CHECKING, Any, Literal, overload 

30 

31from lsst.resources import ResourcePath 

32 

33from .file_helpers import read_file_info 

34from .headers import merge_headers 

35from .observationGroup import ObservationGroup 

36from .observationInfo import ObservationInfo 

37 

38if TYPE_CHECKING: 

39 from lsst.resources import ResourcePathExpression 

40 

41log = logging.getLogger(__name__) 

42 

43COMMON_KEY = "__COMMON__" 

44CONTENT_KEY = "__CONTENT__" 

45 

46 

47def index_files( 

48 files: Sequence[ResourcePathExpression], 

49 root: ResourcePathExpression | None, 

50 hdrnum: int, 

51 print_trace: bool, 

52 content: str, 

53 outstream: IO | None = None, 

54) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]: 

55 """Create an index from the supplied files. 

56 

57 No file is written. The Python structure returned is suitable 

58 for writing. 

59 

60 Parameters 

61 ---------- 

62 files : iterable of `lsst.resources.ResourcePathExpression` 

63 Paths to the files to be indexed. They do not have to all be 

64 in a single directory but all content will be indexed into a single 

65 index. 

66 root : `str` 

67 Directory root that can be combined with each file (if the supplied) 

68 file is relative. Will be ignored if `None`. 

69 hdrnum : `int` 

70 The HDU number to read. The primary header is always read and 

71 merged with the header from this HDU. 

72 print_trace : `bool` 

73 If there is an error reading the file and this parameter is `True`, 

74 a full traceback of the exception will be reported. If `False` prints 

75 a one line summary of the error condition. If `None` the exception 

76 will be allowed. 

77 content : `str` 

78 Form of data to write in index file. Options are: 

79 ``translated`` (default) to write ObservationInfo to the index; 

80 ``metadata`` to write native metadata headers to the index. 

81 The index file is called ``{mode}_index.json``. 

82 outstream : `io.StringIO`, optional 

83 Output stream to use for standard messages. Defaults to `None` which 

84 uses the default output stream. 

85 

86 Returns 

87 ------- 

88 file_index : `dict` of [`str`, `dict`] 

89 The headers in form suitable for writing to an index. The keys will 

90 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the 

91 content mode used to construct the index, and paths to the files. The 

92 paths will be the supplied paths and will not include any supplied 

93 ``root``. 

94 okay : `list` of `str` 

95 All the files that were processed successfully. 

96 failed : `list` of `str` 

97 All the files that could not be processed. Will be empty if 

98 ``print_trace`` is not `None`. 

99 """ 

100 if content not in ("translated", "metadata"): 

101 raise ValueError(f"Unrecognized mode {content}") 

102 

103 failed: list[str] = [] 

104 okay: list[str] = [] 

105 root_uri = ResourcePath(root, forceDirectory=True) if root else None 

106 

107 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content 

108 for file in sorted(files): 

109 uri = ResourcePath(file, forceAbsolute=False, forceDirectory=False) 

110 if root_uri is not None: 

111 path = root_uri.join(uri) 

112 else: 

113 path = uri 

114 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream) 

115 path_key = path.ospath if path.isLocal else str(path) 

116 if simple is None: 

117 failed.append(path_key) 

118 continue 

119 else: 

120 okay.append(path_key) 

121 

122 # Store the information indexed by the filename within dir 

123 # We may get a PropertyList here and can therefore not just 

124 # assert Mapping for mypy. We therefore assert that it's not the 

125 # other 2 options, which we were enforcing with the "simple" parameter 

126 # in the call to read_file_info. 

127 assert not isinstance(simple, str | ObservationInfo) 

128 # Force string as key since this is required to be a relative path. 

129 # Make it relative to the given directory, else it might be absolute. 

130 if root_uri is not None: 

131 relative = path.relative_to(root_uri) 

132 if relative is not None: 

133 path_key = relative 

134 content_by_file[path_key] = simple 

135 

136 output = calculate_index(content_by_file, content) 

137 

138 return output, okay, failed 

139 

140 

141def calculate_index( 

142 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str 

143) -> MutableMapping[str, str | MutableMapping[str, Any]]: 

144 """Calculate an index data structure from the supplied headers. 

145 

146 Parameters 

147 ---------- 

148 headers : `dict` of [`str`, `dict`] 

149 The headers indexed by filename. 

150 content_mode : `str` 

151 The mode associated with these headers. Not used other than to 

152 store the information in the data structure for later use on 

153 deserialization. 

154 

155 Returns 

156 ------- 

157 index_ : `dict` of [`str`, `dict`] 

158 The headers in form suitable for writing to an index. 

159 """ 

160 if content_mode not in ("metadata", "translated"): 

161 raise ValueError(f"Unrecognized mode for index creation: {content_mode}") 

162 

163 # Merge all the information into a primary plus diff 

164 merged = merge_headers(list(headers.values()), mode="diff") 

165 

166 # For a single file it is possible that the merged contents 

167 # are not a dict but are an LSST-style PropertyList. JSON needs 

168 # dict though. mypy can't know about PropertyList so we must ignore 

169 # the type error. We also need to force Astropy Header to a dict. 

170 if not isinstance(merged, dict): 

171 # dict(Header) brings along additional keys that can't be serialized. 

172 merged = {k: v for k, v in merged.items()} 

173 

174 # The structure to write to file is intended to look like (in YAML): 

175 # __COMMON__: 

176 # KEY1: value1 

177 # KEY2: value2 

178 # FILE1: 

179 # KEY3: value3a 

180 # FILE2: 

181 # KEY3: value3b 

182 

183 # if there was only one file there will not be a diff but we 

184 # want it to look like there was. 

185 diff_dict = merged.pop("__DIFF__", [{}]) 

186 

187 # Put the common headers first in the output. 

188 # Store the mode so that we can work out how to read the file in 

189 output: MutableMapping[str, str | MutableMapping[str, Any]] = { 

190 CONTENT_KEY: content_mode, 

191 COMMON_KEY: merged, 

192 } 

193 for file, diff in zip(headers, diff_dict, strict=True): 

194 output[file] = diff 

195 

196 return output 

197 

198 

199@overload 

200def read_index( 200 ↛ exitline 200 didn't return from function 'read_index' because

201 path: str, 

202 *, 

203 force_dict: Literal[True], 

204) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

205 

206 

207@overload 

208def read_index( 208 ↛ exitline 208 didn't return from function 'read_index' because

209 path: str, 

210 *, 

211 force_dict: Literal[False], 

212) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

213 

214 

215def read_index( 

216 path: str, force_dict: bool = False 

217) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

218 """Read an index file. 

219 

220 Parameters 

221 ---------- 

222 path : `str` 

223 Path to the index file. 

224 force_dict : `bool`, optional 

225 If `True` the structure returned will always be a dict keyed 

226 by filename. 

227 

228 Returns 

229 ------- 

230 index_ : `.ObservationGroup` or `dict` [ `str`, \ 

231 `dict` | `.ObservationInfo` ] 

232 The return content matches that returned by `process_index_data`. 

233 """ 

234 if not path.endswith(".json"): 

235 raise ValueError(f"Index files must be in .json format; got {path}") 

236 

237 with open(path) as fd: 

238 content: MutableMapping[str, Any] = json.loads(fd.read()) 

239 

240 if not isinstance(content, MutableMapping): 

241 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

242 

243 return process_index_data(content, force_dict=force_dict) 

244 

245 

246@overload 

247def process_index_data( 247 ↛ exitline 247 didn't return from function 'process_index_data' because

248 content: MutableMapping[str, Any], 

249 *, 

250 force_metadata: Literal[True], 

251 force_dict: Literal[False], 

252) -> MutableMapping[str, Any]: ... 

253 

254 

255@overload 

256def process_index_data( 256 ↛ exitline 256 didn't return from function 'process_index_data' because

257 content: MutableMapping[str, Any], 

258 *, 

259 force_metadata: Literal[False], 

260 force_dict: Literal[True], 

261) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

262 

263 

264@overload 

265def process_index_data( 265 ↛ exitline 265 didn't return from function 'process_index_data' because

266 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

267) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ... 

268 

269 

270def process_index_data( 

271 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False 

272) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: 

273 """Process the content read from a JSON index file. 

274 

275 Parameters 

276 ---------- 

277 content : `dict` 

278 Data structure stored in JSON index file converted to simple python 

279 form. 

280 force_metadata : `bool`, optional 

281 By default the content returned will match the original form that 

282 was used for the index. If this parameter is `True` an index of 

283 `.ObservationInfo` will be returned as if it was simple dict content. 

284 force_dict : `bool`, optional 

285 If `True` the structure returned will always be a dict keyed 

286 by filename. 

287 

288 Returns 

289 ------- 

290 index : `.ObservationGroup` or `dict` of [`str`, `dict`] 

291 If the index file referred to `.ObservationInfo` this will return 

292 an `.ObservationGroup`, otherwise a `dict` will be returned with the 

293 keys being paths to files and the values being the keys and values 

294 stored in the index (with common information merged in). This 

295 can be overridden using the ``force_metadata`` parameter. If 

296 ``force_dict`` is `True` a `dict` will be returned with filename 

297 keys even if the index file refers to `.ObservationInfo` (the values 

298 will be `.ObservationInfo` unless ``force_metadata`` is `True`). 

299 

300 Notes 

301 ----- 

302 File keys will be relative to the location of the index file. 

303 """ 

304 if COMMON_KEY not in content: 

305 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.") 

306 

307 # Copy the input structure so we can update in place 

308 unpacked = deepcopy(content) 

309 

310 content_mode = unpacked.pop(CONTENT_KEY, None) 

311 if force_metadata: 

312 content_mode = "metadata" 

313 elif content_mode is None: 

314 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY) 

315 content_mode = "metadata" 

316 elif content_mode not in ("metadata", "translated"): 

317 raise ValueError(f"Unrecognized mode '{content_mode}' in index data structure.") 

318 

319 # The common headers will be copied into each header 

320 common = unpacked.pop(COMMON_KEY) 

321 if not isinstance(common, MutableMapping): 

322 raise ValueError( 

323 f"Common index metadata stored in '{COMMON_KEY}' must be a mapping, not {type(common)}." 

324 ) 

325 

326 for file in unpacked: 

327 file_content = unpacked[file] 

328 if not isinstance(file_content, MutableMapping): 

329 raise ValueError(f"Index entry for file '{file}' must be a mapping, not {type(file_content)}.") 

330 file_content.update(common) 

331 

332 if content_mode == "metadata": 

333 # nothing more to be done 

334 return unpacked 

335 

336 obs_infos: list[ObservationInfo] = [] 

337 # This type annotation is really MutableMapping[str, ObservationInfo] 

338 # but mypy needs it to look like the function return value. 

339 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {} 

340 for file, hdr in unpacked.items(): 

341 info = ObservationInfo.from_simple(hdr) 

342 info.filename = file 

343 obs_infos.append(info) 

344 by_file[file] = info 

345 

346 if force_dict: 

347 return by_file 

348 return ObservationGroup(obs_infos) 

349 

350 

351def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]: 

352 """Read a metadata sidecar file. 

353 

354 Parameters 

355 ---------- 

356 path : `str` 

357 Path to the sidecar file. 

358 

359 Returns 

360 ------- 

361 info : `.ObservationInfo` or `dict` of [`str`, `dict`] 

362 If the sidecar file referred to `.ObservationInfo` this will return 

363 an `.ObservationInfo`, otherwise a `dict` will be returned. 

364 """ 

365 if not path.endswith(".json"): 

366 raise ValueError(f"Sidecar files must be in .json format; got {path}") 

367 

368 with open(path) as fd: 

369 content: MutableMapping[str, Any] = json.loads(fd.read()) 

370 

371 if not isinstance(content, MutableMapping): 

372 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.") 

373 

374 return process_sidecar_data(content) 

375 

376 

377@overload 

378def process_sidecar_data( 378 ↛ exitline 378 didn't return from function 'process_sidecar_data' because

379 content: MutableMapping[str, Any], 

380) -> ObservationInfo | MutableMapping[str, Any]: ... 

381 

382 

383@overload 

384def process_sidecar_data( 384 ↛ exitline 384 didn't return from function 'process_sidecar_data' because

385 content: MutableMapping[str, Any], force_metadata: Literal[True] 

386) -> MutableMapping[str, Any]: ... 

387 

388 

389@overload 

390def process_sidecar_data( 390 ↛ exitline 390 didn't return from function 'process_sidecar_data' because

391 content: MutableMapping[str, Any], force_metadata: Literal[False] 

392) -> ObservationInfo | MutableMapping[str, Any]: ... 

393 

394 

395def process_sidecar_data( 

396 content: MutableMapping[str, Any], force_metadata: bool = False 

397) -> ObservationInfo | MutableMapping[str, Any]: 

398 """Process the content read from a JSON sidecar file. 

399 

400 Parameters 

401 ---------- 

402 content : `dict` 

403 Data structure stored in JSON sidecar file converted to simple python 

404 form. 

405 force_metadata : `bool`, optional 

406 By default the content returned will match the original form that 

407 was used for the sidecar. If this parameter is `True` a sidecar of 

408 `.ObservationInfo` will be returned as if it was simple dict content. 

409 

410 Returns 

411 ------- 

412 info : `.ObservationInfo` or `dict` of [`str`, `~typing.Any`] 

413 If the sidecar file referred to `.ObservationInfo` this will return 

414 an `.ObservationInfo`, otherwise a `dict` will be returned. This 

415 can be overridden using the ``force_metadata`` parameter in which 

416 case a `dict` will always be returned. 

417 """ 

418 if not isinstance(content, dict): 

419 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}") 

420 

421 # Copy the input structure so we can update in place 

422 content = deepcopy(content) 

423 

424 guessing = False 

425 content_mode = content.pop(CONTENT_KEY, None) 

426 if force_metadata: 

427 content_mode = "metadata" 

428 elif content_mode is None: 

429 # All ObservationInfo objects will have observation_id and instrument 

430 # so if they are there we can guess 

431 guessing = True 

432 if "observation_id" in content and "instrument" in content: 

433 content_mode = "translated" 

434 else: 

435 content_mode = "metadata" 

436 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode) 

437 elif content_mode not in ("metadata", "translated"): 

438 raise ValueError(f"Unrecognized mode '{content_mode}' in sidecar data structure.") 

439 

440 if content_mode == "metadata": 

441 # nothing more to be done 

442 return content 

443 

444 try: 

445 info = ObservationInfo.from_simple(content) 

446 except Exception as e: 

447 if guessing: 

448 # We were guessing so seems like this is not ObservationInfo 

449 return content 

450 raise e 

451 

452 return info