Coverage for python / astro_metadata_translator / indexing.py: 21%
141 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-26 08:50 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-26 08:50 +0000
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12"""Functions to support file indexing."""
14from __future__ import annotations
16__all__ = (
17 "calculate_index",
18 "index_files",
19 "process_index_data",
20 "process_sidecar_data",
21 "read_index",
22 "read_sidecar",
23)
25import json
26import logging
27from collections.abc import MutableMapping, Sequence
28from copy import deepcopy
29from typing import IO, TYPE_CHECKING, Any, Literal, overload
31from lsst.resources import ResourcePath
33from .file_helpers import read_file_info
34from .headers import merge_headers
35from .observationGroup import ObservationGroup
36from .observationInfo import ObservationInfo
38if TYPE_CHECKING:
39 from lsst.resources import ResourcePathExpression
41log = logging.getLogger(__name__)
43COMMON_KEY = "__COMMON__"
44CONTENT_KEY = "__CONTENT__"
47def index_files(
48 files: Sequence[ResourcePathExpression],
49 root: ResourcePathExpression | None,
50 hdrnum: int,
51 print_trace: bool,
52 content: str,
53 outstream: IO | None = None,
54) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]:
55 """Create an index from the supplied files.
57 No file is written. The Python structure returned is suitable
58 for writing.
60 Parameters
61 ----------
62 files : iterable of `lsst.resources.ResourcePathExpression`
63 Paths to the files to be indexed. They do not have to all be
64 in a single directory but all content will be indexed into a single
65 index.
66 root : `str`
67 Directory root that can be combined with each file (if the supplied)
68 file is relative. Will be ignored if `None`.
69 hdrnum : `int`
70 The HDU number to read. The primary header is always read and
71 merged with the header from this HDU.
72 print_trace : `bool`
73 If there is an error reading the file and this parameter is `True`,
74 a full traceback of the exception will be reported. If `False` prints
75 a one line summary of the error condition. If `None` the exception
76 will be allowed.
77 content : `str`
78 Form of data to write in index file. Options are:
79 ``translated`` (default) to write ObservationInfo to the index;
80 ``metadata`` to write native metadata headers to the index.
81 The index file is called ``{mode}_index.json``.
82 outstream : `io.StringIO`, optional
83 Output stream to use for standard messages. Defaults to `None` which
84 uses the default output stream.
86 Returns
87 -------
88 file_index : `dict` of [`str`, `dict`]
89 The headers in form suitable for writing to an index. The keys will
90 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
91 content mode used to construct the index, and paths to the files. The
92 paths will be the supplied paths and will not include any supplied
93 ``root``.
94 okay : `list` of `str`
95 All the files that were processed successfully.
96 failed : `list` of `str`
97 All the files that could not be processed. Will be empty if
98 ``print_trace`` is not `None`.
99 """
100 if content not in ("translated", "metadata"):
101 raise ValueError(f"Unrecognized mode {content}")
103 failed: list[str] = []
104 okay: list[str] = []
105 root_uri = ResourcePath(root, forceDirectory=True) if root else None
107 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content
108 for file in sorted(files):
109 uri = ResourcePath(file, forceAbsolute=False, forceDirectory=False)
110 if root_uri is not None:
111 path = root_uri.join(uri)
112 else:
113 path = uri
114 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream)
115 path_key = path.ospath if path.isLocal else str(path)
116 if simple is None:
117 failed.append(path_key)
118 continue
119 else:
120 okay.append(path_key)
122 # Store the information indexed by the filename within dir
123 # We may get a PropertyList here and can therefore not just
124 # assert Mapping for mypy. We therefore assert that it's not the
125 # other 2 options, which we were enforcing with the "simple" parameter
126 # in the call to read_file_info.
127 assert not isinstance(simple, str | ObservationInfo)
128 # Force string as key since this is required to be a relative path.
129 # Make it relative to the given directory, else it might be absolute.
130 if root_uri is not None:
131 relative = path.relative_to(root_uri)
132 if relative is not None:
133 path_key = relative
134 content_by_file[path_key] = simple
136 output = calculate_index(content_by_file, content)
138 return output, okay, failed
141def calculate_index(
142 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str
143) -> MutableMapping[str, str | MutableMapping[str, Any]]:
144 """Calculate an index data structure from the supplied headers.
146 Parameters
147 ----------
148 headers : `dict` of [`str`, `dict`]
149 The headers indexed by filename.
150 content_mode : `str`
151 The mode associated with these headers. Not used other than to
152 store the information in the data structure for later use on
153 deserialization.
155 Returns
156 -------
157 index_ : `dict` of [`str`, `dict`]
158 The headers in form suitable for writing to an index.
159 """
160 if content_mode not in ("metadata", "translated"):
161 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
163 # Merge all the information into a primary plus diff
164 merged = merge_headers(list(headers.values()), mode="diff")
166 # For a single file it is possible that the merged contents
167 # are not a dict but are an LSST-style PropertyList. JSON needs
168 # dict though. mypy can't know about PropertyList so we must ignore
169 # the type error. We also need to force Astropy Header to a dict.
170 if not isinstance(merged, dict):
171 # dict(Header) brings along additional keys that can't be serialized.
172 merged = {k: v for k, v in merged.items()}
174 # The structure to write to file is intended to look like (in YAML):
175 # __COMMON__:
176 # KEY1: value1
177 # KEY2: value2
178 # FILE1:
179 # KEY3: value3a
180 # FILE2:
181 # KEY3: value3b
183 # if there was only one file there will not be a diff but we
184 # want it to look like there was.
185 diff_dict = merged.pop("__DIFF__", [{}])
187 # Put the common headers first in the output.
188 # Store the mode so that we can work out how to read the file in
189 output: MutableMapping[str, str | MutableMapping[str, Any]] = {
190 CONTENT_KEY: content_mode,
191 COMMON_KEY: merged,
192 }
193 for file, diff in zip(headers, diff_dict, strict=True):
194 output[file] = diff
196 return output
199@overload
200def read_index( 200 ↛ exitline 200 didn't return from function 'read_index' because
201 path: str,
202 *,
203 force_dict: Literal[True],
204) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
207@overload
208def read_index( 208 ↛ exitline 208 didn't return from function 'read_index' because
209 path: str,
210 *,
211 force_dict: Literal[False],
212) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
215def read_index(
216 path: str, force_dict: bool = False
217) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
218 """Read an index file.
220 Parameters
221 ----------
222 path : `str`
223 Path to the index file.
224 force_dict : `bool`, optional
225 If `True` the structure returned will always be a dict keyed
226 by filename.
228 Returns
229 -------
230 index_ : `.ObservationGroup` or `dict` [ `str`, \
231 `dict` | `.ObservationInfo` ]
232 The return content matches that returned by `process_index_data`.
233 """
234 if not path.endswith(".json"):
235 raise ValueError(f"Index files must be in .json format; got {path}")
237 with open(path) as fd:
238 content: MutableMapping[str, Any] = json.loads(fd.read())
240 if not isinstance(content, MutableMapping):
241 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
243 return process_index_data(content, force_dict=force_dict)
246@overload
247def process_index_data( 247 ↛ exitline 247 didn't return from function 'process_index_data' because
248 content: MutableMapping[str, Any],
249 *,
250 force_metadata: Literal[True],
251 force_dict: Literal[False],
252) -> MutableMapping[str, Any]: ...
255@overload
256def process_index_data( 256 ↛ exitline 256 didn't return from function 'process_index_data' because
257 content: MutableMapping[str, Any],
258 *,
259 force_metadata: Literal[False],
260 force_dict: Literal[True],
261) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
264@overload
265def process_index_data( 265 ↛ exitline 265 didn't return from function 'process_index_data' because
266 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
267) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
270def process_index_data(
271 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
272) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
273 """Process the content read from a JSON index file.
275 Parameters
276 ----------
277 content : `dict`
278 Data structure stored in JSON index file converted to simple python
279 form.
280 force_metadata : `bool`, optional
281 By default the content returned will match the original form that
282 was used for the index. If this parameter is `True` an index of
283 `.ObservationInfo` will be returned as if it was simple dict content.
284 force_dict : `bool`, optional
285 If `True` the structure returned will always be a dict keyed
286 by filename.
288 Returns
289 -------
290 index : `.ObservationGroup` or `dict` of [`str`, `dict`]
291 If the index file referred to `.ObservationInfo` this will return
292 an `.ObservationGroup`, otherwise a `dict` will be returned with the
293 keys being paths to files and the values being the keys and values
294 stored in the index (with common information merged in). This
295 can be overridden using the ``force_metadata`` parameter. If
296 ``force_dict`` is `True` a `dict` will be returned with filename
297 keys even if the index file refers to `.ObservationInfo` (the values
298 will be `.ObservationInfo` unless ``force_metadata`` is `True`).
300 Notes
301 -----
302 File keys will be relative to the location of the index file.
303 """
304 if COMMON_KEY not in content:
305 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
307 # Copy the input structure so we can update in place
308 unpacked = deepcopy(content)
310 content_mode = unpacked.pop(CONTENT_KEY, None)
311 if force_metadata:
312 content_mode = "metadata"
313 elif content_mode is None:
314 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
315 content_mode = "metadata"
316 elif content_mode not in ("metadata", "translated"):
317 raise ValueError(f"Unrecognized mode '{content_mode}' in index data structure.")
319 # The common headers will be copied into each header
320 common = unpacked.pop(COMMON_KEY)
321 if not isinstance(common, MutableMapping):
322 raise ValueError(
323 f"Common index metadata stored in '{COMMON_KEY}' must be a mapping, not {type(common)}."
324 )
326 for file in unpacked:
327 file_content = unpacked[file]
328 if not isinstance(file_content, MutableMapping):
329 raise ValueError(f"Index entry for file '{file}' must be a mapping, not {type(file_content)}.")
330 file_content.update(common)
332 if content_mode == "metadata":
333 # nothing more to be done
334 return unpacked
336 obs_infos: list[ObservationInfo] = []
337 # This type annotation is really MutableMapping[str, ObservationInfo]
338 # but mypy needs it to look like the function return value.
339 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {}
340 for file, hdr in unpacked.items():
341 info = ObservationInfo.from_simple(hdr)
342 info.filename = file
343 obs_infos.append(info)
344 by_file[file] = info
346 if force_dict:
347 return by_file
348 return ObservationGroup(obs_infos)
351def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]:
352 """Read a metadata sidecar file.
354 Parameters
355 ----------
356 path : `str`
357 Path to the sidecar file.
359 Returns
360 -------
361 info : `.ObservationInfo` or `dict` of [`str`, `dict`]
362 If the sidecar file referred to `.ObservationInfo` this will return
363 an `.ObservationInfo`, otherwise a `dict` will be returned.
364 """
365 if not path.endswith(".json"):
366 raise ValueError(f"Sidecar files must be in .json format; got {path}")
368 with open(path) as fd:
369 content: MutableMapping[str, Any] = json.loads(fd.read())
371 if not isinstance(content, MutableMapping):
372 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
374 return process_sidecar_data(content)
377@overload
378def process_sidecar_data( 378 ↛ exitline 378 didn't return from function 'process_sidecar_data' because
379 content: MutableMapping[str, Any],
380) -> ObservationInfo | MutableMapping[str, Any]: ...
383@overload
384def process_sidecar_data( 384 ↛ exitline 384 didn't return from function 'process_sidecar_data' because
385 content: MutableMapping[str, Any], force_metadata: Literal[True]
386) -> MutableMapping[str, Any]: ...
389@overload
390def process_sidecar_data( 390 ↛ exitline 390 didn't return from function 'process_sidecar_data' because
391 content: MutableMapping[str, Any], force_metadata: Literal[False]
392) -> ObservationInfo | MutableMapping[str, Any]: ...
395def process_sidecar_data(
396 content: MutableMapping[str, Any], force_metadata: bool = False
397) -> ObservationInfo | MutableMapping[str, Any]:
398 """Process the content read from a JSON sidecar file.
400 Parameters
401 ----------
402 content : `dict`
403 Data structure stored in JSON sidecar file converted to simple python
404 form.
405 force_metadata : `bool`, optional
406 By default the content returned will match the original form that
407 was used for the sidecar. If this parameter is `True` a sidecar of
408 `.ObservationInfo` will be returned as if it was simple dict content.
410 Returns
411 -------
412 info : `.ObservationInfo` or `dict` of [`str`, `~typing.Any`]
413 If the sidecar file referred to `.ObservationInfo` this will return
414 an `.ObservationInfo`, otherwise a `dict` will be returned. This
415 can be overridden using the ``force_metadata`` parameter in which
416 case a `dict` will always be returned.
417 """
418 if not isinstance(content, dict):
419 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
421 # Copy the input structure so we can update in place
422 content = deepcopy(content)
424 guessing = False
425 content_mode = content.pop(CONTENT_KEY, None)
426 if force_metadata:
427 content_mode = "metadata"
428 elif content_mode is None:
429 # All ObservationInfo objects will have observation_id and instrument
430 # so if they are there we can guess
431 guessing = True
432 if "observation_id" in content and "instrument" in content:
433 content_mode = "translated"
434 else:
435 content_mode = "metadata"
436 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
437 elif content_mode not in ("metadata", "translated"):
438 raise ValueError(f"Unrecognized mode '{content_mode}' in sidecar data structure.")
440 if content_mode == "metadata":
441 # nothing more to be done
442 return content
444 try:
445 info = ObservationInfo.from_simple(content)
446 except Exception as e:
447 if guessing:
448 # We were guessing so seems like this is not ObservationInfo
449 return content
450 raise e
452 return info