Coverage for python/astro_metadata_translator/indexing.py: 27%
128 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-20 03:54 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-20 03:54 -0700
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12"""Functions to support file indexing."""
14from __future__ import annotations
16__all__ = (
17 "read_index",
18 "read_sidecar",
19 "calculate_index",
20 "index_files",
21 "process_index_data",
22 "process_sidecar_data",
23)
25import collections.abc
26import json
27import logging
28import os
29import sys
30from collections.abc import MutableMapping, Sequence
31from copy import deepcopy
32from typing import IO, Any, Literal, overload
34from .file_helpers import read_file_info
35from .headers import merge_headers
36from .observationGroup import ObservationGroup
37from .observationInfo import ObservationInfo
39log = logging.getLogger(__name__)
41COMMON_KEY = "__COMMON__"
42CONTENT_KEY = "__CONTENT__"
45def index_files(
46 files: Sequence[str],
47 root: str | None,
48 hdrnum: int,
49 print_trace: bool,
50 content: str,
51 outstream: IO = sys.stdout,
52 errstream: IO = sys.stderr,
53) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]:
54 """Create an index from the supplied files.
56 No file is written. The Python structure returned is suitable
57 for writing.
59 Parameters
60 ----------
61 files : iterable of `str`
62 Paths to the files to be indexed. They do not have to all be
63 in a single directory but all content will be indexed into a single
64 index.
65 root : `str`
66 Directory root that can be combined with each file (if the supplied)
67 file is relative. Will be ignored if `None`.
68 hdrnum : `int`
69 The HDU number to read. The primary header is always read and
70 merged with the header from this HDU.
71 print_trace : `bool`
72 If there is an error reading the file and this parameter is `True`,
73 a full traceback of the exception will be reported. If `False` prints
74 a one line summary of the error condition. If `None` the exception
75 will be allowed.
76 content : `str`
77 Form of data to write in index file. Options are:
78 ``translated`` (default) to write ObservationInfo to the index;
79 ``metadata`` to write native metadata headers to the index.
80 The index file is called ``{mode}_index.json``.
81 outstream : `io.StringIO`, optional
82 Output stream to use for standard messages. Defaults to `sys.stdout`.
83 errstream : `io.StringIO`, optional
84 Stream to send messages that would normally be sent to standard
85 error. Defaults to `sys.stderr`.
87 Returns
88 -------
89 file_index : `dict` of [`str`, `dict`]
90 The headers in form suitable for writing to an index. The keys will
91 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
92 content mode used to construct the index, and paths to the files. The
93 paths will be the supplied paths and will not include any supplied
94 ``root``.
95 okay : `list` of `str`
96 All the files that were processed successfully.
97 failed : `list` of `str`
98 All the files that could not be processed. Will be empty if
99 ``print_trace`` is not `None`.
100 """
101 if content not in ("translated", "metadata"):
102 raise ValueError("Unrecognized mode {mode}")
104 failed: list[str] = []
105 okay: list[str] = []
107 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content
108 for file in sorted(files):
109 if root is not None:
110 path = os.path.join(root, file)
111 else:
112 path = file
113 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream)
114 if simple is None:
115 failed.append(path)
116 continue
117 else:
118 okay.append(path)
120 # Store the information indexed by the filename within dir
121 # We may get a PropertyList here and can therefore not just
122 # assert Mapping for mypy. We therefore assert that it's not the
123 # other 2 options, which we were enforcing with the "simple" parameter
124 # in the call to read_file_info.
125 assert not isinstance(simple, (str, ObservationInfo))
126 content_by_file[file] = simple
128 output = calculate_index(content_by_file, content)
130 return output, okay, failed
133def calculate_index(
134 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str
135) -> MutableMapping[str, str | MutableMapping[str, Any]]:
136 """Calculate an index data structure from the supplied headers.
138 Parameters
139 ----------
140 headers : `dict` of [`str`, `dict`]
141 The headers indexed by filename.
142 content_mode : `str`
143 The mode associated with these headers. Not used other than to
144 store the information in the data structure for later use on
145 deserialization.
147 Returns
148 -------
149 index_ : `dict` of [`str`, `dict`]
150 The headers in form suitable for writing to an index.
151 """
152 if content_mode not in ("metadata", "translated"):
153 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
155 # Merge all the information into a primary plus diff
156 merged = merge_headers([hdr for hdr in headers.values()], mode="diff")
158 # For a single file it is possible that the merged contents
159 # are not a dict but are an LSST-style PropertyList. JSON needs
160 # dict though. mypy can't know about PropertyList so we must ignore
161 # the type error.
162 if not isinstance(merged, collections.abc.Mapping):
163 merged = dict(merged) # type: ignore
165 # The structure to write to file is intended to look like (in YAML):
166 # __COMMON__:
167 # KEY1: value1
168 # KEY2: value2
169 # FILE1:
170 # KEY3: value3a
171 # FILE2:
172 # KEY3: value3b
174 # if there was only one file there will not be a diff but we
175 # want it to look like there was.
176 diff_dict = merged.pop("__DIFF__", [dict()])
178 # Put the common headers first in the output.
179 # Store the mode so that we can work out how to read the file in
180 output: MutableMapping[str, str | MutableMapping[str, Any]] = {
181 CONTENT_KEY: content_mode,
182 COMMON_KEY: merged,
183 }
184 for file, diff in zip(headers, diff_dict):
185 output[file] = diff
187 return output
190@overload
191def read_index( 191 ↛ exitline 191 didn't jump to the function exit
192 path: str,
193 *,
194 force_dict: Literal[True],
195) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
198@overload
199def read_index( 199 ↛ exitline 199 didn't jump to the function exit
200 path: str,
201 *,
202 force_dict: Literal[False],
203) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
206def read_index(
207 path: str, force_dict: bool = False
208) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
209 """Read an index file.
211 Parameters
212 ----------
213 path : `str`
214 Path to the index file.
215 force_dict : `bool`, optional
216 If `True` the structure returned will always be a dict keyed
217 by filename.
219 Returns
220 -------
221 index_ : `.ObservationGroup` or `dict` [ `str`, \
222 `dict` | `.ObservationInfo` ]
223 The return content matches that returned by `process_index_data`.
224 """
225 if not path.endswith(".json"):
226 raise ValueError(f"Index files must be in .json format; got {path}")
228 with open(path) as fd:
229 content: MutableMapping[str, Any] = json.loads(fd.read())
231 if not isinstance(content, MutableMapping):
232 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
234 return process_index_data(content, force_dict=force_dict)
237@overload
238def process_index_data( 238 ↛ exitline 238 didn't jump to the function exit
239 content: MutableMapping[str, Any],
240 *,
241 force_metadata: Literal[True],
242 force_dict: Literal[False],
243) -> MutableMapping[str, Any]: ...
246@overload
247def process_index_data( 247 ↛ exitline 247 didn't jump to the function exit
248 content: MutableMapping[str, Any],
249 *,
250 force_metadata: Literal[False],
251 force_dict: Literal[True],
252) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
255@overload
256def process_index_data( 256 ↛ exitline 256 didn't jump to the function exit
257 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
258) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
261def process_index_data(
262 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
263) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
264 """Process the content read from a JSON index file.
266 Parameters
267 ----------
268 content : `dict`
269 Data structure stored in JSON index file converted to simple python
270 form.
271 force_metadata : `bool`, optional
272 By default the content returned will match the original form that
273 was used for the index. If this parameter is `True` an index of
274 `.ObservationInfo` will be returned as if it was simple dict content.
275 force_dict : `bool`, optional
276 If `True` the structure returned will always be a dict keyed
277 by filename.
279 Returns
280 -------
281 index : `.ObservationGroup` or `dict` of [`str`, `dict`]
282 If the index file referred to `.ObservationInfo` this will return
283 an `.ObservationGroup`, otherwise a `dict` will be returned with the
284 keys being paths to files and the values being the keys and values
285 stored in the index (with common information merged in). This
286 can be overridden using the ``force_metadata`` parameter. If
287 ``force_dict`` is `True` a `dict` will be returned with filename
288 keys even if the index file refers to `.ObservationInfo` (the values
289 will be `.ObservationInfo` unless ``force_metadata`` is `True`).
291 Notes
292 -----
293 File keys will be relative to the location of the index file.
294 """
295 if COMMON_KEY not in content:
296 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
298 # Copy the input structure so we can update in place
299 unpacked = deepcopy(content)
301 content_mode = unpacked.pop(CONTENT_KEY, None)
302 if force_metadata:
303 content_mode = "metadata"
304 elif content_mode is None:
305 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
306 content_mode = "metadata"
308 # The common headers will be copied into each header
309 common = unpacked.pop(COMMON_KEY)
311 for file in unpacked:
312 unpacked[file].update(common)
314 if content_mode == "metadata":
315 # nothing more to be done
316 return unpacked
318 obs_infos: list[ObservationInfo] = []
319 # This type annotation is really MutableMapping[str, ObservationInfo]
320 # but mypy needs it to look like the function return value.
321 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {}
322 for file, hdr in unpacked.items():
323 info = ObservationInfo.from_simple(hdr)
324 info.filename = file
325 obs_infos.append(info)
326 by_file[file] = info
328 if force_dict:
329 return by_file
330 return ObservationGroup(obs_infos)
333def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]:
334 """Read a metadata sidecar file.
336 Parameters
337 ----------
338 path : `str`
339 Path to the sidecar file.
341 Returns
342 -------
343 info : `.ObservationInfo` or `dict` of [`str`, `dict`]
344 If the sidecar file referred to `.ObservationInfo` this will return
345 an `.ObservationInfo`, otherwise a `dict` will be returned.
346 """
347 if not path.endswith(".json"):
348 raise ValueError(f"Sidecar files must be in .json format; got {path}")
350 with open(path) as fd:
351 content: MutableMapping[str, Any] = json.loads(fd.read())
353 if not isinstance(content, MutableMapping):
354 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
356 return process_sidecar_data(content)
359@overload
360def process_sidecar_data( 360 ↛ exitline 360 didn't jump to the function exit
361 content: MutableMapping[str, Any],
362) -> ObservationInfo | MutableMapping[str, Any]: ...
365@overload
366def process_sidecar_data( 366 ↛ exitline 366 didn't jump to the function exit
367 content: MutableMapping[str, Any], force_metadata: Literal[True]
368) -> MutableMapping[str, Any]: ...
371@overload
372def process_sidecar_data( 372 ↛ exitline 372 didn't jump to the function exit
373 content: MutableMapping[str, Any], force_metadata: Literal[False]
374) -> ObservationInfo | MutableMapping[str, Any]: ...
377def process_sidecar_data(
378 content: MutableMapping[str, Any], force_metadata: bool = False
379) -> ObservationInfo | MutableMapping[str, Any]:
380 """Process the content read from a JSON sidecar file.
382 Parameters
383 ----------
384 content : `dict`
385 Data structure stored in JSON sidecar file converted to simple python
386 form.
387 force_metadata : `bool`, optional
388 By default the content returned will match the original form that
389 was used for the sidecar. If this parameter is `True` a sidecar of
390 `.ObservationInfo` will be returned as if it was simple dict content.
392 Returns
393 -------
394 info : `.ObservationInfo` or `dict` of [`str`, `~typing.Any`]
395 If the sidecar file referred to `.ObservationInfo` this will return
396 an `.ObservationInfo`, otherwise a `dict` will be returned. This
397 can be overridden using the ``force_metadata`` parameter in which
398 case a `dict` will always be returned.
399 """
400 if not isinstance(content, dict):
401 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
403 # Copy the input structure so we can update in place
404 content = deepcopy(content)
406 guessing = False
407 content_mode = content.pop(CONTENT_KEY, None)
408 if force_metadata:
409 content_mode = "metadata"
410 elif content_mode is None:
411 # All ObservationInfo objects will have observation_id and instrument
412 # so if they are there we can guess
413 guessing = True
414 if "observation_id" in content and "instrument" in content:
415 content_mode = "translated"
416 else:
417 content_mode = "metadata"
418 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
420 if content_mode == "metadata":
421 # nothing more to be done
422 return content
424 try:
425 info = ObservationInfo.from_simple(content)
426 except Exception as e:
427 if guessing:
428 # We were guessing so seems like this is not ObservationInfo
429 return content
430 raise e
432 return info