Coverage for python/astro_metadata_translator/indexing.py: 21%
135 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-30 02:35 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-30 02:35 -0700
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = (
15 "read_index",
16 "read_sidecar",
17 "calculate_index",
18 "index_files",
19 "process_index_data",
20 "process_sidecar_data",
21)
23"""Functions to support file indexing."""
25import collections.abc
26import json
27import logging
28import os
29import sys
30from copy import deepcopy
31from typing import IO, Any, List, Literal, MutableMapping, Optional, Sequence, Tuple, Union, overload
33from .file_helpers import read_file_info
34from .headers import merge_headers
35from .observationGroup import ObservationGroup
36from .observationInfo import ObservationInfo
38log = logging.getLogger(__name__)
40COMMON_KEY = "__COMMON__"
41CONTENT_KEY = "__CONTENT__"
44def index_files(
45 files: Sequence[str],
46 root: Optional[str],
47 hdrnum: int,
48 print_trace: bool,
49 content: str,
50 outstream: IO = sys.stdout,
51 errstream: IO = sys.stderr,
52) -> Tuple[MutableMapping[str, Union[str, MutableMapping[str, Any]]], List[str], List[str]]:
53 """Create an index from the supplied files.
55 No file is written. The Python structure returned is suitable
56 for writing.
58 Parameters
59 ----------
60 files : iterable of `str`
61 Paths to the files to be indexed. They do not have to all be
62 in a single directory but all content will be indexed into a single
63 index.
64 root : `str`
65 Directory root that can be combined with each file (if the supplied)
66 file is relative. Will be ignored if `None`.
67 hdrnum : `int`
68 The HDU number to read. The primary header is always read and
69 print_trace : `bool`
70 If there is an error reading the file and this parameter is `True`,
71 a full traceback of the exception will be reported. If `False` prints
72 a one line summary of the error condition. If `None` the exception
73 will be allowed.
74 content : `str`
75 Form of data to write in index file. Options are:
76 ``translated`` (default) to write ObservationInfo to the index;
77 ``metadata`` to write native metadata headers to the index.
78 The index file is called ``{mode}_index.json``
79 outstream : `io.StringIO`, optional
80 Output stream to use for standard messages. Defaults to `sys.stdout`.
81 errstream : `io.StringIO`, optional
82 Stream to send messages that would normally be sent to standard
83 error. Defaults to `sys.stderr`.
85 Returns
86 -------
87 file_index : `dict` of [`str`, `dict`]
88 The headers in form suitable for writing to an index. The keys will
89 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
90 content mode used to construct the index, and paths to the files. The
91 paths will be the supplied paths and will not include any supplied
92 ``root``.
93 okay : `list` of `str`
94 All the files that were processed successfully.
95 failed : `list` of `str`
96 All the files that could not be processed. Will be empty if
97 ``print_trace`` is not `None`.
98 """
99 if content not in ("translated", "metadata"):
100 raise ValueError("Unrecognized mode {mode}")
102 failed: List[str] = []
103 okay: List[str] = []
105 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content
106 for file in sorted(files):
107 if root is not None:
108 path = os.path.join(root, file)
109 else:
110 path = file
111 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream)
112 if simple is None:
113 failed.append(path)
114 continue
115 else:
116 okay.append(path)
118 # Store the information indexed by the filename within dir
119 # We may get a PropertyList here and can therefore not just
120 # assert Mapping for mypy. We therefore assert that it's not the
121 # other 2 options, which we were enforcing with the "simple" parameter
122 # in the call to read_file_info.
123 assert not isinstance(simple, (str, ObservationInfo))
124 content_by_file[file] = simple
126 output = calculate_index(content_by_file, content)
128 return output, okay, failed
131def calculate_index(
132 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str
133) -> MutableMapping[str, Union[str, MutableMapping[str, Any]]]:
134 """Calculate an index data structure from the supplied headers.
136 Parameters
137 ----------
138 headers : `dict` of [`str`, `dict`]
139 The headers indexed by filename.
140 content_mode : `str`
141 The mode associated with these headers. Not used other than to
142 store the information in the data structure for later use on
143 deserialization.
145 Returns
146 -------
147 index_ : `dict` of [`str`, `dict`]
148 The headers in form suitable for writing to an index.
149 """
150 if content_mode not in ("metadata", "translated"):
151 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
153 # Merge all the information into a primary plus diff
154 merged = merge_headers([hdr for hdr in headers.values()], mode="diff")
156 # For a single file it is possible that the merged contents
157 # are not a dict but are an LSST-style PropertyList. JSON needs
158 # dict though. mypy can't know about PropertyList so we must ignore
159 # the type error.
160 if not isinstance(merged, collections.abc.Mapping):
161 merged = dict(merged) # type: ignore
163 # The structure to write to file is intended to look like (in YAML):
164 # __COMMON__:
165 # KEY1: value1
166 # KEY2: value2
167 # FILE1:
168 # KEY3: value3a
169 # FILE2:
170 # KEY3: value3b
172 # if there was only one file there will not be a diff but we
173 # want it to look like there was.
174 diff_dict = merged.pop("__DIFF__", [dict()])
176 # Put the common headers first in the output.
177 # Store the mode so that we can work out how to read the file in
178 output: MutableMapping[str, Union[str, MutableMapping[str, Any]]] = {
179 CONTENT_KEY: content_mode,
180 COMMON_KEY: merged,
181 }
182 for file, diff in zip(headers, diff_dict):
183 output[file] = diff
185 return output
188@overload
189def read_index(
190 path: str,
191 *,
192 force_dict: Literal[True],
193) -> MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]:
194 ...
197@overload
198def read_index(
199 path: str,
200 *,
201 force_dict: Literal[False],
202) -> Union[ObservationGroup, MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]]:
203 ...
206def read_index(
207 path: str, force_dict: bool = False
208) -> Union[ObservationGroup, MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]]:
209 """Read an index file.
211 Parameters
212 ----------
213 path : `str`
214 Path to the index file.
215 force_dict : `bool`, optional
216 If `True` the structure returned will always be a dict keyed
217 by filename.
219 Returns
220 -------
221 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]`
222 The return content matches that returned by `process_index_data`.
223 """
224 if not path.endswith(".json"):
225 raise ValueError(f"Index files must be in .json format; got {path}")
227 with open(path, "r") as fd:
228 content: MutableMapping[str, Any] = json.loads(fd.read())
230 if not isinstance(content, MutableMapping):
231 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
233 return process_index_data(content, force_dict=force_dict)
236@overload
237def process_index_data(
238 content: MutableMapping[str, Any],
239 *,
240 force_metadata: Literal[True],
241 force_dict: Literal[False],
242) -> MutableMapping[str, Any]:
243 ...
246@overload
247def process_index_data(
248 content: MutableMapping[str, Any],
249 *,
250 force_metadata: Literal[False],
251 force_dict: Literal[True],
252) -> MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]:
253 ...
256@overload
257def process_index_data(
258 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
259) -> Union[ObservationGroup, MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]]:
260 ...
263def process_index_data(
264 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
265) -> Union[ObservationGroup, MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]]]:
266 """Process the content read from a JSON index file.
268 Parameters
269 ----------
270 content : `dict`
271 Data structure stored in JSON index file converted to simple python
272 form.
273 force_metadata : `bool`, optional
274 By default the content returned will match the original form that
275 was used for the index. If this parameter is `True` an index of
276 `ObservationInfo` will be returned as if it was simple dict content.
277 force_dict : `bool`, optional
278 If `True` the structure returned will always be a dict keyed
279 by filename.
281 Returns
282 -------
283 index : `ObservationGroup` or `dict` of [`str`, `dict`]
284 If the index file referred to `ObservationInfo` this will return
285 an `ObservationGroup`, otherwise a `dict` will be returned with the
286 keys being paths to files and the values being the keys and values
287 stored in the index (with common information merged in). This
288 can be overridden using the ``force_metadata`` parameter. If
289 ``force_dict`` is `True` a `dict` will be returned with filename
290 keys even if the index file refers to `ObservationInfo` (the values
291 will be `ObservationInfo` unless ``force_metadata`` is `True`).
293 Notes
294 -----
295 File keys will be relative to the location of the index file.
296 """
298 if COMMON_KEY not in content:
299 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
301 # Copy the input structure so we can update in place
302 unpacked = deepcopy(content)
304 content_mode = unpacked.pop(CONTENT_KEY, None)
305 if force_metadata:
306 content_mode = "metadata"
307 elif content_mode is None:
308 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
309 content_mode = "metadata"
311 # The common headers will be copied into each header
312 common = unpacked.pop(COMMON_KEY)
314 for file in unpacked:
315 unpacked[file].update(common)
317 if content_mode == "metadata":
318 # nothing more to be done
319 return unpacked
321 obs_infos: List[ObservationInfo] = []
322 # This type annotation is really MutableMapping[str, ObservationInfo]
323 # but mypy needs it to look like the function return value.
324 by_file: MutableMapping[str, Union[MutableMapping[str, Any], ObservationInfo]] = {}
325 for file, hdr in unpacked.items():
326 info = ObservationInfo.from_simple(hdr)
327 info.filename = file
328 obs_infos.append(info)
329 by_file[file] = info
331 if force_dict:
332 return by_file
333 return ObservationGroup(obs_infos)
336def read_sidecar(path: str) -> Union[ObservationInfo, MutableMapping[str, Any]]:
337 """Read a metadata sidecar file.
339 Parameters
340 ----------
341 path : `str`
342 Path to the sidecar file.
344 Returns
345 -------
346 info : `ObservationInfo` or `dict` of [`str`, `dict`]
347 If the sidecar file referred to `ObservationInfo` this will return
348 an `ObservationInfo`, otherwise a `dict` will be returned.
349 """
350 if not path.endswith(".json"):
351 raise ValueError(f"Sidecar files must be in .json format; got {path}")
353 with open(path, "r") as fd:
354 content: MutableMapping[str, Any] = json.loads(fd.read())
356 if not isinstance(content, MutableMapping):
357 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
359 return process_sidecar_data(content)
362@overload
363def process_sidecar_data(
364 content: MutableMapping[str, Any],
365) -> Union[ObservationInfo, MutableMapping[str, Any]]:
366 ...
369@overload
370def process_sidecar_data(
371 content: MutableMapping[str, Any], force_metadata: Literal[True]
372) -> MutableMapping[str, Any]:
373 ...
376@overload
377def process_sidecar_data(
378 content: MutableMapping[str, Any], force_metadata: Literal[False]
379) -> Union[ObservationInfo, MutableMapping[str, Any]]:
380 ...
383def process_sidecar_data(
384 content: MutableMapping[str, Any], force_metadata: bool = False
385) -> Union[ObservationInfo, MutableMapping[str, Any]]:
386 """Process the content read from a JSON sidecar file.
388 Parameters
389 ----------
390 content : `dict`
391 Data structure stored in JSON sidecar file converted to simple python
392 form.
393 force_metadata : `bool`, optional
394 By default the content returned will match the original form that
395 was used for the sidecar. If this parameter is `True` a sidecar of
396 `ObservationInfo` will be returned as if it was simple dict content.
398 Returns
399 -------
400 info : `ObservationInfo` or `dict` of [`str`, `Any`]
401 If the sidecar file referred to `ObservationInfo` this will return
402 an `ObservationInfo`, otherwise a `dict` will be returned. This
403 can be overridden using the ``force_metadata`` parameter in which
404 case a `dict` will always be returned.
405 """
407 if not isinstance(content, dict):
408 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
410 # Copy the input structure so we can update in place
411 content = deepcopy(content)
413 guessing = False
414 content_mode = content.pop(CONTENT_KEY, None)
415 if force_metadata:
416 content_mode = "metadata"
417 elif content_mode is None:
418 # All ObservationInfo objects will have observation_id and instrument
419 # so if they are there we can guess
420 guessing = True
421 if "observation_id" in content and "instrument" in content:
422 content_mode = "translated"
423 else:
424 content_mode = "metadata"
425 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
427 if content_mode == "metadata":
428 # nothing more to be done
429 return content
431 try:
432 info = ObservationInfo.from_simple(content)
433 except Exception as e:
434 if guessing:
435 # We were guessing so seems like this is not ObservationInfo
436 return content
437 raise e
439 return info