Coverage for python/astro_metadata_translator/indexing.py: 21%
136 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-27 02:38 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-27 02:38 -0700
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = (
15 "read_index",
16 "read_sidecar",
17 "calculate_index",
18 "index_files",
19 "process_index_data",
20 "process_sidecar_data",
21)
23"""Functions to support file indexing."""
25import collections.abc
26import json
27import logging
28import os
29import sys
30from collections.abc import MutableMapping, Sequence
31from copy import deepcopy
32from typing import IO, Any, Literal, overload
34from .file_helpers import read_file_info
35from .headers import merge_headers
36from .observationGroup import ObservationGroup
37from .observationInfo import ObservationInfo
39log = logging.getLogger(__name__)
41COMMON_KEY = "__COMMON__"
42CONTENT_KEY = "__CONTENT__"
45def index_files(
46 files: Sequence[str],
47 root: str | None,
48 hdrnum: int,
49 print_trace: bool,
50 content: str,
51 outstream: IO = sys.stdout,
52 errstream: IO = sys.stderr,
53) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]:
54 """Create an index from the supplied files.
56 No file is written. The Python structure returned is suitable
57 for writing.
59 Parameters
60 ----------
61 files : iterable of `str`
62 Paths to the files to be indexed. They do not have to all be
63 in a single directory but all content will be indexed into a single
64 index.
65 root : `str`
66 Directory root that can be combined with each file (if the supplied)
67 file is relative. Will be ignored if `None`.
68 hdrnum : `int`
69 The HDU number to read. The primary header is always read and
70 print_trace : `bool`
71 If there is an error reading the file and this parameter is `True`,
72 a full traceback of the exception will be reported. If `False` prints
73 a one line summary of the error condition. If `None` the exception
74 will be allowed.
75 content : `str`
76 Form of data to write in index file. Options are:
77 ``translated`` (default) to write ObservationInfo to the index;
78 ``metadata`` to write native metadata headers to the index.
79 The index file is called ``{mode}_index.json``
80 outstream : `io.StringIO`, optional
81 Output stream to use for standard messages. Defaults to `sys.stdout`.
82 errstream : `io.StringIO`, optional
83 Stream to send messages that would normally be sent to standard
84 error. Defaults to `sys.stderr`.
86 Returns
87 -------
88 file_index : `dict` of [`str`, `dict`]
89 The headers in form suitable for writing to an index. The keys will
90 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
91 content mode used to construct the index, and paths to the files. The
92 paths will be the supplied paths and will not include any supplied
93 ``root``.
94 okay : `list` of `str`
95 All the files that were processed successfully.
96 failed : `list` of `str`
97 All the files that could not be processed. Will be empty if
98 ``print_trace`` is not `None`.
99 """
100 if content not in ("translated", "metadata"):
101 raise ValueError("Unrecognized mode {mode}")
103 failed: list[str] = []
104 okay: list[str] = []
106 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content
107 for file in sorted(files):
108 if root is not None:
109 path = os.path.join(root, file)
110 else:
111 path = file
112 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream)
113 if simple is None:
114 failed.append(path)
115 continue
116 else:
117 okay.append(path)
119 # Store the information indexed by the filename within dir
120 # We may get a PropertyList here and can therefore not just
121 # assert Mapping for mypy. We therefore assert that it's not the
122 # other 2 options, which we were enforcing with the "simple" parameter
123 # in the call to read_file_info.
124 assert not isinstance(simple, (str, ObservationInfo))
125 content_by_file[file] = simple
127 output = calculate_index(content_by_file, content)
129 return output, okay, failed
132def calculate_index(
133 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str
134) -> MutableMapping[str, str | MutableMapping[str, Any]]:
135 """Calculate an index data structure from the supplied headers.
137 Parameters
138 ----------
139 headers : `dict` of [`str`, `dict`]
140 The headers indexed by filename.
141 content_mode : `str`
142 The mode associated with these headers. Not used other than to
143 store the information in the data structure for later use on
144 deserialization.
146 Returns
147 -------
148 index_ : `dict` of [`str`, `dict`]
149 The headers in form suitable for writing to an index.
150 """
151 if content_mode not in ("metadata", "translated"):
152 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
154 # Merge all the information into a primary plus diff
155 merged = merge_headers([hdr for hdr in headers.values()], mode="diff")
157 # For a single file it is possible that the merged contents
158 # are not a dict but are an LSST-style PropertyList. JSON needs
159 # dict though. mypy can't know about PropertyList so we must ignore
160 # the type error.
161 if not isinstance(merged, collections.abc.Mapping):
162 merged = dict(merged) # type: ignore
164 # The structure to write to file is intended to look like (in YAML):
165 # __COMMON__:
166 # KEY1: value1
167 # KEY2: value2
168 # FILE1:
169 # KEY3: value3a
170 # FILE2:
171 # KEY3: value3b
173 # if there was only one file there will not be a diff but we
174 # want it to look like there was.
175 diff_dict = merged.pop("__DIFF__", [dict()])
177 # Put the common headers first in the output.
178 # Store the mode so that we can work out how to read the file in
179 output: MutableMapping[str, str | MutableMapping[str, Any]] = {
180 CONTENT_KEY: content_mode,
181 COMMON_KEY: merged,
182 }
183 for file, diff in zip(headers, diff_dict):
184 output[file] = diff
186 return output
189@overload
190def read_index(
191 path: str,
192 *,
193 force_dict: Literal[True],
194) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
195 ...
198@overload
199def read_index(
200 path: str,
201 *,
202 force_dict: Literal[False],
203) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
204 ...
207def read_index(
208 path: str, force_dict: bool = False
209) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
210 """Read an index file.
212 Parameters
213 ----------
214 path : `str`
215 Path to the index file.
216 force_dict : `bool`, optional
217 If `True` the structure returned will always be a dict keyed
218 by filename.
220 Returns
221 -------
222 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]`
223 The return content matches that returned by `process_index_data`.
224 """
225 if not path.endswith(".json"):
226 raise ValueError(f"Index files must be in .json format; got {path}")
228 with open(path) as fd:
229 content: MutableMapping[str, Any] = json.loads(fd.read())
231 if not isinstance(content, MutableMapping):
232 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
234 return process_index_data(content, force_dict=force_dict)
237@overload
238def process_index_data(
239 content: MutableMapping[str, Any],
240 *,
241 force_metadata: Literal[True],
242 force_dict: Literal[False],
243) -> MutableMapping[str, Any]:
244 ...
247@overload
248def process_index_data(
249 content: MutableMapping[str, Any],
250 *,
251 force_metadata: Literal[False],
252 force_dict: Literal[True],
253) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
254 ...
257@overload
258def process_index_data(
259 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
260) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
261 ...
264def process_index_data(
265 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
266) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
267 """Process the content read from a JSON index file.
269 Parameters
270 ----------
271 content : `dict`
272 Data structure stored in JSON index file converted to simple python
273 form.
274 force_metadata : `bool`, optional
275 By default the content returned will match the original form that
276 was used for the index. If this parameter is `True` an index of
277 `ObservationInfo` will be returned as if it was simple dict content.
278 force_dict : `bool`, optional
279 If `True` the structure returned will always be a dict keyed
280 by filename.
282 Returns
283 -------
284 index : `ObservationGroup` or `dict` of [`str`, `dict`]
285 If the index file referred to `ObservationInfo` this will return
286 an `ObservationGroup`, otherwise a `dict` will be returned with the
287 keys being paths to files and the values being the keys and values
288 stored in the index (with common information merged in). This
289 can be overridden using the ``force_metadata`` parameter. If
290 ``force_dict`` is `True` a `dict` will be returned with filename
291 keys even if the index file refers to `ObservationInfo` (the values
292 will be `ObservationInfo` unless ``force_metadata`` is `True`).
294 Notes
295 -----
296 File keys will be relative to the location of the index file.
297 """
299 if COMMON_KEY not in content:
300 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
302 # Copy the input structure so we can update in place
303 unpacked = deepcopy(content)
305 content_mode = unpacked.pop(CONTENT_KEY, None)
306 if force_metadata:
307 content_mode = "metadata"
308 elif content_mode is None:
309 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
310 content_mode = "metadata"
312 # The common headers will be copied into each header
313 common = unpacked.pop(COMMON_KEY)
315 for file in unpacked:
316 unpacked[file].update(common)
318 if content_mode == "metadata":
319 # nothing more to be done
320 return unpacked
322 obs_infos: list[ObservationInfo] = []
323 # This type annotation is really MutableMapping[str, ObservationInfo]
324 # but mypy needs it to look like the function return value.
325 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {}
326 for file, hdr in unpacked.items():
327 info = ObservationInfo.from_simple(hdr)
328 info.filename = file
329 obs_infos.append(info)
330 by_file[file] = info
332 if force_dict:
333 return by_file
334 return ObservationGroup(obs_infos)
337def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]:
338 """Read a metadata sidecar file.
340 Parameters
341 ----------
342 path : `str`
343 Path to the sidecar file.
345 Returns
346 -------
347 info : `ObservationInfo` or `dict` of [`str`, `dict`]
348 If the sidecar file referred to `ObservationInfo` this will return
349 an `ObservationInfo`, otherwise a `dict` will be returned.
350 """
351 if not path.endswith(".json"):
352 raise ValueError(f"Sidecar files must be in .json format; got {path}")
354 with open(path) as fd:
355 content: MutableMapping[str, Any] = json.loads(fd.read())
357 if not isinstance(content, MutableMapping):
358 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
360 return process_sidecar_data(content)
363@overload
364def process_sidecar_data(
365 content: MutableMapping[str, Any],
366) -> ObservationInfo | MutableMapping[str, Any]:
367 ...
370@overload
371def process_sidecar_data(
372 content: MutableMapping[str, Any], force_metadata: Literal[True]
373) -> MutableMapping[str, Any]:
374 ...
377@overload
378def process_sidecar_data(
379 content: MutableMapping[str, Any], force_metadata: Literal[False]
380) -> ObservationInfo | MutableMapping[str, Any]:
381 ...
384def process_sidecar_data(
385 content: MutableMapping[str, Any], force_metadata: bool = False
386) -> ObservationInfo | MutableMapping[str, Any]:
387 """Process the content read from a JSON sidecar file.
389 Parameters
390 ----------
391 content : `dict`
392 Data structure stored in JSON sidecar file converted to simple python
393 form.
394 force_metadata : `bool`, optional
395 By default the content returned will match the original form that
396 was used for the sidecar. If this parameter is `True` a sidecar of
397 `ObservationInfo` will be returned as if it was simple dict content.
399 Returns
400 -------
401 info : `ObservationInfo` or `dict` of [`str`, `Any`]
402 If the sidecar file referred to `ObservationInfo` this will return
403 an `ObservationInfo`, otherwise a `dict` will be returned. This
404 can be overridden using the ``force_metadata`` parameter in which
405 case a `dict` will always be returned.
406 """
408 if not isinstance(content, dict):
409 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
411 # Copy the input structure so we can update in place
412 content = deepcopy(content)
414 guessing = False
415 content_mode = content.pop(CONTENT_KEY, None)
416 if force_metadata:
417 content_mode = "metadata"
418 elif content_mode is None:
419 # All ObservationInfo objects will have observation_id and instrument
420 # so if they are there we can guess
421 guessing = True
422 if "observation_id" in content and "instrument" in content:
423 content_mode = "translated"
424 else:
425 content_mode = "metadata"
426 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
428 if content_mode == "metadata":
429 # nothing more to be done
430 return content
432 try:
433 info = ObservationInfo.from_simple(content)
434 except Exception as e:
435 if guessing:
436 # We were guessing so seems like this is not ObservationInfo
437 return content
438 raise e
440 return info