Coverage for python/astro_metadata_translator/indexing.py: 27%
136 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 10:39 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-20 10:39 +0000
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12"""Functions to support file indexing."""
14from __future__ import annotations
16__all__ = (
17 "read_index",
18 "read_sidecar",
19 "calculate_index",
20 "index_files",
21 "process_index_data",
22 "process_sidecar_data",
23)
25import collections.abc
26import json
27import logging
28import os
29import sys
30from collections.abc import MutableMapping, Sequence
31from copy import deepcopy
32from typing import IO, Any, Literal, overload
34from .file_helpers import read_file_info
35from .headers import merge_headers
36from .observationGroup import ObservationGroup
37from .observationInfo import ObservationInfo
39log = logging.getLogger(__name__)
41COMMON_KEY = "__COMMON__"
42CONTENT_KEY = "__CONTENT__"
45def index_files(
46 files: Sequence[str],
47 root: str | None,
48 hdrnum: int,
49 print_trace: bool,
50 content: str,
51 outstream: IO = sys.stdout,
52 errstream: IO = sys.stderr,
53) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]:
54 """Create an index from the supplied files.
56 No file is written. The Python structure returned is suitable
57 for writing.
59 Parameters
60 ----------
61 files : iterable of `str`
62 Paths to the files to be indexed. They do not have to all be
63 in a single directory but all content will be indexed into a single
64 index.
65 root : `str`
66 Directory root that can be combined with each file (if the supplied)
67 file is relative. Will be ignored if `None`.
68 hdrnum : `int`
69 The HDU number to read. The primary header is always read and
70 print_trace : `bool`
71 If there is an error reading the file and this parameter is `True`,
72 a full traceback of the exception will be reported. If `False` prints
73 a one line summary of the error condition. If `None` the exception
74 will be allowed.
75 content : `str`
76 Form of data to write in index file. Options are:
77 ``translated`` (default) to write ObservationInfo to the index;
78 ``metadata`` to write native metadata headers to the index.
79 The index file is called ``{mode}_index.json``
80 outstream : `io.StringIO`, optional
81 Output stream to use for standard messages. Defaults to `sys.stdout`.
82 errstream : `io.StringIO`, optional
83 Stream to send messages that would normally be sent to standard
84 error. Defaults to `sys.stderr`.
86 Returns
87 -------
88 file_index : `dict` of [`str`, `dict`]
89 The headers in form suitable for writing to an index. The keys will
90 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
91 content mode used to construct the index, and paths to the files. The
92 paths will be the supplied paths and will not include any supplied
93 ``root``.
94 okay : `list` of `str`
95 All the files that were processed successfully.
96 failed : `list` of `str`
97 All the files that could not be processed. Will be empty if
98 ``print_trace`` is not `None`.
99 """
100 if content not in ("translated", "metadata"):
101 raise ValueError("Unrecognized mode {mode}")
103 failed: list[str] = []
104 okay: list[str] = []
106 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content
107 for file in sorted(files):
108 if root is not None:
109 path = os.path.join(root, file)
110 else:
111 path = file
112 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream)
113 if simple is None:
114 failed.append(path)
115 continue
116 else:
117 okay.append(path)
119 # Store the information indexed by the filename within dir
120 # We may get a PropertyList here and can therefore not just
121 # assert Mapping for mypy. We therefore assert that it's not the
122 # other 2 options, which we were enforcing with the "simple" parameter
123 # in the call to read_file_info.
124 assert not isinstance(simple, (str, ObservationInfo))
125 content_by_file[file] = simple
127 output = calculate_index(content_by_file, content)
129 return output, okay, failed
132def calculate_index(
133 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str
134) -> MutableMapping[str, str | MutableMapping[str, Any]]:
135 """Calculate an index data structure from the supplied headers.
137 Parameters
138 ----------
139 headers : `dict` of [`str`, `dict`]
140 The headers indexed by filename.
141 content_mode : `str`
142 The mode associated with these headers. Not used other than to
143 store the information in the data structure for later use on
144 deserialization.
146 Returns
147 -------
148 index_ : `dict` of [`str`, `dict`]
149 The headers in form suitable for writing to an index.
150 """
151 if content_mode not in ("metadata", "translated"):
152 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
154 # Merge all the information into a primary plus diff
155 merged = merge_headers([hdr for hdr in headers.values()], mode="diff")
157 # For a single file it is possible that the merged contents
158 # are not a dict but are an LSST-style PropertyList. JSON needs
159 # dict though. mypy can't know about PropertyList so we must ignore
160 # the type error.
161 if not isinstance(merged, collections.abc.Mapping):
162 merged = dict(merged) # type: ignore
164 # The structure to write to file is intended to look like (in YAML):
165 # __COMMON__:
166 # KEY1: value1
167 # KEY2: value2
168 # FILE1:
169 # KEY3: value3a
170 # FILE2:
171 # KEY3: value3b
173 # if there was only one file there will not be a diff but we
174 # want it to look like there was.
175 diff_dict = merged.pop("__DIFF__", [dict()])
177 # Put the common headers first in the output.
178 # Store the mode so that we can work out how to read the file in
179 output: MutableMapping[str, str | MutableMapping[str, Any]] = {
180 CONTENT_KEY: content_mode,
181 COMMON_KEY: merged,
182 }
183 for file, diff in zip(headers, diff_dict):
184 output[file] = diff
186 return output
189@overload
190def read_index(
191 path: str,
192 *,
193 force_dict: Literal[True],
194) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
195 ...
198@overload
199def read_index(
200 path: str,
201 *,
202 force_dict: Literal[False],
203) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
204 ...
207def read_index(
208 path: str, force_dict: bool = False
209) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
210 """Read an index file.
212 Parameters
213 ----------
214 path : `str`
215 Path to the index file.
216 force_dict : `bool`, optional
217 If `True` the structure returned will always be a dict keyed
218 by filename.
220 Returns
221 -------
222 index_ : `.ObservationGroup` or `dict` [ `str`, \
223 `dict` | `.ObservationInfo` ]
224 The return content matches that returned by `process_index_data`.
225 """
226 if not path.endswith(".json"):
227 raise ValueError(f"Index files must be in .json format; got {path}")
229 with open(path) as fd:
230 content: MutableMapping[str, Any] = json.loads(fd.read())
232 if not isinstance(content, MutableMapping):
233 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
235 return process_index_data(content, force_dict=force_dict)
238@overload
239def process_index_data(
240 content: MutableMapping[str, Any],
241 *,
242 force_metadata: Literal[True],
243 force_dict: Literal[False],
244) -> MutableMapping[str, Any]:
245 ...
248@overload
249def process_index_data(
250 content: MutableMapping[str, Any],
251 *,
252 force_metadata: Literal[False],
253 force_dict: Literal[True],
254) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
255 ...
258@overload
259def process_index_data(
260 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
261) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
262 ...
265def process_index_data(
266 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
267) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
268 """Process the content read from a JSON index file.
270 Parameters
271 ----------
272 content : `dict`
273 Data structure stored in JSON index file converted to simple python
274 form.
275 force_metadata : `bool`, optional
276 By default the content returned will match the original form that
277 was used for the index. If this parameter is `True` an index of
278 `.ObservationInfo` will be returned as if it was simple dict content.
279 force_dict : `bool`, optional
280 If `True` the structure returned will always be a dict keyed
281 by filename.
283 Returns
284 -------
285 index : `.ObservationGroup` or `dict` of [`str`, `dict`]
286 If the index file referred to `.ObservationInfo` this will return
287 an `.ObservationGroup`, otherwise a `dict` will be returned with the
288 keys being paths to files and the values being the keys and values
289 stored in the index (with common information merged in). This
290 can be overridden using the ``force_metadata`` parameter. If
291 ``force_dict`` is `True` a `dict` will be returned with filename
292 keys even if the index file refers to `.ObservationInfo` (the values
293 will be `.ObservationInfo` unless ``force_metadata`` is `True`).
295 Notes
296 -----
297 File keys will be relative to the location of the index file.
298 """
299 if COMMON_KEY not in content:
300 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
302 # Copy the input structure so we can update in place
303 unpacked = deepcopy(content)
305 content_mode = unpacked.pop(CONTENT_KEY, None)
306 if force_metadata:
307 content_mode = "metadata"
308 elif content_mode is None:
309 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
310 content_mode = "metadata"
312 # The common headers will be copied into each header
313 common = unpacked.pop(COMMON_KEY)
315 for file in unpacked:
316 unpacked[file].update(common)
318 if content_mode == "metadata":
319 # nothing more to be done
320 return unpacked
322 obs_infos: list[ObservationInfo] = []
323 # This type annotation is really MutableMapping[str, ObservationInfo]
324 # but mypy needs it to look like the function return value.
325 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {}
326 for file, hdr in unpacked.items():
327 info = ObservationInfo.from_simple(hdr)
328 info.filename = file
329 obs_infos.append(info)
330 by_file[file] = info
332 if force_dict:
333 return by_file
334 return ObservationGroup(obs_infos)
337def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]:
338 """Read a metadata sidecar file.
340 Parameters
341 ----------
342 path : `str`
343 Path to the sidecar file.
345 Returns
346 -------
347 info : `.ObservationInfo` or `dict` of [`str`, `dict`]
348 If the sidecar file referred to `.ObservationInfo` this will return
349 an `.ObservationInfo`, otherwise a `dict` will be returned.
350 """
351 if not path.endswith(".json"):
352 raise ValueError(f"Sidecar files must be in .json format; got {path}")
354 with open(path) as fd:
355 content: MutableMapping[str, Any] = json.loads(fd.read())
357 if not isinstance(content, MutableMapping):
358 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
360 return process_sidecar_data(content)
363@overload
364def process_sidecar_data(
365 content: MutableMapping[str, Any],
366) -> ObservationInfo | MutableMapping[str, Any]:
367 ...
370@overload
371def process_sidecar_data(
372 content: MutableMapping[str, Any], force_metadata: Literal[True]
373) -> MutableMapping[str, Any]:
374 ...
377@overload
378def process_sidecar_data(
379 content: MutableMapping[str, Any], force_metadata: Literal[False]
380) -> ObservationInfo | MutableMapping[str, Any]:
381 ...
384def process_sidecar_data(
385 content: MutableMapping[str, Any], force_metadata: bool = False
386) -> ObservationInfo | MutableMapping[str, Any]:
387 """Process the content read from a JSON sidecar file.
389 Parameters
390 ----------
391 content : `dict`
392 Data structure stored in JSON sidecar file converted to simple python
393 form.
394 force_metadata : `bool`, optional
395 By default the content returned will match the original form that
396 was used for the sidecar. If this parameter is `True` a sidecar of
397 `.ObservationInfo` will be returned as if it was simple dict content.
399 Returns
400 -------
401 info : `.ObservationInfo` or `dict` of [`str`, `~typing.Any`]
402 If the sidecar file referred to `.ObservationInfo` this will return
403 an `.ObservationInfo`, otherwise a `dict` will be returned. This
404 can be overridden using the ``force_metadata`` parameter in which
405 case a `dict` will always be returned.
406 """
407 if not isinstance(content, dict):
408 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
410 # Copy the input structure so we can update in place
411 content = deepcopy(content)
413 guessing = False
414 content_mode = content.pop(CONTENT_KEY, None)
415 if force_metadata:
416 content_mode = "metadata"
417 elif content_mode is None:
418 # All ObservationInfo objects will have observation_id and instrument
419 # so if they are there we can guess
420 guessing = True
421 if "observation_id" in content and "instrument" in content:
422 content_mode = "translated"
423 else:
424 content_mode = "metadata"
425 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
427 if content_mode == "metadata":
428 # nothing more to be done
429 return content
431 try:
432 info = ObservationInfo.from_simple(content)
433 except Exception as e:
434 if guessing:
435 # We were guessing so seems like this is not ObservationInfo
436 return content
437 raise e
439 return info