Coverage for python/astro_metadata_translator/indexing.py: 27%
127 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-28 02:59 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-28 02:59 -0700
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12"""Functions to support file indexing."""
14from __future__ import annotations
16__all__ = (
17 "read_index",
18 "read_sidecar",
19 "calculate_index",
20 "index_files",
21 "process_index_data",
22 "process_sidecar_data",
23)
25import collections.abc
26import json
27import logging
28import os
29from collections.abc import MutableMapping, Sequence
30from copy import deepcopy
31from typing import IO, Any, Literal, overload
33from .file_helpers import read_file_info
34from .headers import merge_headers
35from .observationGroup import ObservationGroup
36from .observationInfo import ObservationInfo
38log = logging.getLogger(__name__)
40COMMON_KEY = "__COMMON__"
41CONTENT_KEY = "__CONTENT__"
44def index_files(
45 files: Sequence[str],
46 root: str | None,
47 hdrnum: int,
48 print_trace: bool,
49 content: str,
50 outstream: IO | None = None,
51) -> tuple[MutableMapping[str, str | MutableMapping[str, Any]], list[str], list[str]]:
52 """Create an index from the supplied files.
54 No file is written. The Python structure returned is suitable
55 for writing.
57 Parameters
58 ----------
59 files : iterable of `str`
60 Paths to the files to be indexed. They do not have to all be
61 in a single directory but all content will be indexed into a single
62 index.
63 root : `str`
64 Directory root that can be combined with each file (if the supplied)
65 file is relative. Will be ignored if `None`.
66 hdrnum : `int`
67 The HDU number to read. The primary header is always read and
68 merged with the header from this HDU.
69 print_trace : `bool`
70 If there is an error reading the file and this parameter is `True`,
71 a full traceback of the exception will be reported. If `False` prints
72 a one line summary of the error condition. If `None` the exception
73 will be allowed.
74 content : `str`
75 Form of data to write in index file. Options are:
76 ``translated`` (default) to write ObservationInfo to the index;
77 ``metadata`` to write native metadata headers to the index.
78 The index file is called ``{mode}_index.json``.
79 outstream : `io.StringIO`, optional
80 Output stream to use for standard messages. Defaults to `None` which
81 uses the default output stream.
83 Returns
84 -------
85 file_index : `dict` of [`str`, `dict`]
86 The headers in form suitable for writing to an index. The keys will
87 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
88 content mode used to construct the index, and paths to the files. The
89 paths will be the supplied paths and will not include any supplied
90 ``root``.
91 okay : `list` of `str`
92 All the files that were processed successfully.
93 failed : `list` of `str`
94 All the files that could not be processed. Will be empty if
95 ``print_trace`` is not `None`.
96 """
97 if content not in ("translated", "metadata"):
98 raise ValueError("Unrecognized mode {mode}")
100 failed: list[str] = []
101 okay: list[str] = []
103 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content
104 for file in sorted(files):
105 if root is not None:
106 path = os.path.join(root, file)
107 else:
108 path = file
109 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream)
110 if simple is None:
111 failed.append(path)
112 continue
113 else:
114 okay.append(path)
116 # Store the information indexed by the filename within dir
117 # We may get a PropertyList here and can therefore not just
118 # assert Mapping for mypy. We therefore assert that it's not the
119 # other 2 options, which we were enforcing with the "simple" parameter
120 # in the call to read_file_info.
121 assert not isinstance(simple, (str, ObservationInfo))
122 content_by_file[file] = simple
124 output = calculate_index(content_by_file, content)
126 return output, okay, failed
129def calculate_index(
130 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str
131) -> MutableMapping[str, str | MutableMapping[str, Any]]:
132 """Calculate an index data structure from the supplied headers.
134 Parameters
135 ----------
136 headers : `dict` of [`str`, `dict`]
137 The headers indexed by filename.
138 content_mode : `str`
139 The mode associated with these headers. Not used other than to
140 store the information in the data structure for later use on
141 deserialization.
143 Returns
144 -------
145 index_ : `dict` of [`str`, `dict`]
146 The headers in form suitable for writing to an index.
147 """
148 if content_mode not in ("metadata", "translated"):
149 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
151 # Merge all the information into a primary plus diff
152 merged = merge_headers([hdr for hdr in headers.values()], mode="diff")
154 # For a single file it is possible that the merged contents
155 # are not a dict but are an LSST-style PropertyList. JSON needs
156 # dict though. mypy can't know about PropertyList so we must ignore
157 # the type error.
158 if not isinstance(merged, collections.abc.Mapping):
159 merged = dict(merged) # type: ignore
161 # The structure to write to file is intended to look like (in YAML):
162 # __COMMON__:
163 # KEY1: value1
164 # KEY2: value2
165 # FILE1:
166 # KEY3: value3a
167 # FILE2:
168 # KEY3: value3b
170 # if there was only one file there will not be a diff but we
171 # want it to look like there was.
172 diff_dict = merged.pop("__DIFF__", [dict()])
174 # Put the common headers first in the output.
175 # Store the mode so that we can work out how to read the file in
176 output: MutableMapping[str, str | MutableMapping[str, Any]] = {
177 CONTENT_KEY: content_mode,
178 COMMON_KEY: merged,
179 }
180 for file, diff in zip(headers, diff_dict):
181 output[file] = diff
183 return output
186@overload
187def read_index( 187 ↛ exitline 187 didn't jump to the function exit
188 path: str,
189 *,
190 force_dict: Literal[True],
191) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
194@overload
195def read_index( 195 ↛ exitline 195 didn't jump to the function exit
196 path: str,
197 *,
198 force_dict: Literal[False],
199) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
202def read_index(
203 path: str, force_dict: bool = False
204) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
205 """Read an index file.
207 Parameters
208 ----------
209 path : `str`
210 Path to the index file.
211 force_dict : `bool`, optional
212 If `True` the structure returned will always be a dict keyed
213 by filename.
215 Returns
216 -------
217 index_ : `.ObservationGroup` or `dict` [ `str`, \
218 `dict` | `.ObservationInfo` ]
219 The return content matches that returned by `process_index_data`.
220 """
221 if not path.endswith(".json"):
222 raise ValueError(f"Index files must be in .json format; got {path}")
224 with open(path) as fd:
225 content: MutableMapping[str, Any] = json.loads(fd.read())
227 if not isinstance(content, MutableMapping):
228 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
230 return process_index_data(content, force_dict=force_dict)
233@overload
234def process_index_data( 234 ↛ exitline 234 didn't jump to the function exit
235 content: MutableMapping[str, Any],
236 *,
237 force_metadata: Literal[True],
238 force_dict: Literal[False],
239) -> MutableMapping[str, Any]: ...
242@overload
243def process_index_data( 243 ↛ exitline 243 didn't jump to the function exit
244 content: MutableMapping[str, Any],
245 *,
246 force_metadata: Literal[False],
247 force_dict: Literal[True],
248) -> MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
251@overload
252def process_index_data( 252 ↛ exitline 252 didn't jump to the function exit
253 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
254) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]: ...
257def process_index_data(
258 content: MutableMapping[str, Any], *, force_metadata: bool = False, force_dict: bool = False
259) -> ObservationGroup | MutableMapping[str, MutableMapping[str, Any] | ObservationInfo]:
260 """Process the content read from a JSON index file.
262 Parameters
263 ----------
264 content : `dict`
265 Data structure stored in JSON index file converted to simple python
266 form.
267 force_metadata : `bool`, optional
268 By default the content returned will match the original form that
269 was used for the index. If this parameter is `True` an index of
270 `.ObservationInfo` will be returned as if it was simple dict content.
271 force_dict : `bool`, optional
272 If `True` the structure returned will always be a dict keyed
273 by filename.
275 Returns
276 -------
277 index : `.ObservationGroup` or `dict` of [`str`, `dict`]
278 If the index file referred to `.ObservationInfo` this will return
279 an `.ObservationGroup`, otherwise a `dict` will be returned with the
280 keys being paths to files and the values being the keys and values
281 stored in the index (with common information merged in). This
282 can be overridden using the ``force_metadata`` parameter. If
283 ``force_dict`` is `True` a `dict` will be returned with filename
284 keys even if the index file refers to `.ObservationInfo` (the values
285 will be `.ObservationInfo` unless ``force_metadata`` is `True`).
287 Notes
288 -----
289 File keys will be relative to the location of the index file.
290 """
291 if COMMON_KEY not in content:
292 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
294 # Copy the input structure so we can update in place
295 unpacked = deepcopy(content)
297 content_mode = unpacked.pop(CONTENT_KEY, None)
298 if force_metadata:
299 content_mode = "metadata"
300 elif content_mode is None:
301 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
302 content_mode = "metadata"
304 # The common headers will be copied into each header
305 common = unpacked.pop(COMMON_KEY)
307 for file in unpacked:
308 unpacked[file].update(common)
310 if content_mode == "metadata":
311 # nothing more to be done
312 return unpacked
314 obs_infos: list[ObservationInfo] = []
315 # This type annotation is really MutableMapping[str, ObservationInfo]
316 # but mypy needs it to look like the function return value.
317 by_file: MutableMapping[str, MutableMapping[str, Any] | ObservationInfo] = {}
318 for file, hdr in unpacked.items():
319 info = ObservationInfo.from_simple(hdr)
320 info.filename = file
321 obs_infos.append(info)
322 by_file[file] = info
324 if force_dict:
325 return by_file
326 return ObservationGroup(obs_infos)
329def read_sidecar(path: str) -> ObservationInfo | MutableMapping[str, Any]:
330 """Read a metadata sidecar file.
332 Parameters
333 ----------
334 path : `str`
335 Path to the sidecar file.
337 Returns
338 -------
339 info : `.ObservationInfo` or `dict` of [`str`, `dict`]
340 If the sidecar file referred to `.ObservationInfo` this will return
341 an `.ObservationInfo`, otherwise a `dict` will be returned.
342 """
343 if not path.endswith(".json"):
344 raise ValueError(f"Sidecar files must be in .json format; got {path}")
346 with open(path) as fd:
347 content: MutableMapping[str, Any] = json.loads(fd.read())
349 if not isinstance(content, MutableMapping):
350 raise ValueError(f"The content of the JSON file is {type(content)} and not a dict.")
352 return process_sidecar_data(content)
355@overload
356def process_sidecar_data( 356 ↛ exitline 356 didn't jump to the function exit
357 content: MutableMapping[str, Any],
358) -> ObservationInfo | MutableMapping[str, Any]: ...
361@overload
362def process_sidecar_data( 362 ↛ exitline 362 didn't jump to the function exit
363 content: MutableMapping[str, Any], force_metadata: Literal[True]
364) -> MutableMapping[str, Any]: ...
367@overload
368def process_sidecar_data( 368 ↛ exitline 368 didn't jump to the function exit
369 content: MutableMapping[str, Any], force_metadata: Literal[False]
370) -> ObservationInfo | MutableMapping[str, Any]: ...
373def process_sidecar_data(
374 content: MutableMapping[str, Any], force_metadata: bool = False
375) -> ObservationInfo | MutableMapping[str, Any]:
376 """Process the content read from a JSON sidecar file.
378 Parameters
379 ----------
380 content : `dict`
381 Data structure stored in JSON sidecar file converted to simple python
382 form.
383 force_metadata : `bool`, optional
384 By default the content returned will match the original form that
385 was used for the sidecar. If this parameter is `True` a sidecar of
386 `.ObservationInfo` will be returned as if it was simple dict content.
388 Returns
389 -------
390 info : `.ObservationInfo` or `dict` of [`str`, `~typing.Any`]
391 If the sidecar file referred to `.ObservationInfo` this will return
392 an `.ObservationInfo`, otherwise a `dict` will be returned. This
393 can be overridden using the ``force_metadata`` parameter in which
394 case a `dict` will always be returned.
395 """
396 if not isinstance(content, dict):
397 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
399 # Copy the input structure so we can update in place
400 content = deepcopy(content)
402 guessing = False
403 content_mode = content.pop(CONTENT_KEY, None)
404 if force_metadata:
405 content_mode = "metadata"
406 elif content_mode is None:
407 # All ObservationInfo objects will have observation_id and instrument
408 # so if they are there we can guess
409 guessing = True
410 if "observation_id" in content and "instrument" in content:
411 content_mode = "translated"
412 else:
413 content_mode = "metadata"
414 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
416 if content_mode == "metadata":
417 # nothing more to be done
418 return content
420 try:
421 info = ObservationInfo.from_simple(content)
422 except Exception as e:
423 if guessing:
424 # We were guessing so seems like this is not ObservationInfo
425 return content
426 raise e
428 return info