Coverage for python/astro_metadata_translator/indexing.py: 14%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("read_index", "calculate_index", "index_files", "process_index_data")
16"""Functions to support file indexing."""
18import collections.abc
19import json
20import logging
21import os
22import sys
23from copy import deepcopy
24from typing import IO, Any, List, MutableMapping, Optional, Sequence, Tuple, Union
26from .file_helpers import read_file_info
27from .headers import merge_headers
28from .observationGroup import ObservationGroup
29from .observationInfo import ObservationInfo
31log = logging.getLogger(__name__)
33COMMON_KEY = "__COMMON__"
34CONTENT_KEY = "__CONTENT__"
37def index_files(
38 files: Sequence[str],
39 root: Optional[str],
40 hdrnum: int,
41 print_trace: bool,
42 content: str,
43 outstream: IO = sys.stdout,
44 errstream: IO = sys.stderr,
45) -> Tuple[MutableMapping[str, Union[str, MutableMapping[str, Any]]], List[str], List[str]]:
46 """Create an index from the supplied files.
48 No file is written. The Python structure returned is suitable
49 for writing.
51 Parameters
52 ----------
53 files : iterable of `str`
54 Paths to the files to be indexed. They do not have to all be
55 in a single directory but all content will be indexed into a single
56 index.
57 root : `str`
58 Directory root that can be combined with each file (if the supplied)
59 file is relative. Will be ignored if `None`.
60 hdrnum : `int`
61 The HDU number to read. The primary header is always read and
62 print_trace : `bool`
63 If there is an error reading the file and this parameter is `True`,
64 a full traceback of the exception will be reported. If `False` prints
65 a one line summary of the error condition. If `None` the exception
66 will be allowed.
67 content : `str`
68 Form of data to write in index file. Options are:
69 ``translated`` (default) to write ObservationInfo to the index;
70 ``metadata`` to write native metadata headers to the index.
71 The index file is called ``{mode}_index.json``
72 outstream : `io.StringIO`, optional
73 Output stream to use for standard messages. Defaults to `sys.stdout`.
74 errstream : `io.StringIO`, optional
75 Stream to send messages that would normally be sent to standard
76 error. Defaults to `sys.stderr`.
78 Returns
79 -------
80 file_index : `dict` of [`str`, `dict`]
81 The headers in form suitable for writing to an index. The keys will
82 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
83 content mode used to construct the index, and paths to the files. The
84 paths will be the supplied paths and will not include any supplied
85 ``root``.
86 okay : `list` of `str`
87 All the files that were processed successfully.
88 failed : `list` of `str`
89 All the files that could not be processed. Will be empty if
90 ``print_trace`` is not `None`.
91 """
92 if content not in ("translated", "metadata"):
93 raise ValueError("Unrecognized mode {mode}")
95 failed: List[str] = []
96 okay: List[str] = []
98 content_by_file: MutableMapping[str, MutableMapping[str, Any]] = {} # Mapping of path to file content
99 for file in sorted(files):
100 if root is not None:
101 path = os.path.join(root, file)
102 else:
103 path = file
104 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream)
105 if simple is None:
106 failed.append(path)
107 continue
108 else:
109 okay.append(path)
111 # Store the information indexed by the filename within dir
112 # We may get a PropertyList here and can therefore not just
113 # assert Mapping for mypy. We therefore assert that it's not the
114 # other 2 options, which we were enforcing with the "simple" parameter
115 # in the call to read_file_info.
116 assert not isinstance(simple, (str, ObservationInfo))
117 content_by_file[file] = simple
119 output = calculate_index(content_by_file, content)
121 return output, okay, failed
124def calculate_index(
125 headers: MutableMapping[str, MutableMapping[str, Any]], content_mode: str
126) -> MutableMapping[str, Union[str, MutableMapping[str, Any]]]:
127 """Calculate an index data structure from the supplied headers.
129 Parameters
130 ----------
131 headers : `dict` of [`str`, `dict`]
132 The headers indexed by filename.
133 content_mode : `str`
134 The mode associated with these headers. Not used other than to
135 store the information in the data structure for later use on
136 deserialization.
138 Returns
139 -------
140 index_ : `dict` of [`str`, `dict`]
141 The headers in form suitable for writing to an index.
142 """
143 if content_mode not in ("metadata", "translated"):
144 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
146 # Merge all the information into a primary plus diff
147 merged = merge_headers([hdr for hdr in headers.values()], mode="diff")
149 # For a single file it is possible that the merged contents
150 # are not a dict but are an LSST-style PropertyList. JSON needs
151 # dict though. mypy can't know about PropertyList so we must ignore
152 # the type error.
153 if not isinstance(merged, collections.abc.Mapping):
154 merged = dict(merged) # type: ignore
156 # The structure to write to file is intended to look like (in YAML):
157 # __COMMON__:
158 # KEY1: value1
159 # KEY2: value2
160 # FILE1:
161 # KEY3: value3a
162 # FILE2:
163 # KEY3: value3b
165 # if there was only one file there will not be a diff but we
166 # want it to look like there was.
167 diff_dict = merged.pop("__DIFF__", [dict()])
169 # Put the common headers first in the output.
170 # Store the mode so that we can work out how to read the file in
171 output: MutableMapping[str, Union[str, MutableMapping[str, Any]]] = {
172 CONTENT_KEY: content_mode,
173 COMMON_KEY: merged,
174 }
175 for file, diff in zip(headers, diff_dict):
176 output[file] = diff
178 return output
181def read_index(
182 path: str, force_dict: bool = False
183) -> Union[ObservationGroup, MutableMapping[str, Union[str, MutableMapping[str, Any], ObservationInfo]]]:
184 """Read an index file.
186 Parameters
187 ----------
188 path : `str`
189 Path to the index file.
190 force_dict : `bool`, optional
191 If `True` the structure returned will always be a dict keyed
192 by filename.
194 Returns
195 -------
196 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]`
197 The return content matches that returned by `process_index_data`.
198 """
199 if not path.endswith(".json"):
200 raise ValueError(f"Index files must be in .json format; got {path}")
202 with open(path, "r") as fd:
203 content = json.loads(fd.read())
205 return process_index_data(content, force_dict=force_dict)
208def process_index_data(
209 content: MutableMapping[str, Any], force_metadata: bool = False, force_dict: bool = False
210) -> Union[ObservationGroup, MutableMapping[str, Union[str, MutableMapping[str, Any], ObservationInfo]]]:
211 """Process the content read from a JSON index file.
213 Parameters
214 ----------
215 content : `dict`
216 Data structure stored in JSON index file converted to simple python
217 form.
218 force_metadata : `bool`, optional
219 By default the content returned will match the original form that
220 was used for the index. If this parameter is `True` an index of
221 `ObservationInfo` will be returned as if it was simple dict content.
222 force_dict : `bool`, optional
223 If `True` the structure returned will always be a dict keyed
224 by filename.
226 Returns
227 -------
228 index : `ObservationGroup` or `dict` of [`str`, `dict`]
229 If the index file referred to `ObservationInfo` this will return
230 an `ObservationGroup`, otherwise a `dict` will be returned with the
231 keys being paths to files and the values being the keys and values
232 stored in the index (with common information merged in). This
233 can be overridden using the ``force_metadata`` parameter. If
234 ``force_dict`` is `True` a `dict` will be returned with filename
235 keys even if the index file refers to `ObservationInfo` (the values
236 will be `ObservationInfo` unless ``force_metadata`` is `True`).
238 Notes
239 -----
240 File keys will be relative to the location of the index file.
241 """
243 if COMMON_KEY not in content:
244 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
246 # Copy the input structure so we can update in place
247 unpacked = deepcopy(content)
249 content_mode = unpacked.pop(CONTENT_KEY, None)
250 if force_metadata:
251 content_mode = "metadata"
252 elif content_mode is None:
253 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
254 content_mode = "metadata"
256 # The common headers will be copied into each header
257 common = unpacked.pop(COMMON_KEY)
259 for file in unpacked:
260 unpacked[file].update(common)
262 if content_mode == "metadata":
263 # nothing more to be done
264 return unpacked
266 obs_infos: List[ObservationInfo] = []
267 # This type annotation is really MutableMapping[str, ObservationInfo]
268 # but mypy needs it to look like the function return value.
269 by_file: MutableMapping[str, Union[str, MutableMapping[str, Any], ObservationInfo]] = {}
270 for file, hdr in unpacked.items():
271 info = ObservationInfo.from_simple(hdr)
272 info.filename = file
273 obs_infos.append(info)
274 by_file[file] = info
276 if force_dict:
277 return by_file
278 return ObservationGroup(obs_infos)
281def read_sidecar(path: str) -> Union[ObservationInfo, MutableMapping[str, MutableMapping[str, Any]]]:
282 """Read a metadata sidecar file.
284 Parameters
285 ----------
286 path : `str`
287 Path to the sidecar file.
289 Returns
290 -------
291 info : `ObservationInfo` or `dict` of [`str`, `dict`]
292 If the sidecar file referred to `ObservationInfo` this will return
293 an `ObservationInfo`, otherwise a `dict` will be returned.
294 """
295 if not path.endswith(".json"):
296 raise ValueError(f"Sidecar files must be in .json format; got {path}")
298 with open(path, "r") as fd:
299 content = json.loads(fd.read())
301 return process_sidecar_data(content)
304def process_sidecar_data(
305 content: MutableMapping[str, Any], force_metadata: bool = False
306) -> Union[ObservationInfo, MutableMapping[str, MutableMapping[str, Any]]]:
307 """Process the content read from a JSON sidecar file.
309 Parameters
310 ----------
311 content : `dict`
312 Data structure stored in JSON sidecar file converted to simple python
313 form.
314 force_metadata : `bool`, optional
315 By default the content returned will match the original form that
316 was used for the sidecar. If this parameter is `True` a sidecar of
317 `ObservationInfo` will be returned as if it was simple dict content.
319 Returns
320 -------
321 info : `ObservationInfo` or `dict` of [`str`, `dict`]
322 If the sidecar file referred to `ObservationInfo` this will return
323 an `ObservationGroup`, otherwise a `dict` will be returned. This
324 can be overridden using the ``force_metadata`` parameter.
325 """
327 if not isinstance(content, dict):
328 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
330 # Copy the input structure so we can update in place
331 content = deepcopy(content)
333 guessing = False
334 content_mode = content.pop(CONTENT_KEY, None)
335 if force_metadata:
336 content_mode = "metadata"
337 elif content_mode is None:
338 # All ObservationInfo objects will have observation_id and instrument
339 # so if they are there we can guess
340 guessing = True
341 if "observation_id" in content and "instrument" in content:
342 content_mode = "translated"
343 else:
344 content_mode = "metadata"
345 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
347 if content_mode == "metadata":
348 # nothing more to be done
349 return content
351 try:
352 info = ObservationInfo.from_simple(content)
353 except Exception as e:
354 if guessing:
355 # We were guessing so seems like this is not ObservationInfo
356 return content
357 raise e
359 return info