Coverage for python/astro_metadata_translator/indexing.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12__all__ = ("read_index", "calculate_index", "index_files", "process_index_data")
14"""Functions to support file indexing."""
16import json
17import logging
18import os
19import sys
20from copy import deepcopy
22from .observationInfo import ObservationInfo
23from .observationGroup import ObservationGroup
24from .headers import merge_headers
25from .file_helpers import read_file_info
27log = logging.getLogger(__name__)
29COMMON_KEY = "__COMMON__"
30CONTENT_KEY = "__CONTENT__"
33def index_files(files, root, hdrnum, print_trace, content, outstream=sys.stdout, errstream=sys.stderr):
34 """Create an index from the supplied files.
36 No file is written. The Python structure returned is suitable
37 for writing.
39 Parameters
40 ----------
41 files : iterable of `str`
42 Paths to the files to be indexed. They do not have to all be
43 in a single directory but all content will be indexed into a single
44 index.
45 root : `str`
46 Directory root that can be combined with each file (if the supplied)
47 file is relative. Will be ignored if `None`.
48 hdrnum : `int`
49 The HDU number to read. The primary header is always read and
50 print_trace : `bool`
51 If there is an error reading the file and this parameter is `True`,
52 a full traceback of the exception will be reported. If `False` prints
53 a one line summary of the error condition. If `None` the exception
54 will be allowed.
55 content : `str`
56 Form of data to write in index file. Options are:
57 ``translated`` (default) to write ObservationInfo to the index;
58 ``metadata`` to write native metadata headers to the index.
59 The index file is called ``{mode}_index.json``
60 outstream : `io.StringIO`, optional
61 Output stream to use for standard messages. Defaults to `sys.stdout`.
62 errstream : `io.StringIO`, optional
63 Stream to send messages that would normally be sent to standard
64 error. Defaults to `sys.stderr`.
66 Returns
67 -------
68 file_index : `dict` of [`str`, `dict`]
69 The headers in form suitable for writing to an index. The keys will
70 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
71 content mode used to construct the index, and paths to the files. The
72 paths will be the supplied paths and will not include any supplied
73 ``root``.
74 okay : `list` of `str`
75 All the files that were processed successfully.
76 failed : `list` of `str`
77 All the files that could not be processed. Will be empty if
78 ``print_trace`` is not `None`.
79 """
80 if content not in ("translated", "metadata"):
81 raise ValueError("Unrecognized mode {mode}")
83 failed = []
84 okay = []
86 content_by_file = {} # Mapping of path to file content
87 for file in sorted(files):
88 if root is not None:
89 path = os.path.join(root, file)
90 else:
91 path = file
92 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream)
93 if simple is None:
94 failed.append(path)
95 continue
96 else:
97 okay.append(path)
99 # Store the information indexed by the filename within dir
100 content_by_file[file] = simple
102 output = calculate_index(content_by_file, content)
104 return output, okay, failed
107def calculate_index(headers, content_mode):
108 """Calculate an index data structure from the supplied headers.
110 Parameters
111 ----------
112 headers : `dict` of [`str`, `dict`]
113 The headers indexed by filename.
114 content_mode : `str`
115 The mode associated with these headers. Not used other than to
116 store the information in the data structure for later use on
117 deserialization.
119 Returns
120 -------
121 index_ : `dict` of [`str`, `dict`]
122 The headers in form suitable for writing to an index.
123 """
124 if content_mode not in ("metadata", "translated"):
125 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
127 # Merge all the information into a primary plus diff
128 merged = merge_headers(headers.values(), mode="diff")
130 # The structure to write to file is intended to look like (in YAML):
131 # __COMMON__:
132 # KEY1: value1
133 # KEY2: value2
134 # FILE1:
135 # KEY3: value3a
136 # FILE2:
137 # KEY3: value3b
139 # if there was only one file there will not be a diff but we
140 # want it to look like there was.
141 diff_dict = merged.pop("__DIFF__", [dict()])
143 # Put the common headers first in the output.
144 # Store the mode so that we can work out how to read the file in
145 output = {CONTENT_KEY: content_mode, COMMON_KEY: merged}
146 for file, diff in zip(headers, diff_dict):
147 output[file] = diff
149 return output
152def read_index(path, force_dict=False):
153 """Read an index file.
155 Parameters
156 ----------
157 path : `str`
158 Path to the index file.
159 force_dict : `bool`, optional
160 If `True` the structure returned will always be a dict keyed
161 by filename.
163 Returns
164 -------
165 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]`
166 The return content matches that returned by `process_index_data`.
167 """
168 if not path.endswith(".json"):
169 raise ValueError(f"Index files must be in .json format; got {path}")
171 with open(path, "r") as fd:
172 content = json.loads(fd.read())
174 return process_index_data(content, force_dict=force_dict)
177def process_index_data(content, force_metadata=False, force_dict=False):
178 """Process the content read from a JSON index file.
180 Parameters
181 ----------
182 content : `dict`
183 Data structure stored in JSON index file converted to simple python
184 form.
185 force_metadata : `bool`, optional
186 By default the content returned will match the original form that
187 was used for the index. If this parameter is `True` an index of
188 `ObservationInfo` will be returned as if it was simple dict content.
189 force_dict : `bool`, optional
190 If `True` the structure returned will always be a dict keyed
191 by filename.
193 Returns
194 -------
195 index : `ObservationGroup` or `dict` of [`str`, `dict`]
196 If the index file referred to `ObservationInfo` this will return
197 an `ObservationGroup`, otherwise a `dict` will be returned with the
198 keys being paths to files and the values being the keys and values
199 stored in the index (with common information merged in). This
200 can be overridden using the ``force_metadata`` parameter. If
201 ``force_dict`` is `True` a `dict` will be returned with filename
202 keys even if the index file refers to `ObservationInfo` (the values
203 will be `ObservationInfo` unless ``force_metadata`` is `True`).
205 Notes
206 -----
207 File keys will be relative to the location of the index file.
208 """
210 if COMMON_KEY not in content:
211 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
213 # Copy the input structure so we can update in place
214 unpacked = deepcopy(content)
216 content_mode = unpacked.pop(CONTENT_KEY, None)
217 if force_metadata:
218 content_mode = "metadata"
219 elif content is None:
220 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
221 content_mode = "metadata"
223 # The common headers will be copied into each header
224 common = unpacked.pop(COMMON_KEY)
226 for file in unpacked:
227 unpacked[file].update(common)
229 if content_mode == "metadata":
230 # nothing more to be done
231 return unpacked
233 obs_infos = []
234 by_file = {}
235 for file, hdr in unpacked.items():
236 info = ObservationInfo.from_simple(hdr)
237 info.filename = file
238 obs_infos.append(info)
239 by_file[file] = info
241 if force_dict:
242 return by_file
243 return ObservationGroup(obs_infos)
246def read_sidecar(path):
247 """Read a metadata sidecar file.
249 Parameters
250 ----------
251 path : `str`
252 Path to the sidecar file.
254 Returns
255 -------
256 info : `ObservationInfo` or `dict` of [`str`, `dict`]
257 If the sidecar file referred to `ObservationInfo` this will return
258 an `ObservationInfo`, otherwise a `dict` will be returned.
259 """
260 if not path.endswith(".json"):
261 raise ValueError(f"Sidecar files must be in .json format; got {path}")
263 with open(path, "r") as fd:
264 content = json.loads(fd.read())
266 return process_sidecar_data(content)
269def process_sidecar_data(content, force_metadata=False):
270 """Process the content read from a JSON sidecar file.
272 Parameters
273 ----------
274 content : `dict`
275 Data structure stored in JSON sidecar file converted to simple python
276 form.
277 force_metadata : `bool`, optional
278 By default the content returned will match the original form that
279 was used for the sidecar. If this parameter is `True` a sidecar of
280 `ObservationInfo` will be returned as if it was simple dict content.
282 Returns
283 -------
284 info : `ObservationInfo` or `dict` of [`str`, `dict`]
285 If the sidecar file referred to `ObservationInfo` this will return
286 an `ObservationGroup`, otherwise a `dict` will be returned. This
287 can be overridden using the ``force_metadata`` parameter.
288 """
290 if not isinstance(content, dict):
291 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
293 # Copy the input structure so we can update in place
294 content = deepcopy(content)
296 guessing = False
297 content_mode = content.pop(CONTENT_KEY, None)
298 if force_metadata:
299 content_mode = "metadata"
300 elif content is None:
301 # All ObservationInfo objects will have observation_id and instrument
302 # so if they are there we can guess
303 guessing = True
304 if "observation_id" in content and "instrument" in content_mode:
305 content_mode = "translated"
306 else:
307 content_mode = "metadata"
308 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
310 if content_mode == "metadata":
311 # nothing more to be done
312 return content
314 try:
315 info = ObservationInfo.from_simple(content)
316 except Exception as e:
317 if guessing:
318 # We were guessing so seems like this is not ObservationInfo
319 return content
320 raise e
322 return info