Coverage for python/astro_metadata_translator/indexing.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of astro_metadata_translator.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the LICENSE file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12__all__ = ("read_index", "calculate_index", "index_files", "process_index_data")
14"""Functions to support file indexing."""
16import collections.abc
17import json
18import logging
19import os
20import sys
21from copy import deepcopy
23from .observationInfo import ObservationInfo
24from .observationGroup import ObservationGroup
25from .headers import merge_headers
26from .file_helpers import read_file_info
28log = logging.getLogger(__name__)
30COMMON_KEY = "__COMMON__"
31CONTENT_KEY = "__CONTENT__"
34def index_files(files, root, hdrnum, print_trace, content, outstream=sys.stdout, errstream=sys.stderr):
35 """Create an index from the supplied files.
37 No file is written. The Python structure returned is suitable
38 for writing.
40 Parameters
41 ----------
42 files : iterable of `str`
43 Paths to the files to be indexed. They do not have to all be
44 in a single directory but all content will be indexed into a single
45 index.
46 root : `str`
47 Directory root that can be combined with each file (if the supplied)
48 file is relative. Will be ignored if `None`.
49 hdrnum : `int`
50 The HDU number to read. The primary header is always read and
51 print_trace : `bool`
52 If there is an error reading the file and this parameter is `True`,
53 a full traceback of the exception will be reported. If `False` prints
54 a one line summary of the error condition. If `None` the exception
55 will be allowed.
56 content : `str`
57 Form of data to write in index file. Options are:
58 ``translated`` (default) to write ObservationInfo to the index;
59 ``metadata`` to write native metadata headers to the index.
60 The index file is called ``{mode}_index.json``
61 outstream : `io.StringIO`, optional
62 Output stream to use for standard messages. Defaults to `sys.stdout`.
63 errstream : `io.StringIO`, optional
64 Stream to send messages that would normally be sent to standard
65 error. Defaults to `sys.stderr`.
67 Returns
68 -------
69 file_index : `dict` of [`str`, `dict`]
70 The headers in form suitable for writing to an index. The keys will
71 be ``__COMMON__`` for shared content, ``__CONTENT_`` to record the
72 content mode used to construct the index, and paths to the files. The
73 paths will be the supplied paths and will not include any supplied
74 ``root``.
75 okay : `list` of `str`
76 All the files that were processed successfully.
77 failed : `list` of `str`
78 All the files that could not be processed. Will be empty if
79 ``print_trace`` is not `None`.
80 """
81 if content not in ("translated", "metadata"):
82 raise ValueError("Unrecognized mode {mode}")
84 failed = []
85 okay = []
87 content_by_file = {} # Mapping of path to file content
88 for file in sorted(files):
89 if root is not None:
90 path = os.path.join(root, file)
91 else:
92 path = file
93 simple = read_file_info(path, hdrnum, print_trace, content, "simple", outstream, errstream)
94 if simple is None:
95 failed.append(path)
96 continue
97 else:
98 okay.append(path)
100 # Store the information indexed by the filename within dir
101 content_by_file[file] = simple
103 output = calculate_index(content_by_file, content)
105 return output, okay, failed
108def calculate_index(headers, content_mode):
109 """Calculate an index data structure from the supplied headers.
111 Parameters
112 ----------
113 headers : `dict` of [`str`, `dict`]
114 The headers indexed by filename.
115 content_mode : `str`
116 The mode associated with these headers. Not used other than to
117 store the information in the data structure for later use on
118 deserialization.
120 Returns
121 -------
122 index_ : `dict` of [`str`, `dict`]
123 The headers in form suitable for writing to an index.
124 """
125 if content_mode not in ("metadata", "translated"):
126 raise ValueError(f"Unrecognized mode for index creation: {content_mode}")
128 # Merge all the information into a primary plus diff
129 merged = merge_headers(headers.values(), mode="diff")
131 # For a single file it is possible that the merged contents
132 # are not a dict but are an LSST-style PropertyList. JSON needs
133 # dict though.
134 if not isinstance(merged, collections.abc.Mapping):
135 merged = dict(merged)
137 # The structure to write to file is intended to look like (in YAML):
138 # __COMMON__:
139 # KEY1: value1
140 # KEY2: value2
141 # FILE1:
142 # KEY3: value3a
143 # FILE2:
144 # KEY3: value3b
146 # if there was only one file there will not be a diff but we
147 # want it to look like there was.
148 diff_dict = merged.pop("__DIFF__", [dict()])
150 # Put the common headers first in the output.
151 # Store the mode so that we can work out how to read the file in
152 output = {CONTENT_KEY: content_mode, COMMON_KEY: merged}
153 for file, diff in zip(headers, diff_dict):
154 output[file] = diff
156 return output
159def read_index(path, force_dict=False):
160 """Read an index file.
162 Parameters
163 ----------
164 path : `str`
165 Path to the index file.
166 force_dict : `bool`, optional
167 If `True` the structure returned will always be a dict keyed
168 by filename.
170 Returns
171 -------
172 index_ : `ObservationGroup` or `dict[str, Union[dict, ObservaitonInfo]]`
173 The return content matches that returned by `process_index_data`.
174 """
175 if not path.endswith(".json"):
176 raise ValueError(f"Index files must be in .json format; got {path}")
178 with open(path, "r") as fd:
179 content = json.loads(fd.read())
181 return process_index_data(content, force_dict=force_dict)
184def process_index_data(content, force_metadata=False, force_dict=False):
185 """Process the content read from a JSON index file.
187 Parameters
188 ----------
189 content : `dict`
190 Data structure stored in JSON index file converted to simple python
191 form.
192 force_metadata : `bool`, optional
193 By default the content returned will match the original form that
194 was used for the index. If this parameter is `True` an index of
195 `ObservationInfo` will be returned as if it was simple dict content.
196 force_dict : `bool`, optional
197 If `True` the structure returned will always be a dict keyed
198 by filename.
200 Returns
201 -------
202 index : `ObservationGroup` or `dict` of [`str`, `dict`]
203 If the index file referred to `ObservationInfo` this will return
204 an `ObservationGroup`, otherwise a `dict` will be returned with the
205 keys being paths to files and the values being the keys and values
206 stored in the index (with common information merged in). This
207 can be overridden using the ``force_metadata`` parameter. If
208 ``force_dict`` is `True` a `dict` will be returned with filename
209 keys even if the index file refers to `ObservationInfo` (the values
210 will be `ObservationInfo` unless ``force_metadata`` is `True`).
212 Notes
213 -----
214 File keys will be relative to the location of the index file.
215 """
217 if COMMON_KEY not in content:
218 raise ValueError(f"No '{COMMON_KEY}' key found in dict. Does not look like an index data structure.")
220 # Copy the input structure so we can update in place
221 unpacked = deepcopy(content)
223 content_mode = unpacked.pop(CONTENT_KEY, None)
224 if force_metadata:
225 content_mode = "metadata"
226 elif content is None:
227 log.warning("No '%s' key in data structure, assuming 'metadata'", CONTENT_KEY)
228 content_mode = "metadata"
230 # The common headers will be copied into each header
231 common = unpacked.pop(COMMON_KEY)
233 for file in unpacked:
234 unpacked[file].update(common)
236 if content_mode == "metadata":
237 # nothing more to be done
238 return unpacked
240 obs_infos = []
241 by_file = {}
242 for file, hdr in unpacked.items():
243 info = ObservationInfo.from_simple(hdr)
244 info.filename = file
245 obs_infos.append(info)
246 by_file[file] = info
248 if force_dict:
249 return by_file
250 return ObservationGroup(obs_infos)
253def read_sidecar(path):
254 """Read a metadata sidecar file.
256 Parameters
257 ----------
258 path : `str`
259 Path to the sidecar file.
261 Returns
262 -------
263 info : `ObservationInfo` or `dict` of [`str`, `dict`]
264 If the sidecar file referred to `ObservationInfo` this will return
265 an `ObservationInfo`, otherwise a `dict` will be returned.
266 """
267 if not path.endswith(".json"):
268 raise ValueError(f"Sidecar files must be in .json format; got {path}")
270 with open(path, "r") as fd:
271 content = json.loads(fd.read())
273 return process_sidecar_data(content)
276def process_sidecar_data(content, force_metadata=False):
277 """Process the content read from a JSON sidecar file.
279 Parameters
280 ----------
281 content : `dict`
282 Data structure stored in JSON sidecar file converted to simple python
283 form.
284 force_metadata : `bool`, optional
285 By default the content returned will match the original form that
286 was used for the sidecar. If this parameter is `True` a sidecar of
287 `ObservationInfo` will be returned as if it was simple dict content.
289 Returns
290 -------
291 info : `ObservationInfo` or `dict` of [`str`, `dict`]
292 If the sidecar file referred to `ObservationInfo` this will return
293 an `ObservationGroup`, otherwise a `dict` will be returned. This
294 can be overridden using the ``force_metadata`` parameter.
295 """
297 if not isinstance(content, dict):
298 raise TypeError(f"Content of sidecar must be a dict, not {type(content)}")
300 # Copy the input structure so we can update in place
301 content = deepcopy(content)
303 guessing = False
304 content_mode = content.pop(CONTENT_KEY, None)
305 if force_metadata:
306 content_mode = "metadata"
307 elif content is None:
308 # All ObservationInfo objects will have observation_id and instrument
309 # so if they are there we can guess
310 guessing = True
311 if "observation_id" in content and "instrument" in content_mode:
312 content_mode = "translated"
313 else:
314 content_mode = "metadata"
315 log.warning("No '%s' key in data structure, assuming '%s'", CONTENT_KEY, content_mode)
317 if content_mode == "metadata":
318 # nothing more to be done
319 return content
321 try:
322 info = ObservationInfo.from_simple(content)
323 except Exception as e:
324 if guessing:
325 # We were guessing so seems like this is not ObservationInfo
326 return content
327 raise e
329 return info