Coverage for python/lsst/resources/_resourceHandles/_httpResourceHandle.py: 24%
125 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:14 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:14 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpReadResourceHandle",)
16import io
17import re
18from collections.abc import Callable, Iterable
19from logging import Logger
20from typing import AnyStr, NamedTuple
22import requests
23from lsst.utils.timer import time_this
25from ._baseResourceHandle import BaseResourceHandle, CloseStatus
28class HttpReadResourceHandle(BaseResourceHandle[bytes]):
29 """HTTP-based specialization of `.BaseResourceHandle`.
31 Parameters
32 ----------
33 mode : `str`
34 Handle modes as described in the python `io` module.
35 log : `~logging.Logger`
36 Logger to used when writing messages.
37 session : `requests.Session`
38 The session to use for this handle.
39 url : `str`
40 URL of remote resource.
41 timeout : `tuple` [`int`, `int`]
42 Timeout to use for connections: connection timeout and read timeout
43 in a tuple.
44 newline : `str` or `None`, optional
45 When doing multiline operations, break the stream on given character.
46 Defaults to newline. If a file is opened in binary mode, this argument
47 is not used, as binary files will only split lines on the binary
48 newline representation.
49 """
51 def __init__(
52 self,
53 mode: str,
54 log: Logger,
55 *,
56 session: requests.Session | None = None,
57 url: str | None = None,
58 timeout: tuple[float, float] | None = None,
59 newline: AnyStr | None = None,
60 ) -> None:
61 super().__init__(mode, log, newline=newline)
62 if url is None:
63 raise ValueError("Url must be specified when constructing this object")
64 self._url = url
65 if session is None:
66 raise ValueError("Session must be specified when constructing this object")
67 self._session = session
69 if timeout is None:
70 raise ValueError("timeout must be specified when constructing this object")
71 self._timeout = timeout
73 self._completeBuffer: io.BytesIO | None = None
75 self._closed = CloseStatus.OPEN
76 self._current_position = 0
77 self._eof = False
79 def close(self) -> None:
80 self._closed = CloseStatus.CLOSED
81 self._completeBuffer = None
82 self._eof = True
84 @property
85 def closed(self) -> bool:
86 return self._closed == CloseStatus.CLOSED
88 def fileno(self) -> int:
89 raise io.UnsupportedOperation("HttpReadResourceHandle does not have a file number")
91 def flush(self) -> None:
92 modes = set(self._mode)
93 if {"w", "x", "a", "+"} & modes:
94 raise io.UnsupportedOperation("HttpReadResourceHandles are read only")
96 @property
97 def isatty(self) -> bool | Callable[[], bool]:
98 return False
100 def readable(self) -> bool:
101 return True
103 def readline(self, size: int = -1) -> bytes:
104 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading")
106 def readlines(self, size: int = -1) -> Iterable[bytes]:
107 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading")
109 def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
110 self._eof = False
111 if whence == io.SEEK_CUR and (self._current_position + offset) >= 0:
112 self._current_position += offset
113 elif whence == io.SEEK_SET and offset >= 0:
114 self._current_position = offset
115 else:
116 raise io.UnsupportedOperation("Seek value is incorrect, or whence mode is unsupported")
118 # handle if the complete file has be read already
119 if self._completeBuffer is not None:
120 self._completeBuffer.seek(self._current_position, whence)
121 return self._current_position
123 def seekable(self) -> bool:
124 return True
126 def tell(self) -> int:
127 return self._current_position
129 def truncate(self, size: int | None = None) -> int:
130 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support truncation")
132 def writable(self) -> bool:
133 return False
135 def write(self, b: bytes, /) -> int:
136 raise io.UnsupportedOperation("HttpReadResourceHandles are read only")
138 def writelines(self, b: Iterable[bytes], /) -> None:
139 raise io.UnsupportedOperation("HttpReadResourceHandles are read only")
141 def read(self, size: int = -1) -> bytes:
142 if self._eof:
143 # At EOF so always return an empty byte string.
144 return b""
146 # branch for if the complete file has been read before
147 if self._completeBuffer is not None:
148 result = self._completeBuffer.read(size)
149 self._current_position += len(result)
150 return result
152 if self._completeBuffer is None and size == -1 and self._current_position == 0:
153 # The whole file has been requested, read it into a buffer and
154 # return the result
155 self._completeBuffer = io.BytesIO()
156 with time_this(self._log, msg="Read from remote resource %s", args=(self._url,)):
157 resp = self._session.get(self._url, stream=False, timeout=self._timeout)
158 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial):
159 raise FileNotFoundError(f"Unable to read resource {self._url}; status code: {code}")
160 self._completeBuffer.write(resp.content)
161 self._current_position = self._completeBuffer.tell()
163 return self._completeBuffer.getbuffer().tobytes()
165 # A partial read is required, either because a size has been specified,
166 # or a read has previously been done. Any time we specify a byte range
167 # we must disable the gzip compression on the server since we want
168 # to address ranges in the uncompressed file. If we send ranges that
169 # are interpreted by the server as offsets into the compressed file
170 # then that is at least confusing and also there is no guarantee that
171 # the bytes can be uncompressed.
173 end_pos = self._current_position + (size - 1) if size >= 0 else ""
174 headers = {"Range": f"bytes={self._current_position}-{end_pos}", "Accept-Encoding": "identity"}
176 with time_this(
177 self._log, msg="Read from remote resource %s using headers %s", args=(self._url, headers)
178 ):
179 resp = self._session.get(self._url, stream=False, timeout=self._timeout, headers=headers)
181 if resp.status_code == requests.codes.range_not_satisfiable:
182 # Must have run off the end of the file. A standard file handle
183 # will treat this as EOF so be consistent with that. Do not change
184 # the current position.
185 self._eof = True
186 return b""
188 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial):
189 raise FileNotFoundError(
190 f"Unable to read resource {self._url}, or bytes are out of range; status code: {code}"
191 )
193 len_content = len(resp.content)
195 # verify this is not actually the whole file and the server did not lie
196 # about supporting ranges
197 if len_content > size or code != requests.codes.partial:
198 self._completeBuffer = io.BytesIO()
199 self._completeBuffer.write(resp.content)
200 self._completeBuffer.seek(0)
201 return self.read(size=size)
203 # The response header should tell us the total number of bytes
204 # in the file and also the current position we have got to in the
205 # server.
206 if "Content-Range" in resp.headers:
207 content_range = parse_content_range_header(resp.headers["Content-Range"])
208 if (
209 content_range.total is not None
210 and content_range.range_end is not None
211 and content_range.range_end >= content_range.total - 1
212 ):
213 self._eof = True
215 # Try to guess that we overran the end. This will not help if we
216 # read exactly the number of bytes to get us to the end and so we
217 # will need to do one more read and get a 416.
218 if len_content < size:
219 self._eof = True
221 self._current_position += len_content
222 return resp.content
225class ContentRange(NamedTuple):
226 """Represents the data in an HTTP Content-Range header."""
228 range_start: int | None
229 """First byte of the zero-indexed, inclusive range returned by this
230 response. `None` if the range was not available in the header.
231 """
232 range_end: int | None
233 """Last byte of the zero-indexed, inclusive range returned by this
234 response. `None` if the range was not available in the header.
235 """
236 total: int | None
237 """Total size of the file in bytes. `None` if the file size was not
238 available in the header.
239 """
242def parse_content_range_header(header: str) -> ContentRange:
243 """Parse an HTTP 'Content-Range' header.
245 Parameters
246 ----------
247 header : `str`
248 Value of an HTTP Content-Range header to be parsed.
250 Returns
251 -------
252 content_range : `ContentRange`
253 The byte range included in the response and the total file size.
255 Raises
256 ------
257 ValueError
258 If the header was not in the expected format.
259 """
260 # There are three possible formats for Content-Range. All of them start
261 # with optional whitespace and a unit, which for our purposes should always
262 # be "bytes".
263 prefix = r"^\s*bytes\s+"
265 # Content-Range: <unit> <range-start>-<range-end>/<size>
266 if (case1 := re.match(prefix + r"(\d+)-(\d+)/(\d+)", header)) is not None:
267 return ContentRange(
268 range_start=int(case1.group(1)), range_end=int(case1.group(2)), total=int(case1.group(3))
269 )
271 # Content-Range: <unit> <range-start>-<range-end>/*
272 if (case2 := re.match(prefix + r"(\d+)-(\d+)/\*", header)) is not None:
273 return ContentRange(range_start=int(case2.group(1)), range_end=int(case2.group(2)), total=None)
275 # Content-Range: <unit> */<size>
276 if (case3 := re.match(prefix + r"\*/(\d+)", header)) is not None:
277 return ContentRange(range_start=None, range_end=None, total=int(case3.group(1)))
279 raise ValueError(f"Content-Range header in unexpected format: '{header}'")