Coverage for python/lsst/resources/_resourceHandles/_httpResourceHandle.py: 20%
115 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-09 11:30 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-09 11:30 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpReadResourceHandle",)
16import io
17from collections.abc import Callable, Iterable
18from logging import Logger
19from typing import AnyStr
21import requests
22from lsst.utils.timer import time_this
24from ._baseResourceHandle import BaseResourceHandle, CloseStatus
27class HttpReadResourceHandle(BaseResourceHandle[bytes]):
28 """HTTP-based specialization of `.BaseResourceHandle`.
30 Parameters
31 ----------
32 mode : `str`
33 Handle modes as described in the python `io` module.
34 log : `~logging.Logger`
35 Logger to used when writing messages.
36 session : `requests.Session`
37 The session to use for this handle.
38 url : `str`
39 URL of remote resource.
40 timeout : `tuple` [`int`, `int`]
41 Timeout to use for connections: connection timeout and read timeout
42 in a tuple.
43 newline : `str` or `None`, optional
44 When doing multiline operations, break the stream on given character.
45 Defaults to newline. If a file is opened in binary mode, this argument
46 is not used, as binary files will only split lines on the binary
47 newline representation.
48 """
50 def __init__(
51 self,
52 mode: str,
53 log: Logger,
54 *,
55 session: requests.Session | None = None,
56 url: str | None = None,
57 timeout: tuple[float, float] | None = None,
58 newline: AnyStr | None = None,
59 ) -> None:
60 super().__init__(mode, log, newline=newline)
61 if url is None:
62 raise ValueError("Url must be specified when constructing this object")
63 self._url = url
64 if session is None:
65 raise ValueError("Session must be specified when constructing this object")
66 self._session = session
68 if timeout is None:
69 raise ValueError("timeout must be specified when constructing this object")
70 self._timeout = timeout
72 self._completeBuffer: io.BytesIO | None = None
74 self._closed = CloseStatus.OPEN
75 self._current_position = 0
76 self._eof = False
78 def close(self) -> None:
79 self._closed = CloseStatus.CLOSED
80 self._completeBuffer = None
81 self._eof = True
83 @property
84 def closed(self) -> bool:
85 return self._closed == CloseStatus.CLOSED
87 def fileno(self) -> int:
88 raise io.UnsupportedOperation("HttpReadResourceHandle does not have a file number")
90 def flush(self) -> None:
91 modes = set(self._mode)
92 if {"w", "x", "a", "+"} & modes:
93 raise io.UnsupportedOperation("HttpReadResourceHandles are read only")
95 @property
96 def isatty(self) -> bool | Callable[[], bool]:
97 return False
99 def readable(self) -> bool:
100 return True
102 def readline(self, size: int = -1) -> bytes:
103 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading")
105 def readlines(self, size: int = -1) -> Iterable[bytes]:
106 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading")
108 def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
109 self._eof = False
110 if whence == io.SEEK_CUR and (self._current_position + offset) >= 0:
111 self._current_position += offset
112 elif whence == io.SEEK_SET and offset >= 0:
113 self._current_position = offset
114 else:
115 raise io.UnsupportedOperation("Seek value is incorrect, or whence mode is unsupported")
117 # handle if the complete file has be read already
118 if self._completeBuffer is not None:
119 self._completeBuffer.seek(self._current_position, whence)
120 return self._current_position
122 def seekable(self) -> bool:
123 return True
125 def tell(self) -> int:
126 return self._current_position
128 def truncate(self, size: int | None = None) -> int:
129 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support truncation")
131 def writable(self) -> bool:
132 return False
134 def write(self, b: bytes, /) -> int:
135 raise io.UnsupportedOperation("HttpReadResourceHandles are read only")
137 def writelines(self, b: Iterable[bytes], /) -> None:
138 raise io.UnsupportedOperation("HttpReadResourceHandles are read only")
140 def read(self, size: int = -1) -> bytes:
141 if self._eof:
142 # At EOF so always return an empty byte string.
143 return b""
145 # branch for if the complete file has been read before
146 if self._completeBuffer is not None:
147 result = self._completeBuffer.read(size)
148 self._current_position += len(result)
149 return result
151 if self._completeBuffer is None and size == -1 and self._current_position == 0:
152 # The whole file has been requested, read it into a buffer and
153 # return the result
154 self._completeBuffer = io.BytesIO()
155 with time_this(self._log, msg="Read from remote resource %s", args=(self._url,)):
156 resp = self._session.get(self._url, stream=False, timeout=self._timeout)
157 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial):
158 raise FileNotFoundError(f"Unable to read resource {self._url}; status code: {code}")
159 self._completeBuffer.write(resp.content)
160 self._current_position = self._completeBuffer.tell()
162 return self._completeBuffer.getbuffer().tobytes()
164 # A partial read is required, either because a size has been specified,
165 # or a read has previously been done. Any time we specify a byte range
166 # we must disable the gzip compression on the server since we want
167 # to address ranges in the uncompressed file. If we send ranges that
168 # are interpreted by the server as offsets into the compressed file
169 # then that is at least confusing and also there is no guarantee that
170 # the bytes can be uncompressed.
172 end_pos = self._current_position + (size - 1) if size >= 0 else ""
173 headers = {"Range": f"bytes={self._current_position}-{end_pos}", "Accept-Encoding": "identity"}
175 with time_this(
176 self._log, msg="Read from remote resource %s using headers %s", args=(self._url, headers)
177 ):
178 resp = self._session.get(self._url, stream=False, timeout=self._timeout, headers=headers)
180 if resp.status_code == requests.codes.range_not_satisfiable:
181 # Must have run off the end of the file. A standard file handle
182 # will treat this as EOF so be consistent with that. Do not change
183 # the current position.
184 self._eof = True
185 return b""
187 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial):
188 raise FileNotFoundError(
189 f"Unable to read resource {self._url}, or bytes are out of range; status code: {code}"
190 )
192 len_content = len(resp.content)
194 # verify this is not actually the whole file and the server did not lie
195 # about supporting ranges
196 if len_content > size or code != requests.codes.partial:
197 self._completeBuffer = io.BytesIO()
198 self._completeBuffer.write(resp.content)
199 self._completeBuffer.seek(0)
200 return self.read(size=size)
202 # The response header should tell us the total number of bytes
203 # in the file and also the current position we have got to in the
204 # server.
205 if "Content-Range" in resp.headers:
206 content_range = resp.headers["Content-Range"]
207 units, range_string = content_range.split(" ")
208 if units == "bytes":
209 range, total = range_string.split("/")
210 if "-" in range:
211 _, end = range.split("-")
212 end_pos = int(end)
213 if total != "*" and end_pos >= int(total) - 1:
214 self._eof = True
215 else:
216 self._log.warning("Requested byte range from server but instead got: %s", content_range)
218 # Try to guess that we overran the end. This will not help if we
219 # read exactly the number of bytes to get us to the end and so we
220 # will need to do one more read and get a 416.
221 if len_content < size:
222 self._eof = True
224 self._current_position += len_content
225 return resp.content