Coverage for python/lsst/resources/_resourceHandles/_httpResourceHandle.py: 20%

115 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-17 10:49 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpReadResourceHandle",) 

15 

16import io 

17from collections.abc import Callable, Iterable 

18from logging import Logger 

19from typing import AnyStr 

20 

21import requests 

22from lsst.utils.timer import time_this 

23 

24from ._baseResourceHandle import BaseResourceHandle, CloseStatus 

25 

26 

27class HttpReadResourceHandle(BaseResourceHandle[bytes]): 

28 """HTTP-based specialization of `.BaseResourceHandle`. 

29 

30 Parameters 

31 ---------- 

32 mode : `str` 

33 Handle modes as described in the python `io` module. 

34 log : `~logging.Logger` 

35 Logger to used when writing messages. 

36 session : `requests.Session` 

37 The session to use for this handle. 

38 url : `str` 

39 URL of remote resource. 

40 timeout : `tuple` [`int`, `int`] 

41 Timeout to use for connections: connection timeout and read timeout 

42 in a tuple. 

43 newline : `str` or `None`, optional 

44 When doing multiline operations, break the stream on given character. 

45 Defaults to newline. If a file is opened in binary mode, this argument 

46 is not used, as binary files will only split lines on the binary 

47 newline representation. 

48 """ 

49 

50 def __init__( 

51 self, 

52 mode: str, 

53 log: Logger, 

54 *, 

55 session: requests.Session | None = None, 

56 url: str | None = None, 

57 timeout: tuple[float, float] | None = None, 

58 newline: AnyStr | None = None, 

59 ) -> None: 

60 super().__init__(mode, log, newline=newline) 

61 if url is None: 

62 raise ValueError("Url must be specified when constructing this object") 

63 self._url = url 

64 if session is None: 

65 raise ValueError("Session must be specified when constructing this object") 

66 self._session = session 

67 

68 if timeout is None: 

69 raise ValueError("timeout must be specified when constructing this object") 

70 self._timeout = timeout 

71 

72 self._completeBuffer: io.BytesIO | None = None 

73 

74 self._closed = CloseStatus.OPEN 

75 self._current_position = 0 

76 self._eof = False 

77 

78 def close(self) -> None: 

79 self._closed = CloseStatus.CLOSED 

80 self._completeBuffer = None 

81 self._eof = True 

82 

83 @property 

84 def closed(self) -> bool: 

85 return self._closed == CloseStatus.CLOSED 

86 

87 def fileno(self) -> int: 

88 raise io.UnsupportedOperation("HttpReadResourceHandle does not have a file number") 

89 

90 def flush(self) -> None: 

91 modes = set(self._mode) 

92 if {"w", "x", "a", "+"} & modes: 

93 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

94 

95 @property 

96 def isatty(self) -> bool | Callable[[], bool]: 

97 return False 

98 

99 def readable(self) -> bool: 

100 return True 

101 

102 def readline(self, size: int = -1) -> bytes: 

103 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading") 

104 

105 def readlines(self, size: int = -1) -> Iterable[bytes]: 

106 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading") 

107 

108 def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: 

109 self._eof = False 

110 if whence == io.SEEK_CUR and (self._current_position + offset) >= 0: 

111 self._current_position += offset 

112 elif whence == io.SEEK_SET and offset >= 0: 

113 self._current_position = offset 

114 else: 

115 raise io.UnsupportedOperation("Seek value is incorrect, or whence mode is unsupported") 

116 

117 # handle if the complete file has be read already 

118 if self._completeBuffer is not None: 

119 self._completeBuffer.seek(self._current_position, whence) 

120 return self._current_position 

121 

122 def seekable(self) -> bool: 

123 return True 

124 

125 def tell(self) -> int: 

126 return self._current_position 

127 

128 def truncate(self, size: int | None = None) -> int: 

129 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support truncation") 

130 

131 def writable(self) -> bool: 

132 return False 

133 

134 def write(self, b: bytes, /) -> int: 

135 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

136 

137 def writelines(self, b: Iterable[bytes], /) -> None: 

138 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

139 

140 def read(self, size: int = -1) -> bytes: 

141 if self._eof: 

142 # At EOF so always return an empty byte string. 

143 return b"" 

144 

145 # branch for if the complete file has been read before 

146 if self._completeBuffer is not None: 

147 result = self._completeBuffer.read(size) 

148 self._current_position += len(result) 

149 return result 

150 

151 if self._completeBuffer is None and size == -1 and self._current_position == 0: 

152 # The whole file has been requested, read it into a buffer and 

153 # return the result 

154 self._completeBuffer = io.BytesIO() 

155 with time_this(self._log, msg="Read from remote resource %s", args=(self._url,)): 

156 resp = self._session.get(self._url, stream=False, timeout=self._timeout) 

157 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial): 

158 raise FileNotFoundError(f"Unable to read resource {self._url}; status code: {code}") 

159 self._completeBuffer.write(resp.content) 

160 self._current_position = self._completeBuffer.tell() 

161 

162 return self._completeBuffer.getbuffer().tobytes() 

163 

164 # A partial read is required, either because a size has been specified, 

165 # or a read has previously been done. Any time we specify a byte range 

166 # we must disable the gzip compression on the server since we want 

167 # to address ranges in the uncompressed file. If we send ranges that 

168 # are interpreted by the server as offsets into the compressed file 

169 # then that is at least confusing and also there is no guarantee that 

170 # the bytes can be uncompressed. 

171 

172 end_pos = self._current_position + (size - 1) if size >= 0 else "" 

173 headers = {"Range": f"bytes={self._current_position}-{end_pos}", "Accept-Encoding": "identity"} 

174 

175 with time_this( 

176 self._log, msg="Read from remote resource %s using headers %s", args=(self._url, headers) 

177 ): 

178 resp = self._session.get(self._url, stream=False, timeout=self._timeout, headers=headers) 

179 

180 if resp.status_code == requests.codes.range_not_satisfiable: 

181 # Must have run off the end of the file. A standard file handle 

182 # will treat this as EOF so be consistent with that. Do not change 

183 # the current position. 

184 self._eof = True 

185 return b"" 

186 

187 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial): 

188 raise FileNotFoundError( 

189 f"Unable to read resource {self._url}, or bytes are out of range; status code: {code}" 

190 ) 

191 

192 len_content = len(resp.content) 

193 

194 # verify this is not actually the whole file and the server did not lie 

195 # about supporting ranges 

196 if len_content > size or code != requests.codes.partial: 

197 self._completeBuffer = io.BytesIO() 

198 self._completeBuffer.write(resp.content) 

199 self._completeBuffer.seek(0) 

200 return self.read(size=size) 

201 

202 # The response header should tell us the total number of bytes 

203 # in the file and also the current position we have got to in the 

204 # server. 

205 if "Content-Range" in resp.headers: 

206 content_range = resp.headers["Content-Range"] 

207 units, range_string = content_range.split(" ") 

208 if units == "bytes": 

209 range, total = range_string.split("/") 

210 if "-" in range: 

211 _, end = range.split("-") 

212 end_pos = int(end) 

213 if total != "*" and end_pos >= int(total) - 1: 

214 self._eof = True 

215 else: 

216 self._log.warning("Requested byte range from server but instead got: %s", content_range) 

217 

218 # Try to guess that we overran the end. This will not help if we 

219 # read exactly the number of bytes to get us to the end and so we 

220 # will need to do one more read and get a 416. 

221 if len_content < size: 

222 self._eof = True 

223 

224 self._current_position += len_content 

225 return resp.content