Coverage for python/lsst/resources/_resourceHandles/_httpResourceHandle.py: 24%

125 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-19 11:17 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpReadResourceHandle",) 

15 

16import io 

17import re 

18from collections.abc import Callable, Iterable 

19from logging import Logger 

20from typing import AnyStr, NamedTuple 

21 

22import requests 

23from lsst.utils.timer import time_this 

24 

25from ._baseResourceHandle import BaseResourceHandle, CloseStatus 

26 

27 

28class HttpReadResourceHandle(BaseResourceHandle[bytes]): 

29 """HTTP-based specialization of `.BaseResourceHandle`. 

30 

31 Parameters 

32 ---------- 

33 mode : `str` 

34 Handle modes as described in the python `io` module. 

35 log : `~logging.Logger` 

36 Logger to used when writing messages. 

37 session : `requests.Session` 

38 The session to use for this handle. 

39 url : `str` 

40 URL of remote resource. 

41 timeout : `tuple` [`int`, `int`] 

42 Timeout to use for connections: connection timeout and read timeout 

43 in a tuple. 

44 newline : `str` or `None`, optional 

45 When doing multiline operations, break the stream on given character. 

46 Defaults to newline. If a file is opened in binary mode, this argument 

47 is not used, as binary files will only split lines on the binary 

48 newline representation. 

49 """ 

50 

51 def __init__( 

52 self, 

53 mode: str, 

54 log: Logger, 

55 *, 

56 session: requests.Session | None = None, 

57 url: str | None = None, 

58 timeout: tuple[float, float] | None = None, 

59 newline: AnyStr | None = None, 

60 ) -> None: 

61 super().__init__(mode, log, newline=newline) 

62 if url is None: 

63 raise ValueError("Url must be specified when constructing this object") 

64 self._url = url 

65 if session is None: 

66 raise ValueError("Session must be specified when constructing this object") 

67 self._session = session 

68 

69 if timeout is None: 

70 raise ValueError("timeout must be specified when constructing this object") 

71 self._timeout = timeout 

72 

73 self._completeBuffer: io.BytesIO | None = None 

74 

75 self._closed = CloseStatus.OPEN 

76 self._current_position = 0 

77 self._eof = False 

78 

79 def close(self) -> None: 

80 self._closed = CloseStatus.CLOSED 

81 self._completeBuffer = None 

82 self._eof = True 

83 

84 @property 

85 def closed(self) -> bool: 

86 return self._closed == CloseStatus.CLOSED 

87 

88 def fileno(self) -> int: 

89 raise io.UnsupportedOperation("HttpReadResourceHandle does not have a file number") 

90 

91 def flush(self) -> None: 

92 modes = set(self._mode) 

93 if {"w", "x", "a", "+"} & modes: 

94 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

95 

96 @property 

97 def isatty(self) -> bool | Callable[[], bool]: 

98 return False 

99 

100 def readable(self) -> bool: 

101 return True 

102 

103 def readline(self, size: int = -1) -> bytes: 

104 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading") 

105 

106 def readlines(self, size: int = -1) -> Iterable[bytes]: 

107 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading") 

108 

109 def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: 

110 self._eof = False 

111 if whence == io.SEEK_CUR and (self._current_position + offset) >= 0: 

112 self._current_position += offset 

113 elif whence == io.SEEK_SET and offset >= 0: 

114 self._current_position = offset 

115 else: 

116 raise io.UnsupportedOperation("Seek value is incorrect, or whence mode is unsupported") 

117 

118 # handle if the complete file has be read already 

119 if self._completeBuffer is not None: 

120 self._completeBuffer.seek(self._current_position, whence) 

121 return self._current_position 

122 

123 def seekable(self) -> bool: 

124 return True 

125 

126 def tell(self) -> int: 

127 return self._current_position 

128 

129 def truncate(self, size: int | None = None) -> int: 

130 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support truncation") 

131 

132 def writable(self) -> bool: 

133 return False 

134 

135 def write(self, b: bytes, /) -> int: 

136 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

137 

138 def writelines(self, b: Iterable[bytes], /) -> None: 

139 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

140 

141 def read(self, size: int = -1) -> bytes: 

142 if self._eof: 

143 # At EOF so always return an empty byte string. 

144 return b"" 

145 

146 # branch for if the complete file has been read before 

147 if self._completeBuffer is not None: 

148 result = self._completeBuffer.read(size) 

149 self._current_position += len(result) 

150 return result 

151 

152 if self._completeBuffer is None and size == -1 and self._current_position == 0: 

153 # The whole file has been requested, read it into a buffer and 

154 # return the result 

155 self._completeBuffer = io.BytesIO() 

156 with time_this(self._log, msg="Read from remote resource %s", args=(self._url,)): 

157 resp = self._session.get(self._url, stream=False, timeout=self._timeout) 

158 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial): 

159 raise FileNotFoundError(f"Unable to read resource {self._url}; status code: {code}") 

160 self._completeBuffer.write(resp.content) 

161 self._current_position = self._completeBuffer.tell() 

162 

163 return self._completeBuffer.getbuffer().tobytes() 

164 

165 # A partial read is required, either because a size has been specified, 

166 # or a read has previously been done. Any time we specify a byte range 

167 # we must disable the gzip compression on the server since we want 

168 # to address ranges in the uncompressed file. If we send ranges that 

169 # are interpreted by the server as offsets into the compressed file 

170 # then that is at least confusing and also there is no guarantee that 

171 # the bytes can be uncompressed. 

172 

173 end_pos = self._current_position + (size - 1) if size >= 0 else "" 

174 headers = {"Range": f"bytes={self._current_position}-{end_pos}", "Accept-Encoding": "identity"} 

175 

176 with time_this( 

177 self._log, msg="Read from remote resource %s using headers %s", args=(self._url, headers) 

178 ): 

179 resp = self._session.get(self._url, stream=False, timeout=self._timeout, headers=headers) 

180 

181 if resp.status_code == requests.codes.range_not_satisfiable: 

182 # Must have run off the end of the file. A standard file handle 

183 # will treat this as EOF so be consistent with that. Do not change 

184 # the current position. 

185 self._eof = True 

186 return b"" 

187 

188 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial): 

189 raise FileNotFoundError( 

190 f"Unable to read resource {self._url}, or bytes are out of range; status code: {code}" 

191 ) 

192 

193 len_content = len(resp.content) 

194 

195 # verify this is not actually the whole file and the server did not lie 

196 # about supporting ranges 

197 if len_content > size or code != requests.codes.partial: 

198 self._completeBuffer = io.BytesIO() 

199 self._completeBuffer.write(resp.content) 

200 self._completeBuffer.seek(0) 

201 return self.read(size=size) 

202 

203 # The response header should tell us the total number of bytes 

204 # in the file and also the current position we have got to in the 

205 # server. 

206 if "Content-Range" in resp.headers: 

207 content_range = parse_content_range_header(resp.headers["Content-Range"]) 

208 if ( 

209 content_range.total is not None 

210 and content_range.range_end is not None 

211 and content_range.range_end >= content_range.total - 1 

212 ): 

213 self._eof = True 

214 

215 # Try to guess that we overran the end. This will not help if we 

216 # read exactly the number of bytes to get us to the end and so we 

217 # will need to do one more read and get a 416. 

218 if len_content < size: 

219 self._eof = True 

220 

221 self._current_position += len_content 

222 return resp.content 

223 

224 

225class ContentRange(NamedTuple): 

226 """Represents the data in an HTTP Content-Range header.""" 

227 

228 range_start: int | None 

229 """First byte of the zero-indexed, inclusive range returned by this 

230 response. `None` if the range was not available in the header. 

231 """ 

232 range_end: int | None 

233 """Last byte of the zero-indexed, inclusive range returned by this 

234 response. `None` if the range was not available in the header. 

235 """ 

236 total: int | None 

237 """Total size of the file in bytes. `None` if the file size was not 

238 available in the header. 

239 """ 

240 

241 

242def parse_content_range_header(header: str) -> ContentRange: 

243 """Parse an HTTP 'Content-Range' header. 

244 

245 Parameters 

246 ---------- 

247 header : `str` 

248 Value of an HTTP Content-Range header to be parsed. 

249 

250 Returns 

251 ------- 

252 content_range : `ContentRange` 

253 The byte range included in the response and the total file size. 

254 

255 Raises 

256 ------ 

257 ValueError 

258 If the header was not in the expected format. 

259 """ 

260 # There are three possible formats for Content-Range. All of them start 

261 # with optional whitespace and a unit, which for our purposes should always 

262 # be "bytes". 

263 prefix = r"^\s*bytes\s+" 

264 

265 # Content-Range: <unit> <range-start>-<range-end>/<size> 

266 if (case1 := re.match(prefix + r"(\d+)-(\d+)/(\d+)", header)) is not None: 

267 return ContentRange( 

268 range_start=int(case1.group(1)), range_end=int(case1.group(2)), total=int(case1.group(3)) 

269 ) 

270 

271 # Content-Range: <unit> <range-start>-<range-end>/* 

272 if (case2 := re.match(prefix + r"(\d+)-(\d+)/\*", header)) is not None: 

273 return ContentRange(range_start=int(case2.group(1)), range_end=int(case2.group(2)), total=None) 

274 

275 # Content-Range: <unit> */<size> 

276 if (case3 := re.match(prefix + r"\*/(\d+)", header)) is not None: 

277 return ContentRange(range_start=None, range_end=None, total=int(case3.group(1))) 

278 

279 raise ValueError(f"Content-Range header in unexpected format: '{header}'")