Coverage for python/lsst/resources/_resourceHandles/_httpResourceHandle.py: 18%

115 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-03 02:26 -0700

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpReadResourceHandle",) 

15 

16import io 

17from logging import Logger 

18from typing import AnyStr, Callable, Iterable, Optional, Union 

19 

20import requests 

21from lsst.utils.timer import time_this 

22 

23from ._baseResourceHandle import BaseResourceHandle, CloseStatus 

24 

25 

26class HttpReadResourceHandle(BaseResourceHandle[bytes]): 

27 def __init__( 

28 self, 

29 mode: str, 

30 log: Logger, 

31 *, 

32 session: Optional[requests.Session] = None, 

33 url: Optional[str] = None, 

34 timeout: Optional[tuple[float, float]] = None, 

35 newline: Optional[AnyStr] = None, 

36 ) -> None: 

37 super().__init__(mode, log, newline=newline) 

38 if url is None: 

39 raise ValueError("Url must be specified when constructing this object") 

40 self._url = url 

41 if session is None: 

42 raise ValueError("Session must be specified when constructing this object") 

43 self._session = session 

44 

45 if timeout is None: 

46 raise ValueError("timeout must be specified when constructing this object") 

47 self._timeout = timeout 

48 

49 self._completeBuffer: Optional[io.BytesIO] = None 

50 

51 self._closed = CloseStatus.OPEN 

52 self._current_position = 0 

53 self._eof = False 

54 

55 def close(self) -> None: 

56 self._closed = CloseStatus.CLOSED 

57 self._completeBuffer = None 

58 self._eof = True 

59 

60 @property 

61 def closed(self) -> bool: 

62 return self._closed == CloseStatus.CLOSED 

63 

64 def fileno(self) -> int: 

65 raise io.UnsupportedOperation("HttpReadResourceHandle does not have a file number") 

66 

67 def flush(self) -> None: 

68 modes = set(self._mode) 

69 if {"w", "x", "a", "+"} & modes: 

70 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

71 

72 @property 

73 def isatty(self) -> Union[bool, Callable[[], bool]]: 

74 return False 

75 

76 def readable(self) -> bool: 

77 return True 

78 

79 def readline(self, size: int = -1) -> AnyStr: 

80 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading") 

81 

82 def readlines(self, size: int = -1) -> Iterable[bytes]: 

83 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support line by line reading") 

84 

85 def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: 

86 self._eof = False 

87 if whence == io.SEEK_CUR and (self._current_position + offset) >= 0: 

88 self._current_position += offset 

89 elif whence == io.SEEK_SET and offset >= 0: 

90 self._current_position = offset 

91 else: 

92 raise io.UnsupportedOperation("Seek value is incorrect, or whence mode is unsupported") 

93 

94 # handle if the complete file has be read already 

95 if self._completeBuffer is not None: 

96 self._completeBuffer.seek(self._current_position, whence) 

97 return self._current_position 

98 

99 def seekable(self) -> bool: 

100 return True 

101 

102 def tell(self) -> int: 

103 return self._current_position 

104 

105 def truncate(self, size: Optional[int] = None) -> int: 

106 raise io.UnsupportedOperation("HttpReadResourceHandles Do not support truncation") 

107 

108 def writable(self) -> bool: 

109 return False 

110 

111 def write(self, b: bytes, /) -> int: 

112 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

113 

114 def writelines(self, b: Iterable[bytes], /) -> None: 

115 raise io.UnsupportedOperation("HttpReadResourceHandles are read only") 

116 

117 def read(self, size: int = -1) -> bytes: 

118 if self._eof: 

119 # At EOF so always return an empty byte string. 

120 return b"" 

121 

122 # branch for if the complete file has been read before 

123 if self._completeBuffer is not None: 

124 result = self._completeBuffer.read(size) 

125 self._current_position += len(result) 

126 return result 

127 

128 if self._completeBuffer is None and size == -1 and self._current_position == 0: 

129 # The whole file has been requested, read it into a buffer and 

130 # return the result 

131 self._completeBuffer = io.BytesIO() 

132 with time_this(self._log, msg="Read from remote resource %s", args=(self._url,)): 

133 resp = self._session.get(self._url, stream=False, timeout=self._timeout) 

134 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial): 

135 raise FileNotFoundError(f"Unable to read resource {self._url}; status code: {code}") 

136 self._completeBuffer.write(resp.content) 

137 self._current_position = self._completeBuffer.tell() 

138 

139 return self._completeBuffer.getbuffer().tobytes() 

140 

141 # A partial read is required, either because a size has been specified, 

142 # or a read has previously been done. Any time we specify a byte range 

143 # we must disable the gzip compression on the server since we want 

144 # to address ranges in the uncompressed file. If we send ranges that 

145 # are interpreted by the server as offsets into the compressed file 

146 # then that is at least confusing and also there is no guarantee that 

147 # the bytes can be uncompressed. 

148 

149 end_pos = self._current_position + (size - 1) if size >= 0 else "" 

150 headers = {"Range": f"bytes={self._current_position}-{end_pos}", "Accept-Encoding": "identity"} 

151 

152 with time_this( 

153 self._log, msg="Read from remote resource %s using headers %s", args=(self._url, headers) 

154 ): 

155 resp = self._session.get(self._url, stream=False, timeout=self._timeout, headers=headers) 

156 

157 if resp.status_code == requests.codes.range_not_satisfiable: 

158 # Must have run off the end of the file. A standard file handle 

159 # will treat this as EOF so be consistent with that. Do not change 

160 # the current position. 

161 self._eof = True 

162 return b"" 

163 

164 if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial): 

165 raise FileNotFoundError( 

166 f"Unable to read resource {self._url}, or bytes are out of range; status code: {code}" 

167 ) 

168 

169 len_content = len(resp.content) 

170 

171 # verify this is not actually the whole file and the server did not lie 

172 # about supporting ranges 

173 if len_content > size or code != requests.codes.partial: 

174 self._completeBuffer = io.BytesIO() 

175 self._completeBuffer.write(resp.content) 

176 self._completeBuffer.seek(0) 

177 return self.read(size=size) 

178 

179 # The response header should tell us the total number of bytes 

180 # in the file and also the current position we have got to in the 

181 # server. 

182 if "Content-Range" in resp.headers: 

183 content_range = resp.headers["Content-Range"] 

184 units, range_string = content_range.split(" ") 

185 if units == "bytes": 

186 range, total = range_string.split("/") 

187 if "-" in range: 

188 _, end = range.split("-") 

189 end_pos = int(end) 

190 if total != "*": 

191 if end_pos >= int(total) - 1: 

192 self._eof = True 

193 else: 

194 self._log.warning("Requested byte range from server but instead got: %s", content_range) 

195 

196 # Try to guess that we overran the end. This will not help if we 

197 # read exactly the number of bytes to get us to the end and so we 

198 # will need to do one more read and get a 416. 

199 if len_content < size: 

200 self._eof = True 

201 

202 self._current_position += len_content 

203 return resp.content