Coverage for python/lsst/resources/_resourceHandles/_s3ResourceHandle.py: 83%

160 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-25 09:29 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourceHandle",) 

15 

16import warnings 

17from collections.abc import Iterable, Mapping 

18from io import SEEK_CUR, SEEK_END, SEEK_SET, BytesIO, UnsupportedOperation 

19from logging import Logger 

20from typing import TYPE_CHECKING 

21 

22from botocore.exceptions import ClientError 

23from lsst.utils.timer import time_this 

24 

25from ..s3utils import all_retryable_errors, backoff, max_retry_time 

26from ._baseResourceHandle import BaseResourceHandle, CloseStatus 

27 

28if TYPE_CHECKING: 

29 import boto3 

30 

31 

32class S3ResourceHandle(BaseResourceHandle[bytes]): 

33 """S3 specialization of `.BaseResourceHandle` 

34 

35 Parameters 

36 ---------- 

37 mode : `str` 

38 Handle modes as described in the python `io` module. 

39 log : `~logging.Logger` 

40 Logger to used when writing messages. 

41 client : `boto3.client` 

42 An existing boto3 client that will be used for interacting with the 

43 remote s3 server. 

44 bucket : `str` 

45 The name of the s3 bucket of this resource. 

46 key : `str` 

47 The identifier of the resource within the specified bucket. 

48 newline : `str` 

49 When doing multiline operations, break the stream on given character. 

50 Defaults to newline. 

51 

52 Note 

53 ---- 

54 It is only possible to incrementally flush this object if each chunk that 

55 is flushed is above 5MB in size. The flush command is ignored until the 

56 internal buffer reaches this size, or until close is called, whichever 

57 comes first. 

58 

59 Once an instance in write mode is flushed, it is not possible to seek back 

60 to a position in the byte stream before the flush is executed. 

61 

62 When opening a resource in read write mode (r+ or w+) no flushing is 

63 possible, and all data will be buffered until the resource is closed and 

64 the buffered data will be written. Additionally the entire contents of the 

65 resource will be loaded into memory upon opening. 

66 

67 Documentation on the methods of this class line should refer to the 

68 corresponding methods in the `io` module. 

69 

70 S3 handles only support operations in binary mode. To get other modes of 

71 reading and writing, wrap this handle inside an `io.TextIOWrapper` context 

72 manager. An example of this can be found in `S3ResourcePath`. 

73 """ 

74 

75 def __init__( 

76 self, mode: str, log: Logger, client: boto3.client, bucket: str, key: str, newline: bytes = b"\n" 

77 ): 

78 super().__init__(mode, log, newline=newline) 

79 self._client = client 

80 self._bucket = bucket 

81 self._key = key 

82 self._buffer = BytesIO() 

83 self._position = 0 

84 self._writable = False 

85 self._last_flush_position: int | None = None 

86 self._warned = False 

87 self._readable = bool({"r", "+"} & set(self._mode)) 

88 self._max_size: int | None = None 

89 self._recursing = False 

90 if {"w", "a", "x", "+"} & set(self._mode): 

91 self._writable = True 

92 self._multiPartUpload = client.create_multipart_upload(Bucket=bucket, Key=key) 

93 self._partNo = 1 

94 self._parts: list[Mapping] = [] 

95 # Below is a workaround for append mode. It basically must read in 

96 # everything that exists in the file so that it is in the buffer to 

97 # append to, and subsequently written back out appropriately with 

98 # any newly added data. 

99 if {"a", "+"} & set(self._mode): 

100 # Cheat a bit to get the existing data from the handle using 

101 # object interfaces, because we know this is safe. 

102 # Save the requested mode and readability. 

103 mode_save = self._mode 

104 read_save = self._readable 

105 # Update each of these internal variables to ensure the handle 

106 # is strictly readable. 

107 self._readable = True 

108 self._mode += "r" 

109 self._mode = self._mode.replace("+", "") 

110 # As mentioned, this reads the existing contents and writes it 

111 # out into the internal buffer, no writes actually happen until 

112 # the handle is flushed. 

113 self.write(self.read()) 

114 # Restore the requested states. 

115 self._mode = mode_save 

116 self._readable = read_save 

117 # Set the state of the stream if the specified mode is read 

118 # and write. 

119 if "+" in self._mode: 

120 self.seek(0) 

121 # If a file is w+ it is read write, but should be truncated 

122 # for future writes. 

123 if "w" in self._mode: 

124 self.truncate() 

125 

126 def tell(self) -> int: 

127 return self._position 

128 

129 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

130 def close(self) -> None: 

131 if self.writable(): 

132 # decide if this is a multipart upload 

133 if self._parts: 

134 # indicate that the object is in closing status 

135 self._closed = CloseStatus.CLOSING 

136 self.flush() 

137 with time_this(self._log, msg="Finalize multipart upload to %s", args=(self,)): 

138 self._client.complete_multipart_upload( 

139 Bucket=self._multiPartUpload["Bucket"], 

140 Key=self._multiPartUpload["Key"], 

141 UploadId=self._multiPartUpload["UploadId"], 

142 MultipartUpload={"Parts": self._parts}, 

143 ) 

144 else: 

145 # Put the complete object at once 

146 with time_this(self._log, msg="Write to %s", args=(self,)): 

147 self._client.put_object(Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue()) 

148 self._closed = CloseStatus.CLOSED 

149 

150 @property 

151 def closed(self) -> bool: 

152 return self._closed == CloseStatus.CLOSED 

153 

154 def fileno(self) -> int: 

155 raise UnsupportedOperation("S3 object does not have a file number") 

156 

157 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

158 def flush(self) -> None: 

159 # If the object is closed, not writeable, or rw flush should be skipped 

160 # rw mode skips flush because the whole bytestream must be kept in 

161 # the buffer for seeking reasons. 

162 if self.closed or not self.writable() or "+" in self._mode: 

163 return 

164 # Disallow writes to seek to a position prior to the previous flush 

165 # this allows multipart uploads to upload content as the stream is 

166 # written to. 

167 s3_min_bits = 5 * 1024 * 1024 # S3 flush threshold is 5 Mib. 

168 if ( 

169 (self.tell() - (self._last_flush_position or 0)) < s3_min_bits 

170 and not self._closed == CloseStatus.CLOSING 

171 and not self._warned 

172 ): 

173 amount = s3_min_bits / (1024 * 1024) 

174 warnings.warn(f"S3 does not support flushing objects less than {amount} Mib, skipping") 

175 self._warned = True 

176 return 

177 # nothing to write, don't create an empty upload 

178 if self.tell() == 0: 178 ↛ 179line 178 didn't jump to line 179, because the condition on line 178 was never true

179 return 

180 with time_this( 

181 self._log, 

182 msg="Upload multipart %d to %s", 

183 args=( 

184 self._partNo, 

185 self, 

186 ), 

187 ): 

188 response = self._client.upload_part( 

189 Body=self._buffer.getvalue(), 

190 Bucket=self._bucket, 

191 Key=self._key, 

192 UploadId=self._multiPartUpload["UploadId"], 

193 PartNumber=self._partNo, 

194 ) 

195 self._parts.append({"PartNumber": self._partNo, "ETag": response["ETag"]}) 

196 self._partNo += 1 

197 self._last_flush_position = self._buffer.tell() + (self._last_flush_position or 0) 

198 self._buffer = BytesIO() 

199 

200 @property 

201 def isatty(self) -> bool: 

202 return False 

203 

204 def readable(self) -> bool: 

205 return self._readable 

206 

207 def readline(self, size: int = -1) -> bytes: 

208 raise OSError("S3 Does not support line by line reads") 

209 

210 def readlines(self, hint: int = -1) -> Iterable[bytes]: 

211 self.seek(0) 

212 return self.read().split(self._newline) 

213 

214 def seek(self, offset: int, whence: int = SEEK_SET) -> int: 

215 if self.writable(): 

216 if self._last_flush_position is not None: 

217 if whence == SEEK_SET: 217 ↛ 221line 217 didn't jump to line 221, because the condition on line 217 was never false

218 offset -= self._last_flush_position 

219 if offset < 0: 

220 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions") 

221 if whence == SEEK_CUR: 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 if (self.tell() - self._last_flush_position) < 0: 

223 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions") 

224 if whence == SEEK_END: 224 ↛ 225line 224 didn't jump to line 225, because the condition on line 224 was never true

225 raise OSError("S3 ResourceHandle can not seek referencing the end of the resource") 

226 self._buffer.seek(offset, whence) 

227 self._position = self._buffer.tell() 

228 else: 

229 if whence == SEEK_SET: 229 ↛ 231line 229 didn't jump to line 231, because the condition on line 229 was never false

230 self._position = offset 

231 elif whence == SEEK_CUR: 

232 self._position += offset 

233 elif whence == SEEK_END: 

234 offset = abs(offset) 

235 self._position -= offset 

236 return self._position 

237 

238 def seekable(self) -> bool: 

239 return True 

240 

241 def truncate(self, size: int | None = None) -> int: 

242 if self.writable(): 242 ↛ 246line 242 didn't jump to line 246, because the condition on line 242 was never false

243 self._buffer.truncate(size) 

244 return self._position 

245 else: 

246 raise OSError("S3 ResourceHandle is not writable") 

247 

248 def writable(self) -> bool: 

249 return self._writable 

250 

251 def writelines(self, lines: Iterable[bytes]) -> None: 

252 if self.writable(): 

253 self._buffer.writelines(lines) 

254 self._position = self._buffer.tell() 

255 else: 

256 raise OSError("S3 ResourceHandle is not writable") 

257 

258 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

259 def read(self, size: int = -1) -> bytes: 

260 if not self.readable(): 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true

261 raise OSError("S3 ResourceHandle is not readable") 

262 # If the object is rw, then read from the internal io buffer 

263 if "+" in self._mode: 

264 self._buffer.seek(self._position) 

265 return self._buffer.read(size) 

266 # otherwise fetch the appropriate bytes from the remote resource 

267 if self._max_size is not None and self._position >= self._max_size: 

268 return b"" 

269 if size > 0: 

270 stop = f"{self._position + size - 1}" 

271 else: 

272 stop = "" 

273 args = {"Range": f"bytes={self._position}-{stop}"} 

274 try: 

275 response = self._client.get_object(Bucket=self._bucket, Key=self._key, **args) 

276 contents = response["Body"].read() 

277 response["Body"].close() 

278 self._position += len(contents) 

279 return contents 

280 except ClientError as exc: 

281 if exc.response["ResponseMetadata"]["HTTPStatusCode"] == 416: 281 ↛ 293line 281 didn't jump to line 293, because the condition on line 281 was never false

282 if self._recursing: 

283 # This means the function has attempted to read the whole 

284 # byte range and failed again, meaning the previous byte 

285 # was the last byte 

286 return b"" 

287 self._recursing = True 

288 result = self.read() 

289 self._max_size = self._position 

290 self._recursing = False 

291 return result 

292 else: 

293 raise 

294 

295 def write(self, b: bytes) -> int: 

296 if self.writable(): 296 ↛ 301line 296 didn't jump to line 301, because the condition on line 296 was never false

297 result = self._buffer.write(b) 

298 self._position = self._buffer.tell() 

299 return result 

300 else: 

301 raise OSError("S3 ResourceHandle is not writable")