Coverage for python/lsst/resources/_resourceHandles/_s3ResourceHandle.py: 82%

159 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-20 03:07 -0700

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourceHandle",) 

15 

16import warnings 

17from io import SEEK_CUR, SEEK_END, SEEK_SET, BytesIO, UnsupportedOperation 

18from logging import Logger 

19from typing import TYPE_CHECKING, Iterable, Mapping, Optional 

20 

21from botocore.exceptions import ClientError 

22from lsst.utils.timer import time_this 

23 

24from ..s3utils import all_retryable_errors, backoff, max_retry_time 

25from ._baseResourceHandle import BaseResourceHandle, CloseStatus 

26 

27if TYPE_CHECKING: 

28 import boto3 

29 

30 

31class S3ResourceHandle(BaseResourceHandle[bytes]): 

32 """S3 specialization of `BaseResourceHandle` 

33 

34 Parameters 

35 ---------- 

36 mode : `str` 

37 Handle modes as described in the python `io` module. 

38 log : `~logging.Logger` 

39 Logger to used when writing messages. 

40 client : `boto3.client` 

41 An existing boto3 client that will be used for interacting with the 

42 remote s3 server. 

43 bucket : `str` 

44 The name of the s3 bucket of this resource. 

45 key : `str` 

46 The identifier of the resource within the specified bucket. 

47 newline : `str` 

48 When doing multiline operations, break the stream on given character. 

49 Defaults to newline. 

50 

51 Note 

52 ---- 

53 It is only possible to incrementally flush this object if each chunk that 

54 is flushed is above 5MB in size. The flush command is ignored until the 

55 internal buffer reaches this size, or until close is called, whichever 

56 comes first. 

57 

58 Once an instance in write mode is flushed, it is not possible to seek back 

59 to a position in the byte stream before the flush is executed. 

60 

61 When opening a resource in read write mode (r+ or w+) no flushing is 

62 possible, and all data will be buffered until the resource is closed and 

63 the buffered data will be written. Additionally the entire contents of the 

64 resource will be loaded into memory upon opening. 

65 

66 Documentation on the methods of this class line should refer to the 

67 corresponding methods in the `io` module. 

68 

69 S3 handles only support operations in binary mode. To get other modes of 

70 reading and writing, wrap this handle inside an `io.TextIOWrapper` context 

71 manager. An example of this can be found in `S3ResourcePath`. 

72 """ 

73 

74 def __init__( 

75 self, mode: str, log: Logger, client: "boto3.client", bucket: str, key: str, newline: bytes = b"\n" 

76 ): 

77 super().__init__(mode, log, newline=newline) 

78 self._client = client 

79 self._bucket = bucket 

80 self._key = key 

81 self._buffer = BytesIO() 

82 self._position = 0 

83 self._writable = False 

84 self._last_flush_position: Optional[int] = None 

85 self._warned = False 

86 self._readable = bool({"r", "+"} & set(self._mode)) 

87 self._max_size: int | None = None 

88 self._recursing = False 

89 if {"w", "a", "x", "+"} & set(self._mode): 

90 self._writable = True 

91 self._multiPartUpload = client.create_multipart_upload(Bucket=bucket, Key=key) 

92 self._partNo = 1 

93 self._parts: list[Mapping] = [] 

94 # Below is a workaround for append mode. It basically must read in 

95 # everything that exists in the file so that it is in the buffer to 

96 # append to, and subsequently written back out appropriately with 

97 # any newly added data. 

98 if {"a", "+"} & set(self._mode): 

99 # Cheat a bit to get the existing data from the handle using 

100 # object interfaces, because we know this is safe. 

101 # Save the requested mode and readability. 

102 mode_save = self._mode 

103 read_save = self._readable 

104 # Update each of these internal variables to ensure the handle 

105 # is strictly readable. 

106 self._readable = True 

107 self._mode += "r" 

108 self._mode = self._mode.replace("+", "") 

109 # As mentioned, this reads the existing contents and writes it 

110 # out into the internal buffer, no writes actually happen until 

111 # the handle is flushed. 

112 self.write(self.read()) 

113 # Restore the requested states. 

114 self._mode = mode_save 

115 self._readable = read_save 

116 # Set the state of the stream if the specified mode is read 

117 # and write. 

118 if "+" in self._mode: 

119 self.seek(0) 

120 # If a file is w+ it is read write, but should be truncated 

121 # for future writes. 

122 if "w" in self._mode: 

123 self.truncate() 

124 

125 def tell(self) -> int: 

126 return self._position 

127 

128 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

129 def close(self) -> None: 

130 if self.writable(): 

131 # decide if this is a multipart upload 

132 if self._parts: 

133 # indicate that the object is in closing status 

134 self._closed = CloseStatus.CLOSING 

135 self.flush() 

136 with time_this(self._log, msg="Finalize multipart upload to %s", args=(self,)): 

137 self._client.complete_multipart_upload( 

138 Bucket=self._multiPartUpload["Bucket"], 

139 Key=self._multiPartUpload["Key"], 

140 UploadId=self._multiPartUpload["UploadId"], 

141 MultipartUpload={"Parts": self._parts}, 

142 ) 

143 else: 

144 # Put the complete object at once 

145 with time_this(self._log, msg="Write to %s", args=(self,)): 

146 self._client.put_object(Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue()) 

147 self._closed = CloseStatus.CLOSED 

148 

149 @property 

150 def closed(self) -> bool: 

151 return self._closed == CloseStatus.CLOSED 

152 

153 def fileno(self) -> int: 

154 raise UnsupportedOperation("S3 object does not have a file number") 

155 

156 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

157 def flush(self) -> None: 

158 # If the object is closed, not writeable, or rw flush should be skipped 

159 # rw mode skips flush because the whole bytestream must be kept in 

160 # the buffer for seeking reasons. 

161 if self.closed or not self.writable() or "+" in self._mode: 

162 return 

163 # Disallow writes to seek to a position prior to the previous flush 

164 # this allows multipart uploads to upload content as the stream is 

165 # written to. 

166 s3_min_bits = 5 * 1024 * 1024 # S3 flush threshold is 5 Mib. 

167 if ( 

168 (self.tell() - (self._last_flush_position or 0)) < s3_min_bits 

169 and not self._closed == CloseStatus.CLOSING 

170 and not self._warned 

171 ): 

172 amount = s3_min_bits / (1024 * 1024) 

173 warnings.warn(f"S3 does not support flushing objects less than {amount} Mib, skipping") 

174 self._warned = True 

175 return 

176 # nothing to write, don't create an empty upload 

177 if self.tell() == 0: 177 ↛ 178line 177 didn't jump to line 178, because the condition on line 177 was never true

178 return 

179 with time_this( 

180 self._log, 

181 msg="Upload multipart %d to %s", 

182 args=( 

183 self._partNo, 

184 self, 

185 ), 

186 ): 

187 response = self._client.upload_part( 

188 Body=self._buffer.getvalue(), 

189 Bucket=self._bucket, 

190 Key=self._key, 

191 UploadId=self._multiPartUpload["UploadId"], 

192 PartNumber=self._partNo, 

193 ) 

194 self._parts.append({"PartNumber": self._partNo, "ETag": response["ETag"]}) 

195 self._partNo += 1 

196 self._last_flush_position = self._buffer.tell() + (self._last_flush_position or 0) 

197 self._buffer = BytesIO() 

198 

199 @property 

200 def isatty(self) -> bool: 

201 return False 

202 

203 def readable(self) -> bool: 

204 return self._readable 

205 

206 def readline(self, size: int = -1) -> bytes: 

207 raise OSError("S3 Does not support line by line reads") 

208 

209 def readlines(self, hint: int = -1) -> Iterable[bytes]: 

210 self.seek(0) 

211 return self.read().split(self._newline) 

212 

213 def seek(self, offset: int, whence: int = SEEK_SET) -> int: 

214 if self.writable(): 

215 if self._last_flush_position is not None: 

216 if whence == SEEK_SET: 216 ↛ 220line 216 didn't jump to line 220, because the condition on line 216 was never false

217 offset -= self._last_flush_position 

218 if offset < 0: 

219 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions") 

220 if whence == SEEK_CUR: 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true

221 if (self.tell() - self._last_flush_position) < 0: 

222 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions") 

223 if whence == SEEK_END: 223 ↛ 224line 223 didn't jump to line 224, because the condition on line 223 was never true

224 raise OSError("S3 ResourceHandle can not seek referencing the end of the resource") 

225 self._buffer.seek(offset, whence) 

226 self._position = self._buffer.tell() 

227 else: 

228 if whence == SEEK_SET: 228 ↛ 230line 228 didn't jump to line 230, because the condition on line 228 was never false

229 self._position = offset 

230 elif whence == SEEK_CUR: 

231 self._position += offset 

232 elif whence == SEEK_END: 

233 offset = abs(offset) 

234 self._position -= offset 

235 return self._position 

236 

237 def seekable(self) -> bool: 

238 return True 

239 

240 def truncate(self, size: Optional[int] = None) -> int: 

241 if self.writable(): 241 ↛ 245line 241 didn't jump to line 245, because the condition on line 241 was never false

242 self._buffer.truncate(size) 

243 return self._position 

244 else: 

245 raise OSError("S3 ResourceHandle is not writable") 

246 

247 def writable(self) -> bool: 

248 return self._writable 

249 

250 def writelines(self, lines: Iterable[bytes]) -> None: 

251 if self.writable(): 

252 self._buffer.writelines(lines) 

253 self._position = self._buffer.tell() 

254 else: 

255 raise OSError("S3 ResourceHandle is not writable") 

256 

257 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

258 def read(self, size: int = -1) -> bytes: 

259 if not self.readable(): 259 ↛ 260line 259 didn't jump to line 260, because the condition on line 259 was never true

260 raise OSError("S3 ResourceHandle is not readable") 

261 # If the object is rw, then read from the internal io buffer 

262 if "+" in self._mode: 

263 self._buffer.seek(self._position) 

264 return self._buffer.read(size) 

265 # otherwise fetch the appropriate bytes from the remote resource 

266 if self._max_size is not None and self._position >= self._max_size: 

267 return b"" 

268 if size > 0: 

269 stop = f"{self._position + size - 1}" 

270 else: 

271 stop = "" 

272 args = {"Range": f"bytes={self._position}-{stop}"} 

273 try: 

274 response = self._client.get_object(Bucket=self._bucket, Key=self._key, **args) 

275 contents = response["Body"].read() 

276 response["Body"].close() 

277 self._position += len(contents) 

278 return contents 

279 except ClientError as exc: 

280 if exc.response["ResponseMetadata"]["HTTPStatusCode"] == 416: 280 ↛ 292line 280 didn't jump to line 292, because the condition on line 280 was never false

281 if self._recursing: 

282 # This means the function has attempted to read the whole 

283 # byte range and failed again, meaning the previous byte 

284 # was the last byte 

285 return b"" 

286 self._recursing = True 

287 result = self.read() 

288 self._max_size = self._position 

289 self._recursing = False 

290 return result 

291 else: 

292 raise 

293 

294 def write(self, b: bytes) -> int: 

295 if self.writable(): 295 ↛ 300line 295 didn't jump to line 300, because the condition on line 295 was never false

296 result = self._buffer.write(b) 

297 self._position = self._buffer.tell() 

298 return result 

299 else: 

300 raise OSError("S3 ResourceHandle is not writable")