Coverage for python/lsst/resources/_resourceHandles/_s3ResourceHandle.py: 84%

158 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-30 11:34 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourceHandle",) 

15 

16import warnings 

17from collections.abc import Iterable, Mapping 

18from io import SEEK_CUR, SEEK_END, SEEK_SET, BytesIO, UnsupportedOperation 

19from logging import Logger 

20from typing import TYPE_CHECKING 

21 

22from botocore.exceptions import ClientError 

23from lsst.utils.introspection import find_outside_stacklevel 

24from lsst.utils.timer import time_this 

25 

26from ..s3utils import all_retryable_errors, backoff, max_retry_time 

27from ._baseResourceHandle import BaseResourceHandle, CloseStatus 

28 

29if TYPE_CHECKING: 

30 import boto3 

31 

32 

33class S3ResourceHandle(BaseResourceHandle[bytes]): 

34 """S3 specialization of `.BaseResourceHandle` 

35 

36 Parameters 

37 ---------- 

38 mode : `str` 

39 Handle modes as described in the python `io` module. 

40 log : `~logging.Logger` 

41 Logger to used when writing messages. 

42 client : `boto3.client` 

43 An existing boto3 client that will be used for interacting with the 

44 remote s3 server. 

45 bucket : `str` 

46 The name of the s3 bucket of this resource. 

47 key : `str` 

48 The identifier of the resource within the specified bucket. 

49 newline : `str` 

50 When doing multiline operations, break the stream on given character. 

51 Defaults to newline. 

52 

53 Note 

54 ---- 

55 It is only possible to incrementally flush this object if each chunk that 

56 is flushed is above 5MB in size. The flush command is ignored until the 

57 internal buffer reaches this size, or until close is called, whichever 

58 comes first. 

59 

60 Once an instance in write mode is flushed, it is not possible to seek back 

61 to a position in the byte stream before the flush is executed. 

62 

63 When opening a resource in read write mode (r+ or w+) no flushing is 

64 possible, and all data will be buffered until the resource is closed and 

65 the buffered data will be written. Additionally the entire contents of the 

66 resource will be loaded into memory upon opening. 

67 

68 Documentation on the methods of this class line should refer to the 

69 corresponding methods in the `io` module. 

70 

71 S3 handles only support operations in binary mode. To get other modes of 

72 reading and writing, wrap this handle inside an `io.TextIOWrapper` context 

73 manager. An example of this can be found in `S3ResourcePath`. 

74 """ 

75 

76 def __init__( 

77 self, mode: str, log: Logger, client: boto3.client, bucket: str, key: str, newline: bytes = b"\n" 

78 ): 

79 super().__init__(mode, log, newline=newline) 

80 self._client = client 

81 self._bucket = bucket 

82 self._key = key 

83 self._buffer = BytesIO() 

84 self._position = 0 

85 self._writable = False 

86 self._last_flush_position: int | None = None 

87 self._warned = False 

88 self._readable = bool({"r", "+"} & set(self._mode)) 

89 self._max_size: int | None = None 

90 self._recursing = False 

91 if {"w", "a", "x", "+"} & set(self._mode): 

92 self._writable = True 

93 self._multiPartUpload = client.create_multipart_upload(Bucket=bucket, Key=key) 

94 self._partNo = 1 

95 self._parts: list[Mapping] = [] 

96 # Below is a workaround for append mode. It basically must read in 

97 # everything that exists in the file so that it is in the buffer to 

98 # append to, and subsequently written back out appropriately with 

99 # any newly added data. 

100 if {"a", "+"} & set(self._mode): 

101 # Cheat a bit to get the existing data from the handle using 

102 # object interfaces, because we know this is safe. 

103 # Save the requested mode and readability. 

104 mode_save = self._mode 

105 read_save = self._readable 

106 # Update each of these internal variables to ensure the handle 

107 # is strictly readable. 

108 self._readable = True 

109 self._mode += "r" 

110 self._mode = self._mode.replace("+", "") 

111 # As mentioned, this reads the existing contents and writes it 

112 # out into the internal buffer, no writes actually happen until 

113 # the handle is flushed. 

114 self.write(self.read()) 

115 # Restore the requested states. 

116 self._mode = mode_save 

117 self._readable = read_save 

118 # Set the state of the stream if the specified mode is read 

119 # and write. 

120 if "+" in self._mode: 

121 self.seek(0) 

122 # If a file is w+ it is read write, but should be truncated 

123 # for future writes. 

124 if "w" in self._mode: 

125 self.truncate() 

126 

127 def tell(self) -> int: 

128 return self._position 

129 

130 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

131 def close(self) -> None: 

132 if self.writable(): 

133 # decide if this is a multipart upload 

134 if self._parts: 

135 # indicate that the object is in closing status 

136 self._closed = CloseStatus.CLOSING 

137 self.flush() 

138 with time_this(self._log, msg="Finalize multipart upload to %s", args=(self,)): 

139 self._client.complete_multipart_upload( 

140 Bucket=self._multiPartUpload["Bucket"], 

141 Key=self._multiPartUpload["Key"], 

142 UploadId=self._multiPartUpload["UploadId"], 

143 MultipartUpload={"Parts": self._parts}, 

144 ) 

145 else: 

146 # Put the complete object at once 

147 with time_this(self._log, msg="Write to %s", args=(self,)): 

148 self._client.put_object(Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue()) 

149 self._closed = CloseStatus.CLOSED 

150 

151 @property 

152 def closed(self) -> bool: 

153 return self._closed == CloseStatus.CLOSED 

154 

155 def fileno(self) -> int: 

156 raise UnsupportedOperation("S3 object does not have a file number") 

157 

158 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

159 def flush(self) -> None: 

160 # If the object is closed, not writeable, or rw flush should be skipped 

161 # rw mode skips flush because the whole bytestream must be kept in 

162 # the buffer for seeking reasons. 

163 if self.closed or not self.writable() or "+" in self._mode: 

164 return 

165 # Disallow writes to seek to a position prior to the previous flush 

166 # this allows multipart uploads to upload content as the stream is 

167 # written to. 

168 s3_min_bits = 5 * 1024 * 1024 # S3 flush threshold is 5 Mib. 

169 if ( 

170 (self.tell() - (self._last_flush_position or 0)) < s3_min_bits 

171 and self._closed != CloseStatus.CLOSING 

172 and not self._warned 

173 ): 

174 amount = s3_min_bits / (1024 * 1024) 

175 warnings.warn( 

176 f"S3 does not support flushing objects less than {amount} Mib, skipping", 

177 stacklevel=find_outside_stacklevel( 

178 "lsst.resources", 

179 "backoff", 

180 "contextlib", 

181 allow_modules={"lsst.resources.tests"}, 

182 ), 

183 ) 

184 self._warned = True 

185 return 

186 # nothing to write, don't create an empty upload 

187 if self.tell() == 0: 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true

188 return 

189 with time_this( 

190 self._log, 

191 msg="Upload multipart %d to %s", 

192 args=( 

193 self._partNo, 

194 self, 

195 ), 

196 ): 

197 response = self._client.upload_part( 

198 Body=self._buffer.getvalue(), 

199 Bucket=self._bucket, 

200 Key=self._key, 

201 UploadId=self._multiPartUpload["UploadId"], 

202 PartNumber=self._partNo, 

203 ) 

204 self._parts.append({"PartNumber": self._partNo, "ETag": response["ETag"]}) 

205 self._partNo += 1 

206 self._last_flush_position = self._buffer.tell() + (self._last_flush_position or 0) 

207 self._buffer = BytesIO() 

208 

209 @property 

210 def isatty(self) -> bool: 

211 return False 

212 

213 def readable(self) -> bool: 

214 return self._readable 

215 

216 def readline(self, size: int = -1) -> bytes: 

217 raise OSError("S3 Does not support line by line reads") 

218 

219 def readlines(self, hint: int = -1) -> Iterable[bytes]: 

220 self.seek(0) 

221 return self.read().split(self._newline) 

222 

223 def seek(self, offset: int, whence: int = SEEK_SET) -> int: 

224 if self.writable(): 

225 if self._last_flush_position is not None: 

226 if whence == SEEK_SET: 226 ↛ 230line 226 didn't jump to line 230, because the condition on line 226 was never false

227 offset -= self._last_flush_position 

228 if offset < 0: 

229 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions") 

230 if whence == SEEK_CUR and (self.tell() - self._last_flush_position) < 0: 230 ↛ 231line 230 didn't jump to line 231, because the condition on line 230 was never true

231 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions") 

232 if whence == SEEK_END: 232 ↛ 233line 232 didn't jump to line 233, because the condition on line 232 was never true

233 raise OSError("S3 ResourceHandle can not seek referencing the end of the resource") 

234 self._buffer.seek(offset, whence) 

235 self._position = self._buffer.tell() 

236 else: 

237 if whence == SEEK_SET: 237 ↛ 239line 237 didn't jump to line 239, because the condition on line 237 was never false

238 self._position = offset 

239 elif whence == SEEK_CUR: 

240 self._position += offset 

241 elif whence == SEEK_END: 

242 offset = abs(offset) 

243 self._position -= offset 

244 return self._position 

245 

246 def seekable(self) -> bool: 

247 return True 

248 

249 def truncate(self, size: int | None = None) -> int: 

250 if self.writable(): 250 ↛ 254line 250 didn't jump to line 254, because the condition on line 250 was never false

251 self._buffer.truncate(size) 

252 return self._position 

253 else: 

254 raise OSError("S3 ResourceHandle is not writable") 

255 

256 def writable(self) -> bool: 

257 return self._writable 

258 

259 def writelines(self, lines: Iterable[bytes]) -> None: 

260 if self.writable(): 

261 self._buffer.writelines(lines) 

262 self._position = self._buffer.tell() 

263 else: 

264 raise OSError("S3 ResourceHandle is not writable") 

265 

266 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

267 def read(self, size: int = -1) -> bytes: 

268 if not self.readable(): 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true

269 raise OSError("S3 ResourceHandle is not readable") 

270 # If the object is rw, then read from the internal io buffer 

271 if "+" in self._mode: 

272 self._buffer.seek(self._position) 

273 return self._buffer.read(size) 

274 # otherwise fetch the appropriate bytes from the remote resource 

275 if self._max_size is not None and self._position >= self._max_size: 

276 return b"" 

277 stop = f"{self._position + size - 1}" if size > 0 else "" 

278 args = {"Range": f"bytes={self._position}-{stop}"} 

279 try: 

280 response = self._client.get_object(Bucket=self._bucket, Key=self._key, **args) 

281 contents = response["Body"].read() 

282 response["Body"].close() 

283 self._position += len(contents) 

284 return contents 

285 except ClientError as exc: 

286 if exc.response["ResponseMetadata"]["HTTPStatusCode"] == 416: 286 ↛ 298line 286 didn't jump to line 298, because the condition on line 286 was never false

287 if self._recursing: 

288 # This means the function has attempted to read the whole 

289 # byte range and failed again, meaning the previous byte 

290 # was the last byte 

291 return b"" 

292 self._recursing = True 

293 result = self.read() 

294 self._max_size = self._position 

295 self._recursing = False 

296 return result 

297 else: 

298 raise 

299 

300 def write(self, b: bytes) -> int: 

301 if self.writable(): 301 ↛ 306line 301 didn't jump to line 306, because the condition on line 301 was never false

302 result = self._buffer.write(b) 

303 self._position = self._buffer.tell() 

304 return result 

305 else: 

306 raise OSError("S3 ResourceHandle is not writable")