Coverage for python/lsst/resources/_resourceHandles/_s3ResourceHandle.py: 81%

145 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-01 02:01 -0800

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourceHandle",) 

15 

16import warnings 

17from io import SEEK_CUR, SEEK_END, SEEK_SET, BytesIO, UnsupportedOperation 

18from logging import Logger 

19from typing import TYPE_CHECKING, Iterable, Mapping, Optional 

20 

21from lsst.utils.timer import time_this 

22 

23from ..s3utils import all_retryable_errors, backoff, max_retry_time 

24from ._baseResourceHandle import BaseResourceHandle, CloseStatus 

25 

26if TYPE_CHECKING: 26 ↛ 27line 26 didn't jump to line 27, because the condition on line 26 was never true

27 import boto3 

28 

29 

30class S3ResourceHandle(BaseResourceHandle[bytes]): 

31 """S3 specialization of `BaseResourceHandle` 

32 

33 Parameters 

34 ---------- 

35 mode : `str` 

36 Handle modes as described in the python `io` module. 

37 log : `~logging.Logger` 

38 Logger to used when writing messages. 

39 client : `boto3.client` 

40 An existing boto3 client that will be used for interacting with the 

41 remote s3 server. 

42 bucket : `str` 

43 The name of the s3 bucket of this resource. 

44 key : `str` 

45 The identifier of the resource within the specified bucket. 

46 newline : `str` 

47 When doing multiline operations, break the stream on given character. 

48 Defaults to newline. 

49 

50 Note 

51 ---- 

52 It is only possible to incrementally flush this object if each chunk that 

53 is flushed is above 5MB in size. The flush command is ignored until the 

54 internal buffer reaches this size, or until close is called, whichever 

55 comes first. 

56 

57 Once an instance in write mode is flushed, it is not possible to seek back 

58 to a position in the byte stream before the flush is executed. 

59 

60 When opening a resource in read write mode (r+ or w+) no flushing is 

61 possible, and all data will be buffered until the resource is closed and 

62 the buffered data will be written. Additionally the entire contents of the 

63 resource will be loaded into memory upon opening. 

64 

65 Documentation on the methods of this class line should refer to the 

66 corresponding methods in the `io` module. 

67 

68 S3 handles only support operations in binary mode. To get other modes of 

69 reading and writing, wrap this handle inside an `io.TextIOWrapper` context 

70 manager. An example of this can be found in `S3ResourcePath`. 

71 """ 

72 

73 def __init__( 

74 self, mode: str, log: Logger, client: "boto3.client", bucket: str, key: str, newline: bytes = b"\n" 

75 ): 

76 super().__init__(mode, log, newline=newline) 

77 self._client = client 

78 self._bucket = bucket 

79 self._key = key 

80 self._buffer = BytesIO() 

81 self._position = 0 

82 self._writable = False 

83 self._last_flush_position: Optional[int] = None 

84 self._warned = False 

85 self._readable = bool({"r", "+"} & set(self._mode)) 

86 if {"w", "a", "x", "+"} & set(self._mode): 

87 self._writable = True 

88 self._multiPartUpload = client.create_multipart_upload(Bucket=bucket, Key=key) 

89 self._partNo = 1 

90 self._parts: list[Mapping] = [] 

91 # Below is a workaround for append mode. It basically must read in 

92 # everything that exists in the file so that it is in the buffer to 

93 # append to, and subsequently written back out appropriately with 

94 # any newly added data. 

95 if {"a", "+"} & set(self._mode): 

96 # Cheat a bit to get the existing data from the handle using 

97 # object interfaces, because we know this is safe. 

98 # Save the requested mode and readability. 

99 mode_save = self._mode 

100 read_save = self._readable 

101 # Update each of these internal variables to ensure the handle 

102 # is strictly readable. 

103 self._readable = True 

104 self._mode += "r" 

105 self._mode = self._mode.replace("+", "") 

106 # As mentioned, this reads the existing contents and writes it 

107 # out into the internal buffer, no writes actually happen until 

108 # the handle is flushed. 

109 self.write(self.read()) 

110 # Restore the requested states. 

111 self._mode = mode_save 

112 self._readable = read_save 

113 # Set the state of the stream if the specified mode is read 

114 # and write. 

115 if "+" in self._mode: 

116 self.seek(0) 

117 # If a file is w+ it is read write, but should be truncated 

118 # for future writes. 

119 if "w" in self._mode: 

120 self.truncate() 

121 

122 def tell(self) -> int: 

123 return self._position 

124 

125 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

126 def close(self) -> None: 

127 if self.writable(): 

128 # decide if this is a multipart upload 

129 if self._parts: 

130 # indicate that the object is in closing status 

131 self._closed = CloseStatus.CLOSING 

132 self.flush() 

133 with time_this(self._log, msg="Finalize multipart upload to %s", args=(self,)): 

134 self._client.complete_multipart_upload( 

135 Bucket=self._multiPartUpload["Bucket"], 

136 Key=self._multiPartUpload["Key"], 

137 UploadId=self._multiPartUpload["UploadId"], 

138 MultipartUpload={"Parts": self._parts}, 

139 ) 

140 else: 

141 # Put the complete object at once 

142 with time_this(self._log, msg="Write to %s", args=(self,)): 

143 self._client.put_object(Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue()) 

144 self._closed = CloseStatus.CLOSED 

145 

146 @property 

147 def closed(self) -> bool: 

148 return self._closed == CloseStatus.CLOSED 

149 

150 def fileno(self) -> int: 

151 raise UnsupportedOperation("S3 object does not have a file number") 

152 

153 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

154 def flush(self) -> None: 

155 # If the object is closed, not writeable, or rw flush should be skipped 

156 # rw mode skips flush because the whole bytestream must be kept in 

157 # the buffer for seeking reasons. 

158 if self.closed or not self.writable() or "+" in self._mode: 

159 return 

160 # Disallow writes to seek to a position prior to the previous flush 

161 # this allows multipart uploads to upload content as the stream is 

162 # written to. 

163 s3_min_bits = 5 * 1024 * 1024 # S3 flush threshold is 5 Mib. 

164 if ( 

165 (self.tell() - (self._last_flush_position or 0)) < s3_min_bits 

166 and not self._closed == CloseStatus.CLOSING 

167 and not self._warned 

168 ): 

169 amount = s3_min_bits / (1024 * 1024) 

170 warnings.warn(f"S3 does not support flushing objects less than {amount} Mib, skipping") 

171 self._warned = True 

172 return 

173 # nothing to write, don't create an empty upload 

174 if self.tell() == 0: 174 ↛ 175line 174 didn't jump to line 175, because the condition on line 174 was never true

175 return 

176 with time_this( 

177 self._log, 

178 msg="Upload multipart %d to %s", 

179 args=( 

180 self._partNo, 

181 self, 

182 ), 

183 ): 

184 response = self._client.upload_part( 

185 Body=self._buffer.getvalue(), 

186 Bucket=self._bucket, 

187 Key=self._key, 

188 UploadId=self._multiPartUpload["UploadId"], 

189 PartNumber=self._partNo, 

190 ) 

191 self._parts.append({"PartNumber": self._partNo, "ETag": response["ETag"]}) 

192 self._partNo += 1 

193 self._last_flush_position = self._buffer.tell() + (self._last_flush_position or 0) 

194 self._buffer = BytesIO() 

195 

196 @property 

197 def isatty(self) -> bool: 

198 return False 

199 

200 def readable(self) -> bool: 

201 return self._readable 

202 

203 def readline(self, size: int = -1) -> bytes: 

204 raise OSError("S3 Does not support line by line reads") 

205 

206 def readlines(self, hint: int = -1) -> Iterable[bytes]: 

207 self.seek(0) 

208 return self.read().split(self._newline) 

209 

210 def seek(self, offset: int, whence: int = SEEK_SET) -> int: 

211 if self.writable(): 

212 if self._last_flush_position is not None: 

213 if whence == SEEK_SET: 213 ↛ 217line 213 didn't jump to line 217, because the condition on line 213 was never false

214 offset -= self._last_flush_position 

215 if offset < 0: 

216 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions") 

217 if whence == SEEK_CUR: 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true

218 if (self.tell() - self._last_flush_position) < 0: 

219 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions") 

220 if whence == SEEK_END: 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true

221 raise OSError("S3 ResourceHandle can not seek referencing the end of the resource") 

222 self._buffer.seek(offset, whence) 

223 self._position = self._buffer.tell() 

224 else: 

225 if whence == SEEK_SET: 225 ↛ 227line 225 didn't jump to line 227, because the condition on line 225 was never false

226 self._position = offset 

227 elif whence == SEEK_CUR: 

228 self._position += offset 

229 elif whence == SEEK_END: 

230 offset = abs(offset) 

231 self._position -= offset 

232 return self._position 

233 

234 def seekable(self) -> bool: 

235 return True 

236 

237 def truncate(self, size: Optional[int] = None) -> int: 

238 if self.writable(): 238 ↛ 242line 238 didn't jump to line 242, because the condition on line 238 was never false

239 self._buffer.truncate(size) 

240 return self._position 

241 else: 

242 raise OSError("S3 ResourceHandle is not writable") 

243 

244 def writable(self) -> bool: 

245 return self._writable 

246 

247 def writelines(self, lines: Iterable[bytes]) -> None: 

248 if self.writable(): 

249 self._buffer.writelines(lines) 

250 self._position = self._buffer.tell() 

251 else: 

252 raise OSError("S3 ResourceHandle is not writable") 

253 

254 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

255 def read(self, size: int = -1) -> bytes: 

256 if not self.readable(): 256 ↛ 257line 256 didn't jump to line 257, because the condition on line 256 was never true

257 raise OSError("S3 ResourceHandle is not readable") 

258 # If the object is rw, then read from the internal io buffer 

259 if "+" in self._mode: 

260 self._buffer.seek(self._position) 

261 return self._buffer.read(size) 

262 # otherwise fetch the appropriate bytes from the remote resource 

263 if size > 0: 

264 stop = f"{self._position + size - 1}" 

265 else: 

266 stop = "" 

267 args = {"Range": f"bytes={self._position}-{stop}"} 

268 response = self._client.get_object(Bucket=self._bucket, Key=self._key, **args) 

269 contents = response["Body"].read() 

270 response["Body"].close() 

271 self._position = len(contents) 

272 return contents 

273 

274 def write(self, b: bytes) -> int: 

275 if self.writable(): 275 ↛ 280line 275 didn't jump to line 280, because the condition on line 275 was never false

276 result = self._buffer.write(b) 

277 self._position = self._buffer.tell() 

278 return result 

279 else: 

280 raise OSError("S3 ResourceHandle is not writable")