Coverage for python/lsst/resources/_resourceHandles/_s3ResourceHandle.py: 82%
159 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-12 02:04 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-12 02:04 -0700
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourceHandle",)
16import warnings
17from io import SEEK_CUR, SEEK_END, SEEK_SET, BytesIO, UnsupportedOperation
18from logging import Logger
19from typing import TYPE_CHECKING, Iterable, Mapping, Optional
21from botocore.exceptions import ClientError
22from lsst.utils.timer import time_this
24from ..s3utils import all_retryable_errors, backoff, max_retry_time
25from ._baseResourceHandle import BaseResourceHandle, CloseStatus
27if TYPE_CHECKING:
28 import boto3
31class S3ResourceHandle(BaseResourceHandle[bytes]):
32 """S3 specialization of `BaseResourceHandle`
34 Parameters
35 ----------
36 mode : `str`
37 Handle modes as described in the python `io` module.
38 log : `~logging.Logger`
39 Logger to used when writing messages.
40 client : `boto3.client`
41 An existing boto3 client that will be used for interacting with the
42 remote s3 server.
43 bucket : `str`
44 The name of the s3 bucket of this resource.
45 key : `str`
46 The identifier of the resource within the specified bucket.
47 newline : `str`
48 When doing multiline operations, break the stream on given character.
49 Defaults to newline.
51 Note
52 ----
53 It is only possible to incrementally flush this object if each chunk that
54 is flushed is above 5MB in size. The flush command is ignored until the
55 internal buffer reaches this size, or until close is called, whichever
56 comes first.
58 Once an instance in write mode is flushed, it is not possible to seek back
59 to a position in the byte stream before the flush is executed.
61 When opening a resource in read write mode (r+ or w+) no flushing is
62 possible, and all data will be buffered until the resource is closed and
63 the buffered data will be written. Additionally the entire contents of the
64 resource will be loaded into memory upon opening.
66 Documentation on the methods of this class line should refer to the
67 corresponding methods in the `io` module.
69 S3 handles only support operations in binary mode. To get other modes of
70 reading and writing, wrap this handle inside an `io.TextIOWrapper` context
71 manager. An example of this can be found in `S3ResourcePath`.
72 """
74 def __init__(
75 self, mode: str, log: Logger, client: "boto3.client", bucket: str, key: str, newline: bytes = b"\n"
76 ):
77 super().__init__(mode, log, newline=newline)
78 self._client = client
79 self._bucket = bucket
80 self._key = key
81 self._buffer = BytesIO()
82 self._position = 0
83 self._writable = False
84 self._last_flush_position: Optional[int] = None
85 self._warned = False
86 self._readable = bool({"r", "+"} & set(self._mode))
87 self._max_size: int | None = None
88 self._recursing = False
89 if {"w", "a", "x", "+"} & set(self._mode):
90 self._writable = True
91 self._multiPartUpload = client.create_multipart_upload(Bucket=bucket, Key=key)
92 self._partNo = 1
93 self._parts: list[Mapping] = []
94 # Below is a workaround for append mode. It basically must read in
95 # everything that exists in the file so that it is in the buffer to
96 # append to, and subsequently written back out appropriately with
97 # any newly added data.
98 if {"a", "+"} & set(self._mode):
99 # Cheat a bit to get the existing data from the handle using
100 # object interfaces, because we know this is safe.
101 # Save the requested mode and readability.
102 mode_save = self._mode
103 read_save = self._readable
104 # Update each of these internal variables to ensure the handle
105 # is strictly readable.
106 self._readable = True
107 self._mode += "r"
108 self._mode = self._mode.replace("+", "")
109 # As mentioned, this reads the existing contents and writes it
110 # out into the internal buffer, no writes actually happen until
111 # the handle is flushed.
112 self.write(self.read())
113 # Restore the requested states.
114 self._mode = mode_save
115 self._readable = read_save
116 # Set the state of the stream if the specified mode is read
117 # and write.
118 if "+" in self._mode:
119 self.seek(0)
120 # If a file is w+ it is read write, but should be truncated
121 # for future writes.
122 if "w" in self._mode:
123 self.truncate()
125 def tell(self) -> int:
126 return self._position
128 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
129 def close(self) -> None:
130 if self.writable():
131 # decide if this is a multipart upload
132 if self._parts:
133 # indicate that the object is in closing status
134 self._closed = CloseStatus.CLOSING
135 self.flush()
136 with time_this(self._log, msg="Finalize multipart upload to %s", args=(self,)):
137 self._client.complete_multipart_upload(
138 Bucket=self._multiPartUpload["Bucket"],
139 Key=self._multiPartUpload["Key"],
140 UploadId=self._multiPartUpload["UploadId"],
141 MultipartUpload={"Parts": self._parts},
142 )
143 else:
144 # Put the complete object at once
145 with time_this(self._log, msg="Write to %s", args=(self,)):
146 self._client.put_object(Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue())
147 self._closed = CloseStatus.CLOSED
149 @property
150 def closed(self) -> bool:
151 return self._closed == CloseStatus.CLOSED
153 def fileno(self) -> int:
154 raise UnsupportedOperation("S3 object does not have a file number")
156 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
157 def flush(self) -> None:
158 # If the object is closed, not writeable, or rw flush should be skipped
159 # rw mode skips flush because the whole bytestream must be kept in
160 # the buffer for seeking reasons.
161 if self.closed or not self.writable() or "+" in self._mode:
162 return
163 # Disallow writes to seek to a position prior to the previous flush
164 # this allows multipart uploads to upload content as the stream is
165 # written to.
166 s3_min_bits = 5 * 1024 * 1024 # S3 flush threshold is 5 Mib.
167 if (
168 (self.tell() - (self._last_flush_position or 0)) < s3_min_bits
169 and not self._closed == CloseStatus.CLOSING
170 and not self._warned
171 ):
172 amount = s3_min_bits / (1024 * 1024)
173 warnings.warn(f"S3 does not support flushing objects less than {amount} Mib, skipping")
174 self._warned = True
175 return
176 # nothing to write, don't create an empty upload
177 if self.tell() == 0: 177 ↛ 178line 177 didn't jump to line 178, because the condition on line 177 was never true
178 return
179 with time_this(
180 self._log,
181 msg="Upload multipart %d to %s",
182 args=(
183 self._partNo,
184 self,
185 ),
186 ):
187 response = self._client.upload_part(
188 Body=self._buffer.getvalue(),
189 Bucket=self._bucket,
190 Key=self._key,
191 UploadId=self._multiPartUpload["UploadId"],
192 PartNumber=self._partNo,
193 )
194 self._parts.append({"PartNumber": self._partNo, "ETag": response["ETag"]})
195 self._partNo += 1
196 self._last_flush_position = self._buffer.tell() + (self._last_flush_position or 0)
197 self._buffer = BytesIO()
199 @property
200 def isatty(self) -> bool:
201 return False
203 def readable(self) -> bool:
204 return self._readable
206 def readline(self, size: int = -1) -> bytes:
207 raise OSError("S3 Does not support line by line reads")
209 def readlines(self, hint: int = -1) -> Iterable[bytes]:
210 self.seek(0)
211 return self.read().split(self._newline)
213 def seek(self, offset: int, whence: int = SEEK_SET) -> int:
214 if self.writable():
215 if self._last_flush_position is not None:
216 if whence == SEEK_SET: 216 ↛ 220line 216 didn't jump to line 220, because the condition on line 216 was never false
217 offset -= self._last_flush_position
218 if offset < 0:
219 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions")
220 if whence == SEEK_CUR: 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 if (self.tell() - self._last_flush_position) < 0:
222 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions")
223 if whence == SEEK_END: 223 ↛ 224line 223 didn't jump to line 224, because the condition on line 223 was never true
224 raise OSError("S3 ResourceHandle can not seek referencing the end of the resource")
225 self._buffer.seek(offset, whence)
226 self._position = self._buffer.tell()
227 else:
228 if whence == SEEK_SET: 228 ↛ 230line 228 didn't jump to line 230, because the condition on line 228 was never false
229 self._position = offset
230 elif whence == SEEK_CUR:
231 self._position += offset
232 elif whence == SEEK_END:
233 offset = abs(offset)
234 self._position -= offset
235 return self._position
237 def seekable(self) -> bool:
238 return True
240 def truncate(self, size: Optional[int] = None) -> int:
241 if self.writable(): 241 ↛ 245line 241 didn't jump to line 245, because the condition on line 241 was never false
242 self._buffer.truncate(size)
243 return self._position
244 else:
245 raise OSError("S3 ResourceHandle is not writable")
247 def writable(self) -> bool:
248 return self._writable
250 def writelines(self, lines: Iterable[bytes]) -> None:
251 if self.writable():
252 self._buffer.writelines(lines)
253 self._position = self._buffer.tell()
254 else:
255 raise OSError("S3 ResourceHandle is not writable")
257 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
258 def read(self, size: int = -1) -> bytes:
259 if not self.readable(): 259 ↛ 260line 259 didn't jump to line 260, because the condition on line 259 was never true
260 raise OSError("S3 ResourceHandle is not readable")
261 # If the object is rw, then read from the internal io buffer
262 if "+" in self._mode:
263 self._buffer.seek(self._position)
264 return self._buffer.read(size)
265 # otherwise fetch the appropriate bytes from the remote resource
266 if self._max_size is not None and self._position >= self._max_size:
267 return b""
268 if size > 0:
269 stop = f"{self._position + size - 1}"
270 else:
271 stop = ""
272 args = {"Range": f"bytes={self._position}-{stop}"}
273 try:
274 response = self._client.get_object(Bucket=self._bucket, Key=self._key, **args)
275 contents = response["Body"].read()
276 response["Body"].close()
277 self._position += len(contents)
278 return contents
279 except ClientError as exc:
280 if exc.response["ResponseMetadata"]["HTTPStatusCode"] == 416: 280 ↛ 292line 280 didn't jump to line 292, because the condition on line 280 was never false
281 if self._recursing:
282 # This means the function has attempted to read the whole
283 # byte range and failed again, meaning the previous byte
284 # was the last byte
285 return b""
286 self._recursing = True
287 result = self.read()
288 self._max_size = self._position
289 self._recursing = False
290 return result
291 else:
292 raise
294 def write(self, b: bytes) -> int:
295 if self.writable(): 295 ↛ 300line 295 didn't jump to line 300, because the condition on line 295 was never false
296 result = self._buffer.write(b)
297 self._position = self._buffer.tell()
298 return result
299 else:
300 raise OSError("S3 ResourceHandle is not writable")