Coverage for python/lsst/resources/_resourceHandles/_s3ResourceHandle.py: 84%
158 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-30 11:34 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-30 11:34 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourceHandle",)
16import warnings
17from collections.abc import Iterable, Mapping
18from io import SEEK_CUR, SEEK_END, SEEK_SET, BytesIO, UnsupportedOperation
19from logging import Logger
20from typing import TYPE_CHECKING
22from botocore.exceptions import ClientError
23from lsst.utils.introspection import find_outside_stacklevel
24from lsst.utils.timer import time_this
26from ..s3utils import all_retryable_errors, backoff, max_retry_time
27from ._baseResourceHandle import BaseResourceHandle, CloseStatus
29if TYPE_CHECKING:
30 import boto3
33class S3ResourceHandle(BaseResourceHandle[bytes]):
34 """S3 specialization of `.BaseResourceHandle`
36 Parameters
37 ----------
38 mode : `str`
39 Handle modes as described in the python `io` module.
40 log : `~logging.Logger`
41 Logger to used when writing messages.
42 client : `boto3.client`
43 An existing boto3 client that will be used for interacting with the
44 remote s3 server.
45 bucket : `str`
46 The name of the s3 bucket of this resource.
47 key : `str`
48 The identifier of the resource within the specified bucket.
49 newline : `str`
50 When doing multiline operations, break the stream on given character.
51 Defaults to newline.
53 Note
54 ----
55 It is only possible to incrementally flush this object if each chunk that
56 is flushed is above 5MB in size. The flush command is ignored until the
57 internal buffer reaches this size, or until close is called, whichever
58 comes first.
60 Once an instance in write mode is flushed, it is not possible to seek back
61 to a position in the byte stream before the flush is executed.
63 When opening a resource in read write mode (r+ or w+) no flushing is
64 possible, and all data will be buffered until the resource is closed and
65 the buffered data will be written. Additionally the entire contents of the
66 resource will be loaded into memory upon opening.
68 Documentation on the methods of this class line should refer to the
69 corresponding methods in the `io` module.
71 S3 handles only support operations in binary mode. To get other modes of
72 reading and writing, wrap this handle inside an `io.TextIOWrapper` context
73 manager. An example of this can be found in `S3ResourcePath`.
74 """
76 def __init__(
77 self, mode: str, log: Logger, client: boto3.client, bucket: str, key: str, newline: bytes = b"\n"
78 ):
79 super().__init__(mode, log, newline=newline)
80 self._client = client
81 self._bucket = bucket
82 self._key = key
83 self._buffer = BytesIO()
84 self._position = 0
85 self._writable = False
86 self._last_flush_position: int | None = None
87 self._warned = False
88 self._readable = bool({"r", "+"} & set(self._mode))
89 self._max_size: int | None = None
90 self._recursing = False
91 if {"w", "a", "x", "+"} & set(self._mode):
92 self._writable = True
93 self._multiPartUpload = client.create_multipart_upload(Bucket=bucket, Key=key)
94 self._partNo = 1
95 self._parts: list[Mapping] = []
96 # Below is a workaround for append mode. It basically must read in
97 # everything that exists in the file so that it is in the buffer to
98 # append to, and subsequently written back out appropriately with
99 # any newly added data.
100 if {"a", "+"} & set(self._mode):
101 # Cheat a bit to get the existing data from the handle using
102 # object interfaces, because we know this is safe.
103 # Save the requested mode and readability.
104 mode_save = self._mode
105 read_save = self._readable
106 # Update each of these internal variables to ensure the handle
107 # is strictly readable.
108 self._readable = True
109 self._mode += "r"
110 self._mode = self._mode.replace("+", "")
111 # As mentioned, this reads the existing contents and writes it
112 # out into the internal buffer, no writes actually happen until
113 # the handle is flushed.
114 self.write(self.read())
115 # Restore the requested states.
116 self._mode = mode_save
117 self._readable = read_save
118 # Set the state of the stream if the specified mode is read
119 # and write.
120 if "+" in self._mode:
121 self.seek(0)
122 # If a file is w+ it is read write, but should be truncated
123 # for future writes.
124 if "w" in self._mode:
125 self.truncate()
127 def tell(self) -> int:
128 return self._position
130 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
131 def close(self) -> None:
132 if self.writable():
133 # decide if this is a multipart upload
134 if self._parts:
135 # indicate that the object is in closing status
136 self._closed = CloseStatus.CLOSING
137 self.flush()
138 with time_this(self._log, msg="Finalize multipart upload to %s", args=(self,)):
139 self._client.complete_multipart_upload(
140 Bucket=self._multiPartUpload["Bucket"],
141 Key=self._multiPartUpload["Key"],
142 UploadId=self._multiPartUpload["UploadId"],
143 MultipartUpload={"Parts": self._parts},
144 )
145 else:
146 # Put the complete object at once
147 with time_this(self._log, msg="Write to %s", args=(self,)):
148 self._client.put_object(Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue())
149 self._closed = CloseStatus.CLOSED
151 @property
152 def closed(self) -> bool:
153 return self._closed == CloseStatus.CLOSED
155 def fileno(self) -> int:
156 raise UnsupportedOperation("S3 object does not have a file number")
158 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
159 def flush(self) -> None:
160 # If the object is closed, not writeable, or rw flush should be skipped
161 # rw mode skips flush because the whole bytestream must be kept in
162 # the buffer for seeking reasons.
163 if self.closed or not self.writable() or "+" in self._mode:
164 return
165 # Disallow writes to seek to a position prior to the previous flush
166 # this allows multipart uploads to upload content as the stream is
167 # written to.
168 s3_min_bits = 5 * 1024 * 1024 # S3 flush threshold is 5 Mib.
169 if (
170 (self.tell() - (self._last_flush_position or 0)) < s3_min_bits
171 and self._closed != CloseStatus.CLOSING
172 and not self._warned
173 ):
174 amount = s3_min_bits / (1024 * 1024)
175 warnings.warn(
176 f"S3 does not support flushing objects less than {amount} Mib, skipping",
177 stacklevel=find_outside_stacklevel(
178 "lsst.resources",
179 "backoff",
180 "contextlib",
181 allow_modules={"lsst.resources.tests"},
182 ),
183 )
184 self._warned = True
185 return
186 # nothing to write, don't create an empty upload
187 if self.tell() == 0: 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true
188 return
189 with time_this(
190 self._log,
191 msg="Upload multipart %d to %s",
192 args=(
193 self._partNo,
194 self,
195 ),
196 ):
197 response = self._client.upload_part(
198 Body=self._buffer.getvalue(),
199 Bucket=self._bucket,
200 Key=self._key,
201 UploadId=self._multiPartUpload["UploadId"],
202 PartNumber=self._partNo,
203 )
204 self._parts.append({"PartNumber": self._partNo, "ETag": response["ETag"]})
205 self._partNo += 1
206 self._last_flush_position = self._buffer.tell() + (self._last_flush_position or 0)
207 self._buffer = BytesIO()
209 @property
210 def isatty(self) -> bool:
211 return False
213 def readable(self) -> bool:
214 return self._readable
216 def readline(self, size: int = -1) -> bytes:
217 raise OSError("S3 Does not support line by line reads")
219 def readlines(self, hint: int = -1) -> Iterable[bytes]:
220 self.seek(0)
221 return self.read().split(self._newline)
223 def seek(self, offset: int, whence: int = SEEK_SET) -> int:
224 if self.writable():
225 if self._last_flush_position is not None:
226 if whence == SEEK_SET: 226 ↛ 230line 226 didn't jump to line 230, because the condition on line 226 was never false
227 offset -= self._last_flush_position
228 if offset < 0:
229 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions")
230 if whence == SEEK_CUR and (self.tell() - self._last_flush_position) < 0: 230 ↛ 231line 230 didn't jump to line 231, because the condition on line 230 was never true
231 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions")
232 if whence == SEEK_END: 232 ↛ 233line 232 didn't jump to line 233, because the condition on line 232 was never true
233 raise OSError("S3 ResourceHandle can not seek referencing the end of the resource")
234 self._buffer.seek(offset, whence)
235 self._position = self._buffer.tell()
236 else:
237 if whence == SEEK_SET: 237 ↛ 239line 237 didn't jump to line 239, because the condition on line 237 was never false
238 self._position = offset
239 elif whence == SEEK_CUR:
240 self._position += offset
241 elif whence == SEEK_END:
242 offset = abs(offset)
243 self._position -= offset
244 return self._position
246 def seekable(self) -> bool:
247 return True
249 def truncate(self, size: int | None = None) -> int:
250 if self.writable(): 250 ↛ 254line 250 didn't jump to line 254, because the condition on line 250 was never false
251 self._buffer.truncate(size)
252 return self._position
253 else:
254 raise OSError("S3 ResourceHandle is not writable")
256 def writable(self) -> bool:
257 return self._writable
259 def writelines(self, lines: Iterable[bytes]) -> None:
260 if self.writable():
261 self._buffer.writelines(lines)
262 self._position = self._buffer.tell()
263 else:
264 raise OSError("S3 ResourceHandle is not writable")
266 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
267 def read(self, size: int = -1) -> bytes:
268 if not self.readable(): 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true
269 raise OSError("S3 ResourceHandle is not readable")
270 # If the object is rw, then read from the internal io buffer
271 if "+" in self._mode:
272 self._buffer.seek(self._position)
273 return self._buffer.read(size)
274 # otherwise fetch the appropriate bytes from the remote resource
275 if self._max_size is not None and self._position >= self._max_size:
276 return b""
277 stop = f"{self._position + size - 1}" if size > 0 else ""
278 args = {"Range": f"bytes={self._position}-{stop}"}
279 try:
280 response = self._client.get_object(Bucket=self._bucket, Key=self._key, **args)
281 contents = response["Body"].read()
282 response["Body"].close()
283 self._position += len(contents)
284 return contents
285 except ClientError as exc:
286 if exc.response["ResponseMetadata"]["HTTPStatusCode"] == 416: 286 ↛ 298line 286 didn't jump to line 298, because the condition on line 286 was never false
287 if self._recursing:
288 # This means the function has attempted to read the whole
289 # byte range and failed again, meaning the previous byte
290 # was the last byte
291 return b""
292 self._recursing = True
293 result = self.read()
294 self._max_size = self._position
295 self._recursing = False
296 return result
297 else:
298 raise
300 def write(self, b: bytes) -> int:
301 if self.writable(): 301 ↛ 306line 301 didn't jump to line 306, because the condition on line 301 was never false
302 result = self._buffer.write(b)
303 self._position = self._buffer.tell()
304 return result
305 else:
306 raise OSError("S3 ResourceHandle is not writable")