Coverage for python/lsst/resources/_resourceHandles/_s3ResourceHandle.py: 82%
160 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 10:52 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 10:52 -0700
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourceHandle",)
16import warnings
17from collections.abc import Iterable, Mapping
18from io import SEEK_CUR, SEEK_END, SEEK_SET, BytesIO, UnsupportedOperation
19from logging import Logger
20from typing import TYPE_CHECKING
22from botocore.exceptions import ClientError
23from lsst.utils.timer import time_this
25from ..s3utils import all_retryable_errors, backoff, max_retry_time
26from ._baseResourceHandle import BaseResourceHandle, CloseStatus
28if TYPE_CHECKING:
29 import boto3
32class S3ResourceHandle(BaseResourceHandle[bytes]):
33 """S3 specialization of `.BaseResourceHandle`
35 Parameters
36 ----------
37 mode : `str`
38 Handle modes as described in the python `io` module.
39 log : `~logging.Logger`
40 Logger to used when writing messages.
41 client : `boto3.client`
42 An existing boto3 client that will be used for interacting with the
43 remote s3 server.
44 bucket : `str`
45 The name of the s3 bucket of this resource.
46 key : `str`
47 The identifier of the resource within the specified bucket.
48 newline : `str`
49 When doing multiline operations, break the stream on given character.
50 Defaults to newline.
52 Note
53 ----
54 It is only possible to incrementally flush this object if each chunk that
55 is flushed is above 5MB in size. The flush command is ignored until the
56 internal buffer reaches this size, or until close is called, whichever
57 comes first.
59 Once an instance in write mode is flushed, it is not possible to seek back
60 to a position in the byte stream before the flush is executed.
62 When opening a resource in read write mode (r+ or w+) no flushing is
63 possible, and all data will be buffered until the resource is closed and
64 the buffered data will be written. Additionally the entire contents of the
65 resource will be loaded into memory upon opening.
67 Documentation on the methods of this class line should refer to the
68 corresponding methods in the `io` module.
70 S3 handles only support operations in binary mode. To get other modes of
71 reading and writing, wrap this handle inside an `io.TextIOWrapper` context
72 manager. An example of this can be found in `S3ResourcePath`.
73 """
75 def __init__(
76 self, mode: str, log: Logger, client: boto3.client, bucket: str, key: str, newline: bytes = b"\n"
77 ):
78 super().__init__(mode, log, newline=newline)
79 self._client = client
80 self._bucket = bucket
81 self._key = key
82 self._buffer = BytesIO()
83 self._position = 0
84 self._writable = False
85 self._last_flush_position: int | None = None
86 self._warned = False
87 self._readable = bool({"r", "+"} & set(self._mode))
88 self._max_size: int | None = None
89 self._recursing = False
90 if {"w", "a", "x", "+"} & set(self._mode):
91 self._writable = True
92 self._multiPartUpload = client.create_multipart_upload(Bucket=bucket, Key=key)
93 self._partNo = 1
94 self._parts: list[Mapping] = []
95 # Below is a workaround for append mode. It basically must read in
96 # everything that exists in the file so that it is in the buffer to
97 # append to, and subsequently written back out appropriately with
98 # any newly added data.
99 if {"a", "+"} & set(self._mode):
100 # Cheat a bit to get the existing data from the handle using
101 # object interfaces, because we know this is safe.
102 # Save the requested mode and readability.
103 mode_save = self._mode
104 read_save = self._readable
105 # Update each of these internal variables to ensure the handle
106 # is strictly readable.
107 self._readable = True
108 self._mode += "r"
109 self._mode = self._mode.replace("+", "")
110 # As mentioned, this reads the existing contents and writes it
111 # out into the internal buffer, no writes actually happen until
112 # the handle is flushed.
113 self.write(self.read())
114 # Restore the requested states.
115 self._mode = mode_save
116 self._readable = read_save
117 # Set the state of the stream if the specified mode is read
118 # and write.
119 if "+" in self._mode:
120 self.seek(0)
121 # If a file is w+ it is read write, but should be truncated
122 # for future writes.
123 if "w" in self._mode:
124 self.truncate()
126 def tell(self) -> int:
127 return self._position
129 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
130 def close(self) -> None:
131 if self.writable():
132 # decide if this is a multipart upload
133 if self._parts:
134 # indicate that the object is in closing status
135 self._closed = CloseStatus.CLOSING
136 self.flush()
137 with time_this(self._log, msg="Finalize multipart upload to %s", args=(self,)):
138 self._client.complete_multipart_upload(
139 Bucket=self._multiPartUpload["Bucket"],
140 Key=self._multiPartUpload["Key"],
141 UploadId=self._multiPartUpload["UploadId"],
142 MultipartUpload={"Parts": self._parts},
143 )
144 else:
145 # Put the complete object at once
146 with time_this(self._log, msg="Write to %s", args=(self,)):
147 self._client.put_object(Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue())
148 self._closed = CloseStatus.CLOSED
150 @property
151 def closed(self) -> bool:
152 return self._closed == CloseStatus.CLOSED
154 def fileno(self) -> int:
155 raise UnsupportedOperation("S3 object does not have a file number")
157 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
158 def flush(self) -> None:
159 # If the object is closed, not writeable, or rw flush should be skipped
160 # rw mode skips flush because the whole bytestream must be kept in
161 # the buffer for seeking reasons.
162 if self.closed or not self.writable() or "+" in self._mode:
163 return
164 # Disallow writes to seek to a position prior to the previous flush
165 # this allows multipart uploads to upload content as the stream is
166 # written to.
167 s3_min_bits = 5 * 1024 * 1024 # S3 flush threshold is 5 Mib.
168 if (
169 (self.tell() - (self._last_flush_position or 0)) < s3_min_bits
170 and not self._closed == CloseStatus.CLOSING
171 and not self._warned
172 ):
173 amount = s3_min_bits / (1024 * 1024)
174 warnings.warn(f"S3 does not support flushing objects less than {amount} Mib, skipping")
175 self._warned = True
176 return
177 # nothing to write, don't create an empty upload
178 if self.tell() == 0: 178 ↛ 179line 178 didn't jump to line 179, because the condition on line 178 was never true
179 return
180 with time_this(
181 self._log,
182 msg="Upload multipart %d to %s",
183 args=(
184 self._partNo,
185 self,
186 ),
187 ):
188 response = self._client.upload_part(
189 Body=self._buffer.getvalue(),
190 Bucket=self._bucket,
191 Key=self._key,
192 UploadId=self._multiPartUpload["UploadId"],
193 PartNumber=self._partNo,
194 )
195 self._parts.append({"PartNumber": self._partNo, "ETag": response["ETag"]})
196 self._partNo += 1
197 self._last_flush_position = self._buffer.tell() + (self._last_flush_position or 0)
198 self._buffer = BytesIO()
200 @property
201 def isatty(self) -> bool:
202 return False
204 def readable(self) -> bool:
205 return self._readable
207 def readline(self, size: int = -1) -> bytes:
208 raise OSError("S3 Does not support line by line reads")
210 def readlines(self, hint: int = -1) -> Iterable[bytes]:
211 self.seek(0)
212 return self.read().split(self._newline)
214 def seek(self, offset: int, whence: int = SEEK_SET) -> int:
215 if self.writable():
216 if self._last_flush_position is not None:
217 if whence == SEEK_SET: 217 ↛ 221line 217 didn't jump to line 221, because the condition on line 217 was never false
218 offset -= self._last_flush_position
219 if offset < 0:
220 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions")
221 if whence == SEEK_CUR: 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true
222 if (self.tell() - self._last_flush_position) < 0:
223 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions")
224 if whence == SEEK_END: 224 ↛ 225line 224 didn't jump to line 225, because the condition on line 224 was never true
225 raise OSError("S3 ResourceHandle can not seek referencing the end of the resource")
226 self._buffer.seek(offset, whence)
227 self._position = self._buffer.tell()
228 else:
229 if whence == SEEK_SET: 229 ↛ 231line 229 didn't jump to line 231, because the condition on line 229 was never false
230 self._position = offset
231 elif whence == SEEK_CUR:
232 self._position += offset
233 elif whence == SEEK_END:
234 offset = abs(offset)
235 self._position -= offset
236 return self._position
238 def seekable(self) -> bool:
239 return True
241 def truncate(self, size: int | None = None) -> int:
242 if self.writable(): 242 ↛ 246line 242 didn't jump to line 246, because the condition on line 242 was never false
243 self._buffer.truncate(size)
244 return self._position
245 else:
246 raise OSError("S3 ResourceHandle is not writable")
248 def writable(self) -> bool:
249 return self._writable
251 def writelines(self, lines: Iterable[bytes]) -> None:
252 if self.writable():
253 self._buffer.writelines(lines)
254 self._position = self._buffer.tell()
255 else:
256 raise OSError("S3 ResourceHandle is not writable")
258 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
259 def read(self, size: int = -1) -> bytes:
260 if not self.readable(): 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true
261 raise OSError("S3 ResourceHandle is not readable")
262 # If the object is rw, then read from the internal io buffer
263 if "+" in self._mode:
264 self._buffer.seek(self._position)
265 return self._buffer.read(size)
266 # otherwise fetch the appropriate bytes from the remote resource
267 if self._max_size is not None and self._position >= self._max_size:
268 return b""
269 if size > 0:
270 stop = f"{self._position + size - 1}"
271 else:
272 stop = ""
273 args = {"Range": f"bytes={self._position}-{stop}"}
274 try:
275 response = self._client.get_object(Bucket=self._bucket, Key=self._key, **args)
276 contents = response["Body"].read()
277 response["Body"].close()
278 self._position += len(contents)
279 return contents
280 except ClientError as exc:
281 if exc.response["ResponseMetadata"]["HTTPStatusCode"] == 416: 281 ↛ 293line 281 didn't jump to line 293, because the condition on line 281 was never false
282 if self._recursing:
283 # This means the function has attempted to read the whole
284 # byte range and failed again, meaning the previous byte
285 # was the last byte
286 return b""
287 self._recursing = True
288 result = self.read()
289 self._max_size = self._position
290 self._recursing = False
291 return result
292 else:
293 raise
295 def write(self, b: bytes) -> int:
296 if self.writable(): 296 ↛ 301line 296 didn't jump to line 301, because the condition on line 296 was never false
297 result = self._buffer.write(b)
298 self._position = self._buffer.tell()
299 return result
300 else:
301 raise OSError("S3 ResourceHandle is not writable")