Coverage for python/lsst/resources/_resourceHandles/_s3ResourceHandle.py: 81%
145 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-31 02:36 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-31 02:36 -0700
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourceHandle",)
16import warnings
17from io import SEEK_CUR, SEEK_END, SEEK_SET, BytesIO, UnsupportedOperation
18from logging import Logger
19from typing import TYPE_CHECKING, Iterable, Mapping, Optional
21from lsst.utils.timer import time_this
23from ..s3utils import all_retryable_errors, backoff, max_retry_time
24from ._baseResourceHandle import BaseResourceHandle, CloseStatus
26if TYPE_CHECKING: 26 ↛ 27line 26 didn't jump to line 27, because the condition on line 26 was never true
27 import boto3
30class S3ResourceHandle(BaseResourceHandle[bytes]):
31 """S3 specialization of `BaseResourceHandle`
33 Parameters
34 ----------
35 mode : `str`
36 Handle modes as described in the python `io` module.
37 log : `~logging.Logger`
38 Logger to used when writing messages.
39 client : `boto3.client`
40 An existing boto3 client that will be used for interacting with the
41 remote s3 server.
42 bucket : `str`
43 The name of the s3 bucket of this resource.
44 key : `str`
45 The identifier of the resource within the specified bucket.
46 newline : `str`
47 When doing multiline operations, break the stream on given character.
48 Defaults to newline.
50 Note
51 ----
52 It is only possible to incrementally flush this object if each chunk that
53 is flushed is above 5MB in size. The flush command is ignored until the
54 internal buffer reaches this size, or until close is called, whichever
55 comes first.
57 Once an instance in write mode is flushed, it is not possible to seek back
58 to a position in the byte stream before the flush is executed.
60 When opening a resource in read write mode (r+ or w+) no flushing is
61 possible, and all data will be buffered until the resource is closed and
62 the buffered data will be written. Additionally the entire contents of the
63 resource will be loaded into memory upon opening.
65 Documentation on the methods of this class line should refer to the
66 corresponding methods in the `io` module.
68 S3 handles only support operations in binary mode. To get other modes of
69 reading and writing, wrap this handle inside an `io.TextIOWrapper` context
70 manager. An example of this can be found in `S3ResourcePath`.
71 """
73 def __init__(
74 self, mode: str, log: Logger, client: "boto3.client", bucket: str, key: str, newline: bytes = b"\n"
75 ):
76 super().__init__(mode, log, newline=newline)
77 self._client = client
78 self._bucket = bucket
79 self._key = key
80 self._buffer = BytesIO()
81 self._position = 0
82 self._writable = False
83 self._last_flush_position: Optional[int] = None
84 self._warned = False
85 self._readable = bool({"r", "+"} & set(self._mode))
86 if {"w", "a", "x", "+"} & set(self._mode):
87 self._writable = True
88 self._multiPartUpload = client.create_multipart_upload(Bucket=bucket, Key=key)
89 self._partNo = 1
90 self._parts: list[Mapping] = []
91 # Below is a workaround for append mode. It basically must read in
92 # everything that exists in the file so that it is in the buffer to
93 # append to, and subsequently written back out appropriately with
94 # any newly added data.
95 if {"a", "+"} & set(self._mode):
96 # Cheat a bit to get the existing data from the handle using
97 # object interfaces, because we know this is safe.
98 # Save the requested mode and readability.
99 mode_save = self._mode
100 read_save = self._readable
101 # Update each of these internal variables to ensure the handle
102 # is strictly readable.
103 self._readable = True
104 self._mode += "r"
105 self._mode = self._mode.replace("+", "")
106 # As mentioned, this reads the existing contents and writes it
107 # out into the internal buffer, no writes actually happen until
108 # the handle is flushed.
109 self.write(self.read())
110 # Restore the requested states.
111 self._mode = mode_save
112 self._readable = read_save
113 # Set the state of the stream if the specified mode is read
114 # and write.
115 if "+" in self._mode:
116 self.seek(0)
117 # If a file is w+ it is read write, but should be truncated
118 # for future writes.
119 if "w" in self._mode:
120 self.truncate()
122 def tell(self) -> int:
123 return self._position
125 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
126 def close(self) -> None:
127 if self.writable():
128 # decide if this is a multipart upload
129 if self._parts:
130 # indicate that the object is in closing status
131 self._closed = CloseStatus.CLOSING
132 self.flush()
133 with time_this(self._log, msg="Finalize multipart upload to %s", args=(self,)):
134 self._client.complete_multipart_upload(
135 Bucket=self._multiPartUpload["Bucket"],
136 Key=self._multiPartUpload["Key"],
137 UploadId=self._multiPartUpload["UploadId"],
138 MultipartUpload={"Parts": self._parts},
139 )
140 else:
141 # Put the complete object at once
142 with time_this(self._log, msg="Write to %s", args=(self,)):
143 self._client.put_object(Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue())
144 self._closed = CloseStatus.CLOSED
146 @property
147 def closed(self) -> bool:
148 return self._closed == CloseStatus.CLOSED
150 def fileno(self) -> int:
151 raise UnsupportedOperation("S3 object does not have a file number")
153 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
154 def flush(self) -> None:
155 # If the object is closed, not writeable, or rw flush should be skipped
156 # rw mode skips flush because the whole bytestream must be kept in
157 # the buffer for seeking reasons.
158 if self.closed or not self.writable() or "+" in self._mode:
159 return
160 # Disallow writes to seek to a position prior to the previous flush
161 # this allows multipart uploads to upload content as the stream is
162 # written to.
163 s3_min_bits = 5 * 1024 * 1024 # S3 flush threshold is 5 Mib.
164 if (
165 (self.tell() - (self._last_flush_position or 0)) < s3_min_bits
166 and not self._closed == CloseStatus.CLOSING
167 and not self._warned
168 ):
169 amount = s3_min_bits / (1024 * 1024)
170 warnings.warn(f"S3 does not support flushing objects less than {amount} Mib, skipping")
171 self._warned = True
172 return
173 # nothing to write, don't create an empty upload
174 if self.tell() == 0: 174 ↛ 175line 174 didn't jump to line 175, because the condition on line 174 was never true
175 return
176 with time_this(
177 self._log,
178 msg="Upload multipart %d to %s",
179 args=(
180 self._partNo,
181 self,
182 ),
183 ):
184 response = self._client.upload_part(
185 Body=self._buffer.getvalue(),
186 Bucket=self._bucket,
187 Key=self._key,
188 UploadId=self._multiPartUpload["UploadId"],
189 PartNumber=self._partNo,
190 )
191 self._parts.append({"PartNumber": self._partNo, "ETag": response["ETag"]})
192 self._partNo += 1
193 self._last_flush_position = self._buffer.tell() + (self._last_flush_position or 0)
194 self._buffer = BytesIO()
196 @property
197 def isatty(self) -> bool:
198 return False
200 def readable(self) -> bool:
201 return self._readable
203 def readline(self, size: int = -1) -> bytes:
204 raise OSError("S3 Does not support line by line reads")
206 def readlines(self, hint: int = -1) -> Iterable[bytes]:
207 self.seek(0)
208 return self.read().split(self._newline)
210 def seek(self, offset: int, whence: int = SEEK_SET) -> int:
211 if self.writable():
212 if self._last_flush_position is not None:
213 if whence == SEEK_SET: 213 ↛ 217line 213 didn't jump to line 217, because the condition on line 213 was never false
214 offset -= self._last_flush_position
215 if offset < 0:
216 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions")
217 if whence == SEEK_CUR: 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true
218 if (self.tell() - self._last_flush_position) < 0:
219 raise OSError("S3 ResourceHandle can not seek prior to already flushed positions")
220 if whence == SEEK_END: 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 raise OSError("S3 ResourceHandle can not seek referencing the end of the resource")
222 self._buffer.seek(offset, whence)
223 self._position = self._buffer.tell()
224 else:
225 if whence == SEEK_SET: 225 ↛ 227line 225 didn't jump to line 227, because the condition on line 225 was never false
226 self._position = offset
227 elif whence == SEEK_CUR:
228 self._position += offset
229 elif whence == SEEK_END:
230 offset = abs(offset)
231 self._position -= offset
232 return self._position
234 def seekable(self) -> bool:
235 return True
237 def truncate(self, size: Optional[int] = None) -> int:
238 if self.writable(): 238 ↛ 242line 238 didn't jump to line 242, because the condition on line 238 was never false
239 self._buffer.truncate(size)
240 return self._position
241 else:
242 raise OSError("S3 ResourceHandle is not writable")
244 def writable(self) -> bool:
245 return self._writable
247 def writelines(self, lines: Iterable[bytes]) -> None:
248 if self.writable():
249 self._buffer.writelines(lines)
250 self._position = self._buffer.tell()
251 else:
252 raise OSError("S3 ResourceHandle is not writable")
254 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
255 def read(self, size: int = -1) -> bytes:
256 if not self.readable(): 256 ↛ 257line 256 didn't jump to line 257, because the condition on line 256 was never true
257 raise OSError("S3 ResourceHandle is not readable")
258 # If the object is rw, then read from the internal io buffer
259 if "+" in self._mode:
260 self._buffer.seek(self._position)
261 return self._buffer.read(size)
262 # otherwise fetch the appropriate bytes from the remote resource
263 if size > 0:
264 stop = f"{self._position + size - 1}"
265 else:
266 stop = ""
267 args = {"Range": f"bytes={self._position}-{stop}"}
268 response = self._client.get_object(Bucket=self._bucket, Key=self._key, **args)
269 contents = response["Body"].read()
270 response["Body"].close()
271 self._position = len(contents)
272 return contents
274 def write(self, b: bytes) -> int:
275 if self.writable(): 275 ↛ 280line 275 didn't jump to line 280, because the condition on line 275 was never false
276 result = self._buffer.write(b)
277 self._position = self._buffer.tell()
278 return result
279 else:
280 raise OSError("S3 ResourceHandle is not writable")