Coverage for python/lsst/resources/s3.py: 89%
194 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-09 11:30 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-09 11:30 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourcePath",)
16import contextlib
17import io
18import logging
19import re
20import sys
21import tempfile
22import threading
23from collections.abc import Iterable, Iterator
24from typing import IO, TYPE_CHECKING, cast
26from botocore.exceptions import ClientError
27from lsst.utils.timer import time_this
29from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
30from ._resourceHandles._s3ResourceHandle import S3ResourceHandle
31from ._resourcePath import ResourcePath
32from .s3utils import (
33 _TooManyRequestsError,
34 all_retryable_errors,
35 backoff,
36 bucketExists,
37 getS3Client,
38 max_retry_time,
39 retryable_io_errors,
40 s3CheckFileExists,
41)
43if TYPE_CHECKING:
44 with contextlib.suppress(ImportError):
45 import boto3
47 from .utils import TransactionProtocol
50log = logging.getLogger(__name__)
53class ProgressPercentage:
54 """Progress bar for S3 file uploads.
56 Parameters
57 ----------
58 file : `ResourcePath`
59 Resource that is relevant to the progress percentage. The size of this
60 resource will be used to determine progress. The name will be used
61 in the log messages unless overridden by ``file_for_msg``.
62 file_for_msg : `ResourcePath` or `None`, optional
63 Resource name to include in log messages in preference to ``file``.
64 msg : `str`, optional
65 Message text to be included in every progress log message.
66 """
68 log_level = logging.DEBUG
69 """Default log level to use when issuing a message."""
71 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""):
72 self._filename = file
73 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
74 self._size = file.size()
75 self._seen_so_far = 0
76 self._lock = threading.Lock()
77 self._msg = msg
79 def __call__(self, bytes_amount: int) -> None:
80 # To simplify, assume this is hooked up to a single filename
81 with self._lock:
82 self._seen_so_far += bytes_amount
83 percentage = (100 * self._seen_so_far) // self._size
84 log.log(
85 self.log_level,
86 "%s %s %s / %s (%s%%)",
87 self._msg,
88 self._file_for_msg,
89 self._seen_so_far,
90 self._size,
91 percentage,
92 )
95def _translate_client_error(err: ClientError) -> None:
96 """Translate a ClientError into a specialist error if relevant.
98 Parameters
99 ----------
100 err : `ClientError`
101 Exception to translate.
103 Raises
104 ------
105 _TooManyRequestsError
106 Raised if the `ClientError` looks like a 429 retry request.
107 """
108 if "(429)" in str(err):
109 # ClientError includes the error code in the message
110 # but no direct way to access it without looking inside the
111 # response.
112 raise _TooManyRequestsError(str(err)) from err
113 elif "(404)" in str(err):
114 # Some systems can generate this rather than NoSuchKey.
115 raise FileNotFoundError("Resource not found: {self}")
118class S3ResourcePath(ResourcePath):
119 """S3 URI resource path implementation class."""
121 @property
122 def client(self) -> boto3.client:
123 """Client object to address remote resource."""
124 # Defer import for circular dependencies
125 return getS3Client()
127 @classmethod
128 def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]:
129 # Force client to be created before creating threads.
130 getS3Client()
132 return super()._mexists(uris)
134 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
135 def exists(self) -> bool:
136 """Check that the S3 resource exists."""
137 if self.is_root:
138 # Only check for the bucket since the path is irrelevant
139 return bucketExists(self.netloc)
140 exists, _ = s3CheckFileExists(self, client=self.client)
141 return exists
143 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
144 def size(self) -> int:
145 """Return the size of the resource in bytes."""
146 if self.dirLike:
147 return 0
148 exists, sz = s3CheckFileExists(self, client=self.client)
149 if not exists:
150 raise FileNotFoundError(f"Resource {self} does not exist")
151 return sz
153 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
154 def remove(self) -> None:
155 """Remove the resource."""
156 # https://github.com/boto/boto3/issues/507 - there is no
157 # way of knowing if the file was actually deleted except
158 # for checking all the keys again, reponse is HTTP 204 OK
159 # response all the time
160 try:
161 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
162 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
163 raise FileNotFoundError("No such resource: {self}") from err
165 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
166 def read(self, size: int = -1) -> bytes:
167 args = {}
168 if size > 0:
169 args["Range"] = f"bytes=0-{size-1}"
170 try:
171 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
172 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 172 ↛ 174line 172 didn't jump to line 174
173 raise FileNotFoundError(f"No such resource: {self}") from err
174 except ClientError as err:
175 _translate_client_error(err)
176 raise
177 with time_this(log, msg="Read from %s", args=(self,)):
178 body = response["Body"].read()
179 response["Body"].close()
180 return body
182 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
183 def write(self, data: bytes, overwrite: bool = True) -> None:
184 if not overwrite and self.exists():
185 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
186 with time_this(log, msg="Write to %s", args=(self,)):
187 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
189 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
190 def mkdir(self) -> None:
191 """Write a directory key to S3."""
192 if not bucketExists(self.netloc):
193 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
195 if not self.dirLike:
196 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
198 # don't create S3 key when root is at the top-level of an Bucket
199 if self.path != "/":
200 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
202 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
203 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None:
204 """Download the remote resource to a local file.
206 Helper routine for _as_local to allow backoff without regenerating
207 the temporary file.
208 """
209 try:
210 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress)
211 except (
212 self.client.exceptions.NoSuchKey,
213 self.client.exceptions.NoSuchBucket,
214 ) as err:
215 raise FileNotFoundError(f"No such resource: {self}") from err
216 except ClientError as err:
217 _translate_client_error(err)
218 raise
220 def _as_local(self) -> tuple[str, bool]:
221 """Download object from S3 and place in temporary directory.
223 Returns
224 -------
225 path : `str`
226 Path to local temporary file.
227 temporary : `bool`
228 Always returns `True`. This is always a temporary file.
229 """
230 with (
231 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile,
232 time_this(log, msg="Downloading %s to local file", args=(self,)),
233 ):
234 progress = (
235 ProgressPercentage(self, msg="Downloading:")
236 if log.isEnabledFor(ProgressPercentage.log_level)
237 else None
238 )
239 self._download_file(tmpFile, progress)
240 return tmpFile.name, True
242 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
243 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None:
244 """Upload a local file with backoff.
246 Helper method to wrap file uploading in backoff for transfer_from.
247 """
248 try:
249 self.client.upload_file(
250 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress
251 )
252 except self.client.exceptions.NoSuchBucket as err:
253 raise NotADirectoryError(f"Target does not exist: {err}") from err
254 except ClientError as err:
255 _translate_client_error(err)
256 raise
258 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
259 def _copy_from(self, src: ResourcePath) -> None:
260 copy_source = {
261 "Bucket": src.netloc,
262 "Key": src.relativeToPathRoot,
263 }
264 try:
265 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
266 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 266 ↛ 268line 266 didn't jump to line 268
267 raise FileNotFoundError("No such resource to transfer: {self}") from err
268 except ClientError as err:
269 _translate_client_error(err)
270 raise
272 def transfer_from(
273 self,
274 src: ResourcePath,
275 transfer: str = "copy",
276 overwrite: bool = False,
277 transaction: TransactionProtocol | None = None,
278 ) -> None:
279 """Transfer the current resource to an S3 bucket.
281 Parameters
282 ----------
283 src : `ResourcePath`
284 Source URI.
285 transfer : `str`
286 Mode to use for transferring the resource. Supports the following
287 options: copy.
288 overwrite : `bool`, optional
289 Allow an existing file to be overwritten. Defaults to `False`.
290 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
291 Currently unused.
292 """
293 # Fail early to prevent delays if remote resources are requested
294 if transfer not in self.transferModes:
295 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
297 # Existence checks cost time so do not call this unless we know
298 # that debugging is enabled.
299 if log.isEnabledFor(logging.DEBUG): 299 ↛ 310line 299 didn't jump to line 310, because the condition on line 299 was never false
300 log.debug(
301 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
302 src,
303 src.exists(),
304 self,
305 self.exists(),
306 transfer,
307 )
309 # Short circuit if the URIs are identical immediately.
310 if self == src:
311 log.debug(
312 "Target and destination URIs are identical: %s, returning immediately."
313 " No further action required.",
314 self,
315 )
316 return
318 if not overwrite and self.exists():
319 raise FileExistsError(f"Destination path '{self}' already exists.")
321 if transfer == "auto":
322 transfer = self.transferDefault
324 timer_msg = "Transfer from %s to %s"
325 timer_args = (src, self)
327 if isinstance(src, type(self)):
328 # Looks like an S3 remote uri so we can use direct copy
329 # note that boto3.resource.meta.copy is cleverer than the low
330 # level copy_object
331 with time_this(log, msg=timer_msg, args=timer_args):
332 self._copy_from(src)
334 else:
335 # Use local file and upload it
336 with src.as_local() as local_uri:
337 progress = (
338 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
339 if log.isEnabledFor(ProgressPercentage.log_level)
340 else None
341 )
342 with time_this(log, msg=timer_msg, args=timer_args):
343 self._upload_file(local_uri, progress)
345 # This was an explicit move requested from a remote resource
346 # try to remove that resource
347 if transfer == "move":
348 # Transactions do not work here
349 src.remove()
351 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
352 def walk(
353 self, file_filter: str | re.Pattern | None = None
354 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
355 """Walk the directory tree returning matching files and directories.
357 Parameters
358 ----------
359 file_filter : `str` or `re.Pattern`, optional
360 Regex to filter out files from the list before it is returned.
362 Yields
363 ------
364 dirpath : `ResourcePath`
365 Current directory being examined.
366 dirnames : `list` of `str`
367 Names of subdirectories within dirpath.
368 filenames : `list` of `str`
369 Names of all the files within dirpath.
370 """
371 # We pretend that S3 uses directories and files and not simply keys
372 if not (self.isdir() or self.is_root):
373 raise ValueError(f"Can not walk a non-directory URI: {self}")
375 if isinstance(file_filter, str): 375 ↛ 376line 375 didn't jump to line 376, because the condition on line 375 was never true
376 file_filter = re.compile(file_filter)
378 s3_paginator = self.client.get_paginator("list_objects_v2")
380 # Limit each query to a single "directory" to match os.walk
381 # We could download all keys at once with no delimiter and work
382 # it out locally but this could potentially lead to large memory
383 # usage for millions of keys. It will also make the initial call
384 # to this method potentially very slow. If making this method look
385 # like os.walk was not required, we could query all keys with
386 # pagination and return them in groups of 1000, but that would
387 # be a different interface since we can't guarantee we would get
388 # them all grouped properly across the 1000 limit boundary.
389 prefix = self.relativeToPathRoot if not self.is_root else ""
390 prefix_len = len(prefix)
391 dirnames = []
392 filenames = []
393 files_there = False
395 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
396 # All results are returned as full key names and we must
397 # convert them back to the root form. The prefix is fixed
398 # and delimited so that is a simple trim
400 # Directories are reported in the CommonPrefixes result
401 # which reports the entire key and must be stripped.
402 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
403 dirnames.extend(found_dirs)
405 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
406 if found_files:
407 files_there = True
408 if file_filter is not None:
409 found_files = [f for f in found_files if file_filter.search(f)]
411 filenames.extend(found_files)
413 # Directories do not exist so we can't test for them. If no files
414 # or directories were found though, this means that it effectively
415 # does not exist and we should match os.walk() behavior and return
416 # immediately.
417 if not dirnames and not files_there:
418 return
419 else:
420 yield self, dirnames, filenames
422 for dir in dirnames:
423 new_uri = self.join(dir)
424 yield from new_uri.walk(file_filter)
426 @contextlib.contextmanager
427 def _openImpl(
428 self,
429 mode: str = "r",
430 *,
431 encoding: str | None = None,
432 ) -> Iterator[ResourceHandleProtocol]:
433 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle:
434 if "b" in mode:
435 yield handle
436 else:
437 if encoding is None:
438 encoding = sys.getdefaultencoding()
439 # cast because the protocol is compatible, but does not have
440 # BytesIO in the inheritance tree
441 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub:
442 yield sub
444 def generate_presigned_get_url(self, *, expiration_time_seconds: int) -> str:
445 # Docstring inherited
446 return self._generate_presigned_url("get_object", expiration_time_seconds)
448 def generate_presigned_put_url(self, *, expiration_time_seconds: int) -> str:
449 # Docstring inherited
450 return self._generate_presigned_url("put_object", expiration_time_seconds)
452 def _generate_presigned_url(self, method: str, expiration_time_seconds: int) -> str:
453 return self.client.generate_presigned_url(
454 method,
455 Params={"Bucket": self.netloc, "Key": self.relativeToPathRoot},
456 ExpiresIn=expiration_time_seconds,
457 )