Coverage for python/lsst/resources/s3.py: 28%
228 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:14 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:14 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourcePath",)
16import contextlib
17import io
18import logging
19import os
20import re
21import sys
22import tempfile
23import threading
24from collections.abc import Iterable, Iterator
25from functools import cache, cached_property
26from typing import IO, TYPE_CHECKING, cast
28from botocore.exceptions import ClientError
29from lsst.utils.timer import time_this
31from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
32from ._resourceHandles._s3ResourceHandle import S3ResourceHandle
33from ._resourcePath import ResourcePath
34from .s3utils import (
35 _TooManyRequestsError,
36 all_retryable_errors,
37 backoff,
38 bucketExists,
39 getS3Client,
40 max_retry_time,
41 retryable_io_errors,
42 s3CheckFileExists,
43)
45try:
46 from boto3.s3.transfer import TransferConfig # type: ignore
47except ImportError:
48 TransferConfig = None
50if TYPE_CHECKING:
51 with contextlib.suppress(ImportError):
52 import boto3
54 from .utils import TransactionProtocol
57log = logging.getLogger(__name__)
60class ProgressPercentage:
61 """Progress bar for S3 file uploads.
63 Parameters
64 ----------
65 file : `ResourcePath`
66 Resource that is relevant to the progress percentage. The size of this
67 resource will be used to determine progress. The name will be used
68 in the log messages unless overridden by ``file_for_msg``.
69 file_for_msg : `ResourcePath` or `None`, optional
70 Resource name to include in log messages in preference to ``file``.
71 msg : `str`, optional
72 Message text to be included in every progress log message.
73 """
75 log_level = logging.DEBUG
76 """Default log level to use when issuing a message."""
78 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""):
79 self._filename = file
80 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
81 self._size = file.size()
82 self._seen_so_far = 0
83 self._lock = threading.Lock()
84 self._msg = msg
86 def __call__(self, bytes_amount: int) -> None:
87 # To simplify, assume this is hooked up to a single filename
88 with self._lock:
89 self._seen_so_far += bytes_amount
90 percentage = (100 * self._seen_so_far) // self._size
91 log.log(
92 self.log_level,
93 "%s %s %s / %s (%s%%)",
94 self._msg,
95 self._file_for_msg,
96 self._seen_so_far,
97 self._size,
98 percentage,
99 )
102def _translate_client_error(err: ClientError) -> None:
103 """Translate a ClientError into a specialist error if relevant.
105 Parameters
106 ----------
107 err : `ClientError`
108 Exception to translate.
110 Raises
111 ------
112 _TooManyRequestsError
113 Raised if the `ClientError` looks like a 429 retry request.
114 """
115 if "(429)" in str(err):
116 # ClientError includes the error code in the message
117 # but no direct way to access it without looking inside the
118 # response.
119 raise _TooManyRequestsError(str(err)) from err
120 elif "(404)" in str(err):
121 # Some systems can generate this rather than NoSuchKey.
122 raise FileNotFoundError("Resource not found: {self}")
125@cache
126def _parse_string_to_maybe_bool(maybe_bool_str: str) -> bool | None:
127 """Map a string to either a boolean value or None.
129 Parameters
130 ----------
131 maybe_bool_str : `str`
132 The value to parse
134 Results
135 -------
136 maybe_bool : `bool` or `None`
137 The parsed value.
138 """
139 if maybe_bool_str.lower() in ["t", "true", "yes", "y", "1"]:
140 maybe_bool = True
141 elif maybe_bool_str.lower() in ["f", "false", "no", "n", "0"]:
142 maybe_bool = False
143 elif maybe_bool_str.lower() in ["none", ""]:
144 maybe_bool = None
145 else:
146 raise ValueError(f'Value of "{maybe_bool_str}" is not True, False, or None.')
148 return maybe_bool
151class S3ResourcePath(ResourcePath):
152 """S3 URI resource path implementation class.
154 Notes
155 -----
156 In order to configure the behavior of instances of this class, the
157 environment variable is inspected:
159 - LSST_S3_USE_THREADS: May be True, False, or None. Sets whether threading
160 is used for downloads, with a value of None defaulting to boto's default
161 value. Users may wish to set it to False when the downloads will be started
162 within threads other than python's main thread.
163 """
165 use_threads: bool | None = None
166 """Explicitly turn on or off threading in use of boto's download_fileobj.
167 Setting this to None results in boto's default behavior."""
169 @cached_property
170 def _environ_use_threads(self) -> bool | None:
171 try:
172 use_threads_str = os.environ["LSST_S3_USE_THREADS"]
173 except KeyError:
174 use_threads_str = "None"
176 use_threads = _parse_string_to_maybe_bool(use_threads_str)
178 return use_threads
180 @property
181 def _transfer_config(self) -> TransferConfig:
182 if self.use_threads is None:
183 self.use_threads = self._environ_use_threads
185 if self.use_threads is None:
186 transfer_config = TransferConfig()
187 else:
188 transfer_config = TransferConfig(use_threads=self.use_threads)
190 return transfer_config
192 @property
193 def client(self) -> boto3.client:
194 """Client object to address remote resource."""
195 # Defer import for circular dependencies
196 return getS3Client()
198 @classmethod
199 def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]:
200 # Force client to be created before creating threads.
201 getS3Client()
203 return super()._mexists(uris)
205 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
206 def exists(self) -> bool:
207 """Check that the S3 resource exists."""
208 if self.is_root:
209 # Only check for the bucket since the path is irrelevant
210 return bucketExists(self.netloc)
211 exists, _ = s3CheckFileExists(self, client=self.client)
212 return exists
214 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
215 def size(self) -> int:
216 """Return the size of the resource in bytes."""
217 if self.dirLike:
218 return 0
219 exists, sz = s3CheckFileExists(self, client=self.client)
220 if not exists:
221 raise FileNotFoundError(f"Resource {self} does not exist")
222 return sz
224 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
225 def remove(self) -> None:
226 """Remove the resource."""
227 # https://github.com/boto/boto3/issues/507 - there is no
228 # way of knowing if the file was actually deleted except
229 # for checking all the keys again, reponse is HTTP 204 OK
230 # response all the time
231 try:
232 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
233 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
234 raise FileNotFoundError("No such resource: {self}") from err
236 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
237 def read(self, size: int = -1) -> bytes:
238 args = {}
239 if size > 0:
240 args["Range"] = f"bytes=0-{size-1}"
241 try:
242 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
243 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
244 raise FileNotFoundError(f"No such resource: {self}") from err
245 except ClientError as err:
246 _translate_client_error(err)
247 raise
248 with time_this(log, msg="Read from %s", args=(self,)):
249 body = response["Body"].read()
250 response["Body"].close()
251 return body
253 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
254 def write(self, data: bytes, overwrite: bool = True) -> None:
255 if not overwrite and self.exists():
256 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
257 with time_this(log, msg="Write to %s", args=(self,)):
258 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
260 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
261 def mkdir(self) -> None:
262 """Write a directory key to S3."""
263 if not bucketExists(self.netloc):
264 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
266 if not self.dirLike:
267 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
269 # don't create S3 key when root is at the top-level of an Bucket
270 if self.path != "/":
271 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
273 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
274 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None:
275 """Download the remote resource to a local file.
277 Helper routine for _as_local to allow backoff without regenerating
278 the temporary file.
279 """
280 try:
281 self.client.download_fileobj(
282 self.netloc,
283 self.relativeToPathRoot,
284 local_file,
285 Callback=progress,
286 Config=self._transfer_config,
287 )
288 except (
289 self.client.exceptions.NoSuchKey,
290 self.client.exceptions.NoSuchBucket,
291 ) as err:
292 raise FileNotFoundError(f"No such resource: {self}") from err
293 except ClientError as err:
294 _translate_client_error(err)
295 raise
297 def _as_local(self) -> tuple[str, bool]:
298 """Download object from S3 and place in temporary directory.
300 Returns
301 -------
302 path : `str`
303 Path to local temporary file.
304 temporary : `bool`
305 Always returns `True`. This is always a temporary file.
306 """
307 with (
308 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile,
309 time_this(log, msg="Downloading %s to local file", args=(self,)),
310 ):
311 progress = (
312 ProgressPercentage(self, msg="Downloading:")
313 if log.isEnabledFor(ProgressPercentage.log_level)
314 else None
315 )
316 self._download_file(tmpFile, progress)
317 return tmpFile.name, True
319 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
320 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None:
321 """Upload a local file with backoff.
323 Helper method to wrap file uploading in backoff for transfer_from.
324 """
325 try:
326 self.client.upload_file(
327 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress
328 )
329 except self.client.exceptions.NoSuchBucket as err:
330 raise NotADirectoryError(f"Target does not exist: {err}") from err
331 except ClientError as err:
332 _translate_client_error(err)
333 raise
335 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
336 def _copy_from(self, src: ResourcePath) -> None:
337 copy_source = {
338 "Bucket": src.netloc,
339 "Key": src.relativeToPathRoot,
340 }
341 try:
342 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
343 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
344 raise FileNotFoundError("No such resource to transfer: {self}") from err
345 except ClientError as err:
346 _translate_client_error(err)
347 raise
349 def transfer_from(
350 self,
351 src: ResourcePath,
352 transfer: str = "copy",
353 overwrite: bool = False,
354 transaction: TransactionProtocol | None = None,
355 ) -> None:
356 """Transfer the current resource to an S3 bucket.
358 Parameters
359 ----------
360 src : `ResourcePath`
361 Source URI.
362 transfer : `str`
363 Mode to use for transferring the resource. Supports the following
364 options: copy.
365 overwrite : `bool`, optional
366 Allow an existing file to be overwritten. Defaults to `False`.
367 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
368 Currently unused.
369 """
370 # Fail early to prevent delays if remote resources are requested
371 if transfer not in self.transferModes:
372 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
374 # Existence checks cost time so do not call this unless we know
375 # that debugging is enabled.
376 if log.isEnabledFor(logging.DEBUG):
377 log.debug(
378 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
379 src,
380 src.exists(),
381 self,
382 self.exists(),
383 transfer,
384 )
386 # Short circuit if the URIs are identical immediately.
387 if self == src:
388 log.debug(
389 "Target and destination URIs are identical: %s, returning immediately."
390 " No further action required.",
391 self,
392 )
393 return
395 if not overwrite and self.exists():
396 raise FileExistsError(f"Destination path '{self}' already exists.")
398 if transfer == "auto":
399 transfer = self.transferDefault
401 timer_msg = "Transfer from %s to %s"
402 timer_args = (src, self)
404 if isinstance(src, type(self)):
405 # Looks like an S3 remote uri so we can use direct copy
406 # note that boto3.resource.meta.copy is cleverer than the low
407 # level copy_object
408 with time_this(log, msg=timer_msg, args=timer_args):
409 self._copy_from(src)
411 else:
412 # Use local file and upload it
413 with src.as_local() as local_uri:
414 progress = (
415 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
416 if log.isEnabledFor(ProgressPercentage.log_level)
417 else None
418 )
419 with time_this(log, msg=timer_msg, args=timer_args):
420 self._upload_file(local_uri, progress)
422 # This was an explicit move requested from a remote resource
423 # try to remove that resource
424 if transfer == "move":
425 # Transactions do not work here
426 src.remove()
428 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
429 def walk(
430 self, file_filter: str | re.Pattern | None = None
431 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
432 """Walk the directory tree returning matching files and directories.
434 Parameters
435 ----------
436 file_filter : `str` or `re.Pattern`, optional
437 Regex to filter out files from the list before it is returned.
439 Yields
440 ------
441 dirpath : `ResourcePath`
442 Current directory being examined.
443 dirnames : `list` of `str`
444 Names of subdirectories within dirpath.
445 filenames : `list` of `str`
446 Names of all the files within dirpath.
447 """
448 # We pretend that S3 uses directories and files and not simply keys
449 if not (self.isdir() or self.is_root):
450 raise ValueError(f"Can not walk a non-directory URI: {self}")
452 if isinstance(file_filter, str):
453 file_filter = re.compile(file_filter)
455 s3_paginator = self.client.get_paginator("list_objects_v2")
457 # Limit each query to a single "directory" to match os.walk
458 # We could download all keys at once with no delimiter and work
459 # it out locally but this could potentially lead to large memory
460 # usage for millions of keys. It will also make the initial call
461 # to this method potentially very slow. If making this method look
462 # like os.walk was not required, we could query all keys with
463 # pagination and return them in groups of 1000, but that would
464 # be a different interface since we can't guarantee we would get
465 # them all grouped properly across the 1000 limit boundary.
466 prefix = self.relativeToPathRoot if not self.is_root else ""
467 prefix_len = len(prefix)
468 dirnames = []
469 filenames = []
470 files_there = False
472 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
473 # All results are returned as full key names and we must
474 # convert them back to the root form. The prefix is fixed
475 # and delimited so that is a simple trim
477 # Directories are reported in the CommonPrefixes result
478 # which reports the entire key and must be stripped.
479 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
480 dirnames.extend(found_dirs)
482 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
483 if found_files:
484 files_there = True
485 if file_filter is not None:
486 found_files = [f for f in found_files if file_filter.search(f)]
488 filenames.extend(found_files)
490 # Directories do not exist so we can't test for them. If no files
491 # or directories were found though, this means that it effectively
492 # does not exist and we should match os.walk() behavior and return
493 # immediately.
494 if not dirnames and not files_there:
495 return
496 else:
497 yield self, dirnames, filenames
499 for dir in dirnames:
500 new_uri = self.join(dir)
501 yield from new_uri.walk(file_filter)
503 @contextlib.contextmanager
504 def _openImpl(
505 self,
506 mode: str = "r",
507 *,
508 encoding: str | None = None,
509 ) -> Iterator[ResourceHandleProtocol]:
510 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle:
511 if "b" in mode:
512 yield handle
513 else:
514 if encoding is None:
515 encoding = sys.getdefaultencoding()
516 # cast because the protocol is compatible, but does not have
517 # BytesIO in the inheritance tree
518 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub:
519 yield sub
521 def generate_presigned_get_url(self, *, expiration_time_seconds: int) -> str:
522 # Docstring inherited
523 return self._generate_presigned_url("get_object", expiration_time_seconds)
525 def generate_presigned_put_url(self, *, expiration_time_seconds: int) -> str:
526 # Docstring inherited
527 return self._generate_presigned_url("put_object", expiration_time_seconds)
529 def _generate_presigned_url(self, method: str, expiration_time_seconds: int) -> str:
530 return self.client.generate_presigned_url(
531 method,
532 Params={"Bucket": self.netloc, "Key": self.relativeToPathRoot},
533 ExpiresIn=expiration_time_seconds,
534 )