Coverage for python/lsst/resources/s3.py: 28%
249 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-13 09:59 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-13 09:59 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourcePath",)
16import contextlib
17import io
18import logging
19import os
20import re
21import sys
22import tempfile
23import threading
24from collections.abc import Iterable, Iterator
25from functools import cache, cached_property
26from typing import IO, TYPE_CHECKING, cast
28from botocore.exceptions import ClientError
29from lsst.utils.timer import time_this
31from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
32from ._resourceHandles._s3ResourceHandle import S3ResourceHandle
33from ._resourcePath import ResourcePath
34from .s3utils import (
35 _TooManyRequestsError,
36 all_retryable_errors,
37 backoff,
38 bucketExists,
39 getS3Client,
40 max_retry_time,
41 retryable_io_errors,
42 s3CheckFileExists,
43)
45try:
46 from boto3.s3.transfer import TransferConfig # type: ignore
47except ImportError:
48 TransferConfig = None
50if TYPE_CHECKING:
51 with contextlib.suppress(ImportError):
52 import boto3
54 from .utils import TransactionProtocol
57log = logging.getLogger(__name__)
60class ProgressPercentage:
61 """Progress bar for S3 file uploads.
63 Parameters
64 ----------
65 file : `ResourcePath`
66 Resource that is relevant to the progress percentage. The size of this
67 resource will be used to determine progress. The name will be used
68 in the log messages unless overridden by ``file_for_msg``.
69 file_for_msg : `ResourcePath` or `None`, optional
70 Resource name to include in log messages in preference to ``file``.
71 msg : `str`, optional
72 Message text to be included in every progress log message.
73 """
75 log_level = logging.DEBUG
76 """Default log level to use when issuing a message."""
78 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""):
79 self._filename = file
80 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
81 self._size = file.size()
82 self._seen_so_far = 0
83 self._lock = threading.Lock()
84 self._msg = msg
86 def __call__(self, bytes_amount: int) -> None:
87 # To simplify, assume this is hooked up to a single filename
88 with self._lock:
89 self._seen_so_far += bytes_amount
90 percentage = (100 * self._seen_so_far) // self._size
91 log.log(
92 self.log_level,
93 "%s %s %s / %s (%s%%)",
94 self._msg,
95 self._file_for_msg,
96 self._seen_so_far,
97 self._size,
98 percentage,
99 )
102def _translate_client_error(err: ClientError) -> None:
103 """Translate a ClientError into a specialist error if relevant.
105 Parameters
106 ----------
107 err : `ClientError`
108 Exception to translate.
110 Raises
111 ------
112 _TooManyRequestsError
113 Raised if the `ClientError` looks like a 429 retry request.
114 """
115 if "(429)" in str(err):
116 # ClientError includes the error code in the message
117 # but no direct way to access it without looking inside the
118 # response.
119 raise _TooManyRequestsError(str(err)) from err
120 elif "(404)" in str(err):
121 # Some systems can generate this rather than NoSuchKey.
122 raise FileNotFoundError("Resource not found: {self}")
125@cache
126def _parse_string_to_maybe_bool(maybe_bool_str: str) -> bool | None:
127 """Map a string to either a boolean value or None.
129 Parameters
130 ----------
131 maybe_bool_str : `str`
132 The value to parse
134 Results
135 -------
136 maybe_bool : `bool` or `None`
137 The parsed value.
138 """
139 if maybe_bool_str.lower() in ["t", "true", "yes", "y", "1"]:
140 maybe_bool = True
141 elif maybe_bool_str.lower() in ["f", "false", "no", "n", "0"]:
142 maybe_bool = False
143 elif maybe_bool_str.lower() in ["none", ""]:
144 maybe_bool = None
145 else:
146 raise ValueError(f'Value of "{maybe_bool_str}" is not True, False, or None.')
148 return maybe_bool
151class S3ResourcePath(ResourcePath):
152 """S3 URI resource path implementation class.
154 Notes
155 -----
156 In order to configure the behavior of instances of this class, the
157 environment variable is inspected:
159 - LSST_S3_USE_THREADS: May be True, False, or None. Sets whether threading
160 is used for downloads, with a value of None defaulting to boto's default
161 value. Users may wish to set it to False when the downloads will be started
162 within threads other than python's main thread.
163 """
165 use_threads: bool | None = None
166 """Explicitly turn on or off threading in use of boto's download_fileobj.
167 Setting this to None results in boto's default behavior."""
169 @cached_property
170 def _environ_use_threads(self) -> bool | None:
171 try:
172 use_threads_str = os.environ["LSST_S3_USE_THREADS"]
173 except KeyError:
174 use_threads_str = "None"
176 use_threads = _parse_string_to_maybe_bool(use_threads_str)
178 return use_threads
180 @property
181 def _transfer_config(self) -> TransferConfig:
182 if self.use_threads is None:
183 self.use_threads = self._environ_use_threads
185 if self.use_threads is None:
186 transfer_config = TransferConfig()
187 else:
188 transfer_config = TransferConfig(use_threads=self.use_threads)
190 return transfer_config
192 @property
193 def client(self) -> boto3.client:
194 """Client object to address remote resource."""
195 return getS3Client(self._profile)
197 @property
198 def _profile(self) -> str | None:
199 """Profile name to use for looking up S3 credentials and endpoint."""
200 return self._uri.username
202 @property
203 def _bucket(self) -> str:
204 """S3 bucket where the files are stored."""
205 # Notionally the bucket is stored in the 'hostname' part of the URI.
206 # However, Ceph S3 uses a "multi-tenant" syntax for bucket names in the
207 # form 'tenant:bucket'. The part after the colon is parsed as the port
208 # portion of the URI, and urllib throws an exception if you try to read
209 # a non-integer port value. So manually split off this portion of the
210 # URI.
211 split = self._uri.netloc.split("@")
212 num_components = len(split)
213 if num_components == 2:
214 # There is a profile@ portion of the URL, so take the second half.
215 bucket = split[1]
216 elif num_components == 1:
217 # There is no profile@, so take the whole netloc.
218 bucket = split[0]
219 else:
220 raise ValueError(f"Unexpected extra '@' in S3 URI: '{str(self)}'")
222 if not bucket:
223 raise ValueError(f"S3 URI does not include bucket name: '{str(self)}'")
225 return bucket
227 @classmethod
228 def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]:
229 # Force client to be created for each profile before creating threads.
230 profiles = set[str | None]()
231 for path in uris:
232 if path.scheme == "s3":
233 path = cast(S3ResourcePath, path)
234 profiles.add(path._profile)
235 for profile in profiles:
236 getS3Client(profile)
238 return super()._mexists(uris)
240 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
241 def exists(self) -> bool:
242 """Check that the S3 resource exists."""
243 if self.is_root:
244 # Only check for the bucket since the path is irrelevant
245 return bucketExists(self._bucket, self.client)
246 exists, _ = s3CheckFileExists(self, bucket=self._bucket, client=self.client)
247 return exists
249 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
250 def size(self) -> int:
251 """Return the size of the resource in bytes."""
252 if self.dirLike:
253 return 0
254 exists, sz = s3CheckFileExists(self, bucket=self._bucket, client=self.client)
255 if not exists:
256 raise FileNotFoundError(f"Resource {self} does not exist")
257 return sz
259 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
260 def remove(self) -> None:
261 """Remove the resource."""
262 # https://github.com/boto/boto3/issues/507 - there is no
263 # way of knowing if the file was actually deleted except
264 # for checking all the keys again, reponse is HTTP 204 OK
265 # response all the time
266 try:
267 self.client.delete_object(Bucket=self._bucket, Key=self.relativeToPathRoot)
268 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
269 raise FileNotFoundError("No such resource: {self}") from err
271 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
272 def read(self, size: int = -1) -> bytes:
273 args = {}
274 if size > 0:
275 args["Range"] = f"bytes=0-{size-1}"
276 try:
277 response = self.client.get_object(Bucket=self._bucket, Key=self.relativeToPathRoot, **args)
278 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
279 raise FileNotFoundError(f"No such resource: {self}") from err
280 except ClientError as err:
281 _translate_client_error(err)
282 raise
283 with time_this(log, msg="Read from %s", args=(self,)):
284 body = response["Body"].read()
285 response["Body"].close()
286 return body
288 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
289 def write(self, data: bytes, overwrite: bool = True) -> None:
290 if not overwrite and self.exists():
291 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
292 with time_this(log, msg="Write to %s", args=(self,)):
293 self.client.put_object(Bucket=self._bucket, Key=self.relativeToPathRoot, Body=data)
295 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
296 def mkdir(self) -> None:
297 """Write a directory key to S3."""
298 if not bucketExists(self._bucket, self.client):
299 raise ValueError(f"Bucket {self._bucket} does not exist for {self}!")
301 if not self.dirLike:
302 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
304 # don't create S3 key when root is at the top-level of an Bucket
305 if self.path != "/":
306 self.client.put_object(Bucket=self._bucket, Key=self.relativeToPathRoot)
308 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
309 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None:
310 """Download the remote resource to a local file.
312 Helper routine for _as_local to allow backoff without regenerating
313 the temporary file.
314 """
315 try:
316 self.client.download_fileobj(
317 self._bucket,
318 self.relativeToPathRoot,
319 local_file,
320 Callback=progress,
321 Config=self._transfer_config,
322 )
323 except (
324 self.client.exceptions.NoSuchKey,
325 self.client.exceptions.NoSuchBucket,
326 ) as err:
327 raise FileNotFoundError(f"No such resource: {self}") from err
328 except ClientError as err:
329 _translate_client_error(err)
330 raise
332 def _as_local(self) -> tuple[str, bool]:
333 """Download object from S3 and place in temporary directory.
335 Returns
336 -------
337 path : `str`
338 Path to local temporary file.
339 temporary : `bool`
340 Always returns `True`. This is always a temporary file.
341 """
342 with (
343 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile,
344 time_this(log, msg="Downloading %s to local file", args=(self,)),
345 ):
346 progress = (
347 ProgressPercentage(self, msg="Downloading:")
348 if log.isEnabledFor(ProgressPercentage.log_level)
349 else None
350 )
351 self._download_file(tmpFile, progress)
352 return tmpFile.name, True
354 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
355 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None:
356 """Upload a local file with backoff.
358 Helper method to wrap file uploading in backoff for transfer_from.
359 """
360 try:
361 self.client.upload_file(
362 local_file.ospath, self._bucket, self.relativeToPathRoot, Callback=progress
363 )
364 except self.client.exceptions.NoSuchBucket as err:
365 raise NotADirectoryError(f"Target does not exist: {err}") from err
366 except ClientError as err:
367 _translate_client_error(err)
368 raise
370 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
371 def _copy_from(self, src: S3ResourcePath) -> None:
372 copy_source = {
373 "Bucket": src._bucket,
374 "Key": src.relativeToPathRoot,
375 }
376 try:
377 self.client.copy_object(CopySource=copy_source, Bucket=self._bucket, Key=self.relativeToPathRoot)
378 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
379 raise FileNotFoundError("No such resource to transfer: {self}") from err
380 except ClientError as err:
381 _translate_client_error(err)
382 raise
384 def transfer_from(
385 self,
386 src: ResourcePath,
387 transfer: str = "copy",
388 overwrite: bool = False,
389 transaction: TransactionProtocol | None = None,
390 ) -> None:
391 """Transfer the current resource to an S3 bucket.
393 Parameters
394 ----------
395 src : `ResourcePath`
396 Source URI.
397 transfer : `str`
398 Mode to use for transferring the resource. Supports the following
399 options: copy.
400 overwrite : `bool`, optional
401 Allow an existing file to be overwritten. Defaults to `False`.
402 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
403 Currently unused.
404 """
405 # Fail early to prevent delays if remote resources are requested
406 if transfer not in self.transferModes:
407 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
409 # Existence checks cost time so do not call this unless we know
410 # that debugging is enabled.
411 if log.isEnabledFor(logging.DEBUG):
412 log.debug(
413 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
414 src,
415 src.exists(),
416 self,
417 self.exists(),
418 transfer,
419 )
421 # Short circuit if the URIs are identical immediately.
422 if self == src:
423 log.debug(
424 "Target and destination URIs are identical: %s, returning immediately."
425 " No further action required.",
426 self,
427 )
428 return
430 if not overwrite and self.exists():
431 raise FileExistsError(f"Destination path '{self}' already exists.")
433 if transfer == "auto":
434 transfer = self.transferDefault
436 timer_msg = "Transfer from %s to %s"
437 timer_args = (src, self)
439 if isinstance(src, type(self)):
440 # Looks like an S3 remote uri so we can use direct copy
441 # note that boto3.resource.meta.copy is cleverer than the low
442 # level copy_object
443 with time_this(log, msg=timer_msg, args=timer_args):
444 self._copy_from(src)
446 else:
447 # Use local file and upload it
448 with src.as_local() as local_uri:
449 progress = (
450 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
451 if log.isEnabledFor(ProgressPercentage.log_level)
452 else None
453 )
454 with time_this(log, msg=timer_msg, args=timer_args):
455 self._upload_file(local_uri, progress)
457 # This was an explicit move requested from a remote resource
458 # try to remove that resource
459 if transfer == "move":
460 # Transactions do not work here
461 src.remove()
463 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
464 def walk(
465 self, file_filter: str | re.Pattern | None = None
466 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
467 """Walk the directory tree returning matching files and directories.
469 Parameters
470 ----------
471 file_filter : `str` or `re.Pattern`, optional
472 Regex to filter out files from the list before it is returned.
474 Yields
475 ------
476 dirpath : `ResourcePath`
477 Current directory being examined.
478 dirnames : `list` of `str`
479 Names of subdirectories within dirpath.
480 filenames : `list` of `str`
481 Names of all the files within dirpath.
482 """
483 # We pretend that S3 uses directories and files and not simply keys
484 if not (self.isdir() or self.is_root):
485 raise ValueError(f"Can not walk a non-directory URI: {self}")
487 if isinstance(file_filter, str):
488 file_filter = re.compile(file_filter)
490 s3_paginator = self.client.get_paginator("list_objects_v2")
492 # Limit each query to a single "directory" to match os.walk
493 # We could download all keys at once with no delimiter and work
494 # it out locally but this could potentially lead to large memory
495 # usage for millions of keys. It will also make the initial call
496 # to this method potentially very slow. If making this method look
497 # like os.walk was not required, we could query all keys with
498 # pagination and return them in groups of 1000, but that would
499 # be a different interface since we can't guarantee we would get
500 # them all grouped properly across the 1000 limit boundary.
501 prefix = self.relativeToPathRoot if not self.is_root else ""
502 prefix_len = len(prefix)
503 dirnames = []
504 filenames = []
505 files_there = False
507 for page in s3_paginator.paginate(Bucket=self._bucket, Prefix=prefix, Delimiter="/"):
508 # All results are returned as full key names and we must
509 # convert them back to the root form. The prefix is fixed
510 # and delimited so that is a simple trim
512 # Directories are reported in the CommonPrefixes result
513 # which reports the entire key and must be stripped.
514 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
515 dirnames.extend(found_dirs)
517 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
518 if found_files:
519 files_there = True
520 if file_filter is not None:
521 found_files = [f for f in found_files if file_filter.search(f)]
523 filenames.extend(found_files)
525 # Directories do not exist so we can't test for them. If no files
526 # or directories were found though, this means that it effectively
527 # does not exist and we should match os.walk() behavior and return
528 # immediately.
529 if not dirnames and not files_there:
530 return
531 else:
532 yield self, dirnames, filenames
534 for dir in dirnames:
535 new_uri = self.join(dir)
536 yield from new_uri.walk(file_filter)
538 @contextlib.contextmanager
539 def _openImpl(
540 self,
541 mode: str = "r",
542 *,
543 encoding: str | None = None,
544 ) -> Iterator[ResourceHandleProtocol]:
545 with S3ResourceHandle(mode, log, self.client, self._bucket, self.relativeToPathRoot) as handle:
546 if "b" in mode:
547 yield handle
548 else:
549 if encoding is None:
550 encoding = sys.getdefaultencoding()
551 # cast because the protocol is compatible, but does not have
552 # BytesIO in the inheritance tree
553 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub:
554 yield sub
556 def generate_presigned_get_url(self, *, expiration_time_seconds: int) -> str:
557 # Docstring inherited
558 return self._generate_presigned_url("get_object", expiration_time_seconds)
560 def generate_presigned_put_url(self, *, expiration_time_seconds: int) -> str:
561 # Docstring inherited
562 return self._generate_presigned_url("put_object", expiration_time_seconds)
564 def _generate_presigned_url(self, method: str, expiration_time_seconds: int) -> str:
565 return self.client.generate_presigned_url(
566 method,
567 Params={"Bucket": self._bucket, "Key": self.relativeToPathRoot},
568 ExpiresIn=expiration_time_seconds,
569 )