Coverage for python / lsst / resources / s3utils.py: 25%
147 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 08:44 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 08:44 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = (
15 "_TooManyRequestsError",
16 "all_retryable_errors",
17 "backoff",
18 "bucketExists",
19 "clean_test_environment_for_s3",
20 "getS3Client",
21 "max_retry_time",
22 "retryable_client_errors",
23 "retryable_io_errors",
24 "s3CheckFileExists",
25)
27import functools
28import os
29import re
30import urllib.parse
31from collections.abc import Callable, Iterator
32from contextlib import contextmanager
33from http.client import HTTPException, ImproperConnectionState
34from types import ModuleType
35from typing import Any, NamedTuple, cast
36from unittest.mock import patch
38from botocore.exceptions import ClientError
39from botocore.handlers import validate_bucket_name
40from urllib3.exceptions import HTTPError, RequestError
41from urllib3.util import Url, parse_url
43try:
44 import boto3
45except ImportError:
46 boto3 = None
48try:
49 import botocore
50except ImportError:
51 botocore = None
54from ._resourcePath import ResourcePath
55from .location import Location
56from .utils import _get_num_workers
58# https://pypi.org/project/backoff/
59try:
60 import backoff
61except ImportError:
63 class Backoff:
64 """Mock implementation of the backoff class."""
66 @staticmethod
67 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
68 return func
70 @staticmethod
71 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
72 return func
74 backoff = cast(ModuleType, Backoff)
77class _TooManyRequestsError(Exception):
78 """Private exception that can be used for 429 retry.
80 botocore refuses to deal with 429 error itself so issues a generic
81 ClientError.
82 """
84 pass
87# settings for "backoff" retry decorators. these retries are belt-and-
88# suspenders along with the retries built into Boto3, to account for
89# semantic differences in errors between S3-like providers.
90retryable_io_errors = (
91 # http.client
92 ImproperConnectionState,
93 HTTPException,
94 # urllib3.exceptions
95 RequestError,
96 HTTPError,
97 # built-ins
98 TimeoutError,
99 ConnectionError,
100 # private
101 _TooManyRequestsError,
102)
104# Client error can include NoSuchKey so retry may not be the right
105# thing. This may require more consideration if it is to be used.
106retryable_client_errors = (
107 # botocore.exceptions
108 ClientError,
109 # built-ins
110 PermissionError,
111)
114# Combine all errors into an easy package. For now client errors
115# are not included.
116all_retryable_errors = retryable_io_errors
117max_retry_time = 60
120@contextmanager
121def clean_test_environment_for_s3() -> Iterator[None]:
122 """Reset S3 environment to ensure that unit tests with a mock S3 can't
123 accidentally reference real infrastructure.
124 """
125 with patch.dict(
126 os.environ,
127 {
128 "AWS_ACCESS_KEY_ID": "test-access-key",
129 "AWS_SECRET_ACCESS_KEY": "test-secret-access-key",
130 "AWS_DEFAULT_REGION": "us-east-1",
131 },
132 ) as patched_environ:
133 for var in (
134 "S3_ENDPOINT_URL",
135 "AWS_SECURITY_TOKEN",
136 "AWS_SESSION_TOKEN",
137 "AWS_PROFILE",
138 "AWS_SHARED_CREDENTIALS_FILE",
139 "AWS_CONFIG_FILE",
140 ):
141 patched_environ.pop(var, None)
142 # Clear the cached boto3 S3 client instances.
143 # This helps us avoid a potential situation where the client could be
144 # instantiated before moto mocks are installed, which would prevent the
145 # mocks from taking effect.
146 _get_s3_client.cache_clear()
147 yield
150def getS3Client(profile: str | None = None) -> boto3.client:
151 """Create a S3 client with AWS (default) or the specified endpoint.
153 Parameters
154 ----------
155 profile : `str`, optional
156 The name of an S3 profile describing which S3 service to use.
158 Returns
159 -------
160 s3client : `botocore.client.S3`
161 A client of the S3 service.
163 Notes
164 -----
165 If an explicit profile name is specified, its configuration will be read
166 from an environment variable named ``LSST_RESOURCES_S3_PROFILE_<profile>``
167 if it exists. Note that the name of the profile is case sensitive. This
168 configuration is specified in the format: ``https://<access key ID>:<secret
169 key>@<s3 endpoint hostname>``. If the access key ID or secret key values
170 contain slashes, the slashes must be URI-encoded (replace "/" with "%2F").
172 If profile is `None` or the profile environment variable was not set, the
173 configuration is read from the environment variable ``S3_ENDPOINT_URL``.
174 If it is not specified, the default AWS endpoint is used.
176 The access key ID and secret key are optional -- if not specified, they
177 will be looked up via the `AWS credentials file
178 <https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html>`_.
180 If the environment variable LSST_DISABLE_BUCKET_VALIDATION exists
181 and has a value that is not empty, "0", "f", "n", or "false"
182 (case-insensitive), then bucket name validation is disabled. This
183 disabling allows Ceph multi-tenancy colon separators to appear in
184 bucket names.
185 """
186 if boto3 is None:
187 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?")
188 if botocore is None:
189 raise ModuleNotFoundError("Could not find botocore. Are you sure it is installed?")
191 endpoint_config = _get_s3_connection_parameters(profile)
193 return _get_s3_client(endpoint_config, not _s3_should_validate_bucket())
196def _s3_should_validate_bucket() -> bool:
197 """Indicate whether bucket validation should be enabled.
199 Returns
200 -------
201 validate : `bool`
202 If `True` bucket names should be validated.
203 """
204 disable_value = os.environ.get("LSST_DISABLE_BUCKET_VALIDATION", "0")
205 return bool(re.search(r"^(0|f|n|false)?$", disable_value, re.I))
208def _get_s3_connection_parameters(profile: str | None = None) -> _EndpointConfig:
209 """Calculate the connection details.
211 Parameters
212 ----------
213 profile : `str`, optional
214 The name of an S3 profile describing which S3 service to use.
216 Returns
217 -------
218 config : _EndPointConfig
219 All the information necessary to connect to the bucket.
220 """
221 endpoint = None
222 if profile is not None:
223 var_name = f"LSST_RESOURCES_S3_PROFILE_{profile}"
224 endpoint = os.environ.get(var_name, None)
225 if not endpoint:
226 endpoint = os.environ.get("S3_ENDPOINT_URL", None)
227 if not endpoint:
228 endpoint = None # Handle ""
230 return _parse_endpoint_config(endpoint, profile)
233def _s3_disable_bucket_validation(client: boto3.client) -> None:
234 """Disable the bucket name validation in the client.
236 This removes the ``validate_bucket_name`` handler from the handlers
237 registered for this client.
239 Parameters
240 ----------
241 client : `boto3.client`
242 The client to modify.
243 """
244 client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name)
247@functools.lru_cache
248def _get_s3_client(endpoint_config: _EndpointConfig, skip_validation: bool) -> boto3.client:
249 # Helper function to cache the client for this endpoint
250 # boto seems to assume it will always have at least 10 available.
251 max_pool_size = max(_get_num_workers(), 10)
252 config = botocore.config.Config(
253 read_timeout=180,
254 max_pool_connections=max_pool_size,
255 retries={"mode": "adaptive", "max_attempts": 10},
256 )
258 session = boto3.Session(profile_name=endpoint_config.profile)
260 client = session.client(
261 "s3",
262 endpoint_url=endpoint_config.endpoint_url,
263 aws_access_key_id=endpoint_config.access_key_id,
264 aws_secret_access_key=endpoint_config.secret_access_key,
265 config=config,
266 )
267 if skip_validation:
268 _s3_disable_bucket_validation(client)
269 return client
272class _EndpointConfig(NamedTuple):
273 endpoint_url: str | None = None
274 access_key_id: str | None = None
275 secret_access_key: str | None = None
276 profile: str | None = None
279def _parse_endpoint_config(endpoint: str | None, profile: str | None = None) -> _EndpointConfig:
280 if not endpoint:
281 return _EndpointConfig(profile=profile)
283 parsed = parse_url(endpoint)
285 # Strip the username/password portion of the URL from the result.
286 endpoint_url = Url(host=parsed.host, path=parsed.path, port=parsed.port, scheme=parsed.scheme).url
288 access_key_id = None
289 secret_access_key = None
290 if parsed.auth:
291 split = parsed.auth.split(":")
292 if len(split) != 2:
293 raise ValueError("S3 access key and secret not in expected format.")
294 access_key_id, secret_access_key = split
295 access_key_id = urllib.parse.unquote(access_key_id)
296 secret_access_key = urllib.parse.unquote(secret_access_key)
298 if access_key_id is not None and secret_access_key is not None:
299 # We already have the necessary configuration for the profile, so do
300 # not pass the profile to boto3. boto3 will raise an exception if the
301 # profile is not defined in its configuration file, whether or not it
302 # needs to read the configuration from it.
303 profile = None
305 return _EndpointConfig(
306 endpoint_url=endpoint_url,
307 access_key_id=access_key_id,
308 secret_access_key=secret_access_key,
309 profile=profile,
310 )
313def s3CheckFileExists(
314 path: Location | ResourcePath | str,
315 bucket: str | None = None,
316 client: boto3.client | None = None,
317) -> tuple[bool, int]:
318 """Return if the file exists in the bucket or not.
320 Parameters
321 ----------
322 path : `Location`, `ResourcePath` or `str`
323 Location or ResourcePath containing the bucket name and filepath.
324 bucket : `str`, optional
325 Name of the bucket in which to look. If provided, path will be assumed
326 to correspond to be relative to the given bucket.
327 client : `boto3.client`, optional
328 S3 Client object to query, if not supplied boto3 will try to resolve
329 the credentials as in order described in its manual_.
331 Returns
332 -------
333 exists : `bool`
334 True if key exists, False otherwise.
335 size : `int`
336 Size of the key, if key exists, in bytes, otherwise -1.
338 Notes
339 -----
340 S3 Paths are sensitive to leading and trailing path separators.
342 .. _manual: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/\
343 configuration.html#configuring-credentials
344 """
345 if boto3 is None:
346 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?")
348 if client is None:
349 client = getS3Client()
351 if isinstance(path, str):
352 if bucket is not None:
353 filepath = path
354 else:
355 uri = ResourcePath(path)
356 bucket = uri.netloc
357 filepath = uri.relativeToPathRoot
358 elif isinstance(path, ResourcePath | Location):
359 if bucket is None:
360 bucket = path.netloc
361 filepath = path.relativeToPathRoot
362 else:
363 raise TypeError(f"Unsupported path type: {path!r}.")
365 try:
366 obj = client.head_object(Bucket=bucket, Key=filepath)
367 return (True, obj["ContentLength"])
368 except client.exceptions.ClientError as err:
369 # resource unreachable error means key does not exist
370 errcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
371 if errcode == 404:
372 return (False, -1)
373 # head_object returns 404 when object does not exist only when user has
374 # s3:ListBucket permission. If list permission does not exist a 403 is
375 # returned. In practical terms this generally means that the file does
376 # not exist, but it could also mean user lacks s3:GetObject permission:
377 # https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
378 # I don't think its possible to discern which case is it with certainty
379 if errcode == 403:
380 raise PermissionError(
381 "Forbidden HEAD operation error occurred. "
382 "Verify s3:ListBucket and s3:GetObject "
383 "permissions are granted for your IAM user. "
384 ) from err
385 if errcode == 429:
386 # boto3, incorrectly, does not automatically retry with 429
387 # so instead we raise an explicit retry exception for backoff.
388 raise _TooManyRequestsError(str(err)) from err
389 raise
392def bucketExists(bucketName: str, client: boto3.client | None = None) -> bool:
393 """Check if the S3 bucket with the given name actually exists.
395 Parameters
396 ----------
397 bucketName : `str`
398 Name of the S3 Bucket.
399 client : `boto3.client`, optional
400 S3 Client object to query, if not supplied boto3 will try to resolve
401 the credentials by calling `getS3Client`.
403 Returns
404 -------
405 exists : `bool`
406 True if it exists, False if no Bucket with specified parameters is
407 found.
408 """
409 if boto3 is None:
410 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?")
412 if client is None:
413 client = getS3Client()
414 try:
415 client.get_bucket_location(Bucket=bucketName)
416 return True
417 except client.exceptions.NoSuchBucket:
418 return False
421def translate_client_error(err: ClientError, uri: ResourcePath) -> None:
422 """Translate a ClientError into a specialist error if relevant.
424 Parameters
425 ----------
426 err : `ClientError`
427 Exception to translate.
428 uri : `ResourcePath`
429 The URI of the resource that is resulting in the error.
431 Raises
432 ------
433 _TooManyRequestsError
434 Raised if the `ClientError` looks like a 429 retry request.
435 """
436 if "(429)" in str(err):
437 # ClientError includes the error code in the message
438 # but no direct way to access it without looking inside the
439 # response.
440 raise _TooManyRequestsError(f"{err} when accessing {uri}") from err
441 elif "(404)" in str(err):
442 # Some systems can generate this rather than NoSuchKey.
443 raise FileNotFoundError(f"Resource not found (permission denied): {uri}")