Coverage for python/lsst/resources/s3utils.py: 26%
158 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 02:51 -0700
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 02:51 -0700
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = (
15 "clean_test_environment",
16 "getS3Client",
17 "s3CheckFileExists",
18 "bucketExists",
19 "setAwsEnvCredentials",
20 "unsetAwsEnvCredentials",
21 "backoff",
22 "all_retryable_errors",
23 "max_retry_time",
24 "retryable_io_errors",
25 "retryable_client_errors",
26 "_TooManyRequestsError",
27 "clean_test_environment_for_s3",
28)
30import functools
31import os
32import re
33import urllib.parse
34from collections.abc import Callable, Iterator
35from contextlib import contextmanager
36from http.client import HTTPException, ImproperConnectionState
37from types import ModuleType
38from typing import TYPE_CHECKING, Any, NamedTuple, cast
39from unittest.mock import patch
41from botocore.exceptions import ClientError
42from botocore.handlers import validate_bucket_name
43from deprecated.sphinx import deprecated
44from urllib3.exceptions import HTTPError, RequestError
45from urllib3.util import Url, parse_url
47if TYPE_CHECKING:
48 from unittest import TestCase
51try:
52 import boto3
53except ImportError:
54 boto3 = None
56try:
57 import botocore
58except ImportError:
59 botocore = None
62from ._resourcePath import ResourcePath
63from .location import Location
65# https://pypi.org/project/backoff/
66try:
67 import backoff
68except ImportError:
70 class Backoff:
71 """Mock implementation of the backoff class."""
73 @staticmethod
74 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
75 return func
77 @staticmethod
78 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
79 return func
81 backoff = cast(ModuleType, Backoff)
84class _TooManyRequestsError(Exception):
85 """Private exception that can be used for 429 retry.
87 botocore refuses to deal with 429 error itself so issues a generic
88 ClientError.
89 """
91 pass
94# settings for "backoff" retry decorators. these retries are belt-and-
95# suspenders along with the retries built into Boto3, to account for
96# semantic differences in errors between S3-like providers.
97retryable_io_errors = (
98 # http.client
99 ImproperConnectionState,
100 HTTPException,
101 # urllib3.exceptions
102 RequestError,
103 HTTPError,
104 # built-ins
105 TimeoutError,
106 ConnectionError,
107 # private
108 _TooManyRequestsError,
109)
111# Client error can include NoSuchKey so retry may not be the right
112# thing. This may require more consideration if it is to be used.
113retryable_client_errors = (
114 # botocore.exceptions
115 ClientError,
116 # built-ins
117 PermissionError,
118)
121# Combine all errors into an easy package. For now client errors
122# are not included.
123all_retryable_errors = retryable_io_errors
124max_retry_time = 60
127@deprecated(
128 reason="This has been replaced by a new function, clean_test_environment_for_s3()."
129 " Will be removed after v26.2023.5000",
130 version="26.2023.5000",
131 category=FutureWarning,
132)
133def clean_test_environment(testcase: TestCase) -> None:
134 """Clear S3_ENDPOINT_URL then restore it at the end of a test.
136 Parameters
137 ----------
138 testcase : `unittest.TestCase`
139 Reference to the test being run; used to add a cleanup function.
140 """
141 endpoint = os.environ.get("S3_ENDPOINT_URL")
143 if not endpoint:
144 return
145 os.environ["S3_ENDPOINT_URL"] = ""
147 def cleanup() -> None:
148 if endpoint is not None:
149 os.environ["S3_ENDPOINT_URL"] = endpoint
151 testcase.addCleanup(cleanup)
154@contextmanager
155def clean_test_environment_for_s3() -> Iterator[None]:
156 """Reset S3 environment to ensure that unit tests with a mock S3 can't
157 accidentally reference real infrastructure.
158 """
159 with patch.dict(
160 os.environ,
161 {
162 "AWS_ACCESS_KEY_ID": "test-access-key",
163 "AWS_SECRET_ACCESS_KEY": "test-secret-access-key",
164 },
165 ) as patched_environ:
166 for var in (
167 "S3_ENDPOINT_URL",
168 "AWS_SECURITY_TOKEN",
169 "AWS_SESSION_TOKEN",
170 "AWS_PROFILE",
171 "AWS_SHARED_CREDENTIALS_FILE",
172 "AWS_CONFIG_FILE",
173 ):
174 patched_environ.pop(var, None)
175 # Clear the cached boto3 S3 client instances.
176 # This helps us avoid a potential situation where the client could be
177 # instantiated before moto mocks are installed, which would prevent the
178 # mocks from taking effect.
179 _get_s3_client.cache_clear()
180 yield
183def getS3Client(profile: str | None = None) -> boto3.client:
184 """Create a S3 client with AWS (default) or the specified endpoint.
186 Parameters
187 ----------
188 profile : `str`, optional
189 The name of an S3 profile describing which S3 service to use.
191 Returns
192 -------
193 s3client : `botocore.client.S3`
194 A client of the S3 service.
196 Notes
197 -----
198 If an explicit profile name is specified, its configuration will be read
199 from an environment variable named ``LSST_RESOURCES_S3_PROFILE_<profile>``
200 if it exists. Note that the name of the profile is case sensitive. This
201 configuration is specified in the format: ``https://<access key ID>:<secret
202 key>@<s3 endpoint hostname>``. If the access key ID or secret key values
203 contain slashes, the slashes must be URI-encoded (replace "/" with "%2F").
205 If profile is `None` or the profile environment variable was not set, the
206 configuration is read from the environment variable ``S3_ENDPOINT_URL``.
207 If it is not specified, the default AWS endpoint is used.
209 The access key ID and secret key are optional -- if not specified, they
210 will be looked up via the `AWS credentials file
211 <https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html>`_.
213 If the environment variable LSST_DISABLE_BUCKET_VALIDATION exists
214 and has a value that is not empty, "0", "f", "n", or "false"
215 (case-insensitive), then bucket name validation is disabled. This
216 disabling allows Ceph multi-tenancy colon separators to appear in
217 bucket names.
218 """
219 if boto3 is None:
220 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?")
221 if botocore is None:
222 raise ModuleNotFoundError("Could not find botocore. Are you sure it is installed?")
224 endpoint = None
225 if profile is not None:
226 var_name = f"LSST_RESOURCES_S3_PROFILE_{profile}"
227 endpoint = os.environ.get(var_name, None)
228 if not endpoint:
229 endpoint = os.environ.get("S3_ENDPOINT_URL", None)
230 if not endpoint:
231 endpoint = None # Handle ""
233 disable_value = os.environ.get("LSST_DISABLE_BUCKET_VALIDATION", "0")
234 skip_validation = not re.search(r"^(0|f|n|false)?$", disable_value, re.I)
236 return _get_s3_client(endpoint, profile, skip_validation)
239@functools.lru_cache
240def _get_s3_client(endpoint: str | None, profile: str | None, skip_validation: bool) -> boto3.client:
241 # Helper function to cache the client for this endpoint
242 config = botocore.config.Config(read_timeout=180, retries={"mode": "adaptive", "max_attempts": 10})
244 endpoint_config = _parse_endpoint_config(endpoint)
246 if endpoint_config.access_key_id is not None and endpoint_config.secret_access_key is not None:
247 # We already have the necessary configuration for the profile, so do
248 # not pass the profile to boto3. boto3 will raise an exception if the
249 # profile is not defined in its configuration file, whether or not it
250 # needs to read the configuration from it.
251 profile = None
252 session = boto3.Session(profile_name=profile)
254 client = session.client(
255 "s3",
256 endpoint_url=endpoint_config.endpoint_url,
257 aws_access_key_id=endpoint_config.access_key_id,
258 aws_secret_access_key=endpoint_config.secret_access_key,
259 config=config,
260 )
261 if skip_validation:
262 client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name)
263 return client
266class _EndpointConfig(NamedTuple):
267 endpoint_url: str | None = None
268 access_key_id: str | None = None
269 secret_access_key: str | None = None
272def _parse_endpoint_config(endpoint: str | None) -> _EndpointConfig:
273 if not endpoint:
274 return _EndpointConfig()
276 parsed = parse_url(endpoint)
278 # Strip the username/password portion of the URL from the result.
279 endpoint_url = Url(host=parsed.host, path=parsed.path, port=parsed.port, scheme=parsed.scheme).url
281 access_key_id = None
282 secret_access_key = None
283 if parsed.auth:
284 split = parsed.auth.split(":")
285 if len(split) != 2:
286 raise ValueError("S3 access key and secret not in expected format.")
287 access_key_id, secret_access_key = split
288 access_key_id = urllib.parse.unquote(access_key_id)
289 secret_access_key = urllib.parse.unquote(secret_access_key)
291 return _EndpointConfig(
292 endpoint_url=endpoint_url, access_key_id=access_key_id, secret_access_key=secret_access_key
293 )
296def s3CheckFileExists(
297 path: Location | ResourcePath | str,
298 bucket: str | None = None,
299 client: boto3.client | None = None,
300) -> tuple[bool, int]:
301 """Return if the file exists in the bucket or not.
303 Parameters
304 ----------
305 path : `Location`, `ResourcePath` or `str`
306 Location or ResourcePath containing the bucket name and filepath.
307 bucket : `str`, optional
308 Name of the bucket in which to look. If provided, path will be assumed
309 to correspond to be relative to the given bucket.
310 client : `boto3.client`, optional
311 S3 Client object to query, if not supplied boto3 will try to resolve
312 the credentials as in order described in its manual_.
314 Returns
315 -------
316 exists : `bool`
317 True if key exists, False otherwise.
318 size : `int`
319 Size of the key, if key exists, in bytes, otherwise -1.
321 Notes
322 -----
323 S3 Paths are sensitive to leading and trailing path separators.
325 .. _manual: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/\
326 configuration.html#configuring-credentials
327 """
328 if boto3 is None:
329 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?")
331 if client is None:
332 client = getS3Client()
334 if isinstance(path, str):
335 if bucket is not None:
336 filepath = path
337 else:
338 uri = ResourcePath(path)
339 bucket = uri.netloc
340 filepath = uri.relativeToPathRoot
341 elif isinstance(path, ResourcePath | Location):
342 if bucket is None:
343 bucket = path.netloc
344 filepath = path.relativeToPathRoot
345 else:
346 raise TypeError(f"Unsupported path type: {path!r}.")
348 try:
349 obj = client.head_object(Bucket=bucket, Key=filepath)
350 return (True, obj["ContentLength"])
351 except client.exceptions.ClientError as err:
352 # resource unreachable error means key does not exist
353 errcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
354 if errcode == 404:
355 return (False, -1)
356 # head_object returns 404 when object does not exist only when user has
357 # s3:ListBucket permission. If list permission does not exist a 403 is
358 # returned. In practical terms this generally means that the file does
359 # not exist, but it could also mean user lacks s3:GetObject permission:
360 # https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
361 # I don't think its possible to discern which case is it with certainty
362 if errcode == 403:
363 raise PermissionError(
364 "Forbidden HEAD operation error occurred. "
365 "Verify s3:ListBucket and s3:GetObject "
366 "permissions are granted for your IAM user. "
367 ) from err
368 if errcode == 429:
369 # boto3, incorrectly, does not automatically retry with 429
370 # so instead we raise an explicit retry exception for backoff.
371 raise _TooManyRequestsError(str(err)) from err
372 raise
375def bucketExists(bucketName: str, client: boto3.client | None = None) -> bool:
376 """Check if the S3 bucket with the given name actually exists.
378 Parameters
379 ----------
380 bucketName : `str`
381 Name of the S3 Bucket.
382 client : `boto3.client`, optional
383 S3 Client object to query, if not supplied boto3 will try to resolve
384 the credentials by calling `getS3Client`.
386 Returns
387 -------
388 exists : `bool`
389 True if it exists, False if no Bucket with specified parameters is
390 found.
391 """
392 if boto3 is None:
393 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?")
395 if client is None:
396 client = getS3Client()
397 try:
398 client.get_bucket_location(Bucket=bucketName)
399 return True
400 except client.exceptions.NoSuchBucket:
401 return False
404@deprecated(
405 reason="This function could accidentally leave real credentials in the environment during testing."
406 " A new function, clean_test_environment_for_s3(), can be used to set up mock credentials."
407 " Will be removed after v26.2023.5000",
408 version="26.2023.5000",
409 category=FutureWarning,
410)
411def setAwsEnvCredentials(
412 accessKeyId: str = "dummyAccessKeyId", secretAccessKey: str = "dummySecretAccessKey"
413) -> bool:
414 """Set AWS credentials environmental variables.
416 Parameters
417 ----------
418 accessKeyId : `str`
419 Value given to AWS_ACCESS_KEY_ID environmental variable. Defaults to
420 `dummyAccessKeyId`.
421 secretAccessKey : `str`
422 Value given to AWS_SECRET_ACCESS_KEY environmental variable. Defaults
423 to `dummySecretAccessKey`.
425 Returns
426 -------
427 setEnvCredentials : `bool`
428 True when environmental variables were set, False otherwise.
430 Notes
431 -----
432 If either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY are not set, both
433 values are overwritten to ensure that the values are consistent.
434 """
435 if "AWS_ACCESS_KEY_ID" not in os.environ or "AWS_SECRET_ACCESS_KEY" not in os.environ:
436 os.environ["AWS_ACCESS_KEY_ID"] = accessKeyId
437 os.environ["AWS_SECRET_ACCESS_KEY"] = secretAccessKey
438 return True
439 return False
442@deprecated(
443 reason="This has been replaced by a new function, clean_test_environment_for_s3()."
444 " Will be removed after v26.2023.5000",
445 version="26.2023.5000",
446 category=FutureWarning,
447)
448def unsetAwsEnvCredentials() -> None:
449 """Unset AWS credential environment variables.
451 Unsets the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environmental
452 variables.
453 """
454 if "AWS_ACCESS_KEY_ID" in os.environ:
455 del os.environ["AWS_ACCESS_KEY_ID"]
456 if "AWS_SECRET_ACCESS_KEY" in os.environ:
457 del os.environ["AWS_SECRET_ACCESS_KEY"]