Coverage for python / lsst / resources / s3utils.py: 25%

147 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-17 08:44 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ( 

15 "_TooManyRequestsError", 

16 "all_retryable_errors", 

17 "backoff", 

18 "bucketExists", 

19 "clean_test_environment_for_s3", 

20 "getS3Client", 

21 "max_retry_time", 

22 "retryable_client_errors", 

23 "retryable_io_errors", 

24 "s3CheckFileExists", 

25) 

26 

27import functools 

28import os 

29import re 

30import urllib.parse 

31from collections.abc import Callable, Iterator 

32from contextlib import contextmanager 

33from http.client import HTTPException, ImproperConnectionState 

34from types import ModuleType 

35from typing import Any, NamedTuple, cast 

36from unittest.mock import patch 

37 

38from botocore.exceptions import ClientError 

39from botocore.handlers import validate_bucket_name 

40from urllib3.exceptions import HTTPError, RequestError 

41from urllib3.util import Url, parse_url 

42 

43try: 

44 import boto3 

45except ImportError: 

46 boto3 = None 

47 

48try: 

49 import botocore 

50except ImportError: 

51 botocore = None 

52 

53 

54from ._resourcePath import ResourcePath 

55from .location import Location 

56from .utils import _get_num_workers 

57 

58# https://pypi.org/project/backoff/ 

59try: 

60 import backoff 

61except ImportError: 

62 

63 class Backoff: 

64 """Mock implementation of the backoff class.""" 

65 

66 @staticmethod 

67 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

68 return func 

69 

70 @staticmethod 

71 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

72 return func 

73 

74 backoff = cast(ModuleType, Backoff) 

75 

76 

77class _TooManyRequestsError(Exception): 

78 """Private exception that can be used for 429 retry. 

79 

80 botocore refuses to deal with 429 error itself so issues a generic 

81 ClientError. 

82 """ 

83 

84 pass 

85 

86 

87# settings for "backoff" retry decorators. these retries are belt-and- 

88# suspenders along with the retries built into Boto3, to account for 

89# semantic differences in errors between S3-like providers. 

90retryable_io_errors = ( 

91 # http.client 

92 ImproperConnectionState, 

93 HTTPException, 

94 # urllib3.exceptions 

95 RequestError, 

96 HTTPError, 

97 # built-ins 

98 TimeoutError, 

99 ConnectionError, 

100 # private 

101 _TooManyRequestsError, 

102) 

103 

104# Client error can include NoSuchKey so retry may not be the right 

105# thing. This may require more consideration if it is to be used. 

106retryable_client_errors = ( 

107 # botocore.exceptions 

108 ClientError, 

109 # built-ins 

110 PermissionError, 

111) 

112 

113 

114# Combine all errors into an easy package. For now client errors 

115# are not included. 

116all_retryable_errors = retryable_io_errors 

117max_retry_time = 60 

118 

119 

120@contextmanager 

121def clean_test_environment_for_s3() -> Iterator[None]: 

122 """Reset S3 environment to ensure that unit tests with a mock S3 can't 

123 accidentally reference real infrastructure. 

124 """ 

125 with patch.dict( 

126 os.environ, 

127 { 

128 "AWS_ACCESS_KEY_ID": "test-access-key", 

129 "AWS_SECRET_ACCESS_KEY": "test-secret-access-key", 

130 "AWS_DEFAULT_REGION": "us-east-1", 

131 }, 

132 ) as patched_environ: 

133 for var in ( 

134 "S3_ENDPOINT_URL", 

135 "AWS_SECURITY_TOKEN", 

136 "AWS_SESSION_TOKEN", 

137 "AWS_PROFILE", 

138 "AWS_SHARED_CREDENTIALS_FILE", 

139 "AWS_CONFIG_FILE", 

140 ): 

141 patched_environ.pop(var, None) 

142 # Clear the cached boto3 S3 client instances. 

143 # This helps us avoid a potential situation where the client could be 

144 # instantiated before moto mocks are installed, which would prevent the 

145 # mocks from taking effect. 

146 _get_s3_client.cache_clear() 

147 yield 

148 

149 

150def getS3Client(profile: str | None = None) -> boto3.client: 

151 """Create a S3 client with AWS (default) or the specified endpoint. 

152 

153 Parameters 

154 ---------- 

155 profile : `str`, optional 

156 The name of an S3 profile describing which S3 service to use. 

157 

158 Returns 

159 ------- 

160 s3client : `botocore.client.S3` 

161 A client of the S3 service. 

162 

163 Notes 

164 ----- 

165 If an explicit profile name is specified, its configuration will be read 

166 from an environment variable named ``LSST_RESOURCES_S3_PROFILE_<profile>`` 

167 if it exists. Note that the name of the profile is case sensitive. This 

168 configuration is specified in the format: ``https://<access key ID>:<secret 

169 key>@<s3 endpoint hostname>``. If the access key ID or secret key values 

170 contain slashes, the slashes must be URI-encoded (replace "/" with "%2F"). 

171 

172 If profile is `None` or the profile environment variable was not set, the 

173 configuration is read from the environment variable ``S3_ENDPOINT_URL``. 

174 If it is not specified, the default AWS endpoint is used. 

175 

176 The access key ID and secret key are optional -- if not specified, they 

177 will be looked up via the `AWS credentials file 

178 <https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html>`_. 

179 

180 If the environment variable LSST_DISABLE_BUCKET_VALIDATION exists 

181 and has a value that is not empty, "0", "f", "n", or "false" 

182 (case-insensitive), then bucket name validation is disabled. This 

183 disabling allows Ceph multi-tenancy colon separators to appear in 

184 bucket names. 

185 """ 

186 if boto3 is None: 

187 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

188 if botocore is None: 

189 raise ModuleNotFoundError("Could not find botocore. Are you sure it is installed?") 

190 

191 endpoint_config = _get_s3_connection_parameters(profile) 

192 

193 return _get_s3_client(endpoint_config, not _s3_should_validate_bucket()) 

194 

195 

196def _s3_should_validate_bucket() -> bool: 

197 """Indicate whether bucket validation should be enabled. 

198 

199 Returns 

200 ------- 

201 validate : `bool` 

202 If `True` bucket names should be validated. 

203 """ 

204 disable_value = os.environ.get("LSST_DISABLE_BUCKET_VALIDATION", "0") 

205 return bool(re.search(r"^(0|f|n|false)?$", disable_value, re.I)) 

206 

207 

208def _get_s3_connection_parameters(profile: str | None = None) -> _EndpointConfig: 

209 """Calculate the connection details. 

210 

211 Parameters 

212 ---------- 

213 profile : `str`, optional 

214 The name of an S3 profile describing which S3 service to use. 

215 

216 Returns 

217 ------- 

218 config : _EndPointConfig 

219 All the information necessary to connect to the bucket. 

220 """ 

221 endpoint = None 

222 if profile is not None: 

223 var_name = f"LSST_RESOURCES_S3_PROFILE_{profile}" 

224 endpoint = os.environ.get(var_name, None) 

225 if not endpoint: 

226 endpoint = os.environ.get("S3_ENDPOINT_URL", None) 

227 if not endpoint: 

228 endpoint = None # Handle "" 

229 

230 return _parse_endpoint_config(endpoint, profile) 

231 

232 

233def _s3_disable_bucket_validation(client: boto3.client) -> None: 

234 """Disable the bucket name validation in the client. 

235 

236 This removes the ``validate_bucket_name`` handler from the handlers 

237 registered for this client. 

238 

239 Parameters 

240 ---------- 

241 client : `boto3.client` 

242 The client to modify. 

243 """ 

244 client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name) 

245 

246 

247@functools.lru_cache 

248def _get_s3_client(endpoint_config: _EndpointConfig, skip_validation: bool) -> boto3.client: 

249 # Helper function to cache the client for this endpoint 

250 # boto seems to assume it will always have at least 10 available. 

251 max_pool_size = max(_get_num_workers(), 10) 

252 config = botocore.config.Config( 

253 read_timeout=180, 

254 max_pool_connections=max_pool_size, 

255 retries={"mode": "adaptive", "max_attempts": 10}, 

256 ) 

257 

258 session = boto3.Session(profile_name=endpoint_config.profile) 

259 

260 client = session.client( 

261 "s3", 

262 endpoint_url=endpoint_config.endpoint_url, 

263 aws_access_key_id=endpoint_config.access_key_id, 

264 aws_secret_access_key=endpoint_config.secret_access_key, 

265 config=config, 

266 ) 

267 if skip_validation: 

268 _s3_disable_bucket_validation(client) 

269 return client 

270 

271 

272class _EndpointConfig(NamedTuple): 

273 endpoint_url: str | None = None 

274 access_key_id: str | None = None 

275 secret_access_key: str | None = None 

276 profile: str | None = None 

277 

278 

279def _parse_endpoint_config(endpoint: str | None, profile: str | None = None) -> _EndpointConfig: 

280 if not endpoint: 

281 return _EndpointConfig(profile=profile) 

282 

283 parsed = parse_url(endpoint) 

284 

285 # Strip the username/password portion of the URL from the result. 

286 endpoint_url = Url(host=parsed.host, path=parsed.path, port=parsed.port, scheme=parsed.scheme).url 

287 

288 access_key_id = None 

289 secret_access_key = None 

290 if parsed.auth: 

291 split = parsed.auth.split(":") 

292 if len(split) != 2: 

293 raise ValueError("S3 access key and secret not in expected format.") 

294 access_key_id, secret_access_key = split 

295 access_key_id = urllib.parse.unquote(access_key_id) 

296 secret_access_key = urllib.parse.unquote(secret_access_key) 

297 

298 if access_key_id is not None and secret_access_key is not None: 

299 # We already have the necessary configuration for the profile, so do 

300 # not pass the profile to boto3. boto3 will raise an exception if the 

301 # profile is not defined in its configuration file, whether or not it 

302 # needs to read the configuration from it. 

303 profile = None 

304 

305 return _EndpointConfig( 

306 endpoint_url=endpoint_url, 

307 access_key_id=access_key_id, 

308 secret_access_key=secret_access_key, 

309 profile=profile, 

310 ) 

311 

312 

313def s3CheckFileExists( 

314 path: Location | ResourcePath | str, 

315 bucket: str | None = None, 

316 client: boto3.client | None = None, 

317) -> tuple[bool, int]: 

318 """Return if the file exists in the bucket or not. 

319 

320 Parameters 

321 ---------- 

322 path : `Location`, `ResourcePath` or `str` 

323 Location or ResourcePath containing the bucket name and filepath. 

324 bucket : `str`, optional 

325 Name of the bucket in which to look. If provided, path will be assumed 

326 to correspond to be relative to the given bucket. 

327 client : `boto3.client`, optional 

328 S3 Client object to query, if not supplied boto3 will try to resolve 

329 the credentials as in order described in its manual_. 

330 

331 Returns 

332 ------- 

333 exists : `bool` 

334 True if key exists, False otherwise. 

335 size : `int` 

336 Size of the key, if key exists, in bytes, otherwise -1. 

337 

338 Notes 

339 ----- 

340 S3 Paths are sensitive to leading and trailing path separators. 

341 

342 .. _manual: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/\ 

343 configuration.html#configuring-credentials 

344 """ 

345 if boto3 is None: 

346 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

347 

348 if client is None: 

349 client = getS3Client() 

350 

351 if isinstance(path, str): 

352 if bucket is not None: 

353 filepath = path 

354 else: 

355 uri = ResourcePath(path) 

356 bucket = uri.netloc 

357 filepath = uri.relativeToPathRoot 

358 elif isinstance(path, ResourcePath | Location): 

359 if bucket is None: 

360 bucket = path.netloc 

361 filepath = path.relativeToPathRoot 

362 else: 

363 raise TypeError(f"Unsupported path type: {path!r}.") 

364 

365 try: 

366 obj = client.head_object(Bucket=bucket, Key=filepath) 

367 return (True, obj["ContentLength"]) 

368 except client.exceptions.ClientError as err: 

369 # resource unreachable error means key does not exist 

370 errcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

371 if errcode == 404: 

372 return (False, -1) 

373 # head_object returns 404 when object does not exist only when user has 

374 # s3:ListBucket permission. If list permission does not exist a 403 is 

375 # returned. In practical terms this generally means that the file does 

376 # not exist, but it could also mean user lacks s3:GetObject permission: 

377 # https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

378 # I don't think its possible to discern which case is it with certainty 

379 if errcode == 403: 

380 raise PermissionError( 

381 "Forbidden HEAD operation error occurred. " 

382 "Verify s3:ListBucket and s3:GetObject " 

383 "permissions are granted for your IAM user. " 

384 ) from err 

385 if errcode == 429: 

386 # boto3, incorrectly, does not automatically retry with 429 

387 # so instead we raise an explicit retry exception for backoff. 

388 raise _TooManyRequestsError(str(err)) from err 

389 raise 

390 

391 

392def bucketExists(bucketName: str, client: boto3.client | None = None) -> bool: 

393 """Check if the S3 bucket with the given name actually exists. 

394 

395 Parameters 

396 ---------- 

397 bucketName : `str` 

398 Name of the S3 Bucket. 

399 client : `boto3.client`, optional 

400 S3 Client object to query, if not supplied boto3 will try to resolve 

401 the credentials by calling `getS3Client`. 

402 

403 Returns 

404 ------- 

405 exists : `bool` 

406 True if it exists, False if no Bucket with specified parameters is 

407 found. 

408 """ 

409 if boto3 is None: 

410 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

411 

412 if client is None: 

413 client = getS3Client() 

414 try: 

415 client.get_bucket_location(Bucket=bucketName) 

416 return True 

417 except client.exceptions.NoSuchBucket: 

418 return False 

419 

420 

421def translate_client_error(err: ClientError, uri: ResourcePath) -> None: 

422 """Translate a ClientError into a specialist error if relevant. 

423 

424 Parameters 

425 ---------- 

426 err : `ClientError` 

427 Exception to translate. 

428 uri : `ResourcePath` 

429 The URI of the resource that is resulting in the error. 

430 

431 Raises 

432 ------ 

433 _TooManyRequestsError 

434 Raised if the `ClientError` looks like a 429 retry request. 

435 """ 

436 if "(429)" in str(err): 

437 # ClientError includes the error code in the message 

438 # but no direct way to access it without looking inside the 

439 # response. 

440 raise _TooManyRequestsError(f"{err} when accessing {uri}") from err 

441 elif "(404)" in str(err): 

442 # Some systems can generate this rather than NoSuchKey. 

443 raise FileNotFoundError(f"Resource not found (permission denied): {uri}")