Coverage for python/lsst/resources/s3utils.py: 26%

158 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-13 09:59 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ( 

15 "clean_test_environment", 

16 "getS3Client", 

17 "s3CheckFileExists", 

18 "bucketExists", 

19 "setAwsEnvCredentials", 

20 "unsetAwsEnvCredentials", 

21 "backoff", 

22 "all_retryable_errors", 

23 "max_retry_time", 

24 "retryable_io_errors", 

25 "retryable_client_errors", 

26 "_TooManyRequestsError", 

27 "clean_test_environment_for_s3", 

28) 

29 

30import functools 

31import os 

32import re 

33import urllib.parse 

34from collections.abc import Callable, Iterator 

35from contextlib import contextmanager 

36from http.client import HTTPException, ImproperConnectionState 

37from types import ModuleType 

38from typing import TYPE_CHECKING, Any, NamedTuple, cast 

39from unittest.mock import patch 

40 

41from botocore.exceptions import ClientError 

42from botocore.handlers import validate_bucket_name 

43from deprecated.sphinx import deprecated 

44from urllib3.exceptions import HTTPError, RequestError 

45from urllib3.util import Url, parse_url 

46 

47if TYPE_CHECKING: 

48 from unittest import TestCase 

49 

50 

51try: 

52 import boto3 

53except ImportError: 

54 boto3 = None 

55 

56try: 

57 import botocore 

58except ImportError: 

59 botocore = None 

60 

61 

62from ._resourcePath import ResourcePath 

63from .location import Location 

64 

65# https://pypi.org/project/backoff/ 

66try: 

67 import backoff 

68except ImportError: 

69 

70 class Backoff: 

71 """Mock implementation of the backoff class.""" 

72 

73 @staticmethod 

74 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

75 return func 

76 

77 @staticmethod 

78 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

79 return func 

80 

81 backoff = cast(ModuleType, Backoff) 

82 

83 

84class _TooManyRequestsError(Exception): 

85 """Private exception that can be used for 429 retry. 

86 

87 botocore refuses to deal with 429 error itself so issues a generic 

88 ClientError. 

89 """ 

90 

91 pass 

92 

93 

94# settings for "backoff" retry decorators. these retries are belt-and- 

95# suspenders along with the retries built into Boto3, to account for 

96# semantic differences in errors between S3-like providers. 

97retryable_io_errors = ( 

98 # http.client 

99 ImproperConnectionState, 

100 HTTPException, 

101 # urllib3.exceptions 

102 RequestError, 

103 HTTPError, 

104 # built-ins 

105 TimeoutError, 

106 ConnectionError, 

107 # private 

108 _TooManyRequestsError, 

109) 

110 

111# Client error can include NoSuchKey so retry may not be the right 

112# thing. This may require more consideration if it is to be used. 

113retryable_client_errors = ( 

114 # botocore.exceptions 

115 ClientError, 

116 # built-ins 

117 PermissionError, 

118) 

119 

120 

121# Combine all errors into an easy package. For now client errors 

122# are not included. 

123all_retryable_errors = retryable_io_errors 

124max_retry_time = 60 

125 

126 

127@deprecated( 

128 reason="This has been replaced by a new function, clean_test_environment_for_s3()." 

129 " Will be removed after v26.2023.5000", 

130 version="26.2023.5000", 

131 category=FutureWarning, 

132) 

133def clean_test_environment(testcase: TestCase) -> None: 

134 """Clear S3_ENDPOINT_URL then restore it at the end of a test. 

135 

136 Parameters 

137 ---------- 

138 testcase : `unittest.TestCase` 

139 Reference to the test being run; used to add a cleanup function. 

140 """ 

141 endpoint = os.environ.get("S3_ENDPOINT_URL") 

142 

143 if not endpoint: 

144 return 

145 os.environ["S3_ENDPOINT_URL"] = "" 

146 

147 def cleanup() -> None: 

148 if endpoint is not None: 

149 os.environ["S3_ENDPOINT_URL"] = endpoint 

150 

151 testcase.addCleanup(cleanup) 

152 

153 

154@contextmanager 

155def clean_test_environment_for_s3() -> Iterator[None]: 

156 """Reset S3 environment to ensure that unit tests with a mock S3 can't 

157 accidentally reference real infrastructure. 

158 """ 

159 with patch.dict( 

160 os.environ, 

161 { 

162 "AWS_ACCESS_KEY_ID": "test-access-key", 

163 "AWS_SECRET_ACCESS_KEY": "test-secret-access-key", 

164 }, 

165 ) as patched_environ: 

166 for var in ( 

167 "S3_ENDPOINT_URL", 

168 "AWS_SECURITY_TOKEN", 

169 "AWS_SESSION_TOKEN", 

170 "AWS_PROFILE", 

171 "AWS_SHARED_CREDENTIALS_FILE", 

172 "AWS_CONFIG_FILE", 

173 ): 

174 patched_environ.pop(var, None) 

175 # Clear the cached boto3 S3 client instances. 

176 # This helps us avoid a potential situation where the client could be 

177 # instantiated before moto mocks are installed, which would prevent the 

178 # mocks from taking effect. 

179 _get_s3_client.cache_clear() 

180 yield 

181 

182 

183def getS3Client(profile: str | None = None) -> boto3.client: 

184 """Create a S3 client with AWS (default) or the specified endpoint. 

185 

186 Parameters 

187 ---------- 

188 profile : `str`, optional 

189 The name of an S3 profile describing which S3 service to use. 

190 

191 Returns 

192 ------- 

193 s3client : `botocore.client.S3` 

194 A client of the S3 service. 

195 

196 Notes 

197 ----- 

198 If an explicit profile name is specified, its configuration will be read 

199 from an environment variable named ``LSST_RESOURCES_S3_PROFILE_<profile>`` 

200 if it exists. Note that the name of the profile is case sensitive. This 

201 configuration is specified in the format: ``https://<access key ID>:<secret 

202 key>@<s3 endpoint hostname>``. If the access key ID or secret key values 

203 contain slashes, the slashes must be URI-encoded (replace "/" with "%2F"). 

204 

205 If profile is `None` or the profile environment variable was not set, the 

206 configuration is read from the environment variable ``S3_ENDPOINT_URL``. 

207 If it is not specified, the default AWS endpoint is used. 

208 

209 The access key ID and secret key are optional -- if not specified, they 

210 will be looked up via the `AWS credentials file 

211 <https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html>`_. 

212 

213 If the environment variable LSST_DISABLE_BUCKET_VALIDATION exists 

214 and has a value that is not empty, "0", "f", "n", or "false" 

215 (case-insensitive), then bucket name validation is disabled. This 

216 disabling allows Ceph multi-tenancy colon separators to appear in 

217 bucket names. 

218 """ 

219 if boto3 is None: 

220 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

221 if botocore is None: 

222 raise ModuleNotFoundError("Could not find botocore. Are you sure it is installed?") 

223 

224 endpoint = None 

225 if profile is not None: 

226 var_name = f"LSST_RESOURCES_S3_PROFILE_{profile}" 

227 endpoint = os.environ.get(var_name, None) 

228 if not endpoint: 

229 endpoint = os.environ.get("S3_ENDPOINT_URL", None) 

230 if not endpoint: 

231 endpoint = None # Handle "" 

232 

233 disable_value = os.environ.get("LSST_DISABLE_BUCKET_VALIDATION", "0") 

234 skip_validation = not re.search(r"^(0|f|n|false)?$", disable_value, re.I) 

235 

236 return _get_s3_client(endpoint, profile, skip_validation) 

237 

238 

239@functools.lru_cache 

240def _get_s3_client(endpoint: str | None, profile: str | None, skip_validation: bool) -> boto3.client: 

241 # Helper function to cache the client for this endpoint 

242 config = botocore.config.Config(read_timeout=180, retries={"mode": "adaptive", "max_attempts": 10}) 

243 

244 endpoint_config = _parse_endpoint_config(endpoint) 

245 

246 if endpoint_config.access_key_id is not None and endpoint_config.secret_access_key is not None: 

247 # We already have the necessary configuration for the profile, so do 

248 # not pass the profile to boto3. boto3 will raise an exception if the 

249 # profile is not defined in its configuration file, whether or not it 

250 # needs to read the configuration from it. 

251 profile = None 

252 session = boto3.Session(profile_name=profile) 

253 

254 client = session.client( 

255 "s3", 

256 endpoint_url=endpoint_config.endpoint_url, 

257 aws_access_key_id=endpoint_config.access_key_id, 

258 aws_secret_access_key=endpoint_config.secret_access_key, 

259 config=config, 

260 ) 

261 if skip_validation: 

262 client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name) 

263 return client 

264 

265 

266class _EndpointConfig(NamedTuple): 

267 endpoint_url: str | None = None 

268 access_key_id: str | None = None 

269 secret_access_key: str | None = None 

270 

271 

272def _parse_endpoint_config(endpoint: str | None) -> _EndpointConfig: 

273 if not endpoint: 

274 return _EndpointConfig() 

275 

276 parsed = parse_url(endpoint) 

277 

278 # Strip the username/password portion of the URL from the result. 

279 endpoint_url = Url(host=parsed.host, path=parsed.path, port=parsed.port, scheme=parsed.scheme).url 

280 

281 access_key_id = None 

282 secret_access_key = None 

283 if parsed.auth: 

284 split = parsed.auth.split(":") 

285 if len(split) != 2: 

286 raise ValueError("S3 access key and secret not in expected format.") 

287 access_key_id, secret_access_key = split 

288 access_key_id = urllib.parse.unquote(access_key_id) 

289 secret_access_key = urllib.parse.unquote(secret_access_key) 

290 

291 return _EndpointConfig( 

292 endpoint_url=endpoint_url, access_key_id=access_key_id, secret_access_key=secret_access_key 

293 ) 

294 

295 

296def s3CheckFileExists( 

297 path: Location | ResourcePath | str, 

298 bucket: str | None = None, 

299 client: boto3.client | None = None, 

300) -> tuple[bool, int]: 

301 """Return if the file exists in the bucket or not. 

302 

303 Parameters 

304 ---------- 

305 path : `Location`, `ResourcePath` or `str` 

306 Location or ResourcePath containing the bucket name and filepath. 

307 bucket : `str`, optional 

308 Name of the bucket in which to look. If provided, path will be assumed 

309 to correspond to be relative to the given bucket. 

310 client : `boto3.client`, optional 

311 S3 Client object to query, if not supplied boto3 will try to resolve 

312 the credentials as in order described in its manual_. 

313 

314 Returns 

315 ------- 

316 exists : `bool` 

317 True if key exists, False otherwise. 

318 size : `int` 

319 Size of the key, if key exists, in bytes, otherwise -1. 

320 

321 Notes 

322 ----- 

323 S3 Paths are sensitive to leading and trailing path separators. 

324 

325 .. _manual: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/\ 

326 configuration.html#configuring-credentials 

327 """ 

328 if boto3 is None: 

329 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

330 

331 if client is None: 

332 client = getS3Client() 

333 

334 if isinstance(path, str): 

335 if bucket is not None: 

336 filepath = path 

337 else: 

338 uri = ResourcePath(path) 

339 bucket = uri.netloc 

340 filepath = uri.relativeToPathRoot 

341 elif isinstance(path, ResourcePath | Location): 

342 if bucket is None: 

343 bucket = path.netloc 

344 filepath = path.relativeToPathRoot 

345 else: 

346 raise TypeError(f"Unsupported path type: {path!r}.") 

347 

348 try: 

349 obj = client.head_object(Bucket=bucket, Key=filepath) 

350 return (True, obj["ContentLength"]) 

351 except client.exceptions.ClientError as err: 

352 # resource unreachable error means key does not exist 

353 errcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

354 if errcode == 404: 

355 return (False, -1) 

356 # head_object returns 404 when object does not exist only when user has 

357 # s3:ListBucket permission. If list permission does not exist a 403 is 

358 # returned. In practical terms this generally means that the file does 

359 # not exist, but it could also mean user lacks s3:GetObject permission: 

360 # https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

361 # I don't think its possible to discern which case is it with certainty 

362 if errcode == 403: 

363 raise PermissionError( 

364 "Forbidden HEAD operation error occurred. " 

365 "Verify s3:ListBucket and s3:GetObject " 

366 "permissions are granted for your IAM user. " 

367 ) from err 

368 if errcode == 429: 

369 # boto3, incorrectly, does not automatically retry with 429 

370 # so instead we raise an explicit retry exception for backoff. 

371 raise _TooManyRequestsError(str(err)) from err 

372 raise 

373 

374 

375def bucketExists(bucketName: str, client: boto3.client | None = None) -> bool: 

376 """Check if the S3 bucket with the given name actually exists. 

377 

378 Parameters 

379 ---------- 

380 bucketName : `str` 

381 Name of the S3 Bucket. 

382 client : `boto3.client`, optional 

383 S3 Client object to query, if not supplied boto3 will try to resolve 

384 the credentials by calling `getS3Client`. 

385 

386 Returns 

387 ------- 

388 exists : `bool` 

389 True if it exists, False if no Bucket with specified parameters is 

390 found. 

391 """ 

392 if boto3 is None: 

393 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

394 

395 if client is None: 

396 client = getS3Client() 

397 try: 

398 client.get_bucket_location(Bucket=bucketName) 

399 return True 

400 except client.exceptions.NoSuchBucket: 

401 return False 

402 

403 

404@deprecated( 

405 reason="This function could accidentally leave real credentials in the environment during testing." 

406 " A new function, clean_test_environment_for_s3(), can be used to set up mock credentials." 

407 " Will be removed after v26.2023.5000", 

408 version="26.2023.5000", 

409 category=FutureWarning, 

410) 

411def setAwsEnvCredentials( 

412 accessKeyId: str = "dummyAccessKeyId", secretAccessKey: str = "dummySecretAccessKey" 

413) -> bool: 

414 """Set AWS credentials environmental variables. 

415 

416 Parameters 

417 ---------- 

418 accessKeyId : `str` 

419 Value given to AWS_ACCESS_KEY_ID environmental variable. Defaults to 

420 `dummyAccessKeyId`. 

421 secretAccessKey : `str` 

422 Value given to AWS_SECRET_ACCESS_KEY environmental variable. Defaults 

423 to `dummySecretAccessKey`. 

424 

425 Returns 

426 ------- 

427 setEnvCredentials : `bool` 

428 True when environmental variables were set, False otherwise. 

429 

430 Notes 

431 ----- 

432 If either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY are not set, both 

433 values are overwritten to ensure that the values are consistent. 

434 """ 

435 if "AWS_ACCESS_KEY_ID" not in os.environ or "AWS_SECRET_ACCESS_KEY" not in os.environ: 

436 os.environ["AWS_ACCESS_KEY_ID"] = accessKeyId 

437 os.environ["AWS_SECRET_ACCESS_KEY"] = secretAccessKey 

438 return True 

439 return False 

440 

441 

442@deprecated( 

443 reason="This has been replaced by a new function, clean_test_environment_for_s3()." 

444 " Will be removed after v26.2023.5000", 

445 version="26.2023.5000", 

446 category=FutureWarning, 

447) 

448def unsetAwsEnvCredentials() -> None: 

449 """Unset AWS credential environment variables. 

450 

451 Unsets the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environmental 

452 variables. 

453 """ 

454 if "AWS_ACCESS_KEY_ID" in os.environ: 

455 del os.environ["AWS_ACCESS_KEY_ID"] 

456 if "AWS_SECRET_ACCESS_KEY" in os.environ: 

457 del os.environ["AWS_SECRET_ACCESS_KEY"]