Coverage for python/lsst/resources/s3utils.py: 24%

94 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-04 02:38 -0800

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ( 

15 "getS3Client", 

16 "s3CheckFileExists", 

17 "bucketExists", 

18 "setAwsEnvCredentials", 

19 "unsetAwsEnvCredentials", 

20 "backoff", 

21 "all_retryable_errors", 

22 "max_retry_time", 

23 "retryable_io_errors", 

24 "retryable_client_errors", 

25 "_TooManyRequestsException", 

26) 

27 

28import functools 

29import os 

30from http.client import HTTPException, ImproperConnectionState 

31from types import ModuleType 

32from typing import Any, Callable, Optional, Tuple, Union, cast 

33 

34from botocore.exceptions import ClientError 

35from urllib3.exceptions import HTTPError, RequestError 

36 

37try: 

38 import boto3 

39except ImportError: 

40 boto3 = None 

41 

42try: 

43 import botocore 

44except ImportError: 

45 botocore = None 

46 

47 

48from ._resourcePath import ResourcePath 

49from .location import Location 

50 

51# https://pypi.org/project/backoff/ 

52try: 

53 import backoff 

54except ImportError: 

55 

56 class Backoff: 

57 @staticmethod 

58 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

59 return func 

60 

61 @staticmethod 

62 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

63 return func 

64 

65 backoff = cast(ModuleType, Backoff) 

66 

67 

68class _TooManyRequestsException(Exception): 

69 """Private exception that can be used for 429 retry. 

70 

71 botocore refuses to deal with 429 error itself so issues a generic 

72 ClientError. 

73 """ 

74 

75 pass 

76 

77 

78# settings for "backoff" retry decorators. these retries are belt-and- 

79# suspenders along with the retries built into Boto3, to account for 

80# semantic differences in errors between S3-like providers. 

81retryable_io_errors = ( 

82 # http.client 

83 ImproperConnectionState, 

84 HTTPException, 

85 # urllib3.exceptions 

86 RequestError, 

87 HTTPError, 

88 # built-ins 

89 TimeoutError, 

90 ConnectionError, 

91 # private 

92 _TooManyRequestsException, 

93) 

94 

95# Client error can include NoSuchKey so retry may not be the right 

96# thing. This may require more consideration if it is to be used. 

97retryable_client_errors = ( 

98 # botocore.exceptions 

99 ClientError, 

100 # built-ins 

101 PermissionError, 

102) 

103 

104 

105# Combine all errors into an easy package. For now client errors 

106# are not included. 

107all_retryable_errors = retryable_io_errors 

108max_retry_time = 60 

109 

110 

111def getS3Client() -> boto3.client: 

112 """Create a S3 client with AWS (default) or the specified endpoint. 

113 

114 Returns 

115 ------- 

116 s3client : `botocore.client.S3` 

117 A client of the S3 service. 

118 

119 Notes 

120 ----- 

121 The endpoint URL is from the environment variable S3_ENDPOINT_URL. 

122 If none is specified, the default AWS one is used. 

123 """ 

124 if boto3 is None: 

125 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

126 if botocore is None: 

127 raise ModuleNotFoundError("Could not find botocore. Are you sure it is installed?") 

128 

129 endpoint = os.environ.get("S3_ENDPOINT_URL", None) 

130 if not endpoint: 

131 endpoint = None # Handle "" 

132 

133 return _get_s3_client(endpoint) 

134 

135 

136@functools.lru_cache() 

137def _get_s3_client(endpoint: str) -> boto3.client: 

138 # Helper function to cache the client for this endpoint 

139 config = botocore.config.Config(read_timeout=180, retries={"mode": "adaptive", "max_attempts": 10}) 

140 

141 return boto3.client("s3", endpoint_url=endpoint, config=config) 

142 

143 

144def s3CheckFileExists( 

145 path: Union[Location, ResourcePath, str], 

146 bucket: Optional[str] = None, 

147 client: Optional[boto3.client] = None, 

148) -> Tuple[bool, int]: 

149 """Return if the file exists in the bucket or not. 

150 

151 Parameters 

152 ---------- 

153 path : `Location`, `ResourcePath` or `str` 

154 Location or ResourcePath containing the bucket name and filepath. 

155 bucket : `str`, optional 

156 Name of the bucket in which to look. If provided, path will be assumed 

157 to correspond to be relative to the given bucket. 

158 client : `boto3.client`, optional 

159 S3 Client object to query, if not supplied boto3 will try to resolve 

160 the credentials as in order described in its manual_. 

161 

162 Returns 

163 ------- 

164 exists : `bool` 

165 True if key exists, False otherwise. 

166 size : `int` 

167 Size of the key, if key exists, in bytes, otherwise -1. 

168 

169 Notes 

170 ----- 

171 S3 Paths are sensitive to leading and trailing path separators. 

172 

173 .. _manual: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/\ 

174 configuration.html#configuring-credentials 

175 """ 

176 if boto3 is None: 

177 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

178 

179 if client is None: 

180 client = getS3Client() 

181 

182 if isinstance(path, str): 

183 if bucket is not None: 

184 filepath = path 

185 else: 

186 uri = ResourcePath(path) 

187 bucket = uri.netloc 

188 filepath = uri.relativeToPathRoot 

189 elif isinstance(path, (ResourcePath, Location)): 

190 bucket = path.netloc 

191 filepath = path.relativeToPathRoot 

192 else: 

193 raise TypeError(f"Unsupported path type: {path!r}.") 

194 

195 try: 

196 obj = client.head_object(Bucket=bucket, Key=filepath) 

197 return (True, obj["ContentLength"]) 

198 except client.exceptions.ClientError as err: 

199 # resource unreachable error means key does not exist 

200 if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404: 

201 return (False, -1) 

202 # head_object returns 404 when object does not exist only when user has 

203 # s3:ListBucket permission. If list permission does not exist a 403 is 

204 # returned. In practical terms this generally means that the file does 

205 # not exist, but it could also mean user lacks s3:GetObject permission: 

206 # https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

207 # I don't think its possible to discern which case is it with certainty 

208 if err.response["ResponseMetadata"]["HTTPStatusCode"] == 403: 

209 raise PermissionError( 

210 "Forbidden HEAD operation error occured. " 

211 "Verify s3:ListBucket and s3:GetObject " 

212 "permissions are granted for your IAM user. " 

213 ) from err 

214 raise 

215 

216 

217def bucketExists(bucketName: str, client: Optional[boto3.client] = None) -> bool: 

218 """Check if the S3 bucket with the given name actually exists. 

219 

220 Parameters 

221 ---------- 

222 bucketName : `str` 

223 Name of the S3 Bucket 

224 client : `boto3.client`, optional 

225 S3 Client object to query, if not supplied boto3 will try to resolve 

226 the credentials as in order described in its manual_. 

227 

228 Returns 

229 ------- 

230 exists : `bool` 

231 True if it exists, False if no Bucket with specified parameters is 

232 found. 

233 

234 .. _manual: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/\ 

235 configuration.html#configuring-credentials 

236 """ 

237 if boto3 is None: 

238 raise ModuleNotFoundError("Could not find boto3. Are you sure it is installed?") 

239 

240 if client is None: 

241 client = getS3Client() 

242 try: 

243 client.get_bucket_location(Bucket=bucketName) 

244 return True 

245 except client.exceptions.NoSuchBucket: 

246 return False 

247 

248 

249def setAwsEnvCredentials( 

250 accessKeyId: str = "dummyAccessKeyId", secretAccessKey: str = "dummySecretAccessKey" 

251) -> bool: 

252 """Set AWS credentials environmental variables. 

253 

254 Parameters 

255 ---------- 

256 accessKeyId : `str` 

257 Value given to AWS_ACCESS_KEY_ID environmental variable. Defaults to 

258 `dummyAccessKeyId`. 

259 secretAccessKey : `str` 

260 Value given to AWS_SECRET_ACCESS_KEY environmental variable. Defaults 

261 to `dummySecretAccessKey`. 

262 

263 Returns 

264 ------- 

265 setEnvCredentials : `bool` 

266 True when environmental variables were set, False otherwise. 

267 

268 Notes 

269 ----- 

270 If either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY are not set, both 

271 values are overwritten to ensure that the values are consistent. 

272 """ 

273 if "AWS_ACCESS_KEY_ID" not in os.environ or "AWS_SECRET_ACCESS_KEY" not in os.environ: 

274 os.environ["AWS_ACCESS_KEY_ID"] = accessKeyId 

275 os.environ["AWS_SECRET_ACCESS_KEY"] = secretAccessKey 

276 return True 

277 return False 

278 

279 

280def unsetAwsEnvCredentials() -> None: 

281 """Unset AWS credential environment variables. 

282 

283 Unsets the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environmental 

284 variables. 

285 """ 

286 if "AWS_ACCESS_KEY_ID" in os.environ: 

287 del os.environ["AWS_ACCESS_KEY_ID"] 

288 if "AWS_SECRET_ACCESS_KEY" in os.environ: 

289 del os.environ["AWS_SECRET_ACCESS_KEY"]