Coverage for python/lsst/resources/s3.py: 86%

197 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-05 10:31 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14import logging 

15import re 

16import tempfile 

17import threading 

18 

19__all__ = ("S3ResourcePath",) 

20 

21from http.client import HTTPException, ImproperConnectionState 

22from types import ModuleType 

23from typing import IO, TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, Union, cast 

24 

25from botocore.exceptions import ClientError 

26from lsst.utils.timer import time_this 

27from urllib3.exceptions import HTTPError, RequestError 

28 

29from ._resourcePath import ResourcePath 

30from .s3utils import bucketExists, getS3Client, s3CheckFileExists 

31 

32if TYPE_CHECKING: 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true

33 try: 

34 import boto3 

35 except ImportError: 

36 pass 

37 from .utils import TransactionProtocol 

38 

39# https://pypi.org/project/backoff/ 

40try: 

41 import backoff 

42except ImportError: 

43 

44 class Backoff: 

45 @staticmethod 

46 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

47 return func 

48 

49 @staticmethod 

50 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

51 return func 

52 

53 backoff = cast(ModuleType, Backoff) 

54 

55 

56class _TooManyRequestsException(Exception): 

57 """Private exception that can be used for 429 retry. 

58 

59 botocore refuses to deal with 429 error itself so issues a generic 

60 ClientError. 

61 """ 

62 

63 pass 

64 

65 

66# settings for "backoff" retry decorators. these retries are belt-and- 

67# suspenders along with the retries built into Boto3, to account for 

68# semantic differences in errors between S3-like providers. 

69retryable_io_errors = ( 

70 # http.client 

71 ImproperConnectionState, 

72 HTTPException, 

73 # urllib3.exceptions 

74 RequestError, 

75 HTTPError, 

76 # built-ins 

77 TimeoutError, 

78 ConnectionError, 

79 # private 

80 _TooManyRequestsException, 

81) 

82 

83# Client error can include NoSuchKey so retry may not be the right 

84# thing. This may require more consideration if it is to be used. 

85retryable_client_errors = ( 

86 # botocore.exceptions 

87 ClientError, 

88 # built-ins 

89 PermissionError, 

90) 

91 

92# Combine all errors into an easy package. For now client errors 

93# are not included. 

94all_retryable_errors = retryable_io_errors 

95max_retry_time = 60 

96 

97 

98log = logging.getLogger(__name__) 

99 

100 

101class ProgressPercentage: 

102 """Progress bar for S3 file uploads.""" 

103 

104 log_level = logging.DEBUG 

105 """Default log level to use when issuing a message.""" 

106 

107 def __init__(self, file: ResourcePath, file_for_msg: Optional[ResourcePath] = None, msg: str = ""): 

108 self._filename = file 

109 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

110 self._size = file.size() 

111 self._seen_so_far = 0 

112 self._lock = threading.Lock() 

113 self._msg = msg 

114 

115 def __call__(self, bytes_amount: int) -> None: 

116 # To simplify, assume this is hooked up to a single filename 

117 with self._lock: 

118 self._seen_so_far += bytes_amount 

119 percentage = (100 * self._seen_so_far) // self._size 

120 log.log( 

121 self.log_level, 

122 "%s %s %s / %s (%s%%)", 

123 self._msg, 

124 self._file_for_msg, 

125 self._seen_so_far, 

126 self._size, 

127 percentage, 

128 ) 

129 

130 

131def _translate_client_error(err: ClientError) -> None: 

132 """Translate a ClientError into a specialist error if relevant. 

133 

134 Parameters 

135 ---------- 

136 err : `ClientError` 

137 Exception to translate. 

138 

139 Raises 

140 ------ 

141 _TooManyRequestsException 

142 Raised if the `ClientError` looks like a 429 retry request. 

143 """ 

144 if "(429)" in str(err): 144 ↛ 148line 144 didn't jump to line 148, because the condition on line 144 was never true

145 # ClientError includes the error code in the message 

146 # but no direct way to access it without looking inside the 

147 # response. 

148 raise _TooManyRequestsException(str(err)) from err 

149 elif "(404)" in str(err): 149 ↛ exitline 149 didn't return from function '_translate_client_error', because the condition on line 149 was never false

150 # Some systems can generate this rather than NoSuchKey. 

151 raise FileNotFoundError("Resource not found: {self}") 

152 

153 

154class S3ResourcePath(ResourcePath): 

155 """S3 URI resource path implementation class.""" 

156 

157 @property 

158 def client(self) -> boto3.client: 

159 """Client object to address remote resource.""" 

160 # Defer import for circular dependencies 

161 return getS3Client() 

162 

163 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

164 def exists(self) -> bool: 

165 """Check that the S3 resource exists.""" 

166 if self.is_root: 

167 # Only check for the bucket since the path is irrelevant 

168 return bucketExists(self.netloc) 

169 exists, _ = s3CheckFileExists(self, client=self.client) 

170 return exists 

171 

172 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

173 def size(self) -> int: 

174 """Return the size of the resource in bytes.""" 

175 if self.dirLike: 

176 return 0 

177 exists, sz = s3CheckFileExists(self, client=self.client) 

178 if not exists: 

179 raise FileNotFoundError(f"Resource {self} does not exist") 

180 return sz 

181 

182 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

183 def remove(self) -> None: 

184 """Remove the resource.""" 

185 # https://github.com/boto/boto3/issues/507 - there is no 

186 # way of knowing if the file was actually deleted except 

187 # for checking all the keys again, reponse is HTTP 204 OK 

188 # response all the time 

189 try: 

190 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

191 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

192 raise FileNotFoundError("No such resource: {self}") from err 

193 

194 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

195 def read(self, size: int = -1) -> bytes: 

196 """Read the contents of the resource.""" 

197 args = {} 

198 if size > 0: 

199 args["Range"] = f"bytes=0-{size-1}" 

200 try: 

201 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

202 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 202 ↛ 204line 202 didn't jump to line 204

203 raise FileNotFoundError(f"No such resource: {self}") from err 

204 except ClientError as err: 

205 _translate_client_error(err) 

206 raise 

207 with time_this(log, msg="Read from %s", args=(self,)): 

208 body = response["Body"].read() 

209 response["Body"].close() 

210 return body 

211 

212 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

213 def write(self, data: bytes, overwrite: bool = True) -> None: 

214 """Write the supplied data to the resource.""" 

215 if not overwrite: 

216 if self.exists(): 

217 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

218 with time_this(log, msg="Write to %s", args=(self,)): 

219 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

220 

221 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

222 def mkdir(self) -> None: 

223 """Write a directory key to S3.""" 

224 if not bucketExists(self.netloc): 

225 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

226 

227 if not self.dirLike: 

228 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

229 

230 # don't create S3 key when root is at the top-level of an Bucket 

231 if not self.path == "/": 

232 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

233 

234 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

235 def _download_file(self, local_file: IO, progress: Optional[ProgressPercentage]) -> None: 

236 """Download the remote resource to a local file. 

237 

238 Helper routine for _as_local to allow backoff without regenerating 

239 the temporary file. 

240 """ 

241 try: 

242 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress) 

243 except ( 243 ↛ 247line 243 didn't jump to line 247, because the exception caught by line 243 didn't happen

244 self.client.exceptions.NoSuchKey, 

245 self.client.exceptions.NoSuchBucket, 

246 ) as err: 

247 raise FileNotFoundError(f"No such resource: {self}") from err 

248 except ClientError as err: 

249 _translate_client_error(err) 

250 raise 

251 

252 def _as_local(self) -> Tuple[str, bool]: 

253 """Download object from S3 and place in temporary directory. 

254 

255 Returns 

256 ------- 

257 path : `str` 

258 Path to local temporary file. 

259 temporary : `bool` 

260 Always returns `True`. This is always a temporary file. 

261 """ 

262 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

263 with time_this(log, msg="Downloading %s to local file", args=(self,)): 

264 progress = ( 

265 ProgressPercentage(self, msg="Downloading:") 

266 if log.isEnabledFor(ProgressPercentage.log_level) 

267 else None 

268 ) 

269 self._download_file(tmpFile, progress) 

270 return tmpFile.name, True 

271 

272 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

273 def _upload_file(self, local_file: ResourcePath, progress: Optional[ProgressPercentage]) -> None: 

274 """Upload a local file with backoff. 

275 

276 Helper method to wrap file uploading in backoff for transfer_from. 

277 """ 

278 try: 

279 self.client.upload_file( 

280 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress 

281 ) 

282 except self.client.exceptions.NoSuchBucket as err: 

283 raise NotADirectoryError(f"Target does not exist: {err}") from err 

284 except ClientError as err: 

285 _translate_client_error(err) 

286 raise 

287 

288 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

289 def _copy_from(self, src: ResourcePath) -> None: 

290 copy_source = { 

291 "Bucket": src.netloc, 

292 "Key": src.relativeToPathRoot, 

293 } 

294 try: 

295 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot) 

296 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 296 ↛ 298line 296 didn't jump to line 298

297 raise FileNotFoundError("No such resource to transfer: {self}") from err 

298 except ClientError as err: 

299 _translate_client_error(err) 

300 raise 

301 

302 def transfer_from( 

303 self, 

304 src: ResourcePath, 

305 transfer: str = "copy", 

306 overwrite: bool = False, 

307 transaction: Optional[TransactionProtocol] = None, 

308 ) -> None: 

309 """Transfer the current resource to an S3 bucket. 

310 

311 Parameters 

312 ---------- 

313 src : `ResourcePath` 

314 Source URI. 

315 transfer : `str` 

316 Mode to use for transferring the resource. Supports the following 

317 options: copy. 

318 overwrite : `bool`, optional 

319 Allow an existing file to be overwritten. Defaults to `False`. 

320 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

321 Currently unused. 

322 """ 

323 # Fail early to prevent delays if remote resources are requested 

324 if transfer not in self.transferModes: 

325 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

326 

327 # Existence checks cost time so do not call this unless we know 

328 # that debugging is enabled. 

329 if log.isEnabledFor(logging.DEBUG): 

330 log.debug( 

331 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

332 src, 

333 src.exists(), 

334 self, 

335 self.exists(), 

336 transfer, 

337 ) 

338 

339 # Short circuit if the URIs are identical immediately. 

340 if self == src: 

341 log.debug( 

342 "Target and destination URIs are identical: %s, returning immediately." 

343 " No further action required.", 

344 self, 

345 ) 

346 return 

347 

348 if not overwrite and self.exists(): 

349 raise FileExistsError(f"Destination path '{self}' already exists.") 

350 

351 if transfer == "auto": 

352 transfer = self.transferDefault 

353 

354 timer_msg = "Transfer from %s to %s" 

355 timer_args = (src, self) 

356 

357 if isinstance(src, type(self)): 

358 # Looks like an S3 remote uri so we can use direct copy 

359 # note that boto3.resource.meta.copy is cleverer than the low 

360 # level copy_object 

361 with time_this(log, msg=timer_msg, args=timer_args): 

362 self._copy_from(src) 

363 

364 else: 

365 # Use local file and upload it 

366 with src.as_local() as local_uri: 

367 progress = ( 

368 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

369 if log.isEnabledFor(ProgressPercentage.log_level) 

370 else None 

371 ) 

372 with time_this(log, msg=timer_msg, args=timer_args): 

373 self._upload_file(local_uri, progress) 

374 

375 # This was an explicit move requested from a remote resource 

376 # try to remove that resource 

377 if transfer == "move": 

378 # Transactions do not work here 

379 src.remove() 

380 

381 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

382 def walk( 

383 self, file_filter: Optional[Union[str, re.Pattern]] = None 

384 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]: 

385 """Walk the directory tree returning matching files and directories. 

386 

387 Parameters 

388 ---------- 

389 file_filter : `str` or `re.Pattern`, optional 

390 Regex to filter out files from the list before it is returned. 

391 

392 Yields 

393 ------ 

394 dirpath : `ResourcePath` 

395 Current directory being examined. 

396 dirnames : `list` of `str` 

397 Names of subdirectories within dirpath. 

398 filenames : `list` of `str` 

399 Names of all the files within dirpath. 

400 """ 

401 # We pretend that S3 uses directories and files and not simply keys 

402 if not (self.isdir() or self.is_root): 

403 raise ValueError(f"Can not walk a non-directory URI: {self}") 

404 

405 if isinstance(file_filter, str): 405 ↛ 406line 405 didn't jump to line 406, because the condition on line 405 was never true

406 file_filter = re.compile(file_filter) 

407 

408 s3_paginator = self.client.get_paginator("list_objects_v2") 

409 

410 # Limit each query to a single "directory" to match os.walk 

411 # We could download all keys at once with no delimiter and work 

412 # it out locally but this could potentially lead to large memory 

413 # usage for millions of keys. It will also make the initial call 

414 # to this method potentially very slow. If making this method look 

415 # like os.walk was not required, we could query all keys with 

416 # pagination and return them in groups of 1000, but that would 

417 # be a different interface since we can't guarantee we would get 

418 # them all grouped properly across the 1000 limit boundary. 

419 prefix = self.relativeToPathRoot if not self.is_root else "" 

420 prefix_len = len(prefix) 

421 dirnames = [] 

422 filenames = [] 

423 files_there = False 

424 

425 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

426 # All results are returned as full key names and we must 

427 # convert them back to the root form. The prefix is fixed 

428 # and delimited so that is a simple trim 

429 

430 # Directories are reported in the CommonPrefixes result 

431 # which reports the entire key and must be stripped. 

432 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

433 dirnames.extend(found_dirs) 

434 

435 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

436 if found_files: 

437 files_there = True 

438 if file_filter is not None: 

439 found_files = [f for f in found_files if file_filter.search(f)] 

440 

441 filenames.extend(found_files) 

442 

443 # Directories do not exist so we can't test for them. If no files 

444 # or directories were found though, this means that it effectively 

445 # does not exist and we should match os.walk() behavior and return 

446 # immediately. 

447 if not dirnames and not files_there: 

448 return 

449 else: 

450 yield self, dirnames, filenames 

451 

452 for dir in dirnames: 

453 new_uri = self.join(dir) 

454 yield from new_uri.walk(file_filter)