Coverage for python/lsst/resources/s3.py: 85%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

195 statements  

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14import logging 

15import re 

16import tempfile 

17import threading 

18 

19__all__ = ("S3ResourcePath",) 

20 

21from http.client import HTTPException, ImproperConnectionState 

22from typing import TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, Union 

23 

24from botocore.exceptions import ClientError 

25from lsst.utils.timer import time_this 

26from urllib3.exceptions import HTTPError, RequestError 

27 

28from ._resourcePath import ResourcePath 

29from .s3utils import bucketExists, getS3Client, s3CheckFileExists 

30 

31if TYPE_CHECKING: 31 ↛ 32line 31 didn't jump to line 32, because the condition on line 31 was never true

32 try: 

33 import boto3 

34 except ImportError: 

35 pass 

36 from .utils import TransactionProtocol 

37 

38# https://pypi.org/project/backoff/ 

39try: 

40 import backoff 

41except ImportError: 

42 

43 class Backoff: 

44 @staticmethod 

45 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

46 return func 

47 

48 @staticmethod 

49 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

50 return func 

51 

52 backoff = Backoff 

53 

54 

55class _TooManyRequestsException(Exception): 

56 """Private exception that can be used for 429 retry. 

57 

58 botocore refuses to deal with 429 error itself so issues a generic 

59 ClientError. 

60 """ 

61 

62 pass 

63 

64 

65# settings for "backoff" retry decorators. these retries are belt-and- 

66# suspenders along with the retries built into Boto3, to account for 

67# semantic differences in errors between S3-like providers. 

68retryable_io_errors = ( 

69 # http.client 

70 ImproperConnectionState, 

71 HTTPException, 

72 # urllib3.exceptions 

73 RequestError, 

74 HTTPError, 

75 # built-ins 

76 TimeoutError, 

77 ConnectionError, 

78 # private 

79 _TooManyRequestsException, 

80) 

81 

82# Client error can include NoSuchKey so retry may not be the right 

83# thing. This may require more consideration if it is to be used. 

84retryable_client_errors = ( 

85 # botocore.exceptions 

86 ClientError, 

87 # built-ins 

88 PermissionError, 

89) 

90 

91# Combine all errors into an easy package. For now client errors 

92# are not included. 

93all_retryable_errors = retryable_io_errors 

94max_retry_time = 60 

95 

96 

97log = logging.getLogger(__name__) 

98 

99 

100class ProgressPercentage: 

101 """Progress bar for S3 file uploads.""" 

102 

103 log_level = logging.DEBUG 

104 """Default log level to use when issuing a message.""" 

105 

106 def __init__(self, file: ResourcePath, file_for_msg: Optional[ResourcePath] = None, msg: str = ""): 

107 self._filename = file 

108 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

109 self._size = file.size() 

110 self._seen_so_far = 0 

111 self._lock = threading.Lock() 

112 self._msg = msg 

113 

114 def __call__(self, bytes_amount: int) -> None: 

115 # To simplify, assume this is hooked up to a single filename 

116 with self._lock: 

117 self._seen_so_far += bytes_amount 

118 percentage = (100 * self._seen_so_far) // self._size 

119 log.log( 

120 self.log_level, 

121 "%s %s %s / %s (%s%%)", 

122 self._msg, 

123 self._file_for_msg, 

124 self._seen_so_far, 

125 self._size, 

126 percentage, 

127 ) 

128 

129 

130def _translate_client_error(err: ClientError) -> None: 

131 """Translate a ClientError into a specialist error if relevant. 

132 

133 Parameters 

134 ---------- 

135 err : `ClientError` 

136 Exception to translate. 

137 

138 Raises 

139 ------ 

140 _TooManyRequestsException 

141 Raised if the `ClientError` looks like a 429 retry request. 

142 """ 

143 if "(429)" in str(err): 143 ↛ 147line 143 didn't jump to line 147, because the condition on line 143 was never true

144 # ClientError includes the error code in the message 

145 # but no direct way to access it without looking inside the 

146 # response. 

147 raise _TooManyRequestsException(str(err)) from err 

148 elif "(404)" in str(err): 148 ↛ exitline 148 didn't return from function '_translate_client_error', because the condition on line 148 was never false

149 # Some systems can generate this rather than NoSuchKey. 

150 raise FileNotFoundError("Resource not found: {self}") 

151 

152 

153class S3ResourcePath(ResourcePath): 

154 """S3 URI resource path implementation class.""" 

155 

156 @property 

157 def client(self) -> boto3.client: 

158 """Client object to address remote resource.""" 

159 # Defer import for circular dependencies 

160 return getS3Client() 

161 

162 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

163 def exists(self) -> bool: 

164 """Check that the S3 resource exists.""" 

165 if self.is_root: 

166 # Only check for the bucket since the path is irrelevant 

167 return bucketExists(self.netloc) 

168 exists, _ = s3CheckFileExists(self, client=self.client) 

169 return exists 

170 

171 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

172 def size(self) -> int: 

173 """Return the size of the resource in bytes.""" 

174 if self.dirLike: 

175 return 0 

176 exists, sz = s3CheckFileExists(self, client=self.client) 

177 if not exists: 

178 raise FileNotFoundError(f"Resource {self} does not exist") 

179 return sz 

180 

181 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

182 def remove(self) -> None: 

183 """Remove the resource.""" 

184 # https://github.com/boto/boto3/issues/507 - there is no 

185 # way of knowing if the file was actually deleted except 

186 # for checking all the keys again, reponse is HTTP 204 OK 

187 # response all the time 

188 try: 

189 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

190 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

191 raise FileNotFoundError("No such resource: {self}") from err 

192 

193 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

194 def read(self, size: int = -1) -> bytes: 

195 """Read the contents of the resource.""" 

196 args = {} 

197 if size > 0: 

198 args["Range"] = f"bytes=0-{size-1}" 

199 try: 

200 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

201 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 201 ↛ 203line 201 didn't jump to line 203

202 raise FileNotFoundError(f"No such resource: {self}") from err 

203 except ClientError as err: 

204 _translate_client_error(err) 

205 raise 

206 with time_this(log, msg="Read from %s", args=(self,)): 

207 body = response["Body"].read() 

208 response["Body"].close() 

209 return body 

210 

211 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

212 def write(self, data: bytes, overwrite: bool = True) -> None: 

213 """Write the supplied data to the resource.""" 

214 if not overwrite: 

215 if self.exists(): 

216 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

217 with time_this(log, msg="Write to %s", args=(self,)): 

218 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

219 

220 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

221 def mkdir(self) -> None: 

222 """Write a directory key to S3.""" 

223 if not bucketExists(self.netloc): 

224 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

225 

226 if not self.dirLike: 

227 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

228 

229 # don't create S3 key when root is at the top-level of an Bucket 

230 if not self.path == "/": 

231 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

232 

233 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

234 def _download_file(self, local_file: ResourcePath, progress: Optional[ProgressPercentage]) -> None: 

235 """Download the remote resource to a local file. 

236 

237 Helper routine for _as_local to allow backoff without regenerating 

238 the temporary file. 

239 """ 

240 try: 

241 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress) 

242 except ( 242 ↛ 246line 242 didn't jump to line 246, because the exception caught by line 242 didn't happen

243 self.client.exceptions.NoSuchKey, 

244 self.client.exceptions.NoSuchBucket, 

245 ) as err: 

246 raise FileNotFoundError(f"No such resource: {self}") from err 

247 except ClientError as err: 

248 _translate_client_error(err) 

249 raise 

250 

251 def _as_local(self) -> Tuple[str, bool]: 

252 """Download object from S3 and place in temporary directory. 

253 

254 Returns 

255 ------- 

256 path : `str` 

257 Path to local temporary file. 

258 temporary : `bool` 

259 Always returns `True`. This is always a temporary file. 

260 """ 

261 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

262 with time_this(log, msg="Downloading %s to local file", args=(self,)): 

263 progress = ( 

264 ProgressPercentage(self, msg="Downloading:") 

265 if log.isEnabledFor(ProgressPercentage.log_level) 

266 else None 

267 ) 

268 self._download_file(tmpFile, progress) 

269 return tmpFile.name, True 

270 

271 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

272 def _upload_file(self, local_file: ResourcePath, progress: Optional[ProgressPercentage]) -> None: 

273 """Upload a local file with backoff. 

274 

275 Helper method to wrap file uploading in backoff for transfer_from. 

276 """ 

277 try: 

278 self.client.upload_file( 

279 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress 

280 ) 

281 except self.client.exceptions.NoSuchBucket as err: 

282 raise NotADirectoryError(f"Target does not exist: {err}") from err 

283 except ClientError as err: 

284 _translate_client_error(err) 

285 raise 

286 

287 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

288 def _copy_from(self, src: ResourcePath) -> None: 

289 copy_source = { 

290 "Bucket": src.netloc, 

291 "Key": src.relativeToPathRoot, 

292 } 

293 try: 

294 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot) 

295 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 295 ↛ 297line 295 didn't jump to line 297

296 raise FileNotFoundError("No such resource to transfer: {self}") from err 

297 except ClientError as err: 

298 _translate_client_error(err) 

299 raise 

300 

301 def transfer_from( 

302 self, 

303 src: ResourcePath, 

304 transfer: str = "copy", 

305 overwrite: bool = False, 

306 transaction: Optional[TransactionProtocol] = None, 

307 ) -> None: 

308 """Transfer the current resource to an S3 bucket. 

309 

310 Parameters 

311 ---------- 

312 src : `ResourcePath` 

313 Source URI. 

314 transfer : `str` 

315 Mode to use for transferring the resource. Supports the following 

316 options: copy. 

317 overwrite : `bool`, optional 

318 Allow an existing file to be overwritten. Defaults to `False`. 

319 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

320 Currently unused. 

321 """ 

322 # Fail early to prevent delays if remote resources are requested 

323 if transfer not in self.transferModes: 

324 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

325 

326 # Existence checks cost time so do not call this unless we know 

327 # that debugging is enabled. 

328 if log.isEnabledFor(logging.DEBUG): 

329 log.debug( 

330 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

331 src, 

332 src.exists(), 

333 self, 

334 self.exists(), 

335 transfer, 

336 ) 

337 

338 # Short circuit if the URIs are identical immediately. 

339 if self == src: 

340 log.debug( 

341 "Target and destination URIs are identical: %s, returning immediately." 

342 " No further action required.", 

343 self, 

344 ) 

345 return 

346 

347 if not overwrite and self.exists(): 

348 raise FileExistsError(f"Destination path '{self}' already exists.") 

349 

350 if transfer == "auto": 

351 transfer = self.transferDefault 

352 

353 timer_msg = "Transfer from %s to %s" 

354 timer_args = (src, self) 

355 

356 if isinstance(src, type(self)): 

357 # Looks like an S3 remote uri so we can use direct copy 

358 # note that boto3.resource.meta.copy is cleverer than the low 

359 # level copy_object 

360 with time_this(log, msg=timer_msg, args=timer_args): 

361 self._copy_from(src) 

362 

363 else: 

364 # Use local file and upload it 

365 with src.as_local() as local_uri: 

366 progress = ( 

367 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

368 if log.isEnabledFor(ProgressPercentage.log_level) 

369 else None 

370 ) 

371 with time_this(log, msg=timer_msg, args=timer_args): 

372 self._upload_file(local_uri, progress) 

373 

374 # This was an explicit move requested from a remote resource 

375 # try to remove that resource 

376 if transfer == "move": 

377 # Transactions do not work here 

378 src.remove() 

379 

380 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

381 def walk( 

382 self, file_filter: Optional[Union[str, re.Pattern]] = None 

383 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]: 

384 """Walk the directory tree returning matching files and directories. 

385 

386 Parameters 

387 ---------- 

388 file_filter : `str` or `re.Pattern`, optional 

389 Regex to filter out files from the list before it is returned. 

390 

391 Yields 

392 ------ 

393 dirpath : `ResourcePath` 

394 Current directory being examined. 

395 dirnames : `list` of `str` 

396 Names of subdirectories within dirpath. 

397 filenames : `list` of `str` 

398 Names of all the files within dirpath. 

399 """ 

400 # We pretend that S3 uses directories and files and not simply keys 

401 if not (self.isdir() or self.is_root): 

402 raise ValueError(f"Can not walk a non-directory URI: {self}") 

403 

404 if isinstance(file_filter, str): 404 ↛ 405line 404 didn't jump to line 405, because the condition on line 404 was never true

405 file_filter = re.compile(file_filter) 

406 

407 s3_paginator = self.client.get_paginator("list_objects_v2") 

408 

409 # Limit each query to a single "directory" to match os.walk 

410 # We could download all keys at once with no delimiter and work 

411 # it out locally but this could potentially lead to large memory 

412 # usage for millions of keys. It will also make the initial call 

413 # to this method potentially very slow. If making this method look 

414 # like os.walk was not required, we could query all keys with 

415 # pagination and return them in groups of 1000, but that would 

416 # be a different interface since we can't guarantee we would get 

417 # them all grouped properly across the 1000 limit boundary. 

418 prefix = self.relativeToPathRoot if not self.is_root else "" 

419 prefix_len = len(prefix) 

420 dirnames = [] 

421 filenames = [] 

422 files_there = False 

423 

424 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

425 # All results are returned as full key names and we must 

426 # convert them back to the root form. The prefix is fixed 

427 # and delimited so that is a simple trim 

428 

429 # Directories are reported in the CommonPrefixes result 

430 # which reports the entire key and must be stripped. 

431 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

432 dirnames.extend(found_dirs) 

433 

434 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

435 if found_files: 

436 files_there = True 

437 if file_filter is not None: 

438 found_files = [f for f in found_files if file_filter.search(f)] 

439 

440 filenames.extend(found_files) 

441 

442 # Directories do not exist so we can't test for them. If no files 

443 # or directories were found though, this means that it effectively 

444 # does not exist and we should match os.walk() behavior and return 

445 # immediately. 

446 if not dirnames and not files_there: 

447 return 

448 else: 

449 yield self, dirnames, filenames 

450 

451 for dir in dirnames: 

452 new_uri = self.join(dir) 

453 yield from new_uri.walk(file_filter)