Coverage for python/lsst/resources/s3.py: 28%

228 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-17 10:49 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourcePath",) 

15 

16import contextlib 

17import io 

18import logging 

19import os 

20import re 

21import sys 

22import tempfile 

23import threading 

24from collections.abc import Iterable, Iterator 

25from functools import cache, cached_property 

26from typing import IO, TYPE_CHECKING, cast 

27 

28from botocore.exceptions import ClientError 

29from lsst.utils.timer import time_this 

30 

31from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol 

32from ._resourceHandles._s3ResourceHandle import S3ResourceHandle 

33from ._resourcePath import ResourcePath 

34from .s3utils import ( 

35 _TooManyRequestsError, 

36 all_retryable_errors, 

37 backoff, 

38 bucketExists, 

39 getS3Client, 

40 max_retry_time, 

41 retryable_io_errors, 

42 s3CheckFileExists, 

43) 

44 

45try: 

46 from boto3.s3.transfer import TransferConfig # type: ignore 

47except ImportError: 

48 TransferConfig = None 

49 

50if TYPE_CHECKING: 

51 with contextlib.suppress(ImportError): 

52 import boto3 

53 

54 from .utils import TransactionProtocol 

55 

56 

57log = logging.getLogger(__name__) 

58 

59 

60class ProgressPercentage: 

61 """Progress bar for S3 file uploads. 

62 

63 Parameters 

64 ---------- 

65 file : `ResourcePath` 

66 Resource that is relevant to the progress percentage. The size of this 

67 resource will be used to determine progress. The name will be used 

68 in the log messages unless overridden by ``file_for_msg``. 

69 file_for_msg : `ResourcePath` or `None`, optional 

70 Resource name to include in log messages in preference to ``file``. 

71 msg : `str`, optional 

72 Message text to be included in every progress log message. 

73 """ 

74 

75 log_level = logging.DEBUG 

76 """Default log level to use when issuing a message.""" 

77 

78 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""): 

79 self._filename = file 

80 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

81 self._size = file.size() 

82 self._seen_so_far = 0 

83 self._lock = threading.Lock() 

84 self._msg = msg 

85 

86 def __call__(self, bytes_amount: int) -> None: 

87 # To simplify, assume this is hooked up to a single filename 

88 with self._lock: 

89 self._seen_so_far += bytes_amount 

90 percentage = (100 * self._seen_so_far) // self._size 

91 log.log( 

92 self.log_level, 

93 "%s %s %s / %s (%s%%)", 

94 self._msg, 

95 self._file_for_msg, 

96 self._seen_so_far, 

97 self._size, 

98 percentage, 

99 ) 

100 

101 

102def _translate_client_error(err: ClientError) -> None: 

103 """Translate a ClientError into a specialist error if relevant. 

104 

105 Parameters 

106 ---------- 

107 err : `ClientError` 

108 Exception to translate. 

109 

110 Raises 

111 ------ 

112 _TooManyRequestsError 

113 Raised if the `ClientError` looks like a 429 retry request. 

114 """ 

115 if "(429)" in str(err): 

116 # ClientError includes the error code in the message 

117 # but no direct way to access it without looking inside the 

118 # response. 

119 raise _TooManyRequestsError(str(err)) from err 

120 elif "(404)" in str(err): 

121 # Some systems can generate this rather than NoSuchKey. 

122 raise FileNotFoundError("Resource not found: {self}") 

123 

124 

125@cache 

126def _parse_string_to_maybe_bool(maybe_bool_str: str) -> bool | None: 

127 """Map a string to either a boolean value or None. 

128 

129 Parameters 

130 ---------- 

131 maybe_bool_str : `str` 

132 The value to parse 

133 

134 Results 

135 ------- 

136 maybe_bool : `bool` or `None` 

137 The parsed value. 

138 """ 

139 if maybe_bool_str.lower() in ["t", "true", "yes", "y", "1"]: 

140 maybe_bool = True 

141 elif maybe_bool_str.lower() in ["f", "false", "no", "n", "0"]: 

142 maybe_bool = False 

143 elif maybe_bool_str.lower() in ["none", ""]: 

144 maybe_bool = None 

145 else: 

146 raise ValueError(f'Value of "{maybe_bool_str}" is not True, False, or None.') 

147 

148 return maybe_bool 

149 

150 

151class S3ResourcePath(ResourcePath): 

152 """S3 URI resource path implementation class. 

153 

154 Notes 

155 ----- 

156 In order to configure the behavior of instances of this class, the 

157 environment variable is inspected: 

158 

159 - LSST_S3_USE_THREADS: May be True, False, or None. Sets whether threading 

160 is used for downloads, with a value of None defaulting to boto's default 

161 value. Users may wish to set it to False when the downloads will be started 

162 within threads other than python's main thread. 

163 """ 

164 

165 use_threads: bool | None = None 

166 """Explicitly turn on or off threading in use of boto's download_fileobj. 

167 Setting this to None results in boto's default behavior.""" 

168 

169 @cached_property 

170 def _environ_use_threads(self) -> bool | None: 

171 try: 

172 use_threads_str = os.environ["LSST_S3_USE_THREADS"] 

173 except KeyError: 

174 use_threads_str = "None" 

175 

176 use_threads = _parse_string_to_maybe_bool(use_threads_str) 

177 

178 return use_threads 

179 

180 @property 

181 def _transfer_config(self) -> TransferConfig: 

182 if self.use_threads is None: 

183 self.use_threads = self._environ_use_threads 

184 

185 if self.use_threads is None: 

186 transfer_config = TransferConfig() 

187 else: 

188 transfer_config = TransferConfig(use_threads=self.use_threads) 

189 

190 return transfer_config 

191 

192 @property 

193 def client(self) -> boto3.client: 

194 """Client object to address remote resource.""" 

195 # Defer import for circular dependencies 

196 return getS3Client() 

197 

198 @classmethod 

199 def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]: 

200 # Force client to be created before creating threads. 

201 getS3Client() 

202 

203 return super()._mexists(uris) 

204 

205 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

206 def exists(self) -> bool: 

207 """Check that the S3 resource exists.""" 

208 if self.is_root: 

209 # Only check for the bucket since the path is irrelevant 

210 return bucketExists(self.netloc) 

211 exists, _ = s3CheckFileExists(self, client=self.client) 

212 return exists 

213 

214 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

215 def size(self) -> int: 

216 """Return the size of the resource in bytes.""" 

217 if self.dirLike: 

218 return 0 

219 exists, sz = s3CheckFileExists(self, client=self.client) 

220 if not exists: 

221 raise FileNotFoundError(f"Resource {self} does not exist") 

222 return sz 

223 

224 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

225 def remove(self) -> None: 

226 """Remove the resource.""" 

227 # https://github.com/boto/boto3/issues/507 - there is no 

228 # way of knowing if the file was actually deleted except 

229 # for checking all the keys again, reponse is HTTP 204 OK 

230 # response all the time 

231 try: 

232 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

233 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

234 raise FileNotFoundError("No such resource: {self}") from err 

235 

236 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

237 def read(self, size: int = -1) -> bytes: 

238 args = {} 

239 if size > 0: 

240 args["Range"] = f"bytes=0-{size-1}" 

241 try: 

242 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

243 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

244 raise FileNotFoundError(f"No such resource: {self}") from err 

245 except ClientError as err: 

246 _translate_client_error(err) 

247 raise 

248 with time_this(log, msg="Read from %s", args=(self,)): 

249 body = response["Body"].read() 

250 response["Body"].close() 

251 return body 

252 

253 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

254 def write(self, data: bytes, overwrite: bool = True) -> None: 

255 if not overwrite and self.exists(): 

256 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

257 with time_this(log, msg="Write to %s", args=(self,)): 

258 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

259 

260 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

261 def mkdir(self) -> None: 

262 """Write a directory key to S3.""" 

263 if not bucketExists(self.netloc): 

264 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

265 

266 if not self.dirLike: 

267 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

268 

269 # don't create S3 key when root is at the top-level of an Bucket 

270 if self.path != "/": 

271 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

272 

273 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

274 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None: 

275 """Download the remote resource to a local file. 

276 

277 Helper routine for _as_local to allow backoff without regenerating 

278 the temporary file. 

279 """ 

280 try: 

281 self.client.download_fileobj( 

282 self.netloc, 

283 self.relativeToPathRoot, 

284 local_file, 

285 Callback=progress, 

286 Config=self._transfer_config, 

287 ) 

288 except ( 

289 self.client.exceptions.NoSuchKey, 

290 self.client.exceptions.NoSuchBucket, 

291 ) as err: 

292 raise FileNotFoundError(f"No such resource: {self}") from err 

293 except ClientError as err: 

294 _translate_client_error(err) 

295 raise 

296 

297 def _as_local(self) -> tuple[str, bool]: 

298 """Download object from S3 and place in temporary directory. 

299 

300 Returns 

301 ------- 

302 path : `str` 

303 Path to local temporary file. 

304 temporary : `bool` 

305 Always returns `True`. This is always a temporary file. 

306 """ 

307 with ( 

308 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile, 

309 time_this(log, msg="Downloading %s to local file", args=(self,)), 

310 ): 

311 progress = ( 

312 ProgressPercentage(self, msg="Downloading:") 

313 if log.isEnabledFor(ProgressPercentage.log_level) 

314 else None 

315 ) 

316 self._download_file(tmpFile, progress) 

317 return tmpFile.name, True 

318 

319 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

320 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None: 

321 """Upload a local file with backoff. 

322 

323 Helper method to wrap file uploading in backoff for transfer_from. 

324 """ 

325 try: 

326 self.client.upload_file( 

327 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress 

328 ) 

329 except self.client.exceptions.NoSuchBucket as err: 

330 raise NotADirectoryError(f"Target does not exist: {err}") from err 

331 except ClientError as err: 

332 _translate_client_error(err) 

333 raise 

334 

335 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

336 def _copy_from(self, src: ResourcePath) -> None: 

337 copy_source = { 

338 "Bucket": src.netloc, 

339 "Key": src.relativeToPathRoot, 

340 } 

341 try: 

342 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot) 

343 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

344 raise FileNotFoundError("No such resource to transfer: {self}") from err 

345 except ClientError as err: 

346 _translate_client_error(err) 

347 raise 

348 

349 def transfer_from( 

350 self, 

351 src: ResourcePath, 

352 transfer: str = "copy", 

353 overwrite: bool = False, 

354 transaction: TransactionProtocol | None = None, 

355 ) -> None: 

356 """Transfer the current resource to an S3 bucket. 

357 

358 Parameters 

359 ---------- 

360 src : `ResourcePath` 

361 Source URI. 

362 transfer : `str` 

363 Mode to use for transferring the resource. Supports the following 

364 options: copy. 

365 overwrite : `bool`, optional 

366 Allow an existing file to be overwritten. Defaults to `False`. 

367 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

368 Currently unused. 

369 """ 

370 # Fail early to prevent delays if remote resources are requested 

371 if transfer not in self.transferModes: 

372 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

373 

374 # Existence checks cost time so do not call this unless we know 

375 # that debugging is enabled. 

376 if log.isEnabledFor(logging.DEBUG): 

377 log.debug( 

378 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

379 src, 

380 src.exists(), 

381 self, 

382 self.exists(), 

383 transfer, 

384 ) 

385 

386 # Short circuit if the URIs are identical immediately. 

387 if self == src: 

388 log.debug( 

389 "Target and destination URIs are identical: %s, returning immediately." 

390 " No further action required.", 

391 self, 

392 ) 

393 return 

394 

395 if not overwrite and self.exists(): 

396 raise FileExistsError(f"Destination path '{self}' already exists.") 

397 

398 if transfer == "auto": 

399 transfer = self.transferDefault 

400 

401 timer_msg = "Transfer from %s to %s" 

402 timer_args = (src, self) 

403 

404 if isinstance(src, type(self)): 

405 # Looks like an S3 remote uri so we can use direct copy 

406 # note that boto3.resource.meta.copy is cleverer than the low 

407 # level copy_object 

408 with time_this(log, msg=timer_msg, args=timer_args): 

409 self._copy_from(src) 

410 

411 else: 

412 # Use local file and upload it 

413 with src.as_local() as local_uri: 

414 progress = ( 

415 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

416 if log.isEnabledFor(ProgressPercentage.log_level) 

417 else None 

418 ) 

419 with time_this(log, msg=timer_msg, args=timer_args): 

420 self._upload_file(local_uri, progress) 

421 

422 # This was an explicit move requested from a remote resource 

423 # try to remove that resource 

424 if transfer == "move": 

425 # Transactions do not work here 

426 src.remove() 

427 

428 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

429 def walk( 

430 self, file_filter: str | re.Pattern | None = None 

431 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

432 """Walk the directory tree returning matching files and directories. 

433 

434 Parameters 

435 ---------- 

436 file_filter : `str` or `re.Pattern`, optional 

437 Regex to filter out files from the list before it is returned. 

438 

439 Yields 

440 ------ 

441 dirpath : `ResourcePath` 

442 Current directory being examined. 

443 dirnames : `list` of `str` 

444 Names of subdirectories within dirpath. 

445 filenames : `list` of `str` 

446 Names of all the files within dirpath. 

447 """ 

448 # We pretend that S3 uses directories and files and not simply keys 

449 if not (self.isdir() or self.is_root): 

450 raise ValueError(f"Can not walk a non-directory URI: {self}") 

451 

452 if isinstance(file_filter, str): 

453 file_filter = re.compile(file_filter) 

454 

455 s3_paginator = self.client.get_paginator("list_objects_v2") 

456 

457 # Limit each query to a single "directory" to match os.walk 

458 # We could download all keys at once with no delimiter and work 

459 # it out locally but this could potentially lead to large memory 

460 # usage for millions of keys. It will also make the initial call 

461 # to this method potentially very slow. If making this method look 

462 # like os.walk was not required, we could query all keys with 

463 # pagination and return them in groups of 1000, but that would 

464 # be a different interface since we can't guarantee we would get 

465 # them all grouped properly across the 1000 limit boundary. 

466 prefix = self.relativeToPathRoot if not self.is_root else "" 

467 prefix_len = len(prefix) 

468 dirnames = [] 

469 filenames = [] 

470 files_there = False 

471 

472 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

473 # All results are returned as full key names and we must 

474 # convert them back to the root form. The prefix is fixed 

475 # and delimited so that is a simple trim 

476 

477 # Directories are reported in the CommonPrefixes result 

478 # which reports the entire key and must be stripped. 

479 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

480 dirnames.extend(found_dirs) 

481 

482 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

483 if found_files: 

484 files_there = True 

485 if file_filter is not None: 

486 found_files = [f for f in found_files if file_filter.search(f)] 

487 

488 filenames.extend(found_files) 

489 

490 # Directories do not exist so we can't test for them. If no files 

491 # or directories were found though, this means that it effectively 

492 # does not exist and we should match os.walk() behavior and return 

493 # immediately. 

494 if not dirnames and not files_there: 

495 return 

496 else: 

497 yield self, dirnames, filenames 

498 

499 for dir in dirnames: 

500 new_uri = self.join(dir) 

501 yield from new_uri.walk(file_filter) 

502 

503 @contextlib.contextmanager 

504 def _openImpl( 

505 self, 

506 mode: str = "r", 

507 *, 

508 encoding: str | None = None, 

509 ) -> Iterator[ResourceHandleProtocol]: 

510 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle: 

511 if "b" in mode: 

512 yield handle 

513 else: 

514 if encoding is None: 

515 encoding = sys.getdefaultencoding() 

516 # cast because the protocol is compatible, but does not have 

517 # BytesIO in the inheritance tree 

518 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub: 

519 yield sub 

520 

521 def generate_presigned_get_url(self, *, expiration_time_seconds: int) -> str: 

522 # Docstring inherited 

523 return self._generate_presigned_url("get_object", expiration_time_seconds) 

524 

525 def generate_presigned_put_url(self, *, expiration_time_seconds: int) -> str: 

526 # Docstring inherited 

527 return self._generate_presigned_url("put_object", expiration_time_seconds) 

528 

529 def _generate_presigned_url(self, method: str, expiration_time_seconds: int) -> str: 

530 return self.client.generate_presigned_url( 

531 method, 

532 Params={"Bucket": self.netloc, "Key": self.relativeToPathRoot}, 

533 ExpiresIn=expiration_time_seconds, 

534 )