Coverage for python/lsst/resources/s3.py: 28%

249 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-13 09:59 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourcePath",) 

15 

16import contextlib 

17import io 

18import logging 

19import os 

20import re 

21import sys 

22import tempfile 

23import threading 

24from collections.abc import Iterable, Iterator 

25from functools import cache, cached_property 

26from typing import IO, TYPE_CHECKING, cast 

27 

28from botocore.exceptions import ClientError 

29from lsst.utils.timer import time_this 

30 

31from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol 

32from ._resourceHandles._s3ResourceHandle import S3ResourceHandle 

33from ._resourcePath import ResourcePath 

34from .s3utils import ( 

35 _TooManyRequestsError, 

36 all_retryable_errors, 

37 backoff, 

38 bucketExists, 

39 getS3Client, 

40 max_retry_time, 

41 retryable_io_errors, 

42 s3CheckFileExists, 

43) 

44 

45try: 

46 from boto3.s3.transfer import TransferConfig # type: ignore 

47except ImportError: 

48 TransferConfig = None 

49 

50if TYPE_CHECKING: 

51 with contextlib.suppress(ImportError): 

52 import boto3 

53 

54 from .utils import TransactionProtocol 

55 

56 

57log = logging.getLogger(__name__) 

58 

59 

60class ProgressPercentage: 

61 """Progress bar for S3 file uploads. 

62 

63 Parameters 

64 ---------- 

65 file : `ResourcePath` 

66 Resource that is relevant to the progress percentage. The size of this 

67 resource will be used to determine progress. The name will be used 

68 in the log messages unless overridden by ``file_for_msg``. 

69 file_for_msg : `ResourcePath` or `None`, optional 

70 Resource name to include in log messages in preference to ``file``. 

71 msg : `str`, optional 

72 Message text to be included in every progress log message. 

73 """ 

74 

75 log_level = logging.DEBUG 

76 """Default log level to use when issuing a message.""" 

77 

78 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""): 

79 self._filename = file 

80 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

81 self._size = file.size() 

82 self._seen_so_far = 0 

83 self._lock = threading.Lock() 

84 self._msg = msg 

85 

86 def __call__(self, bytes_amount: int) -> None: 

87 # To simplify, assume this is hooked up to a single filename 

88 with self._lock: 

89 self._seen_so_far += bytes_amount 

90 percentage = (100 * self._seen_so_far) // self._size 

91 log.log( 

92 self.log_level, 

93 "%s %s %s / %s (%s%%)", 

94 self._msg, 

95 self._file_for_msg, 

96 self._seen_so_far, 

97 self._size, 

98 percentage, 

99 ) 

100 

101 

102def _translate_client_error(err: ClientError) -> None: 

103 """Translate a ClientError into a specialist error if relevant. 

104 

105 Parameters 

106 ---------- 

107 err : `ClientError` 

108 Exception to translate. 

109 

110 Raises 

111 ------ 

112 _TooManyRequestsError 

113 Raised if the `ClientError` looks like a 429 retry request. 

114 """ 

115 if "(429)" in str(err): 

116 # ClientError includes the error code in the message 

117 # but no direct way to access it without looking inside the 

118 # response. 

119 raise _TooManyRequestsError(str(err)) from err 

120 elif "(404)" in str(err): 

121 # Some systems can generate this rather than NoSuchKey. 

122 raise FileNotFoundError("Resource not found: {self}") 

123 

124 

125@cache 

126def _parse_string_to_maybe_bool(maybe_bool_str: str) -> bool | None: 

127 """Map a string to either a boolean value or None. 

128 

129 Parameters 

130 ---------- 

131 maybe_bool_str : `str` 

132 The value to parse 

133 

134 Results 

135 ------- 

136 maybe_bool : `bool` or `None` 

137 The parsed value. 

138 """ 

139 if maybe_bool_str.lower() in ["t", "true", "yes", "y", "1"]: 

140 maybe_bool = True 

141 elif maybe_bool_str.lower() in ["f", "false", "no", "n", "0"]: 

142 maybe_bool = False 

143 elif maybe_bool_str.lower() in ["none", ""]: 

144 maybe_bool = None 

145 else: 

146 raise ValueError(f'Value of "{maybe_bool_str}" is not True, False, or None.') 

147 

148 return maybe_bool 

149 

150 

151class S3ResourcePath(ResourcePath): 

152 """S3 URI resource path implementation class. 

153 

154 Notes 

155 ----- 

156 In order to configure the behavior of instances of this class, the 

157 environment variable is inspected: 

158 

159 - LSST_S3_USE_THREADS: May be True, False, or None. Sets whether threading 

160 is used for downloads, with a value of None defaulting to boto's default 

161 value. Users may wish to set it to False when the downloads will be started 

162 within threads other than python's main thread. 

163 """ 

164 

165 use_threads: bool | None = None 

166 """Explicitly turn on or off threading in use of boto's download_fileobj. 

167 Setting this to None results in boto's default behavior.""" 

168 

169 @cached_property 

170 def _environ_use_threads(self) -> bool | None: 

171 try: 

172 use_threads_str = os.environ["LSST_S3_USE_THREADS"] 

173 except KeyError: 

174 use_threads_str = "None" 

175 

176 use_threads = _parse_string_to_maybe_bool(use_threads_str) 

177 

178 return use_threads 

179 

180 @property 

181 def _transfer_config(self) -> TransferConfig: 

182 if self.use_threads is None: 

183 self.use_threads = self._environ_use_threads 

184 

185 if self.use_threads is None: 

186 transfer_config = TransferConfig() 

187 else: 

188 transfer_config = TransferConfig(use_threads=self.use_threads) 

189 

190 return transfer_config 

191 

192 @property 

193 def client(self) -> boto3.client: 

194 """Client object to address remote resource.""" 

195 return getS3Client(self._profile) 

196 

197 @property 

198 def _profile(self) -> str | None: 

199 """Profile name to use for looking up S3 credentials and endpoint.""" 

200 return self._uri.username 

201 

202 @property 

203 def _bucket(self) -> str: 

204 """S3 bucket where the files are stored.""" 

205 # Notionally the bucket is stored in the 'hostname' part of the URI. 

206 # However, Ceph S3 uses a "multi-tenant" syntax for bucket names in the 

207 # form 'tenant:bucket'. The part after the colon is parsed as the port 

208 # portion of the URI, and urllib throws an exception if you try to read 

209 # a non-integer port value. So manually split off this portion of the 

210 # URI. 

211 split = self._uri.netloc.split("@") 

212 num_components = len(split) 

213 if num_components == 2: 

214 # There is a profile@ portion of the URL, so take the second half. 

215 bucket = split[1] 

216 elif num_components == 1: 

217 # There is no profile@, so take the whole netloc. 

218 bucket = split[0] 

219 else: 

220 raise ValueError(f"Unexpected extra '@' in S3 URI: '{str(self)}'") 

221 

222 if not bucket: 

223 raise ValueError(f"S3 URI does not include bucket name: '{str(self)}'") 

224 

225 return bucket 

226 

227 @classmethod 

228 def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]: 

229 # Force client to be created for each profile before creating threads. 

230 profiles = set[str | None]() 

231 for path in uris: 

232 if path.scheme == "s3": 

233 path = cast(S3ResourcePath, path) 

234 profiles.add(path._profile) 

235 for profile in profiles: 

236 getS3Client(profile) 

237 

238 return super()._mexists(uris) 

239 

240 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

241 def exists(self) -> bool: 

242 """Check that the S3 resource exists.""" 

243 if self.is_root: 

244 # Only check for the bucket since the path is irrelevant 

245 return bucketExists(self._bucket, self.client) 

246 exists, _ = s3CheckFileExists(self, bucket=self._bucket, client=self.client) 

247 return exists 

248 

249 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

250 def size(self) -> int: 

251 """Return the size of the resource in bytes.""" 

252 if self.dirLike: 

253 return 0 

254 exists, sz = s3CheckFileExists(self, bucket=self._bucket, client=self.client) 

255 if not exists: 

256 raise FileNotFoundError(f"Resource {self} does not exist") 

257 return sz 

258 

259 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

260 def remove(self) -> None: 

261 """Remove the resource.""" 

262 # https://github.com/boto/boto3/issues/507 - there is no 

263 # way of knowing if the file was actually deleted except 

264 # for checking all the keys again, reponse is HTTP 204 OK 

265 # response all the time 

266 try: 

267 self.client.delete_object(Bucket=self._bucket, Key=self.relativeToPathRoot) 

268 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

269 raise FileNotFoundError("No such resource: {self}") from err 

270 

271 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

272 def read(self, size: int = -1) -> bytes: 

273 args = {} 

274 if size > 0: 

275 args["Range"] = f"bytes=0-{size-1}" 

276 try: 

277 response = self.client.get_object(Bucket=self._bucket, Key=self.relativeToPathRoot, **args) 

278 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

279 raise FileNotFoundError(f"No such resource: {self}") from err 

280 except ClientError as err: 

281 _translate_client_error(err) 

282 raise 

283 with time_this(log, msg="Read from %s", args=(self,)): 

284 body = response["Body"].read() 

285 response["Body"].close() 

286 return body 

287 

288 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

289 def write(self, data: bytes, overwrite: bool = True) -> None: 

290 if not overwrite and self.exists(): 

291 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

292 with time_this(log, msg="Write to %s", args=(self,)): 

293 self.client.put_object(Bucket=self._bucket, Key=self.relativeToPathRoot, Body=data) 

294 

295 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

296 def mkdir(self) -> None: 

297 """Write a directory key to S3.""" 

298 if not bucketExists(self._bucket, self.client): 

299 raise ValueError(f"Bucket {self._bucket} does not exist for {self}!") 

300 

301 if not self.dirLike: 

302 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

303 

304 # don't create S3 key when root is at the top-level of an Bucket 

305 if self.path != "/": 

306 self.client.put_object(Bucket=self._bucket, Key=self.relativeToPathRoot) 

307 

308 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

309 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None: 

310 """Download the remote resource to a local file. 

311 

312 Helper routine for _as_local to allow backoff without regenerating 

313 the temporary file. 

314 """ 

315 try: 

316 self.client.download_fileobj( 

317 self._bucket, 

318 self.relativeToPathRoot, 

319 local_file, 

320 Callback=progress, 

321 Config=self._transfer_config, 

322 ) 

323 except ( 

324 self.client.exceptions.NoSuchKey, 

325 self.client.exceptions.NoSuchBucket, 

326 ) as err: 

327 raise FileNotFoundError(f"No such resource: {self}") from err 

328 except ClientError as err: 

329 _translate_client_error(err) 

330 raise 

331 

332 def _as_local(self) -> tuple[str, bool]: 

333 """Download object from S3 and place in temporary directory. 

334 

335 Returns 

336 ------- 

337 path : `str` 

338 Path to local temporary file. 

339 temporary : `bool` 

340 Always returns `True`. This is always a temporary file. 

341 """ 

342 with ( 

343 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile, 

344 time_this(log, msg="Downloading %s to local file", args=(self,)), 

345 ): 

346 progress = ( 

347 ProgressPercentage(self, msg="Downloading:") 

348 if log.isEnabledFor(ProgressPercentage.log_level) 

349 else None 

350 ) 

351 self._download_file(tmpFile, progress) 

352 return tmpFile.name, True 

353 

354 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

355 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None: 

356 """Upload a local file with backoff. 

357 

358 Helper method to wrap file uploading in backoff for transfer_from. 

359 """ 

360 try: 

361 self.client.upload_file( 

362 local_file.ospath, self._bucket, self.relativeToPathRoot, Callback=progress 

363 ) 

364 except self.client.exceptions.NoSuchBucket as err: 

365 raise NotADirectoryError(f"Target does not exist: {err}") from err 

366 except ClientError as err: 

367 _translate_client_error(err) 

368 raise 

369 

370 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

371 def _copy_from(self, src: S3ResourcePath) -> None: 

372 copy_source = { 

373 "Bucket": src._bucket, 

374 "Key": src.relativeToPathRoot, 

375 } 

376 try: 

377 self.client.copy_object(CopySource=copy_source, Bucket=self._bucket, Key=self.relativeToPathRoot) 

378 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

379 raise FileNotFoundError("No such resource to transfer: {self}") from err 

380 except ClientError as err: 

381 _translate_client_error(err) 

382 raise 

383 

384 def transfer_from( 

385 self, 

386 src: ResourcePath, 

387 transfer: str = "copy", 

388 overwrite: bool = False, 

389 transaction: TransactionProtocol | None = None, 

390 ) -> None: 

391 """Transfer the current resource to an S3 bucket. 

392 

393 Parameters 

394 ---------- 

395 src : `ResourcePath` 

396 Source URI. 

397 transfer : `str` 

398 Mode to use for transferring the resource. Supports the following 

399 options: copy. 

400 overwrite : `bool`, optional 

401 Allow an existing file to be overwritten. Defaults to `False`. 

402 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

403 Currently unused. 

404 """ 

405 # Fail early to prevent delays if remote resources are requested 

406 if transfer not in self.transferModes: 

407 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

408 

409 # Existence checks cost time so do not call this unless we know 

410 # that debugging is enabled. 

411 if log.isEnabledFor(logging.DEBUG): 

412 log.debug( 

413 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

414 src, 

415 src.exists(), 

416 self, 

417 self.exists(), 

418 transfer, 

419 ) 

420 

421 # Short circuit if the URIs are identical immediately. 

422 if self == src: 

423 log.debug( 

424 "Target and destination URIs are identical: %s, returning immediately." 

425 " No further action required.", 

426 self, 

427 ) 

428 return 

429 

430 if not overwrite and self.exists(): 

431 raise FileExistsError(f"Destination path '{self}' already exists.") 

432 

433 if transfer == "auto": 

434 transfer = self.transferDefault 

435 

436 timer_msg = "Transfer from %s to %s" 

437 timer_args = (src, self) 

438 

439 if isinstance(src, type(self)): 

440 # Looks like an S3 remote uri so we can use direct copy 

441 # note that boto3.resource.meta.copy is cleverer than the low 

442 # level copy_object 

443 with time_this(log, msg=timer_msg, args=timer_args): 

444 self._copy_from(src) 

445 

446 else: 

447 # Use local file and upload it 

448 with src.as_local() as local_uri: 

449 progress = ( 

450 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

451 if log.isEnabledFor(ProgressPercentage.log_level) 

452 else None 

453 ) 

454 with time_this(log, msg=timer_msg, args=timer_args): 

455 self._upload_file(local_uri, progress) 

456 

457 # This was an explicit move requested from a remote resource 

458 # try to remove that resource 

459 if transfer == "move": 

460 # Transactions do not work here 

461 src.remove() 

462 

463 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

464 def walk( 

465 self, file_filter: str | re.Pattern | None = None 

466 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

467 """Walk the directory tree returning matching files and directories. 

468 

469 Parameters 

470 ---------- 

471 file_filter : `str` or `re.Pattern`, optional 

472 Regex to filter out files from the list before it is returned. 

473 

474 Yields 

475 ------ 

476 dirpath : `ResourcePath` 

477 Current directory being examined. 

478 dirnames : `list` of `str` 

479 Names of subdirectories within dirpath. 

480 filenames : `list` of `str` 

481 Names of all the files within dirpath. 

482 """ 

483 # We pretend that S3 uses directories and files and not simply keys 

484 if not (self.isdir() or self.is_root): 

485 raise ValueError(f"Can not walk a non-directory URI: {self}") 

486 

487 if isinstance(file_filter, str): 

488 file_filter = re.compile(file_filter) 

489 

490 s3_paginator = self.client.get_paginator("list_objects_v2") 

491 

492 # Limit each query to a single "directory" to match os.walk 

493 # We could download all keys at once with no delimiter and work 

494 # it out locally but this could potentially lead to large memory 

495 # usage for millions of keys. It will also make the initial call 

496 # to this method potentially very slow. If making this method look 

497 # like os.walk was not required, we could query all keys with 

498 # pagination and return them in groups of 1000, but that would 

499 # be a different interface since we can't guarantee we would get 

500 # them all grouped properly across the 1000 limit boundary. 

501 prefix = self.relativeToPathRoot if not self.is_root else "" 

502 prefix_len = len(prefix) 

503 dirnames = [] 

504 filenames = [] 

505 files_there = False 

506 

507 for page in s3_paginator.paginate(Bucket=self._bucket, Prefix=prefix, Delimiter="/"): 

508 # All results are returned as full key names and we must 

509 # convert them back to the root form. The prefix is fixed 

510 # and delimited so that is a simple trim 

511 

512 # Directories are reported in the CommonPrefixes result 

513 # which reports the entire key and must be stripped. 

514 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

515 dirnames.extend(found_dirs) 

516 

517 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

518 if found_files: 

519 files_there = True 

520 if file_filter is not None: 

521 found_files = [f for f in found_files if file_filter.search(f)] 

522 

523 filenames.extend(found_files) 

524 

525 # Directories do not exist so we can't test for them. If no files 

526 # or directories were found though, this means that it effectively 

527 # does not exist and we should match os.walk() behavior and return 

528 # immediately. 

529 if not dirnames and not files_there: 

530 return 

531 else: 

532 yield self, dirnames, filenames 

533 

534 for dir in dirnames: 

535 new_uri = self.join(dir) 

536 yield from new_uri.walk(file_filter) 

537 

538 @contextlib.contextmanager 

539 def _openImpl( 

540 self, 

541 mode: str = "r", 

542 *, 

543 encoding: str | None = None, 

544 ) -> Iterator[ResourceHandleProtocol]: 

545 with S3ResourceHandle(mode, log, self.client, self._bucket, self.relativeToPathRoot) as handle: 

546 if "b" in mode: 

547 yield handle 

548 else: 

549 if encoding is None: 

550 encoding = sys.getdefaultencoding() 

551 # cast because the protocol is compatible, but does not have 

552 # BytesIO in the inheritance tree 

553 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub: 

554 yield sub 

555 

556 def generate_presigned_get_url(self, *, expiration_time_seconds: int) -> str: 

557 # Docstring inherited 

558 return self._generate_presigned_url("get_object", expiration_time_seconds) 

559 

560 def generate_presigned_put_url(self, *, expiration_time_seconds: int) -> str: 

561 # Docstring inherited 

562 return self._generate_presigned_url("put_object", expiration_time_seconds) 

563 

564 def _generate_presigned_url(self, method: str, expiration_time_seconds: int) -> str: 

565 return self.client.generate_presigned_url( 

566 method, 

567 Params={"Bucket": self._bucket, "Key": self.relativeToPathRoot}, 

568 ExpiresIn=expiration_time_seconds, 

569 )