Coverage for python/lsst/resources/s3.py: 89%

194 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-09 11:30 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourcePath",) 

15 

16import contextlib 

17import io 

18import logging 

19import re 

20import sys 

21import tempfile 

22import threading 

23from collections.abc import Iterable, Iterator 

24from typing import IO, TYPE_CHECKING, cast 

25 

26from botocore.exceptions import ClientError 

27from lsst.utils.timer import time_this 

28 

29from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol 

30from ._resourceHandles._s3ResourceHandle import S3ResourceHandle 

31from ._resourcePath import ResourcePath 

32from .s3utils import ( 

33 _TooManyRequestsError, 

34 all_retryable_errors, 

35 backoff, 

36 bucketExists, 

37 getS3Client, 

38 max_retry_time, 

39 retryable_io_errors, 

40 s3CheckFileExists, 

41) 

42 

43if TYPE_CHECKING: 

44 with contextlib.suppress(ImportError): 

45 import boto3 

46 

47 from .utils import TransactionProtocol 

48 

49 

50log = logging.getLogger(__name__) 

51 

52 

53class ProgressPercentage: 

54 """Progress bar for S3 file uploads. 

55 

56 Parameters 

57 ---------- 

58 file : `ResourcePath` 

59 Resource that is relevant to the progress percentage. The size of this 

60 resource will be used to determine progress. The name will be used 

61 in the log messages unless overridden by ``file_for_msg``. 

62 file_for_msg : `ResourcePath` or `None`, optional 

63 Resource name to include in log messages in preference to ``file``. 

64 msg : `str`, optional 

65 Message text to be included in every progress log message. 

66 """ 

67 

68 log_level = logging.DEBUG 

69 """Default log level to use when issuing a message.""" 

70 

71 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""): 

72 self._filename = file 

73 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

74 self._size = file.size() 

75 self._seen_so_far = 0 

76 self._lock = threading.Lock() 

77 self._msg = msg 

78 

79 def __call__(self, bytes_amount: int) -> None: 

80 # To simplify, assume this is hooked up to a single filename 

81 with self._lock: 

82 self._seen_so_far += bytes_amount 

83 percentage = (100 * self._seen_so_far) // self._size 

84 log.log( 

85 self.log_level, 

86 "%s %s %s / %s (%s%%)", 

87 self._msg, 

88 self._file_for_msg, 

89 self._seen_so_far, 

90 self._size, 

91 percentage, 

92 ) 

93 

94 

95def _translate_client_error(err: ClientError) -> None: 

96 """Translate a ClientError into a specialist error if relevant. 

97 

98 Parameters 

99 ---------- 

100 err : `ClientError` 

101 Exception to translate. 

102 

103 Raises 

104 ------ 

105 _TooManyRequestsError 

106 Raised if the `ClientError` looks like a 429 retry request. 

107 """ 

108 if "(429)" in str(err): 

109 # ClientError includes the error code in the message 

110 # but no direct way to access it without looking inside the 

111 # response. 

112 raise _TooManyRequestsError(str(err)) from err 

113 elif "(404)" in str(err): 

114 # Some systems can generate this rather than NoSuchKey. 

115 raise FileNotFoundError("Resource not found: {self}") 

116 

117 

118class S3ResourcePath(ResourcePath): 

119 """S3 URI resource path implementation class.""" 

120 

121 @property 

122 def client(self) -> boto3.client: 

123 """Client object to address remote resource.""" 

124 # Defer import for circular dependencies 

125 return getS3Client() 

126 

127 @classmethod 

128 def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]: 

129 # Force client to be created before creating threads. 

130 getS3Client() 

131 

132 return super()._mexists(uris) 

133 

134 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

135 def exists(self) -> bool: 

136 """Check that the S3 resource exists.""" 

137 if self.is_root: 

138 # Only check for the bucket since the path is irrelevant 

139 return bucketExists(self.netloc) 

140 exists, _ = s3CheckFileExists(self, client=self.client) 

141 return exists 

142 

143 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

144 def size(self) -> int: 

145 """Return the size of the resource in bytes.""" 

146 if self.dirLike: 

147 return 0 

148 exists, sz = s3CheckFileExists(self, client=self.client) 

149 if not exists: 

150 raise FileNotFoundError(f"Resource {self} does not exist") 

151 return sz 

152 

153 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

154 def remove(self) -> None: 

155 """Remove the resource.""" 

156 # https://github.com/boto/boto3/issues/507 - there is no 

157 # way of knowing if the file was actually deleted except 

158 # for checking all the keys again, reponse is HTTP 204 OK 

159 # response all the time 

160 try: 

161 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

162 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

163 raise FileNotFoundError("No such resource: {self}") from err 

164 

165 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

166 def read(self, size: int = -1) -> bytes: 

167 args = {} 

168 if size > 0: 

169 args["Range"] = f"bytes=0-{size-1}" 

170 try: 

171 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

172 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 172 ↛ 174line 172 didn't jump to line 174

173 raise FileNotFoundError(f"No such resource: {self}") from err 

174 except ClientError as err: 

175 _translate_client_error(err) 

176 raise 

177 with time_this(log, msg="Read from %s", args=(self,)): 

178 body = response["Body"].read() 

179 response["Body"].close() 

180 return body 

181 

182 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

183 def write(self, data: bytes, overwrite: bool = True) -> None: 

184 if not overwrite and self.exists(): 

185 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

186 with time_this(log, msg="Write to %s", args=(self,)): 

187 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

188 

189 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

190 def mkdir(self) -> None: 

191 """Write a directory key to S3.""" 

192 if not bucketExists(self.netloc): 

193 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

194 

195 if not self.dirLike: 

196 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

197 

198 # don't create S3 key when root is at the top-level of an Bucket 

199 if self.path != "/": 

200 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

201 

202 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

203 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None: 

204 """Download the remote resource to a local file. 

205 

206 Helper routine for _as_local to allow backoff without regenerating 

207 the temporary file. 

208 """ 

209 try: 

210 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress) 

211 except ( 

212 self.client.exceptions.NoSuchKey, 

213 self.client.exceptions.NoSuchBucket, 

214 ) as err: 

215 raise FileNotFoundError(f"No such resource: {self}") from err 

216 except ClientError as err: 

217 _translate_client_error(err) 

218 raise 

219 

220 def _as_local(self) -> tuple[str, bool]: 

221 """Download object from S3 and place in temporary directory. 

222 

223 Returns 

224 ------- 

225 path : `str` 

226 Path to local temporary file. 

227 temporary : `bool` 

228 Always returns `True`. This is always a temporary file. 

229 """ 

230 with ( 

231 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile, 

232 time_this(log, msg="Downloading %s to local file", args=(self,)), 

233 ): 

234 progress = ( 

235 ProgressPercentage(self, msg="Downloading:") 

236 if log.isEnabledFor(ProgressPercentage.log_level) 

237 else None 

238 ) 

239 self._download_file(tmpFile, progress) 

240 return tmpFile.name, True 

241 

242 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

243 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None: 

244 """Upload a local file with backoff. 

245 

246 Helper method to wrap file uploading in backoff for transfer_from. 

247 """ 

248 try: 

249 self.client.upload_file( 

250 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress 

251 ) 

252 except self.client.exceptions.NoSuchBucket as err: 

253 raise NotADirectoryError(f"Target does not exist: {err}") from err 

254 except ClientError as err: 

255 _translate_client_error(err) 

256 raise 

257 

258 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

259 def _copy_from(self, src: ResourcePath) -> None: 

260 copy_source = { 

261 "Bucket": src.netloc, 

262 "Key": src.relativeToPathRoot, 

263 } 

264 try: 

265 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot) 

266 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 266 ↛ 268line 266 didn't jump to line 268

267 raise FileNotFoundError("No such resource to transfer: {self}") from err 

268 except ClientError as err: 

269 _translate_client_error(err) 

270 raise 

271 

272 def transfer_from( 

273 self, 

274 src: ResourcePath, 

275 transfer: str = "copy", 

276 overwrite: bool = False, 

277 transaction: TransactionProtocol | None = None, 

278 ) -> None: 

279 """Transfer the current resource to an S3 bucket. 

280 

281 Parameters 

282 ---------- 

283 src : `ResourcePath` 

284 Source URI. 

285 transfer : `str` 

286 Mode to use for transferring the resource. Supports the following 

287 options: copy. 

288 overwrite : `bool`, optional 

289 Allow an existing file to be overwritten. Defaults to `False`. 

290 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

291 Currently unused. 

292 """ 

293 # Fail early to prevent delays if remote resources are requested 

294 if transfer not in self.transferModes: 

295 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

296 

297 # Existence checks cost time so do not call this unless we know 

298 # that debugging is enabled. 

299 if log.isEnabledFor(logging.DEBUG): 299 ↛ 310line 299 didn't jump to line 310, because the condition on line 299 was never false

300 log.debug( 

301 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

302 src, 

303 src.exists(), 

304 self, 

305 self.exists(), 

306 transfer, 

307 ) 

308 

309 # Short circuit if the URIs are identical immediately. 

310 if self == src: 

311 log.debug( 

312 "Target and destination URIs are identical: %s, returning immediately." 

313 " No further action required.", 

314 self, 

315 ) 

316 return 

317 

318 if not overwrite and self.exists(): 

319 raise FileExistsError(f"Destination path '{self}' already exists.") 

320 

321 if transfer == "auto": 

322 transfer = self.transferDefault 

323 

324 timer_msg = "Transfer from %s to %s" 

325 timer_args = (src, self) 

326 

327 if isinstance(src, type(self)): 

328 # Looks like an S3 remote uri so we can use direct copy 

329 # note that boto3.resource.meta.copy is cleverer than the low 

330 # level copy_object 

331 with time_this(log, msg=timer_msg, args=timer_args): 

332 self._copy_from(src) 

333 

334 else: 

335 # Use local file and upload it 

336 with src.as_local() as local_uri: 

337 progress = ( 

338 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

339 if log.isEnabledFor(ProgressPercentage.log_level) 

340 else None 

341 ) 

342 with time_this(log, msg=timer_msg, args=timer_args): 

343 self._upload_file(local_uri, progress) 

344 

345 # This was an explicit move requested from a remote resource 

346 # try to remove that resource 

347 if transfer == "move": 

348 # Transactions do not work here 

349 src.remove() 

350 

351 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

352 def walk( 

353 self, file_filter: str | re.Pattern | None = None 

354 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

355 """Walk the directory tree returning matching files and directories. 

356 

357 Parameters 

358 ---------- 

359 file_filter : `str` or `re.Pattern`, optional 

360 Regex to filter out files from the list before it is returned. 

361 

362 Yields 

363 ------ 

364 dirpath : `ResourcePath` 

365 Current directory being examined. 

366 dirnames : `list` of `str` 

367 Names of subdirectories within dirpath. 

368 filenames : `list` of `str` 

369 Names of all the files within dirpath. 

370 """ 

371 # We pretend that S3 uses directories and files and not simply keys 

372 if not (self.isdir() or self.is_root): 

373 raise ValueError(f"Can not walk a non-directory URI: {self}") 

374 

375 if isinstance(file_filter, str): 375 ↛ 376line 375 didn't jump to line 376, because the condition on line 375 was never true

376 file_filter = re.compile(file_filter) 

377 

378 s3_paginator = self.client.get_paginator("list_objects_v2") 

379 

380 # Limit each query to a single "directory" to match os.walk 

381 # We could download all keys at once with no delimiter and work 

382 # it out locally but this could potentially lead to large memory 

383 # usage for millions of keys. It will also make the initial call 

384 # to this method potentially very slow. If making this method look 

385 # like os.walk was not required, we could query all keys with 

386 # pagination and return them in groups of 1000, but that would 

387 # be a different interface since we can't guarantee we would get 

388 # them all grouped properly across the 1000 limit boundary. 

389 prefix = self.relativeToPathRoot if not self.is_root else "" 

390 prefix_len = len(prefix) 

391 dirnames = [] 

392 filenames = [] 

393 files_there = False 

394 

395 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

396 # All results are returned as full key names and we must 

397 # convert them back to the root form. The prefix is fixed 

398 # and delimited so that is a simple trim 

399 

400 # Directories are reported in the CommonPrefixes result 

401 # which reports the entire key and must be stripped. 

402 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

403 dirnames.extend(found_dirs) 

404 

405 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

406 if found_files: 

407 files_there = True 

408 if file_filter is not None: 

409 found_files = [f for f in found_files if file_filter.search(f)] 

410 

411 filenames.extend(found_files) 

412 

413 # Directories do not exist so we can't test for them. If no files 

414 # or directories were found though, this means that it effectively 

415 # does not exist and we should match os.walk() behavior and return 

416 # immediately. 

417 if not dirnames and not files_there: 

418 return 

419 else: 

420 yield self, dirnames, filenames 

421 

422 for dir in dirnames: 

423 new_uri = self.join(dir) 

424 yield from new_uri.walk(file_filter) 

425 

426 @contextlib.contextmanager 

427 def _openImpl( 

428 self, 

429 mode: str = "r", 

430 *, 

431 encoding: str | None = None, 

432 ) -> Iterator[ResourceHandleProtocol]: 

433 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle: 

434 if "b" in mode: 

435 yield handle 

436 else: 

437 if encoding is None: 

438 encoding = sys.getdefaultencoding() 

439 # cast because the protocol is compatible, but does not have 

440 # BytesIO in the inheritance tree 

441 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub: 

442 yield sub 

443 

444 def generate_presigned_get_url(self, *, expiration_time_seconds: int) -> str: 

445 # Docstring inherited 

446 return self._generate_presigned_url("get_object", expiration_time_seconds) 

447 

448 def generate_presigned_put_url(self, *, expiration_time_seconds: int) -> str: 

449 # Docstring inherited 

450 return self._generate_presigned_url("put_object", expiration_time_seconds) 

451 

452 def _generate_presigned_url(self, method: str, expiration_time_seconds: int) -> str: 

453 return self.client.generate_presigned_url( 

454 method, 

455 Params={"Bucket": self.netloc, "Key": self.relativeToPathRoot}, 

456 ExpiresIn=expiration_time_seconds, 

457 )