Coverage for python/lsst/resources/s3.py: 89%

188 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-30 11:34 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourcePath",) 

15 

16import contextlib 

17import io 

18import logging 

19import re 

20import sys 

21import tempfile 

22import threading 

23from collections.abc import Iterable, Iterator 

24from typing import IO, TYPE_CHECKING, cast 

25 

26from botocore.exceptions import ClientError 

27from lsst.utils.timer import time_this 

28 

29from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol 

30from ._resourceHandles._s3ResourceHandle import S3ResourceHandle 

31from ._resourcePath import ResourcePath 

32from .s3utils import ( 

33 _TooManyRequestsError, 

34 all_retryable_errors, 

35 backoff, 

36 bucketExists, 

37 getS3Client, 

38 max_retry_time, 

39 retryable_io_errors, 

40 s3CheckFileExists, 

41) 

42 

43if TYPE_CHECKING: 

44 with contextlib.suppress(ImportError): 

45 import boto3 

46 

47 from .utils import TransactionProtocol 

48 

49 

50log = logging.getLogger(__name__) 

51 

52 

53class ProgressPercentage: 

54 """Progress bar for S3 file uploads.""" 

55 

56 log_level = logging.DEBUG 

57 """Default log level to use when issuing a message.""" 

58 

59 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""): 

60 self._filename = file 

61 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

62 self._size = file.size() 

63 self._seen_so_far = 0 

64 self._lock = threading.Lock() 

65 self._msg = msg 

66 

67 def __call__(self, bytes_amount: int) -> None: 

68 # To simplify, assume this is hooked up to a single filename 

69 with self._lock: 

70 self._seen_so_far += bytes_amount 

71 percentage = (100 * self._seen_so_far) // self._size 

72 log.log( 

73 self.log_level, 

74 "%s %s %s / %s (%s%%)", 

75 self._msg, 

76 self._file_for_msg, 

77 self._seen_so_far, 

78 self._size, 

79 percentage, 

80 ) 

81 

82 

83def _translate_client_error(err: ClientError) -> None: 

84 """Translate a ClientError into a specialist error if relevant. 

85 

86 Parameters 

87 ---------- 

88 err : `ClientError` 

89 Exception to translate. 

90 

91 Raises 

92 ------ 

93 _TooManyRequestsError 

94 Raised if the `ClientError` looks like a 429 retry request. 

95 """ 

96 if "(429)" in str(err): 

97 # ClientError includes the error code in the message 

98 # but no direct way to access it without looking inside the 

99 # response. 

100 raise _TooManyRequestsError(str(err)) from err 

101 elif "(404)" in str(err): 

102 # Some systems can generate this rather than NoSuchKey. 

103 raise FileNotFoundError("Resource not found: {self}") 

104 

105 

106class S3ResourcePath(ResourcePath): 

107 """S3 URI resource path implementation class.""" 

108 

109 @property 

110 def client(self) -> boto3.client: 

111 """Client object to address remote resource.""" 

112 # Defer import for circular dependencies 

113 return getS3Client() 

114 

115 @classmethod 

116 def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]: 

117 # Force client to be created before creating threads. 

118 getS3Client() 

119 

120 return super()._mexists(uris) 

121 

122 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

123 def exists(self) -> bool: 

124 """Check that the S3 resource exists.""" 

125 if self.is_root: 

126 # Only check for the bucket since the path is irrelevant 

127 return bucketExists(self.netloc) 

128 exists, _ = s3CheckFileExists(self, client=self.client) 

129 return exists 

130 

131 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

132 def size(self) -> int: 

133 """Return the size of the resource in bytes.""" 

134 if self.dirLike: 

135 return 0 

136 exists, sz = s3CheckFileExists(self, client=self.client) 

137 if not exists: 

138 raise FileNotFoundError(f"Resource {self} does not exist") 

139 return sz 

140 

141 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

142 def remove(self) -> None: 

143 """Remove the resource.""" 

144 # https://github.com/boto/boto3/issues/507 - there is no 

145 # way of knowing if the file was actually deleted except 

146 # for checking all the keys again, reponse is HTTP 204 OK 

147 # response all the time 

148 try: 

149 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

150 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

151 raise FileNotFoundError("No such resource: {self}") from err 

152 

153 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

154 def read(self, size: int = -1) -> bytes: 

155 """Read the contents of the resource.""" 

156 args = {} 

157 if size > 0: 

158 args["Range"] = f"bytes=0-{size-1}" 

159 try: 

160 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

161 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 161 ↛ 163line 161 didn't jump to line 163

162 raise FileNotFoundError(f"No such resource: {self}") from err 

163 except ClientError as err: 

164 _translate_client_error(err) 

165 raise 

166 with time_this(log, msg="Read from %s", args=(self,)): 

167 body = response["Body"].read() 

168 response["Body"].close() 

169 return body 

170 

171 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

172 def write(self, data: bytes, overwrite: bool = True) -> None: 

173 """Write the supplied data to the resource.""" 

174 if not overwrite and self.exists(): 

175 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

176 with time_this(log, msg="Write to %s", args=(self,)): 

177 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

178 

179 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

180 def mkdir(self) -> None: 

181 """Write a directory key to S3.""" 

182 if not bucketExists(self.netloc): 

183 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

184 

185 if not self.dirLike: 

186 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

187 

188 # don't create S3 key when root is at the top-level of an Bucket 

189 if self.path != "/": 

190 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

191 

192 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

193 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None: 

194 """Download the remote resource to a local file. 

195 

196 Helper routine for _as_local to allow backoff without regenerating 

197 the temporary file. 

198 """ 

199 try: 

200 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress) 

201 except ( 

202 self.client.exceptions.NoSuchKey, 

203 self.client.exceptions.NoSuchBucket, 

204 ) as err: 

205 raise FileNotFoundError(f"No such resource: {self}") from err 

206 except ClientError as err: 

207 _translate_client_error(err) 

208 raise 

209 

210 def _as_local(self) -> tuple[str, bool]: 

211 """Download object from S3 and place in temporary directory. 

212 

213 Returns 

214 ------- 

215 path : `str` 

216 Path to local temporary file. 

217 temporary : `bool` 

218 Always returns `True`. This is always a temporary file. 

219 """ 

220 with ( 

221 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile, 

222 time_this(log, msg="Downloading %s to local file", args=(self,)), 

223 ): 

224 progress = ( 

225 ProgressPercentage(self, msg="Downloading:") 

226 if log.isEnabledFor(ProgressPercentage.log_level) 

227 else None 

228 ) 

229 self._download_file(tmpFile, progress) 

230 return tmpFile.name, True 

231 

232 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

233 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None: 

234 """Upload a local file with backoff. 

235 

236 Helper method to wrap file uploading in backoff for transfer_from. 

237 """ 

238 try: 

239 self.client.upload_file( 

240 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress 

241 ) 

242 except self.client.exceptions.NoSuchBucket as err: 

243 raise NotADirectoryError(f"Target does not exist: {err}") from err 

244 except ClientError as err: 

245 _translate_client_error(err) 

246 raise 

247 

248 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

249 def _copy_from(self, src: ResourcePath) -> None: 

250 copy_source = { 

251 "Bucket": src.netloc, 

252 "Key": src.relativeToPathRoot, 

253 } 

254 try: 

255 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot) 

256 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 256 ↛ 258line 256 didn't jump to line 258

257 raise FileNotFoundError("No such resource to transfer: {self}") from err 

258 except ClientError as err: 

259 _translate_client_error(err) 

260 raise 

261 

262 def transfer_from( 

263 self, 

264 src: ResourcePath, 

265 transfer: str = "copy", 

266 overwrite: bool = False, 

267 transaction: TransactionProtocol | None = None, 

268 ) -> None: 

269 """Transfer the current resource to an S3 bucket. 

270 

271 Parameters 

272 ---------- 

273 src : `ResourcePath` 

274 Source URI. 

275 transfer : `str` 

276 Mode to use for transferring the resource. Supports the following 

277 options: copy. 

278 overwrite : `bool`, optional 

279 Allow an existing file to be overwritten. Defaults to `False`. 

280 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

281 Currently unused. 

282 """ 

283 # Fail early to prevent delays if remote resources are requested 

284 if transfer not in self.transferModes: 

285 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

286 

287 # Existence checks cost time so do not call this unless we know 

288 # that debugging is enabled. 

289 if log.isEnabledFor(logging.DEBUG): 289 ↛ 300line 289 didn't jump to line 300, because the condition on line 289 was never false

290 log.debug( 

291 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

292 src, 

293 src.exists(), 

294 self, 

295 self.exists(), 

296 transfer, 

297 ) 

298 

299 # Short circuit if the URIs are identical immediately. 

300 if self == src: 

301 log.debug( 

302 "Target and destination URIs are identical: %s, returning immediately." 

303 " No further action required.", 

304 self, 

305 ) 

306 return 

307 

308 if not overwrite and self.exists(): 

309 raise FileExistsError(f"Destination path '{self}' already exists.") 

310 

311 if transfer == "auto": 

312 transfer = self.transferDefault 

313 

314 timer_msg = "Transfer from %s to %s" 

315 timer_args = (src, self) 

316 

317 if isinstance(src, type(self)): 

318 # Looks like an S3 remote uri so we can use direct copy 

319 # note that boto3.resource.meta.copy is cleverer than the low 

320 # level copy_object 

321 with time_this(log, msg=timer_msg, args=timer_args): 

322 self._copy_from(src) 

323 

324 else: 

325 # Use local file and upload it 

326 with src.as_local() as local_uri: 

327 progress = ( 

328 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

329 if log.isEnabledFor(ProgressPercentage.log_level) 

330 else None 

331 ) 

332 with time_this(log, msg=timer_msg, args=timer_args): 

333 self._upload_file(local_uri, progress) 

334 

335 # This was an explicit move requested from a remote resource 

336 # try to remove that resource 

337 if transfer == "move": 

338 # Transactions do not work here 

339 src.remove() 

340 

341 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

342 def walk( 

343 self, file_filter: str | re.Pattern | None = None 

344 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

345 """Walk the directory tree returning matching files and directories. 

346 

347 Parameters 

348 ---------- 

349 file_filter : `str` or `re.Pattern`, optional 

350 Regex to filter out files from the list before it is returned. 

351 

352 Yields 

353 ------ 

354 dirpath : `ResourcePath` 

355 Current directory being examined. 

356 dirnames : `list` of `str` 

357 Names of subdirectories within dirpath. 

358 filenames : `list` of `str` 

359 Names of all the files within dirpath. 

360 """ 

361 # We pretend that S3 uses directories and files and not simply keys 

362 if not (self.isdir() or self.is_root): 

363 raise ValueError(f"Can not walk a non-directory URI: {self}") 

364 

365 if isinstance(file_filter, str): 365 ↛ 366line 365 didn't jump to line 366, because the condition on line 365 was never true

366 file_filter = re.compile(file_filter) 

367 

368 s3_paginator = self.client.get_paginator("list_objects_v2") 

369 

370 # Limit each query to a single "directory" to match os.walk 

371 # We could download all keys at once with no delimiter and work 

372 # it out locally but this could potentially lead to large memory 

373 # usage for millions of keys. It will also make the initial call 

374 # to this method potentially very slow. If making this method look 

375 # like os.walk was not required, we could query all keys with 

376 # pagination and return them in groups of 1000, but that would 

377 # be a different interface since we can't guarantee we would get 

378 # them all grouped properly across the 1000 limit boundary. 

379 prefix = self.relativeToPathRoot if not self.is_root else "" 

380 prefix_len = len(prefix) 

381 dirnames = [] 

382 filenames = [] 

383 files_there = False 

384 

385 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

386 # All results are returned as full key names and we must 

387 # convert them back to the root form. The prefix is fixed 

388 # and delimited so that is a simple trim 

389 

390 # Directories are reported in the CommonPrefixes result 

391 # which reports the entire key and must be stripped. 

392 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

393 dirnames.extend(found_dirs) 

394 

395 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

396 if found_files: 

397 files_there = True 

398 if file_filter is not None: 

399 found_files = [f for f in found_files if file_filter.search(f)] 

400 

401 filenames.extend(found_files) 

402 

403 # Directories do not exist so we can't test for them. If no files 

404 # or directories were found though, this means that it effectively 

405 # does not exist and we should match os.walk() behavior and return 

406 # immediately. 

407 if not dirnames and not files_there: 

408 return 

409 else: 

410 yield self, dirnames, filenames 

411 

412 for dir in dirnames: 

413 new_uri = self.join(dir) 

414 yield from new_uri.walk(file_filter) 

415 

416 @contextlib.contextmanager 

417 def _openImpl( 

418 self, 

419 mode: str = "r", 

420 *, 

421 encoding: str | None = None, 

422 ) -> Iterator[ResourceHandleProtocol]: 

423 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle: 

424 if "b" in mode: 

425 yield handle 

426 else: 

427 if encoding is None: 

428 encoding = sys.getdefaultencoding() 

429 # cast because the protocol is compatible, but does not have 

430 # BytesIO in the inheritance tree 

431 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub: 

432 yield sub