Coverage for python/lsst/resources/s3.py: 88%

185 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-15 02:25 -0700

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14import contextlib 

15import io 

16import logging 

17import re 

18import sys 

19import tempfile 

20import threading 

21 

22__all__ = ("S3ResourcePath",) 

23 

24from typing import IO, TYPE_CHECKING, Iterator, List, Optional, Tuple, Union, cast 

25 

26from botocore.exceptions import ClientError 

27from lsst.utils.timer import time_this 

28 

29from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol 

30from ._resourceHandles._s3ResourceHandle import S3ResourceHandle 

31from ._resourcePath import ResourcePath 

32from .s3utils import ( 

33 _TooManyRequestsException, 

34 all_retryable_errors, 

35 backoff, 

36 bucketExists, 

37 getS3Client, 

38 max_retry_time, 

39 retryable_io_errors, 

40 s3CheckFileExists, 

41) 

42 

43if TYPE_CHECKING: 

44 try: 

45 import boto3 

46 except ImportError: 

47 pass 

48 from .utils import TransactionProtocol 

49 

50 

51log = logging.getLogger(__name__) 

52 

53 

54class ProgressPercentage: 

55 """Progress bar for S3 file uploads.""" 

56 

57 log_level = logging.DEBUG 

58 """Default log level to use when issuing a message.""" 

59 

60 def __init__(self, file: ResourcePath, file_for_msg: Optional[ResourcePath] = None, msg: str = ""): 

61 self._filename = file 

62 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

63 self._size = file.size() 

64 self._seen_so_far = 0 

65 self._lock = threading.Lock() 

66 self._msg = msg 

67 

68 def __call__(self, bytes_amount: int) -> None: 

69 # To simplify, assume this is hooked up to a single filename 

70 with self._lock: 

71 self._seen_so_far += bytes_amount 

72 percentage = (100 * self._seen_so_far) // self._size 

73 log.log( 

74 self.log_level, 

75 "%s %s %s / %s (%s%%)", 

76 self._msg, 

77 self._file_for_msg, 

78 self._seen_so_far, 

79 self._size, 

80 percentage, 

81 ) 

82 

83 

84def _translate_client_error(err: ClientError) -> None: 

85 """Translate a ClientError into a specialist error if relevant. 

86 

87 Parameters 

88 ---------- 

89 err : `ClientError` 

90 Exception to translate. 

91 

92 Raises 

93 ------ 

94 _TooManyRequestsException 

95 Raised if the `ClientError` looks like a 429 retry request. 

96 """ 

97 if "(429)" in str(err): 

98 # ClientError includes the error code in the message 

99 # but no direct way to access it without looking inside the 

100 # response. 

101 raise _TooManyRequestsException(str(err)) from err 

102 elif "(404)" in str(err): 

103 # Some systems can generate this rather than NoSuchKey. 

104 raise FileNotFoundError("Resource not found: {self}") 

105 

106 

107class S3ResourcePath(ResourcePath): 

108 """S3 URI resource path implementation class.""" 

109 

110 @property 

111 def client(self) -> boto3.client: 

112 """Client object to address remote resource.""" 

113 # Defer import for circular dependencies 

114 return getS3Client() 

115 

116 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

117 def exists(self) -> bool: 

118 """Check that the S3 resource exists.""" 

119 if self.is_root: 

120 # Only check for the bucket since the path is irrelevant 

121 return bucketExists(self.netloc) 

122 exists, _ = s3CheckFileExists(self, client=self.client) 

123 return exists 

124 

125 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

126 def size(self) -> int: 

127 """Return the size of the resource in bytes.""" 

128 if self.dirLike: 

129 return 0 

130 exists, sz = s3CheckFileExists(self, client=self.client) 

131 if not exists: 

132 raise FileNotFoundError(f"Resource {self} does not exist") 

133 return sz 

134 

135 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

136 def remove(self) -> None: 

137 """Remove the resource.""" 

138 # https://github.com/boto/boto3/issues/507 - there is no 

139 # way of knowing if the file was actually deleted except 

140 # for checking all the keys again, reponse is HTTP 204 OK 

141 # response all the time 

142 try: 

143 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

144 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

145 raise FileNotFoundError("No such resource: {self}") from err 

146 

147 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

148 def read(self, size: int = -1) -> bytes: 

149 """Read the contents of the resource.""" 

150 args = {} 

151 if size > 0: 

152 args["Range"] = f"bytes=0-{size-1}" 

153 try: 

154 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

155 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 155 ↛ 157line 155 didn't jump to line 157

156 raise FileNotFoundError(f"No such resource: {self}") from err 

157 except ClientError as err: 

158 _translate_client_error(err) 

159 raise 

160 with time_this(log, msg="Read from %s", args=(self,)): 

161 body = response["Body"].read() 

162 response["Body"].close() 

163 return body 

164 

165 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

166 def write(self, data: bytes, overwrite: bool = True) -> None: 

167 """Write the supplied data to the resource.""" 

168 if not overwrite: 

169 if self.exists(): 169 ↛ 171line 169 didn't jump to line 171, because the condition on line 169 was never false

170 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

171 with time_this(log, msg="Write to %s", args=(self,)): 

172 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

173 

174 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

175 def mkdir(self) -> None: 

176 """Write a directory key to S3.""" 

177 if not bucketExists(self.netloc): 

178 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

179 

180 if not self.dirLike: 

181 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

182 

183 # don't create S3 key when root is at the top-level of an Bucket 

184 if not self.path == "/": 

185 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

186 

187 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

188 def _download_file(self, local_file: IO, progress: Optional[ProgressPercentage]) -> None: 

189 """Download the remote resource to a local file. 

190 

191 Helper routine for _as_local to allow backoff without regenerating 

192 the temporary file. 

193 """ 

194 try: 

195 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress) 

196 except ( 

197 self.client.exceptions.NoSuchKey, 

198 self.client.exceptions.NoSuchBucket, 

199 ) as err: 

200 raise FileNotFoundError(f"No such resource: {self}") from err 

201 except ClientError as err: 

202 _translate_client_error(err) 

203 raise 

204 

205 def _as_local(self) -> Tuple[str, bool]: 

206 """Download object from S3 and place in temporary directory. 

207 

208 Returns 

209 ------- 

210 path : `str` 

211 Path to local temporary file. 

212 temporary : `bool` 

213 Always returns `True`. This is always a temporary file. 

214 """ 

215 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

216 with time_this(log, msg="Downloading %s to local file", args=(self,)): 

217 progress = ( 

218 ProgressPercentage(self, msg="Downloading:") 

219 if log.isEnabledFor(ProgressPercentage.log_level) 

220 else None 

221 ) 

222 self._download_file(tmpFile, progress) 

223 return tmpFile.name, True 

224 

225 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

226 def _upload_file(self, local_file: ResourcePath, progress: Optional[ProgressPercentage]) -> None: 

227 """Upload a local file with backoff. 

228 

229 Helper method to wrap file uploading in backoff for transfer_from. 

230 """ 

231 try: 

232 self.client.upload_file( 

233 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress 

234 ) 

235 except self.client.exceptions.NoSuchBucket as err: 

236 raise NotADirectoryError(f"Target does not exist: {err}") from err 

237 except ClientError as err: 

238 _translate_client_error(err) 

239 raise 

240 

241 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

242 def _copy_from(self, src: ResourcePath) -> None: 

243 copy_source = { 

244 "Bucket": src.netloc, 

245 "Key": src.relativeToPathRoot, 

246 } 

247 try: 

248 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot) 

249 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 249 ↛ 251line 249 didn't jump to line 251

250 raise FileNotFoundError("No such resource to transfer: {self}") from err 

251 except ClientError as err: 

252 _translate_client_error(err) 

253 raise 

254 

255 def transfer_from( 

256 self, 

257 src: ResourcePath, 

258 transfer: str = "copy", 

259 overwrite: bool = False, 

260 transaction: Optional[TransactionProtocol] = None, 

261 ) -> None: 

262 """Transfer the current resource to an S3 bucket. 

263 

264 Parameters 

265 ---------- 

266 src : `ResourcePath` 

267 Source URI. 

268 transfer : `str` 

269 Mode to use for transferring the resource. Supports the following 

270 options: copy. 

271 overwrite : `bool`, optional 

272 Allow an existing file to be overwritten. Defaults to `False`. 

273 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

274 Currently unused. 

275 """ 

276 # Fail early to prevent delays if remote resources are requested 

277 if transfer not in self.transferModes: 

278 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

279 

280 # Existence checks cost time so do not call this unless we know 

281 # that debugging is enabled. 

282 if log.isEnabledFor(logging.DEBUG): 282 ↛ 293line 282 didn't jump to line 293, because the condition on line 282 was never false

283 log.debug( 

284 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

285 src, 

286 src.exists(), 

287 self, 

288 self.exists(), 

289 transfer, 

290 ) 

291 

292 # Short circuit if the URIs are identical immediately. 

293 if self == src: 

294 log.debug( 

295 "Target and destination URIs are identical: %s, returning immediately." 

296 " No further action required.", 

297 self, 

298 ) 

299 return 

300 

301 if not overwrite and self.exists(): 

302 raise FileExistsError(f"Destination path '{self}' already exists.") 

303 

304 if transfer == "auto": 

305 transfer = self.transferDefault 

306 

307 timer_msg = "Transfer from %s to %s" 

308 timer_args = (src, self) 

309 

310 if isinstance(src, type(self)): 

311 # Looks like an S3 remote uri so we can use direct copy 

312 # note that boto3.resource.meta.copy is cleverer than the low 

313 # level copy_object 

314 with time_this(log, msg=timer_msg, args=timer_args): 

315 self._copy_from(src) 

316 

317 else: 

318 # Use local file and upload it 

319 with src.as_local() as local_uri: 

320 progress = ( 

321 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

322 if log.isEnabledFor(ProgressPercentage.log_level) 

323 else None 

324 ) 

325 with time_this(log, msg=timer_msg, args=timer_args): 

326 self._upload_file(local_uri, progress) 

327 

328 # This was an explicit move requested from a remote resource 

329 # try to remove that resource 

330 if transfer == "move": 

331 # Transactions do not work here 

332 src.remove() 

333 

334 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

335 def walk( 

336 self, file_filter: Optional[Union[str, re.Pattern]] = None 

337 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]: 

338 """Walk the directory tree returning matching files and directories. 

339 

340 Parameters 

341 ---------- 

342 file_filter : `str` or `re.Pattern`, optional 

343 Regex to filter out files from the list before it is returned. 

344 

345 Yields 

346 ------ 

347 dirpath : `ResourcePath` 

348 Current directory being examined. 

349 dirnames : `list` of `str` 

350 Names of subdirectories within dirpath. 

351 filenames : `list` of `str` 

352 Names of all the files within dirpath. 

353 """ 

354 # We pretend that S3 uses directories and files and not simply keys 

355 if not (self.isdir() or self.is_root): 

356 raise ValueError(f"Can not walk a non-directory URI: {self}") 

357 

358 if isinstance(file_filter, str): 358 ↛ 359line 358 didn't jump to line 359, because the condition on line 358 was never true

359 file_filter = re.compile(file_filter) 

360 

361 s3_paginator = self.client.get_paginator("list_objects_v2") 

362 

363 # Limit each query to a single "directory" to match os.walk 

364 # We could download all keys at once with no delimiter and work 

365 # it out locally but this could potentially lead to large memory 

366 # usage for millions of keys. It will also make the initial call 

367 # to this method potentially very slow. If making this method look 

368 # like os.walk was not required, we could query all keys with 

369 # pagination and return them in groups of 1000, but that would 

370 # be a different interface since we can't guarantee we would get 

371 # them all grouped properly across the 1000 limit boundary. 

372 prefix = self.relativeToPathRoot if not self.is_root else "" 

373 prefix_len = len(prefix) 

374 dirnames = [] 

375 filenames = [] 

376 files_there = False 

377 

378 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

379 # All results are returned as full key names and we must 

380 # convert them back to the root form. The prefix is fixed 

381 # and delimited so that is a simple trim 

382 

383 # Directories are reported in the CommonPrefixes result 

384 # which reports the entire key and must be stripped. 

385 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

386 dirnames.extend(found_dirs) 

387 

388 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

389 if found_files: 

390 files_there = True 

391 if file_filter is not None: 

392 found_files = [f for f in found_files if file_filter.search(f)] 

393 

394 filenames.extend(found_files) 

395 

396 # Directories do not exist so we can't test for them. If no files 

397 # or directories were found though, this means that it effectively 

398 # does not exist and we should match os.walk() behavior and return 

399 # immediately. 

400 if not dirnames and not files_there: 

401 return 

402 else: 

403 yield self, dirnames, filenames 

404 

405 for dir in dirnames: 

406 new_uri = self.join(dir) 

407 yield from new_uri.walk(file_filter) 

408 

409 @contextlib.contextmanager 

410 def _openImpl( 

411 self, 

412 mode: str = "r", 

413 *, 

414 encoding: Optional[str] = None, 

415 ) -> Iterator[ResourceHandleProtocol]: 

416 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle: 

417 if "b" in mode: 

418 yield handle 

419 else: 

420 if encoding is None: 

421 encoding = sys.getdefaultencoding() 

422 # cast because the protocol is compatible, but does not have 

423 # BytesIO in the inheritance tree 

424 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub: 

425 yield sub