Coverage for python/lsst/resources/s3.py: 89%

184 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-31 09:33 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("S3ResourcePath",) 

15 

16import contextlib 

17import io 

18import logging 

19import re 

20import sys 

21import tempfile 

22import threading 

23from collections.abc import Iterator 

24from typing import IO, TYPE_CHECKING, cast 

25 

26from botocore.exceptions import ClientError 

27from lsst.utils.timer import time_this 

28 

29from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol 

30from ._resourceHandles._s3ResourceHandle import S3ResourceHandle 

31from ._resourcePath import ResourcePath 

32from .s3utils import ( 

33 _TooManyRequestsError, 

34 all_retryable_errors, 

35 backoff, 

36 bucketExists, 

37 getS3Client, 

38 max_retry_time, 

39 retryable_io_errors, 

40 s3CheckFileExists, 

41) 

42 

43if TYPE_CHECKING: 

44 with contextlib.suppress(ImportError): 

45 import boto3 

46 

47 from .utils import TransactionProtocol 

48 

49 

50log = logging.getLogger(__name__) 

51 

52 

53class ProgressPercentage: 

54 """Progress bar for S3 file uploads.""" 

55 

56 log_level = logging.DEBUG 

57 """Default log level to use when issuing a message.""" 

58 

59 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""): 

60 self._filename = file 

61 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

62 self._size = file.size() 

63 self._seen_so_far = 0 

64 self._lock = threading.Lock() 

65 self._msg = msg 

66 

67 def __call__(self, bytes_amount: int) -> None: 

68 # To simplify, assume this is hooked up to a single filename 

69 with self._lock: 

70 self._seen_so_far += bytes_amount 

71 percentage = (100 * self._seen_so_far) // self._size 

72 log.log( 

73 self.log_level, 

74 "%s %s %s / %s (%s%%)", 

75 self._msg, 

76 self._file_for_msg, 

77 self._seen_so_far, 

78 self._size, 

79 percentage, 

80 ) 

81 

82 

83def _translate_client_error(err: ClientError) -> None: 

84 """Translate a ClientError into a specialist error if relevant. 

85 

86 Parameters 

87 ---------- 

88 err : `ClientError` 

89 Exception to translate. 

90 

91 Raises 

92 ------ 

93 _TooManyRequestsError 

94 Raised if the `ClientError` looks like a 429 retry request. 

95 """ 

96 if "(429)" in str(err): 

97 # ClientError includes the error code in the message 

98 # but no direct way to access it without looking inside the 

99 # response. 

100 raise _TooManyRequestsError(str(err)) from err 

101 elif "(404)" in str(err): 

102 # Some systems can generate this rather than NoSuchKey. 

103 raise FileNotFoundError("Resource not found: {self}") 

104 

105 

106class S3ResourcePath(ResourcePath): 

107 """S3 URI resource path implementation class.""" 

108 

109 @property 

110 def client(self) -> boto3.client: 

111 """Client object to address remote resource.""" 

112 # Defer import for circular dependencies 

113 return getS3Client() 

114 

115 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

116 def exists(self) -> bool: 

117 """Check that the S3 resource exists.""" 

118 if self.is_root: 

119 # Only check for the bucket since the path is irrelevant 

120 return bucketExists(self.netloc) 

121 exists, _ = s3CheckFileExists(self, client=self.client) 

122 return exists 

123 

124 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

125 def size(self) -> int: 

126 """Return the size of the resource in bytes.""" 

127 if self.dirLike: 

128 return 0 

129 exists, sz = s3CheckFileExists(self, client=self.client) 

130 if not exists: 

131 raise FileNotFoundError(f"Resource {self} does not exist") 

132 return sz 

133 

134 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

135 def remove(self) -> None: 

136 """Remove the resource.""" 

137 # https://github.com/boto/boto3/issues/507 - there is no 

138 # way of knowing if the file was actually deleted except 

139 # for checking all the keys again, reponse is HTTP 204 OK 

140 # response all the time 

141 try: 

142 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

143 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

144 raise FileNotFoundError("No such resource: {self}") from err 

145 

146 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

147 def read(self, size: int = -1) -> bytes: 

148 """Read the contents of the resource.""" 

149 args = {} 

150 if size > 0: 

151 args["Range"] = f"bytes=0-{size-1}" 

152 try: 

153 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

154 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 154 ↛ 156line 154 didn't jump to line 156

155 raise FileNotFoundError(f"No such resource: {self}") from err 

156 except ClientError as err: 

157 _translate_client_error(err) 

158 raise 

159 with time_this(log, msg="Read from %s", args=(self,)): 

160 body = response["Body"].read() 

161 response["Body"].close() 

162 return body 

163 

164 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

165 def write(self, data: bytes, overwrite: bool = True) -> None: 

166 """Write the supplied data to the resource.""" 

167 if not overwrite and self.exists(): 

168 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

169 with time_this(log, msg="Write to %s", args=(self,)): 

170 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

171 

172 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

173 def mkdir(self) -> None: 

174 """Write a directory key to S3.""" 

175 if not bucketExists(self.netloc): 

176 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

177 

178 if not self.dirLike: 

179 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

180 

181 # don't create S3 key when root is at the top-level of an Bucket 

182 if self.path != "/": 

183 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

184 

185 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

186 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None: 

187 """Download the remote resource to a local file. 

188 

189 Helper routine for _as_local to allow backoff without regenerating 

190 the temporary file. 

191 """ 

192 try: 

193 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress) 

194 except ( 

195 self.client.exceptions.NoSuchKey, 

196 self.client.exceptions.NoSuchBucket, 

197 ) as err: 

198 raise FileNotFoundError(f"No such resource: {self}") from err 

199 except ClientError as err: 

200 _translate_client_error(err) 

201 raise 

202 

203 def _as_local(self) -> tuple[str, bool]: 

204 """Download object from S3 and place in temporary directory. 

205 

206 Returns 

207 ------- 

208 path : `str` 

209 Path to local temporary file. 

210 temporary : `bool` 

211 Always returns `True`. This is always a temporary file. 

212 """ 

213 with ( 

214 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile, 

215 time_this(log, msg="Downloading %s to local file", args=(self,)), 

216 ): 

217 progress = ( 

218 ProgressPercentage(self, msg="Downloading:") 

219 if log.isEnabledFor(ProgressPercentage.log_level) 

220 else None 

221 ) 

222 self._download_file(tmpFile, progress) 

223 return tmpFile.name, True 

224 

225 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

226 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None: 

227 """Upload a local file with backoff. 

228 

229 Helper method to wrap file uploading in backoff for transfer_from. 

230 """ 

231 try: 

232 self.client.upload_file( 

233 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress 

234 ) 

235 except self.client.exceptions.NoSuchBucket as err: 

236 raise NotADirectoryError(f"Target does not exist: {err}") from err 

237 except ClientError as err: 

238 _translate_client_error(err) 

239 raise 

240 

241 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

242 def _copy_from(self, src: ResourcePath) -> None: 

243 copy_source = { 

244 "Bucket": src.netloc, 

245 "Key": src.relativeToPathRoot, 

246 } 

247 try: 

248 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot) 

249 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 249 ↛ 251line 249 didn't jump to line 251

250 raise FileNotFoundError("No such resource to transfer: {self}") from err 

251 except ClientError as err: 

252 _translate_client_error(err) 

253 raise 

254 

255 def transfer_from( 

256 self, 

257 src: ResourcePath, 

258 transfer: str = "copy", 

259 overwrite: bool = False, 

260 transaction: TransactionProtocol | None = None, 

261 ) -> None: 

262 """Transfer the current resource to an S3 bucket. 

263 

264 Parameters 

265 ---------- 

266 src : `ResourcePath` 

267 Source URI. 

268 transfer : `str` 

269 Mode to use for transferring the resource. Supports the following 

270 options: copy. 

271 overwrite : `bool`, optional 

272 Allow an existing file to be overwritten. Defaults to `False`. 

273 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

274 Currently unused. 

275 """ 

276 # Fail early to prevent delays if remote resources are requested 

277 if transfer not in self.transferModes: 

278 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

279 

280 # Existence checks cost time so do not call this unless we know 

281 # that debugging is enabled. 

282 if log.isEnabledFor(logging.DEBUG): 282 ↛ 293line 282 didn't jump to line 293, because the condition on line 282 was never false

283 log.debug( 

284 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

285 src, 

286 src.exists(), 

287 self, 

288 self.exists(), 

289 transfer, 

290 ) 

291 

292 # Short circuit if the URIs are identical immediately. 

293 if self == src: 

294 log.debug( 

295 "Target and destination URIs are identical: %s, returning immediately." 

296 " No further action required.", 

297 self, 

298 ) 

299 return 

300 

301 if not overwrite and self.exists(): 

302 raise FileExistsError(f"Destination path '{self}' already exists.") 

303 

304 if transfer == "auto": 

305 transfer = self.transferDefault 

306 

307 timer_msg = "Transfer from %s to %s" 

308 timer_args = (src, self) 

309 

310 if isinstance(src, type(self)): 

311 # Looks like an S3 remote uri so we can use direct copy 

312 # note that boto3.resource.meta.copy is cleverer than the low 

313 # level copy_object 

314 with time_this(log, msg=timer_msg, args=timer_args): 

315 self._copy_from(src) 

316 

317 else: 

318 # Use local file and upload it 

319 with src.as_local() as local_uri: 

320 progress = ( 

321 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

322 if log.isEnabledFor(ProgressPercentage.log_level) 

323 else None 

324 ) 

325 with time_this(log, msg=timer_msg, args=timer_args): 

326 self._upload_file(local_uri, progress) 

327 

328 # This was an explicit move requested from a remote resource 

329 # try to remove that resource 

330 if transfer == "move": 

331 # Transactions do not work here 

332 src.remove() 

333 

334 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

335 def walk( 

336 self, file_filter: str | re.Pattern | None = None 

337 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

338 """Walk the directory tree returning matching files and directories. 

339 

340 Parameters 

341 ---------- 

342 file_filter : `str` or `re.Pattern`, optional 

343 Regex to filter out files from the list before it is returned. 

344 

345 Yields 

346 ------ 

347 dirpath : `ResourcePath` 

348 Current directory being examined. 

349 dirnames : `list` of `str` 

350 Names of subdirectories within dirpath. 

351 filenames : `list` of `str` 

352 Names of all the files within dirpath. 

353 """ 

354 # We pretend that S3 uses directories and files and not simply keys 

355 if not (self.isdir() or self.is_root): 

356 raise ValueError(f"Can not walk a non-directory URI: {self}") 

357 

358 if isinstance(file_filter, str): 358 ↛ 359line 358 didn't jump to line 359, because the condition on line 358 was never true

359 file_filter = re.compile(file_filter) 

360 

361 s3_paginator = self.client.get_paginator("list_objects_v2") 

362 

363 # Limit each query to a single "directory" to match os.walk 

364 # We could download all keys at once with no delimiter and work 

365 # it out locally but this could potentially lead to large memory 

366 # usage for millions of keys. It will also make the initial call 

367 # to this method potentially very slow. If making this method look 

368 # like os.walk was not required, we could query all keys with 

369 # pagination and return them in groups of 1000, but that would 

370 # be a different interface since we can't guarantee we would get 

371 # them all grouped properly across the 1000 limit boundary. 

372 prefix = self.relativeToPathRoot if not self.is_root else "" 

373 prefix_len = len(prefix) 

374 dirnames = [] 

375 filenames = [] 

376 files_there = False 

377 

378 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

379 # All results are returned as full key names and we must 

380 # convert them back to the root form. The prefix is fixed 

381 # and delimited so that is a simple trim 

382 

383 # Directories are reported in the CommonPrefixes result 

384 # which reports the entire key and must be stripped. 

385 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

386 dirnames.extend(found_dirs) 

387 

388 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

389 if found_files: 

390 files_there = True 

391 if file_filter is not None: 

392 found_files = [f for f in found_files if file_filter.search(f)] 

393 

394 filenames.extend(found_files) 

395 

396 # Directories do not exist so we can't test for them. If no files 

397 # or directories were found though, this means that it effectively 

398 # does not exist and we should match os.walk() behavior and return 

399 # immediately. 

400 if not dirnames and not files_there: 

401 return 

402 else: 

403 yield self, dirnames, filenames 

404 

405 for dir in dirnames: 

406 new_uri = self.join(dir) 

407 yield from new_uri.walk(file_filter) 

408 

409 @contextlib.contextmanager 

410 def _openImpl( 

411 self, 

412 mode: str = "r", 

413 *, 

414 encoding: str | None = None, 

415 ) -> Iterator[ResourceHandleProtocol]: 

416 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle: 

417 if "b" in mode: 

418 yield handle 

419 else: 

420 if encoding is None: 

421 encoding = sys.getdefaultencoding() 

422 # cast because the protocol is compatible, but does not have 

423 # BytesIO in the inheritance tree 

424 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub: 

425 yield sub