Coverage for python/lsst/resources/s3.py: 92%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

166 statements  

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14import logging 

15import re 

16import tempfile 

17import threading 

18 

19__all__ = ("S3ResourcePath",) 

20 

21from http.client import HTTPException, ImproperConnectionState 

22from typing import TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, Union 

23 

24from botocore.exceptions import ClientError 

25from lsst.utils.timer import time_this 

26from urllib3.exceptions import HTTPError, RequestError 

27 

28from ._resourcePath import ResourcePath 

29from .s3utils import bucketExists, getS3Client, s3CheckFileExists 

30 

31if TYPE_CHECKING: 31 ↛ 32line 31 didn't jump to line 32, because the condition on line 31 was never true

32 try: 

33 import boto3 

34 except ImportError: 

35 pass 

36 from .utils import TransactionProtocol 

37 

38# https://pypi.org/project/backoff/ 

39try: 

40 import backoff 

41except ImportError: 

42 

43 class Backoff: 

44 @staticmethod 

45 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

46 return func 

47 

48 @staticmethod 

49 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

50 return func 

51 

52 backoff = Backoff 

53 

54# settings for "backoff" retry decorators. these retries are belt-and- 

55# suspenders along with the retries built into Boto3, to account for 

56# semantic differences in errors between S3-like providers. 

57retryable_io_errors = ( 

58 # http.client 

59 ImproperConnectionState, 

60 HTTPException, 

61 # urllib3.exceptions 

62 RequestError, 

63 HTTPError, 

64 # built-ins 

65 TimeoutError, 

66 ConnectionError, 

67) 

68 

69# Client error can include NoSuchKey so retry may not be the right 

70# thing. This may require more consideration if it is to be used. 

71retryable_client_errors = ( 

72 # botocore.exceptions 

73 ClientError, 

74 # built-ins 

75 PermissionError, 

76) 

77 

78# Combine all errors into an easy package. For now client errors 

79# are not included. 

80all_retryable_errors = retryable_io_errors 

81max_retry_time = 60 

82 

83 

84log = logging.getLogger(__name__) 

85 

86 

87class ProgressPercentage: 

88 """Progress bar for S3 file uploads.""" 

89 

90 log_level = logging.DEBUG 

91 """Default log level to use when issuing a message.""" 

92 

93 def __init__(self, file: ResourcePath, file_for_msg: Optional[ResourcePath] = None, msg: str = ""): 

94 self._filename = file 

95 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file) 

96 self._size = file.size() 

97 self._seen_so_far = 0 

98 self._lock = threading.Lock() 

99 self._msg = msg 

100 

101 def __call__(self, bytes_amount: int) -> None: 

102 # To simplify, assume this is hooked up to a single filename 

103 with self._lock: 

104 self._seen_so_far += bytes_amount 

105 percentage = (100 * self._seen_so_far) // self._size 

106 log.log( 

107 self.log_level, 

108 "%s %s %s / %s (%s%%)", 

109 self._msg, 

110 self._file_for_msg, 

111 self._seen_so_far, 

112 self._size, 

113 percentage, 

114 ) 

115 

116 

117class S3ResourcePath(ResourcePath): 

118 """S3 URI resource path implementation class.""" 

119 

120 @property 

121 def client(self) -> boto3.client: 

122 """Client object to address remote resource.""" 

123 # Defer import for circular dependencies 

124 return getS3Client() 

125 

126 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

127 def exists(self) -> bool: 

128 """Check that the S3 resource exists.""" 

129 if self.is_root: 

130 # Only check for the bucket since the path is irrelevant 

131 return bucketExists(self.netloc) 

132 exists, _ = s3CheckFileExists(self, client=self.client) 

133 return exists 

134 

135 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

136 def size(self) -> int: 

137 """Return the size of the resource in bytes.""" 

138 if self.dirLike: 

139 return 0 

140 exists, sz = s3CheckFileExists(self, client=self.client) 

141 if not exists: 

142 raise FileNotFoundError(f"Resource {self} does not exist") 

143 return sz 

144 

145 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

146 def remove(self) -> None: 

147 """Remove the resource.""" 

148 # https://github.com/boto/boto3/issues/507 - there is no 

149 # way of knowing if the file was actually deleted except 

150 # for checking all the keys again, reponse is HTTP 204 OK 

151 # response all the time 

152 try: 

153 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

154 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

155 raise FileNotFoundError("No such resource: {self}") from err 

156 

157 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

158 def read(self, size: int = -1) -> bytes: 

159 """Read the contents of the resource.""" 

160 args = {} 

161 if size > 0: 

162 args["Range"] = f"bytes=0-{size-1}" 

163 try: 

164 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

165 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

166 raise FileNotFoundError(f"No such resource: {self}") from err 

167 with time_this(log, msg="Read from %s", args=(self,)): 

168 body = response["Body"].read() 

169 response["Body"].close() 

170 return body 

171 

172 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

173 def write(self, data: bytes, overwrite: bool = True) -> None: 

174 """Write the supplied data to the resource.""" 

175 if not overwrite: 

176 if self.exists(): 

177 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

178 with time_this(log, msg="Write to %s", args=(self,)): 

179 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

180 

181 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

182 def mkdir(self) -> None: 

183 """Write a directory key to S3.""" 

184 if not bucketExists(self.netloc): 

185 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

186 

187 if not self.dirLike: 

188 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

189 

190 # don't create S3 key when root is at the top-level of an Bucket 

191 if not self.path == "/": 

192 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

193 

194 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

195 def _as_local(self) -> Tuple[str, bool]: 

196 """Download object from S3 and place in temporary directory. 

197 

198 Returns 

199 ------- 

200 path : `str` 

201 Path to local temporary file. 

202 temporary : `bool` 

203 Always returns `True`. This is always a temporary file. 

204 """ 

205 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

206 with time_this(log, msg="Downloading %s to local file", args=(self,)): 

207 progress = ( 

208 ProgressPercentage(self, msg="Downloading:") 

209 if log.isEnabledFor(ProgressPercentage.log_level) 

210 else None 

211 ) 

212 try: 

213 self.client.download_fileobj( 

214 self.netloc, self.relativeToPathRoot, tmpFile, Callback=progress 

215 ) 

216 except ( 

217 ClientError, 

218 self.client.exceptions.NoSuchKey, 

219 self.client.exceptions.NoSuchBucket, 

220 ) as err: 

221 raise FileNotFoundError(f"No such resource: {self}") from err 

222 return tmpFile.name, True 

223 

224 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

225 def transfer_from( 

226 self, 

227 src: ResourcePath, 

228 transfer: str = "copy", 

229 overwrite: bool = False, 

230 transaction: Optional[TransactionProtocol] = None, 

231 ) -> None: 

232 """Transfer the current resource to an S3 bucket. 

233 

234 Parameters 

235 ---------- 

236 src : `ResourcePath` 

237 Source URI. 

238 transfer : `str` 

239 Mode to use for transferring the resource. Supports the following 

240 options: copy. 

241 overwrite : `bool`, optional 

242 Allow an existing file to be overwritten. Defaults to `False`. 

243 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

244 Currently unused. 

245 """ 

246 # Fail early to prevent delays if remote resources are requested 

247 if transfer not in self.transferModes: 

248 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

249 

250 # Existence checks cost time so do not call this unless we know 

251 # that debugging is enabled. 

252 if log.isEnabledFor(logging.DEBUG): 

253 log.debug( 

254 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

255 src, 

256 src.exists(), 

257 self, 

258 self.exists(), 

259 transfer, 

260 ) 

261 

262 # Short circuit if the URIs are identical immediately. 

263 if self == src: 

264 log.debug( 

265 "Target and destination URIs are identical: %s, returning immediately." 

266 " No further action required.", 

267 self, 

268 ) 

269 return 

270 

271 if not overwrite and self.exists(): 

272 raise FileExistsError(f"Destination path '{self}' already exists.") 

273 

274 if transfer == "auto": 

275 transfer = self.transferDefault 

276 

277 timer_msg = "Transfer from %s to %s" 

278 timer_args = (src, self) 

279 

280 if isinstance(src, type(self)): 

281 # Looks like an S3 remote uri so we can use direct copy 

282 # note that boto3.resource.meta.copy is cleverer than the low 

283 # level copy_object 

284 copy_source = { 

285 "Bucket": src.netloc, 

286 "Key": src.relativeToPathRoot, 

287 } 

288 with time_this(log, msg=timer_msg, args=timer_args): 

289 try: 

290 self.client.copy_object( 

291 CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot 

292 ) 

293 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

294 raise FileNotFoundError("No such resource to transfer: {self}") from err 

295 else: 

296 # Use local file and upload it 

297 with src.as_local() as local_uri: 

298 progress = ( 

299 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:") 

300 if log.isEnabledFor(ProgressPercentage.log_level) 

301 else None 

302 ) 

303 with time_this(log, msg=timer_msg, args=timer_args): 

304 self.client.upload_file( 

305 local_uri.ospath, self.netloc, self.relativeToPathRoot, Callback=progress 

306 ) 

307 

308 # This was an explicit move requested from a remote resource 

309 # try to remove that resource 

310 if transfer == "move": 

311 # Transactions do not work here 

312 src.remove() 

313 

314 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

315 def walk( 

316 self, file_filter: Optional[Union[str, re.Pattern]] = None 

317 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]: 

318 """Walk the directory tree returning matching files and directories. 

319 

320 Parameters 

321 ---------- 

322 file_filter : `str` or `re.Pattern`, optional 

323 Regex to filter out files from the list before it is returned. 

324 

325 Yields 

326 ------ 

327 dirpath : `ResourcePath` 

328 Current directory being examined. 

329 dirnames : `list` of `str` 

330 Names of subdirectories within dirpath. 

331 filenames : `list` of `str` 

332 Names of all the files within dirpath. 

333 """ 

334 # We pretend that S3 uses directories and files and not simply keys 

335 if not (self.isdir() or self.is_root): 

336 raise ValueError(f"Can not walk a non-directory URI: {self}") 

337 

338 if isinstance(file_filter, str): 338 ↛ 339line 338 didn't jump to line 339, because the condition on line 338 was never true

339 file_filter = re.compile(file_filter) 

340 

341 s3_paginator = self.client.get_paginator("list_objects_v2") 

342 

343 # Limit each query to a single "directory" to match os.walk 

344 # We could download all keys at once with no delimiter and work 

345 # it out locally but this could potentially lead to large memory 

346 # usage for millions of keys. It will also make the initial call 

347 # to this method potentially very slow. If making this method look 

348 # like os.walk was not required, we could query all keys with 

349 # pagination and return them in groups of 1000, but that would 

350 # be a different interface since we can't guarantee we would get 

351 # them all grouped properly across the 1000 limit boundary. 

352 prefix = self.relativeToPathRoot if not self.is_root else "" 

353 prefix_len = len(prefix) 

354 dirnames = [] 

355 filenames = [] 

356 files_there = False 

357 

358 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

359 # All results are returned as full key names and we must 

360 # convert them back to the root form. The prefix is fixed 

361 # and delimited so that is a simple trim 

362 

363 # Directories are reported in the CommonPrefixes result 

364 # which reports the entire key and must be stripped. 

365 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

366 dirnames.extend(found_dirs) 

367 

368 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

369 if found_files: 

370 files_there = True 

371 if file_filter is not None: 

372 found_files = [f for f in found_files if file_filter.search(f)] 

373 

374 filenames.extend(found_files) 

375 

376 # Directories do not exist so we can't test for them. If no files 

377 # or directories were found though, this means that it effectively 

378 # does not exist and we should match os.walk() behavior and return 

379 # immediately. 

380 if not dirnames and not files_there: 

381 return 

382 else: 

383 yield self, dirnames, filenames 

384 

385 for dir in dirnames: 

386 new_uri = self.join(dir) 

387 yield from new_uri.walk(file_filter)