Coverage for python/lsst/resources/s3.py: 73%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

141 statements  

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14import logging 

15import re 

16import tempfile 

17 

18__all__ = ("S3ResourcePath",) 

19 

20from http.client import HTTPException, ImproperConnectionState 

21from typing import TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, Union 

22 

23from botocore.exceptions import ClientError 

24from lsst.utils.timer import time_this 

25from urllib3.exceptions import HTTPError, RequestError 

26 

27from ._resourcePath import ResourcePath 

28from .s3utils import bucketExists, getS3Client, s3CheckFileExists 

29 

30if TYPE_CHECKING: 30 ↛ 31line 30 didn't jump to line 31, because the condition on line 30 was never true

31 try: 

32 import boto3 

33 except ImportError: 

34 pass 

35 from .utils import TransactionProtocol 

36 

37# https://pypi.org/project/backoff/ 

38try: 

39 import backoff 

40except ImportError: 

41 

42 class Backoff: 

43 @staticmethod 

44 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

45 return func 

46 

47 @staticmethod 

48 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

49 return func 

50 

51 backoff = Backoff 

52 

53# settings for "backoff" retry decorators. these retries are belt-and- 

54# suspenders along with the retries built into Boto3, to account for 

55# semantic differences in errors between S3-like providers. 

56retryable_io_errors = ( 

57 # http.client 

58 ImproperConnectionState, 

59 HTTPException, 

60 # urllib3.exceptions 

61 RequestError, 

62 HTTPError, 

63 # built-ins 

64 TimeoutError, 

65 ConnectionError, 

66) 

67 

68# Client error can include NoSuchKey so retry may not be the right 

69# thing. This may require more consideration if it is to be used. 

70retryable_client_errors = ( 

71 # botocore.exceptions 

72 ClientError, 

73 # built-ins 

74 PermissionError, 

75) 

76 

77# Combine all errors into an easy package. For now client errors 

78# are not included. 

79all_retryable_errors = retryable_io_errors 

80max_retry_time = 60 

81 

82 

83log = logging.getLogger(__name__) 

84 

85 

86class S3ResourcePath(ResourcePath): 

87 """S3 URI resource path implementation class.""" 

88 

89 @property 

90 def client(self) -> boto3.client: 

91 """Client object to address remote resource.""" 

92 # Defer import for circular dependencies 

93 return getS3Client() 

94 

95 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

96 def exists(self) -> bool: 

97 """Check that the S3 resource exists.""" 

98 if self.is_root: 98 ↛ 100line 98 didn't jump to line 100, because the condition on line 98 was never true

99 # Only check for the bucket since the path is irrelevant 

100 return bucketExists(self.netloc) 

101 exists, _ = s3CheckFileExists(self, client=self.client) 

102 return exists 

103 

104 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

105 def size(self) -> int: 

106 """Return the size of the resource in bytes.""" 

107 if self.dirLike: 107 ↛ 108line 107 didn't jump to line 108, because the condition on line 107 was never true

108 return 0 

109 exists, sz = s3CheckFileExists(self, client=self.client) 

110 if not exists: 110 ↛ 112line 110 didn't jump to line 112, because the condition on line 110 was never false

111 raise FileNotFoundError(f"Resource {self} does not exist") 

112 return sz 

113 

114 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

115 def remove(self) -> None: 

116 """Remove the resource.""" 

117 # https://github.com/boto/boto3/issues/507 - there is no 

118 # way of knowing if the file was actually deleted except 

119 # for checking all the keys again, reponse is HTTP 204 OK 

120 # response all the time 

121 try: 

122 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

123 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

124 raise FileNotFoundError("No such resource: {self}") from err 

125 

126 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

127 def read(self, size: int = -1) -> bytes: 

128 """Read the contents of the resource.""" 

129 args = {} 

130 if size > 0: 

131 args["Range"] = f"bytes=0-{size-1}" 

132 try: 

133 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args) 

134 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

135 raise FileNotFoundError(f"No such resource: {self}") from err 

136 with time_this(log, msg="Read from %s", args=(self,)): 

137 body = response["Body"].read() 

138 response["Body"].close() 

139 return body 

140 

141 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

142 def write(self, data: bytes, overwrite: bool = True) -> None: 

143 """Write the supplied data to the resource.""" 

144 if not overwrite: 

145 if self.exists(): 145 ↛ 146line 145 didn't jump to line 146, because the condition on line 145 was never true

146 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

147 with time_this(log, msg="Write to %s", args=(self,)): 

148 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data) 

149 

150 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

151 def mkdir(self) -> None: 

152 """Write a directory key to S3.""" 

153 if not bucketExists(self.netloc): 

154 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

155 

156 if not self.dirLike: 

157 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

158 

159 # don't create S3 key when root is at the top-level of an Bucket 

160 if not self.path == "/": 

161 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

162 

163 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

164 def _as_local(self) -> Tuple[str, bool]: 

165 """Download object from S3 and place in temporary directory. 

166 

167 Returns 

168 ------- 

169 path : `str` 

170 Path to local temporary file. 

171 temporary : `bool` 

172 Always returns `True`. This is always a temporary file. 

173 """ 

174 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

175 with time_this(log, msg="Downloading %s to local file", args=(self,)): 

176 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, tmpFile) 

177 return tmpFile.name, True 

178 

179 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

180 def transfer_from( 

181 self, 

182 src: ResourcePath, 

183 transfer: str = "copy", 

184 overwrite: bool = False, 

185 transaction: Optional[TransactionProtocol] = None, 

186 ) -> None: 

187 """Transfer the current resource to an S3 bucket. 

188 

189 Parameters 

190 ---------- 

191 src : `ResourcePath` 

192 Source URI. 

193 transfer : `str` 

194 Mode to use for transferring the resource. Supports the following 

195 options: copy. 

196 overwrite : `bool`, optional 

197 Allow an existing file to be overwritten. Defaults to `False`. 

198 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

199 Currently unused. 

200 """ 

201 # Fail early to prevent delays if remote resources are requested 

202 if transfer not in self.transferModes: 

203 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

204 

205 # Existence checks cost time so do not call this unless we know 

206 # that debugging is enabled. 

207 if log.isEnabledFor(logging.DEBUG): 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true

208 log.debug( 

209 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

210 src, 

211 src.exists(), 

212 self, 

213 self.exists(), 

214 transfer, 

215 ) 

216 

217 if not overwrite and self.exists(): 

218 raise FileExistsError(f"Destination path '{self}' already exists.") 

219 

220 if transfer == "auto": 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true

221 transfer = self.transferDefault 

222 

223 timer_msg = "Transfer from %s to %s" 

224 timer_args = (src, self) 

225 

226 if isinstance(src, type(self)): 

227 # Looks like an S3 remote uri so we can use direct copy 

228 # note that boto3.resource.meta.copy is cleverer than the low 

229 # level copy_object 

230 copy_source = { 

231 "Bucket": src.netloc, 

232 "Key": src.relativeToPathRoot, 

233 } 

234 with time_this(log, msg=timer_msg, args=timer_args): 

235 self.client.copy_object( 

236 CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot 

237 ) 

238 else: 

239 # Use local file and upload it 

240 with src.as_local() as local_uri: 

241 

242 # resource.meta.upload_file seems like the right thing 

243 # but we have a low level client 

244 with time_this(log, msg=timer_msg, args=timer_args): 

245 with open(local_uri.ospath, "rb") as fh: 

246 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=fh) 

247 

248 # This was an explicit move requested from a remote resource 

249 # try to remove that resource 

250 if transfer == "move": 250 ↛ 252line 250 didn't jump to line 252, because the condition on line 250 was never true

251 # Transactions do not work here 

252 src.remove() 

253 

254 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

255 def walk( 

256 self, file_filter: Optional[Union[str, re.Pattern]] = None 

257 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]: 

258 """Walk the directory tree returning matching files and directories. 

259 

260 Parameters 

261 ---------- 

262 file_filter : `str` or `re.Pattern`, optional 

263 Regex to filter out files from the list before it is returned. 

264 

265 Yields 

266 ------ 

267 dirpath : `ResourcePath` 

268 Current directory being examined. 

269 dirnames : `list` of `str` 

270 Names of subdirectories within dirpath. 

271 filenames : `list` of `str` 

272 Names of all the files within dirpath. 

273 """ 

274 # We pretend that S3 uses directories and files and not simply keys 

275 if not (self.isdir() or self.is_root): 275 ↛ 276line 275 didn't jump to line 276, because the condition on line 275 was never true

276 raise ValueError(f"Can not walk a non-directory URI: {self}") 

277 

278 if isinstance(file_filter, str): 278 ↛ 279line 278 didn't jump to line 279, because the condition on line 278 was never true

279 file_filter = re.compile(file_filter) 

280 

281 s3_paginator = self.client.get_paginator("list_objects_v2") 

282 

283 # Limit each query to a single "directory" to match os.walk 

284 # We could download all keys at once with no delimiter and work 

285 # it out locally but this could potentially lead to large memory 

286 # usage for millions of keys. It will also make the initial call 

287 # to this method potentially very slow. If making this method look 

288 # like os.walk was not required, we could query all keys with 

289 # pagination and return them in groups of 1000, but that would 

290 # be a different interface since we can't guarantee we would get 

291 # them all grouped properly across the 1000 limit boundary. 

292 prefix = self.relativeToPathRoot if not self.is_root else "" 

293 prefix_len = len(prefix) 

294 dirnames = [] 

295 filenames = [] 

296 files_there = False 

297 

298 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

299 # All results are returned as full key names and we must 

300 # convert them back to the root form. The prefix is fixed 

301 # and delimited so that is a simple trim 

302 

303 # Directories are reported in the CommonPrefixes result 

304 # which reports the entire key and must be stripped. 

305 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

306 dirnames.extend(found_dirs) 

307 

308 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

309 if found_files: 

310 files_there = True 

311 if file_filter is not None: 

312 found_files = [f for f in found_files if file_filter.search(f)] 

313 

314 filenames.extend(found_files) 

315 

316 # Directories do not exist so we can't test for them. If no files 

317 # or directories were found though, this means that it effectively 

318 # does not exist and we should match os.walk() behavior and return 

319 # []. 

320 if not dirnames and not files_there: 320 ↛ 321line 320 didn't jump to line 321, because the condition on line 320 was never true

321 yield [] 

322 else: 

323 yield self, dirnames, filenames 

324 

325 for dir in dirnames: 

326 new_uri = self.join(dir) 

327 yield from new_uri.walk(file_filter)