Coverage for python/lsst/daf/butler/core/_butlerUri/s3.py: 78%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

142 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import logging 

25import re 

26import tempfile 

27 

28__all__ = ('ButlerS3URI',) 

29 

30from typing import ( 

31 TYPE_CHECKING, 

32 Optional, 

33 Any, 

34 Callable, 

35 Iterator, 

36 List, 

37 Tuple, 

38 Union, 

39) 

40 

41from ..utils import time_this 

42from .utils import NoTransaction 

43from ._butlerUri import ButlerURI 

44from .s3utils import getS3Client, s3CheckFileExists, bucketExists 

45 

46from botocore.exceptions import ClientError 

47from http.client import ImproperConnectionState, HTTPException 

48from urllib3.exceptions import RequestError, HTTPError 

49 

50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 try: 

52 import boto3 

53 except ImportError: 

54 pass 

55 from ..datastore import DatastoreTransaction 

56 

57# https://pypi.org/project/backoff/ 

58try: 

59 import backoff 

60except ImportError: 

61 class Backoff(): 

62 @staticmethod 

63 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

64 return func 

65 

66 @staticmethod 

67 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

68 return func 

69 

70 backoff = Backoff 

71 

72# settings for "backoff" retry decorators. these retries are belt-and- 

73# suspenders along with the retries built into Boto3, to account for 

74# semantic differences in errors between S3-like providers. 

75retryable_io_errors = ( 

76 # http.client 

77 ImproperConnectionState, HTTPException, 

78 # urllib3.exceptions 

79 RequestError, HTTPError, 

80 # built-ins 

81 TimeoutError, ConnectionError) 

82 

83# Client error can include NoSuchKey so retry may not be the right 

84# thing. This may require more consideration if it is to be used. 

85retryable_client_errors = ( 

86 # botocore.exceptions 

87 ClientError, 

88 # built-ins 

89 PermissionError) 

90 

91# Combine all errors into an easy package. For now client errors 

92# are not included. 

93all_retryable_errors = retryable_io_errors 

94max_retry_time = 60 

95 

96 

97log = logging.getLogger(__name__) 

98 

99 

100class ButlerS3URI(ButlerURI): 

101 """S3 URI implementation class.""" 

102 

103 @property 

104 def client(self) -> boto3.client: 

105 """Client object to address remote resource.""" 

106 # Defer import for circular dependencies 

107 return getS3Client() 

108 

109 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

110 def exists(self) -> bool: 

111 """Check that the S3 resource exists.""" 

112 if self.is_root: 112 ↛ 114line 112 didn't jump to line 114, because the condition on line 112 was never true

113 # Only check for the bucket since the path is irrelevant 

114 return bucketExists(self.netloc) 

115 exists, _ = s3CheckFileExists(self, client=self.client) 

116 return exists 

117 

118 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

119 def size(self) -> int: 

120 """Return the size of the resource in bytes.""" 

121 if self.dirLike: 121 ↛ 122line 121 didn't jump to line 122, because the condition on line 121 was never true

122 return 0 

123 exists, sz = s3CheckFileExists(self, client=self.client) 

124 if not exists: 

125 raise FileNotFoundError(f"Resource {self} does not exist") 

126 return sz 

127 

128 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time) 

129 def remove(self) -> None: 

130 """Remove the resource.""" 

131 # https://github.com/boto/boto3/issues/507 - there is no 

132 # way of knowing if the file was actually deleted except 

133 # for checking all the keys again, reponse is HTTP 204 OK 

134 # response all the time 

135 try: 

136 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

137 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

138 raise FileNotFoundError("No such resource: {self}") from err 

139 

140 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

141 def read(self, size: int = -1) -> bytes: 

142 """Read the contents of the resource.""" 

143 args = {} 

144 if size > 0: 

145 args["Range"] = f"bytes=0-{size-1}" 

146 try: 

147 response = self.client.get_object(Bucket=self.netloc, 

148 Key=self.relativeToPathRoot, 

149 **args) 

150 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

151 raise FileNotFoundError(f"No such resource: {self}") from err 

152 with time_this(log, msg="Read from %s", args=(self,)): 

153 body = response["Body"].read() 

154 response["Body"].close() 

155 return body 

156 

157 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

158 def write(self, data: bytes, overwrite: bool = True) -> None: 

159 """Write the supplied data to the resource.""" 

160 if not overwrite: 

161 if self.exists(): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

163 with time_this(log, msg="Write to %s", args=(self,)): 

164 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, 

165 Body=data) 

166 

167 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

168 def mkdir(self) -> None: 

169 """Write a directory key to S3.""" 

170 if not bucketExists(self.netloc): 170 ↛ 171line 170 didn't jump to line 171, because the condition on line 170 was never true

171 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

172 

173 if not self.dirLike: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true

174 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

175 

176 # don't create S3 key when root is at the top-level of an Bucket 

177 if not self.path == "/": 177 ↛ exitline 177 didn't return from function 'mkdir', because the condition on line 177 was never false

178 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

179 

180 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

181 def _as_local(self) -> Tuple[str, bool]: 

182 """Download object from S3 and place in temporary directory. 

183 

184 Returns 

185 ------- 

186 path : `str` 

187 Path to local temporary file. 

188 temporary : `bool` 

189 Always returns `True`. This is always a temporary file. 

190 """ 

191 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

192 with time_this(log, msg="Downloading %s to local file", args=(self,)): 

193 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, tmpFile) 

194 return tmpFile.name, True 

195 

196 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

197 def transfer_from(self, src: ButlerURI, transfer: str = "copy", 

198 overwrite: bool = False, 

199 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

200 """Transfer the current resource to an S3 bucket. 

201 

202 Parameters 

203 ---------- 

204 src : `ButlerURI` 

205 Source URI. 

206 transfer : `str` 

207 Mode to use for transferring the resource. Supports the following 

208 options: copy. 

209 overwrite : `bool`, optional 

210 Allow an existing file to be overwritten. Defaults to `False`. 

211 transaction : `DatastoreTransaction`, optional 

212 Currently unused. 

213 """ 

214 # Fail early to prevent delays if remote resources are requested 

215 if transfer not in self.transferModes: 

216 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

217 

218 # Existence checks cost time so do not call this unless we know 

219 # that debugging is enabled. 

220 if log.isEnabledFor(logging.DEBUG): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true

221 log.debug("Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

222 src, src.exists(), self, self.exists(), transfer) 

223 

224 if not overwrite and self.exists(): 

225 raise FileExistsError(f"Destination path '{self}' already exists.") 

226 

227 if transfer == "auto": 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true

228 transfer = self.transferDefault 

229 

230 timer_msg = "Transfer from %s to %s" 

231 timer_args = (src, self) 

232 

233 if isinstance(src, type(self)): 

234 # Looks like an S3 remote uri so we can use direct copy 

235 # note that boto3.resource.meta.copy is cleverer than the low 

236 # level copy_object 

237 copy_source = { 

238 "Bucket": src.netloc, 

239 "Key": src.relativeToPathRoot, 

240 } 

241 with time_this(log, msg=timer_msg, args=timer_args): 

242 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, 

243 Key=self.relativeToPathRoot) 

244 else: 

245 # Use local file and upload it 

246 with src.as_local() as local_uri: 

247 

248 # resource.meta.upload_file seems like the right thing 

249 # but we have a low level client 

250 with time_this(log, msg=timer_msg, args=timer_args): 

251 with open(local_uri.ospath, "rb") as fh: 

252 self.client.put_object(Bucket=self.netloc, 

253 Key=self.relativeToPathRoot, Body=fh) 

254 

255 # This was an explicit move requested from a remote resource 

256 # try to remove that resource 

257 if transfer == "move": 257 ↛ 259line 257 didn't jump to line 259, because the condition on line 257 was never true

258 # Transactions do not work here 

259 src.remove() 

260 

261 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

262 def walk(self, file_filter: Optional[Union[str, re.Pattern]] = None) -> Iterator[Union[List, 

263 Tuple[ButlerURI, 

264 List[str], 

265 List[str]]]]: 

266 """Walk the directory tree returning matching files and directories. 

267 

268 Parameters 

269 ---------- 

270 file_filter : `str` or `re.Pattern`, optional 

271 Regex to filter out files from the list before it is returned. 

272 

273 Yields 

274 ------ 

275 dirpath : `ButlerURI` 

276 Current directory being examined. 

277 dirnames : `list` of `str` 

278 Names of subdirectories within dirpath. 

279 filenames : `list` of `str` 

280 Names of all the files within dirpath. 

281 """ 

282 # We pretend that S3 uses directories and files and not simply keys 

283 if not (self.isdir() or self.is_root): 283 ↛ 284line 283 didn't jump to line 284, because the condition on line 283 was never true

284 raise ValueError(f"Can not walk a non-directory URI: {self}") 

285 

286 if isinstance(file_filter, str): 286 ↛ 287line 286 didn't jump to line 287, because the condition on line 286 was never true

287 file_filter = re.compile(file_filter) 

288 

289 s3_paginator = self.client.get_paginator('list_objects_v2') 

290 

291 # Limit each query to a single "directory" to match os.walk 

292 # We could download all keys at once with no delimiter and work 

293 # it out locally but this could potentially lead to large memory 

294 # usage for millions of keys. It will also make the initial call 

295 # to this method potentially very slow. If making this method look 

296 # like os.walk was not required, we could query all keys with 

297 # pagination and return them in groups of 1000, but that would 

298 # be a different interface since we can't guarantee we would get 

299 # them all grouped properly across the 1000 limit boundary. 

300 prefix = self.relativeToPathRoot if not self.is_root else "" 

301 prefix_len = len(prefix) 

302 dirnames = [] 

303 filenames = [] 

304 files_there = False 

305 

306 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

307 # All results are returned as full key names and we must 

308 # convert them back to the root form. The prefix is fixed 

309 # and delimited so that is a simple trim 

310 

311 # Directories are reported in the CommonPrefixes result 

312 # which reports the entire key and must be stripped. 

313 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

314 dirnames.extend(found_dirs) 

315 

316 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

317 if found_files: 

318 files_there = True 

319 if file_filter is not None: 

320 found_files = [f for f in found_files if file_filter.search(f)] 

321 

322 filenames.extend(found_files) 

323 

324 # Directories do not exist so we can't test for them. If no files 

325 # or directories were found though, this means that it effectively 

326 # does not exist and we should match os.walk() behavior and return 

327 # []. 

328 if not dirnames and not files_there: 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true

329 yield [] 

330 else: 

331 yield self, dirnames, filenames 

332 

333 for dir in dirnames: 

334 new_uri = self.join(dir) 

335 yield from new_uri.walk(file_filter)