Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import logging 

25import re 

26import tempfile 

27 

28__all__ = ('ButlerS3URI',) 

29 

30from typing import ( 

31 TYPE_CHECKING, 

32 Optional, 

33 Any, 

34 Callable, 

35 Iterator, 

36 List, 

37 Tuple, 

38 Union, 

39) 

40 

41from .utils import NoTransaction 

42from ._butlerUri import ButlerURI 

43from .s3utils import getS3Client, s3CheckFileExists, bucketExists 

44 

45from botocore.exceptions import ClientError 

46from http.client import ImproperConnectionState, HTTPException 

47from urllib3.exceptions import RequestError, HTTPError 

48 

49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true

50 try: 

51 import boto3 

52 except ImportError: 

53 pass 

54 from ..datastore import DatastoreTransaction 

55 

56# https://pypi.org/project/backoff/ 

57try: 

58 import backoff 

59except ImportError: 

60 class Backoff(): 

61 @staticmethod 

62 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

63 return func 

64 

65 @staticmethod 

66 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

67 return func 

68 

69 backoff = Backoff 

70 

71# settings for "backoff" retry decorators. these retries are belt-and- 

72# suspenders along with the retries built into Boto3, to account for 

73# semantic differences in errors between S3-like providers. 

74retryable_io_errors = ( 

75 # http.client 

76 ImproperConnectionState, HTTPException, 

77 # urllib3.exceptions 

78 RequestError, HTTPError, 

79 # built-ins 

80 TimeoutError, ConnectionError) 

81retryable_client_errors = ( 

82 # botocore.exceptions 

83 ClientError, 

84 # built-ins 

85 PermissionError) 

86all_retryable_errors = retryable_client_errors + retryable_io_errors 

87max_retry_time = 60 

88 

89 

90log = logging.getLogger(__name__) 

91 

92 

93class ButlerS3URI(ButlerURI): 

94 """S3 URI implementation class.""" 

95 

96 @property 

97 def client(self) -> boto3.client: 

98 """Client object to address remote resource.""" 

99 # Defer import for circular dependencies 

100 return getS3Client() 

101 

102 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time) 

103 def exists(self) -> bool: 

104 """Check that the S3 resource exists.""" 

105 if self.is_root: 105 ↛ 107line 105 didn't jump to line 107, because the condition on line 105 was never true

106 # Only check for the bucket since the path is irrelevant 

107 return bucketExists(self.netloc) 

108 exists, _ = s3CheckFileExists(self, client=self.client) 

109 return exists 

110 

111 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time) 

112 def size(self) -> int: 

113 """Return the size of the resource in bytes.""" 

114 if self.dirLike: 114 ↛ 115line 114 didn't jump to line 115, because the condition on line 114 was never true

115 return 0 

116 exists, sz = s3CheckFileExists(self, client=self.client) 

117 if not exists: 

118 raise FileNotFoundError(f"Resource {self} does not exist") 

119 return sz 

120 

121 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time) 

122 def remove(self) -> None: 

123 """Remove the resource.""" 

124 # https://github.com/boto/boto3/issues/507 - there is no 

125 # way of knowing if the file was actually deleted except 

126 # for checking all the keys again, reponse is HTTP 204 OK 

127 # response all the time 

128 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

129 

130 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

131 def read(self, size: int = -1) -> bytes: 

132 """Read the contents of the resource.""" 

133 args = {} 

134 if size > 0: 

135 args["Range"] = f"bytes=0-{size-1}" 

136 try: 

137 response = self.client.get_object(Bucket=self.netloc, 

138 Key=self.relativeToPathRoot, 

139 **args) 

140 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 

141 raise FileNotFoundError(f"No such resource: {self}") from err 

142 body = response["Body"].read() 

143 response["Body"].close() 

144 return body 

145 

146 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

147 def write(self, data: bytes, overwrite: bool = True) -> None: 

148 """Write the supplied data to the resource.""" 

149 if not overwrite: 

150 if self.exists(): 150 ↛ 151line 150 didn't jump to line 151, because the condition on line 150 was never true

151 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

152 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, 

153 Body=data) 

154 

155 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

156 def mkdir(self) -> None: 

157 """Write a directory key to S3.""" 

158 if not bucketExists(self.netloc): 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true

159 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!") 

160 

161 if not self.dirLike: 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

163 

164 # don't create S3 key when root is at the top-level of an Bucket 

165 if not self.path == "/": 165 ↛ exitline 165 didn't return from function 'mkdir', because the condition on line 165 was never false

166 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot) 

167 

168 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

169 def _as_local(self) -> Tuple[str, bool]: 

170 """Download object from S3 and place in temporary directory. 

171 

172 Returns 

173 ------- 

174 path : `str` 

175 Path to local temporary file. 

176 temporary : `bool` 

177 Always returns `True`. This is always a temporary file. 

178 """ 

179 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

180 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, tmpFile) 

181 return tmpFile.name, True 

182 

183 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

184 def transfer_from(self, src: ButlerURI, transfer: str = "copy", 

185 overwrite: bool = False, 

186 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

187 """Transfer the current resource to an S3 bucket. 

188 

189 Parameters 

190 ---------- 

191 src : `ButlerURI` 

192 Source URI. 

193 transfer : `str` 

194 Mode to use for transferring the resource. Supports the following 

195 options: copy. 

196 overwrite : `bool`, optional 

197 Allow an existing file to be overwritten. Defaults to `False`. 

198 transaction : `DatastoreTransaction`, optional 

199 Currently unused. 

200 """ 

201 # Fail early to prevent delays if remote resources are requested 

202 if transfer not in self.transferModes: 

203 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

204 

205 log.debug(f"Transferring {src} [exists: {src.exists()}] -> " 

206 f"{self} [exists: {self.exists()}] (transfer={transfer})") 

207 

208 if not overwrite and self.exists(): 

209 raise FileExistsError(f"Destination path '{self}' already exists.") 

210 

211 if transfer == "auto": 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true

212 transfer = self.transferDefault 

213 

214 if isinstance(src, type(self)): 

215 # Looks like an S3 remote uri so we can use direct copy 

216 # note that boto3.resource.meta.copy is cleverer than the low 

217 # level copy_object 

218 copy_source = { 

219 "Bucket": src.netloc, 

220 "Key": src.relativeToPathRoot, 

221 } 

222 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot) 

223 else: 

224 # Use local file and upload it 

225 with src.as_local() as local_uri: 

226 

227 # resource.meta.upload_file seems like the right thing 

228 # but we have a low level client 

229 with open(local_uri.ospath, "rb") as fh: 

230 self.client.put_object(Bucket=self.netloc, 

231 Key=self.relativeToPathRoot, Body=fh) 

232 

233 # This was an explicit move requested from a remote resource 

234 # try to remove that resource 

235 if transfer == "move": 235 ↛ 237line 235 didn't jump to line 237, because the condition on line 235 was never true

236 # Transactions do not work here 

237 src.remove() 

238 

239 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

240 def walk(self, file_filter: Optional[Union[str, re.Pattern]] = None) -> Iterator[Union[List, 

241 Tuple[ButlerURI, 

242 List[str], 

243 List[str]]]]: 

244 """Walk the directory tree returning matching files and directories. 

245 

246 Parameters 

247 ---------- 

248 file_filter : `str` or `re.Pattern`, optional 

249 Regex to filter out files from the list before it is returned. 

250 

251 Yields 

252 ------ 

253 dirpath : `ButlerURI` 

254 Current directory being examined. 

255 dirnames : `list` of `str` 

256 Names of subdirectories within dirpath. 

257 filenames : `list` of `str` 

258 Names of all the files within dirpath. 

259 """ 

260 # We pretend that S3 uses directories and files and not simply keys 

261 if not (self.isdir() or self.is_root): 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true

262 raise ValueError(f"Can not walk a non-directory URI: {self}") 

263 

264 if isinstance(file_filter, str): 264 ↛ 265line 264 didn't jump to line 265, because the condition on line 264 was never true

265 file_filter = re.compile(file_filter) 

266 

267 s3_paginator = self.client.get_paginator('list_objects_v2') 

268 

269 # Limit each query to a single "directory" to match os.walk 

270 # We could download all keys at once with no delimiter and work 

271 # it out locally but this could potentially lead to large memory 

272 # usage for millions of keys. It will also make the initial call 

273 # to this method potentially very slow. If making this method look 

274 # like os.walk was not required, we could query all keys with 

275 # pagination and return them in groups of 1000, but that would 

276 # be a different interface since we can't guarantee we would get 

277 # them all grouped properly across the 1000 limit boundary. 

278 prefix = self.relativeToPathRoot if not self.is_root else "" 

279 prefix_len = len(prefix) 

280 dirnames = [] 

281 filenames = [] 

282 files_there = False 

283 

284 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"): 

285 # All results are returned as full key names and we must 

286 # convert them back to the root form. The prefix is fixed 

287 # and delimited so that is a simple trim 

288 

289 # Directories are reported in the CommonPrefixes result 

290 # which reports the entire key and must be stripped. 

291 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())] 

292 dirnames.extend(found_dirs) 

293 

294 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())] 

295 if found_files: 

296 files_there = True 

297 if file_filter is not None: 

298 found_files = [f for f in found_files if file_filter.search(f)] 

299 

300 filenames.extend(found_files) 

301 

302 # Directories do not exist so we can't test for them. If no files 

303 # or directories were found though, this means that it effectively 

304 # does not exist and we should match os.walk() behavior and return 

305 # []. 

306 if not dirnames and not files_there: 306 ↛ 307line 306 didn't jump to line 307, because the condition on line 306 was never true

307 yield [] 

308 else: 

309 yield self, dirnames, filenames 

310 

311 for dir in dirnames: 

312 new_uri = self.join(dir) 

313 yield from new_uri.walk(file_filter)