Coverage for python/lsst/resources/s3.py: 86%
196 statements
« prev ^ index » next coverage.py v6.4, created at 2022-05-25 10:21 +0000
« prev ^ index » next coverage.py v6.4, created at 2022-05-25 10:21 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14import logging
15import re
16import tempfile
17import threading
19__all__ = ("S3ResourcePath",)
21from http.client import HTTPException, ImproperConnectionState
22from typing import TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, Union
24from botocore.exceptions import ClientError
25from lsst.utils.timer import time_this
26from urllib3.exceptions import HTTPError, RequestError
28from ._resourcePath import ResourcePath
29from .s3utils import bucketExists, getS3Client, s3CheckFileExists
31if TYPE_CHECKING: 31 ↛ 32line 31 didn't jump to line 32, because the condition on line 31 was never true
32 try:
33 import boto3
34 except ImportError:
35 pass
36 from .utils import TransactionProtocol
38# https://pypi.org/project/backoff/
39try:
40 import backoff
41except ImportError:
43 class Backoff:
44 @staticmethod
45 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
46 return func
48 @staticmethod
49 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
50 return func
52 backoff = Backoff
55class _TooManyRequestsException(Exception):
56 """Private exception that can be used for 429 retry.
58 botocore refuses to deal with 429 error itself so issues a generic
59 ClientError.
60 """
62 pass
65# settings for "backoff" retry decorators. these retries are belt-and-
66# suspenders along with the retries built into Boto3, to account for
67# semantic differences in errors between S3-like providers.
68retryable_io_errors = (
69 # http.client
70 ImproperConnectionState,
71 HTTPException,
72 # urllib3.exceptions
73 RequestError,
74 HTTPError,
75 # built-ins
76 TimeoutError,
77 ConnectionError,
78 # private
79 _TooManyRequestsException,
80)
82# Client error can include NoSuchKey so retry may not be the right
83# thing. This may require more consideration if it is to be used.
84retryable_client_errors = (
85 # botocore.exceptions
86 ClientError,
87 # built-ins
88 PermissionError,
89)
91# Combine all errors into an easy package. For now client errors
92# are not included.
93all_retryable_errors = retryable_io_errors
94max_retry_time = 60
97log = logging.getLogger(__name__)
100class ProgressPercentage:
101 """Progress bar for S3 file uploads."""
103 log_level = logging.DEBUG
104 """Default log level to use when issuing a message."""
106 def __init__(self, file: ResourcePath, file_for_msg: Optional[ResourcePath] = None, msg: str = ""):
107 self._filename = file
108 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
109 self._size = file.size()
110 self._seen_so_far = 0
111 self._lock = threading.Lock()
112 self._msg = msg
114 def __call__(self, bytes_amount: int) -> None:
115 # To simplify, assume this is hooked up to a single filename
116 with self._lock:
117 self._seen_so_far += bytes_amount
118 percentage = (100 * self._seen_so_far) // self._size
119 log.log(
120 self.log_level,
121 "%s %s %s / %s (%s%%)",
122 self._msg,
123 self._file_for_msg,
124 self._seen_so_far,
125 self._size,
126 percentage,
127 )
130def _translate_client_error(err: ClientError) -> None:
131 """Translate a ClientError into a specialist error if relevant.
133 Parameters
134 ----------
135 err : `ClientError`
136 Exception to translate.
138 Raises
139 ------
140 _TooManyRequestsException
141 Raised if the `ClientError` looks like a 429 retry request.
142 """
143 if "(429)" in str(err): 143 ↛ 147line 143 didn't jump to line 147, because the condition on line 143 was never true
144 # ClientError includes the error code in the message
145 # but no direct way to access it without looking inside the
146 # response.
147 raise _TooManyRequestsException(str(err)) from err
148 elif "(404)" in str(err): 148 ↛ exitline 148 didn't return from function '_translate_client_error', because the condition on line 148 was never false
149 # Some systems can generate this rather than NoSuchKey.
150 raise FileNotFoundError("Resource not found: {self}")
153class S3ResourcePath(ResourcePath):
154 """S3 URI resource path implementation class."""
156 @property
157 def client(self) -> boto3.client:
158 """Client object to address remote resource."""
159 # Defer import for circular dependencies
160 return getS3Client()
162 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
163 def exists(self) -> bool:
164 """Check that the S3 resource exists."""
165 if self.is_root:
166 # Only check for the bucket since the path is irrelevant
167 return bucketExists(self.netloc)
168 exists, _ = s3CheckFileExists(self, client=self.client)
169 return exists
171 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
172 def size(self) -> int:
173 """Return the size of the resource in bytes."""
174 if self.dirLike:
175 return 0
176 exists, sz = s3CheckFileExists(self, client=self.client)
177 if not exists:
178 raise FileNotFoundError(f"Resource {self} does not exist")
179 return sz
181 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
182 def remove(self) -> None:
183 """Remove the resource."""
184 # https://github.com/boto/boto3/issues/507 - there is no
185 # way of knowing if the file was actually deleted except
186 # for checking all the keys again, reponse is HTTP 204 OK
187 # response all the time
188 try:
189 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
190 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
191 raise FileNotFoundError("No such resource: {self}") from err
193 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
194 def read(self, size: int = -1) -> bytes:
195 """Read the contents of the resource."""
196 args = {}
197 if size > 0:
198 args["Range"] = f"bytes=0-{size-1}"
199 try:
200 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
201 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 201 ↛ 203line 201 didn't jump to line 203
202 raise FileNotFoundError(f"No such resource: {self}") from err
203 except ClientError as err:
204 _translate_client_error(err)
205 raise
206 with time_this(log, msg="Read from %s", args=(self,)):
207 body = response["Body"].read()
208 response["Body"].close()
209 return body
211 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
212 def write(self, data: bytes, overwrite: bool = True) -> None:
213 """Write the supplied data to the resource."""
214 if not overwrite:
215 if self.exists():
216 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
217 with time_this(log, msg="Write to %s", args=(self,)):
218 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
220 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
221 def mkdir(self) -> None:
222 """Write a directory key to S3."""
223 if not bucketExists(self.netloc):
224 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
226 if not self.dirLike:
227 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
229 # don't create S3 key when root is at the top-level of an Bucket
230 if not self.path == "/":
231 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
233 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
234 def _download_file(self, local_file: ResourcePath, progress: Optional[ProgressPercentage]) -> None:
235 """Download the remote resource to a local file.
237 Helper routine for _as_local to allow backoff without regenerating
238 the temporary file.
239 """
240 try:
241 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress)
242 except ( 242 ↛ 246line 242 didn't jump to line 246, because the exception caught by line 242 didn't happen
243 self.client.exceptions.NoSuchKey,
244 self.client.exceptions.NoSuchBucket,
245 ) as err:
246 raise FileNotFoundError(f"No such resource: {self}") from err
247 except ClientError as err:
248 _translate_client_error(err)
249 raise
251 def _as_local(self) -> Tuple[str, bool]:
252 """Download object from S3 and place in temporary directory.
254 Returns
255 -------
256 path : `str`
257 Path to local temporary file.
258 temporary : `bool`
259 Always returns `True`. This is always a temporary file.
260 """
261 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
262 with time_this(log, msg="Downloading %s to local file", args=(self,)):
263 progress = (
264 ProgressPercentage(self, msg="Downloading:")
265 if log.isEnabledFor(ProgressPercentage.log_level)
266 else None
267 )
268 self._download_file(tmpFile, progress)
269 return tmpFile.name, True
271 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
272 def _upload_file(self, local_file: ResourcePath, progress: Optional[ProgressPercentage]) -> None:
273 """Upload a local file with backoff.
275 Helper method to wrap file uploading in backoff for transfer_from.
276 """
277 try:
278 self.client.upload_file(
279 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress
280 )
281 except self.client.exceptions.NoSuchBucket as err:
282 raise NotADirectoryError(f"Target does not exist: {err}") from err
283 except ClientError as err:
284 _translate_client_error(err)
285 raise
287 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
288 def _copy_from(self, src: ResourcePath) -> None:
289 copy_source = {
290 "Bucket": src.netloc,
291 "Key": src.relativeToPathRoot,
292 }
293 try:
294 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
295 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 295 ↛ 297line 295 didn't jump to line 297
296 raise FileNotFoundError("No such resource to transfer: {self}") from err
297 except ClientError as err:
298 _translate_client_error(err)
299 raise
301 def transfer_from(
302 self,
303 src: ResourcePath,
304 transfer: str = "copy",
305 overwrite: bool = False,
306 transaction: Optional[TransactionProtocol] = None,
307 ) -> None:
308 """Transfer the current resource to an S3 bucket.
310 Parameters
311 ----------
312 src : `ResourcePath`
313 Source URI.
314 transfer : `str`
315 Mode to use for transferring the resource. Supports the following
316 options: copy.
317 overwrite : `bool`, optional
318 Allow an existing file to be overwritten. Defaults to `False`.
319 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
320 Currently unused.
321 """
322 # Fail early to prevent delays if remote resources are requested
323 if transfer not in self.transferModes:
324 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
326 # Existence checks cost time so do not call this unless we know
327 # that debugging is enabled.
328 if log.isEnabledFor(logging.DEBUG):
329 log.debug(
330 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
331 src,
332 src.exists(),
333 self,
334 self.exists(),
335 transfer,
336 )
338 # Short circuit if the URIs are identical immediately.
339 if self == src:
340 log.debug(
341 "Target and destination URIs are identical: %s, returning immediately."
342 " No further action required.",
343 self,
344 )
345 return
347 if not overwrite and self.exists():
348 raise FileExistsError(f"Destination path '{self}' already exists.")
350 if transfer == "auto":
351 transfer = self.transferDefault
353 timer_msg = "Transfer from %s to %s"
354 timer_args = (src, self)
356 if isinstance(src, type(self)):
357 # Looks like an S3 remote uri so we can use direct copy
358 # note that boto3.resource.meta.copy is cleverer than the low
359 # level copy_object
360 with time_this(log, msg=timer_msg, args=timer_args):
361 self._copy_from(src)
363 else:
364 # Use local file and upload it
365 with src.as_local() as local_uri:
366 progress = (
367 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
368 if log.isEnabledFor(ProgressPercentage.log_level)
369 else None
370 )
371 with time_this(log, msg=timer_msg, args=timer_args):
372 self._upload_file(local_uri, progress)
374 # This was an explicit move requested from a remote resource
375 # try to remove that resource
376 if transfer == "move":
377 # Transactions do not work here
378 src.remove()
380 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
381 def walk(
382 self, file_filter: Optional[Union[str, re.Pattern]] = None
383 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]:
384 """Walk the directory tree returning matching files and directories.
386 Parameters
387 ----------
388 file_filter : `str` or `re.Pattern`, optional
389 Regex to filter out files from the list before it is returned.
391 Yields
392 ------
393 dirpath : `ResourcePath`
394 Current directory being examined.
395 dirnames : `list` of `str`
396 Names of subdirectories within dirpath.
397 filenames : `list` of `str`
398 Names of all the files within dirpath.
399 """
400 # We pretend that S3 uses directories and files and not simply keys
401 if not (self.isdir() or self.is_root):
402 raise ValueError(f"Can not walk a non-directory URI: {self}")
404 if isinstance(file_filter, str): 404 ↛ 405line 404 didn't jump to line 405, because the condition on line 404 was never true
405 file_filter = re.compile(file_filter)
407 s3_paginator = self.client.get_paginator("list_objects_v2")
409 # Limit each query to a single "directory" to match os.walk
410 # We could download all keys at once with no delimiter and work
411 # it out locally but this could potentially lead to large memory
412 # usage for millions of keys. It will also make the initial call
413 # to this method potentially very slow. If making this method look
414 # like os.walk was not required, we could query all keys with
415 # pagination and return them in groups of 1000, but that would
416 # be a different interface since we can't guarantee we would get
417 # them all grouped properly across the 1000 limit boundary.
418 prefix = self.relativeToPathRoot if not self.is_root else ""
419 prefix_len = len(prefix)
420 dirnames = []
421 filenames = []
422 files_there = False
424 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
425 # All results are returned as full key names and we must
426 # convert them back to the root form. The prefix is fixed
427 # and delimited so that is a simple trim
429 # Directories are reported in the CommonPrefixes result
430 # which reports the entire key and must be stripped.
431 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
432 dirnames.extend(found_dirs)
434 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
435 if found_files:
436 files_there = True
437 if file_filter is not None:
438 found_files = [f for f in found_files if file_filter.search(f)]
440 filenames.extend(found_files)
442 # Directories do not exist so we can't test for them. If no files
443 # or directories were found though, this means that it effectively
444 # does not exist and we should match os.walk() behavior and return
445 # immediately.
446 if not dirnames and not files_there:
447 return
448 else:
449 yield self, dirnames, filenames
451 for dir in dirnames:
452 new_uri = self.join(dir)
453 yield from new_uri.walk(file_filter)