Coverage for python/lsst/resources/s3.py: 86%
197 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-08-01 01:03 -0700
« prev ^ index » next coverage.py v6.4.2, created at 2022-08-01 01:03 -0700
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14import logging
15import re
16import tempfile
17import threading
19__all__ = ("S3ResourcePath",)
21from http.client import HTTPException, ImproperConnectionState
22from types import ModuleType
23from typing import IO, TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, Union, cast
25from botocore.exceptions import ClientError
26from lsst.utils.timer import time_this
27from urllib3.exceptions import HTTPError, RequestError
29from ._resourcePath import ResourcePath
30from .s3utils import bucketExists, getS3Client, s3CheckFileExists
32if TYPE_CHECKING: 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true
33 try:
34 import boto3
35 except ImportError:
36 pass
37 from .utils import TransactionProtocol
39# https://pypi.org/project/backoff/
40try:
41 import backoff
42except ImportError:
44 class Backoff:
45 @staticmethod
46 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
47 return func
49 @staticmethod
50 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
51 return func
53 backoff = cast(ModuleType, Backoff)
56class _TooManyRequestsException(Exception):
57 """Private exception that can be used for 429 retry.
59 botocore refuses to deal with 429 error itself so issues a generic
60 ClientError.
61 """
63 pass
66# settings for "backoff" retry decorators. these retries are belt-and-
67# suspenders along with the retries built into Boto3, to account for
68# semantic differences in errors between S3-like providers.
69retryable_io_errors = (
70 # http.client
71 ImproperConnectionState,
72 HTTPException,
73 # urllib3.exceptions
74 RequestError,
75 HTTPError,
76 # built-ins
77 TimeoutError,
78 ConnectionError,
79 # private
80 _TooManyRequestsException,
81)
83# Client error can include NoSuchKey so retry may not be the right
84# thing. This may require more consideration if it is to be used.
85retryable_client_errors = (
86 # botocore.exceptions
87 ClientError,
88 # built-ins
89 PermissionError,
90)
92# Combine all errors into an easy package. For now client errors
93# are not included.
94all_retryable_errors = retryable_io_errors
95max_retry_time = 60
98log = logging.getLogger(__name__)
101class ProgressPercentage:
102 """Progress bar for S3 file uploads."""
104 log_level = logging.DEBUG
105 """Default log level to use when issuing a message."""
107 def __init__(self, file: ResourcePath, file_for_msg: Optional[ResourcePath] = None, msg: str = ""):
108 self._filename = file
109 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
110 self._size = file.size()
111 self._seen_so_far = 0
112 self._lock = threading.Lock()
113 self._msg = msg
115 def __call__(self, bytes_amount: int) -> None:
116 # To simplify, assume this is hooked up to a single filename
117 with self._lock:
118 self._seen_so_far += bytes_amount
119 percentage = (100 * self._seen_so_far) // self._size
120 log.log(
121 self.log_level,
122 "%s %s %s / %s (%s%%)",
123 self._msg,
124 self._file_for_msg,
125 self._seen_so_far,
126 self._size,
127 percentage,
128 )
131def _translate_client_error(err: ClientError) -> None:
132 """Translate a ClientError into a specialist error if relevant.
134 Parameters
135 ----------
136 err : `ClientError`
137 Exception to translate.
139 Raises
140 ------
141 _TooManyRequestsException
142 Raised if the `ClientError` looks like a 429 retry request.
143 """
144 if "(429)" in str(err): 144 ↛ 148line 144 didn't jump to line 148, because the condition on line 144 was never true
145 # ClientError includes the error code in the message
146 # but no direct way to access it without looking inside the
147 # response.
148 raise _TooManyRequestsException(str(err)) from err
149 elif "(404)" in str(err): 149 ↛ exitline 149 didn't return from function '_translate_client_error', because the condition on line 149 was never false
150 # Some systems can generate this rather than NoSuchKey.
151 raise FileNotFoundError("Resource not found: {self}")
154class S3ResourcePath(ResourcePath):
155 """S3 URI resource path implementation class."""
157 @property
158 def client(self) -> boto3.client:
159 """Client object to address remote resource."""
160 # Defer import for circular dependencies
161 return getS3Client()
163 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
164 def exists(self) -> bool:
165 """Check that the S3 resource exists."""
166 if self.is_root:
167 # Only check for the bucket since the path is irrelevant
168 return bucketExists(self.netloc)
169 exists, _ = s3CheckFileExists(self, client=self.client)
170 return exists
172 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
173 def size(self) -> int:
174 """Return the size of the resource in bytes."""
175 if self.dirLike:
176 return 0
177 exists, sz = s3CheckFileExists(self, client=self.client)
178 if not exists:
179 raise FileNotFoundError(f"Resource {self} does not exist")
180 return sz
182 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
183 def remove(self) -> None:
184 """Remove the resource."""
185 # https://github.com/boto/boto3/issues/507 - there is no
186 # way of knowing if the file was actually deleted except
187 # for checking all the keys again, reponse is HTTP 204 OK
188 # response all the time
189 try:
190 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
191 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
192 raise FileNotFoundError("No such resource: {self}") from err
194 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
195 def read(self, size: int = -1) -> bytes:
196 """Read the contents of the resource."""
197 args = {}
198 if size > 0:
199 args["Range"] = f"bytes=0-{size-1}"
200 try:
201 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
202 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 202 ↛ 204line 202 didn't jump to line 204
203 raise FileNotFoundError(f"No such resource: {self}") from err
204 except ClientError as err:
205 _translate_client_error(err)
206 raise
207 with time_this(log, msg="Read from %s", args=(self,)):
208 body = response["Body"].read()
209 response["Body"].close()
210 return body
212 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
213 def write(self, data: bytes, overwrite: bool = True) -> None:
214 """Write the supplied data to the resource."""
215 if not overwrite:
216 if self.exists():
217 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
218 with time_this(log, msg="Write to %s", args=(self,)):
219 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
221 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
222 def mkdir(self) -> None:
223 """Write a directory key to S3."""
224 if not bucketExists(self.netloc):
225 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
227 if not self.dirLike:
228 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
230 # don't create S3 key when root is at the top-level of an Bucket
231 if not self.path == "/":
232 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
234 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
235 def _download_file(self, local_file: IO, progress: Optional[ProgressPercentage]) -> None:
236 """Download the remote resource to a local file.
238 Helper routine for _as_local to allow backoff without regenerating
239 the temporary file.
240 """
241 try:
242 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress)
243 except ( 243 ↛ 247line 243 didn't jump to line 247, because the exception caught by line 243 didn't happen
244 self.client.exceptions.NoSuchKey,
245 self.client.exceptions.NoSuchBucket,
246 ) as err:
247 raise FileNotFoundError(f"No such resource: {self}") from err
248 except ClientError as err:
249 _translate_client_error(err)
250 raise
252 def _as_local(self) -> Tuple[str, bool]:
253 """Download object from S3 and place in temporary directory.
255 Returns
256 -------
257 path : `str`
258 Path to local temporary file.
259 temporary : `bool`
260 Always returns `True`. This is always a temporary file.
261 """
262 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
263 with time_this(log, msg="Downloading %s to local file", args=(self,)):
264 progress = (
265 ProgressPercentage(self, msg="Downloading:")
266 if log.isEnabledFor(ProgressPercentage.log_level)
267 else None
268 )
269 self._download_file(tmpFile, progress)
270 return tmpFile.name, True
272 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
273 def _upload_file(self, local_file: ResourcePath, progress: Optional[ProgressPercentage]) -> None:
274 """Upload a local file with backoff.
276 Helper method to wrap file uploading in backoff for transfer_from.
277 """
278 try:
279 self.client.upload_file(
280 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress
281 )
282 except self.client.exceptions.NoSuchBucket as err:
283 raise NotADirectoryError(f"Target does not exist: {err}") from err
284 except ClientError as err:
285 _translate_client_error(err)
286 raise
288 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
289 def _copy_from(self, src: ResourcePath) -> None:
290 copy_source = {
291 "Bucket": src.netloc,
292 "Key": src.relativeToPathRoot,
293 }
294 try:
295 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
296 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 296 ↛ 298line 296 didn't jump to line 298
297 raise FileNotFoundError("No such resource to transfer: {self}") from err
298 except ClientError as err:
299 _translate_client_error(err)
300 raise
302 def transfer_from(
303 self,
304 src: ResourcePath,
305 transfer: str = "copy",
306 overwrite: bool = False,
307 transaction: Optional[TransactionProtocol] = None,
308 ) -> None:
309 """Transfer the current resource to an S3 bucket.
311 Parameters
312 ----------
313 src : `ResourcePath`
314 Source URI.
315 transfer : `str`
316 Mode to use for transferring the resource. Supports the following
317 options: copy.
318 overwrite : `bool`, optional
319 Allow an existing file to be overwritten. Defaults to `False`.
320 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
321 Currently unused.
322 """
323 # Fail early to prevent delays if remote resources are requested
324 if transfer not in self.transferModes:
325 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
327 # Existence checks cost time so do not call this unless we know
328 # that debugging is enabled.
329 if log.isEnabledFor(logging.DEBUG):
330 log.debug(
331 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
332 src,
333 src.exists(),
334 self,
335 self.exists(),
336 transfer,
337 )
339 # Short circuit if the URIs are identical immediately.
340 if self == src:
341 log.debug(
342 "Target and destination URIs are identical: %s, returning immediately."
343 " No further action required.",
344 self,
345 )
346 return
348 if not overwrite and self.exists():
349 raise FileExistsError(f"Destination path '{self}' already exists.")
351 if transfer == "auto":
352 transfer = self.transferDefault
354 timer_msg = "Transfer from %s to %s"
355 timer_args = (src, self)
357 if isinstance(src, type(self)):
358 # Looks like an S3 remote uri so we can use direct copy
359 # note that boto3.resource.meta.copy is cleverer than the low
360 # level copy_object
361 with time_this(log, msg=timer_msg, args=timer_args):
362 self._copy_from(src)
364 else:
365 # Use local file and upload it
366 with src.as_local() as local_uri:
367 progress = (
368 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
369 if log.isEnabledFor(ProgressPercentage.log_level)
370 else None
371 )
372 with time_this(log, msg=timer_msg, args=timer_args):
373 self._upload_file(local_uri, progress)
375 # This was an explicit move requested from a remote resource
376 # try to remove that resource
377 if transfer == "move":
378 # Transactions do not work here
379 src.remove()
381 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
382 def walk(
383 self, file_filter: Optional[Union[str, re.Pattern]] = None
384 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]:
385 """Walk the directory tree returning matching files and directories.
387 Parameters
388 ----------
389 file_filter : `str` or `re.Pattern`, optional
390 Regex to filter out files from the list before it is returned.
392 Yields
393 ------
394 dirpath : `ResourcePath`
395 Current directory being examined.
396 dirnames : `list` of `str`
397 Names of subdirectories within dirpath.
398 filenames : `list` of `str`
399 Names of all the files within dirpath.
400 """
401 # We pretend that S3 uses directories and files and not simply keys
402 if not (self.isdir() or self.is_root):
403 raise ValueError(f"Can not walk a non-directory URI: {self}")
405 if isinstance(file_filter, str): 405 ↛ 406line 405 didn't jump to line 406, because the condition on line 405 was never true
406 file_filter = re.compile(file_filter)
408 s3_paginator = self.client.get_paginator("list_objects_v2")
410 # Limit each query to a single "directory" to match os.walk
411 # We could download all keys at once with no delimiter and work
412 # it out locally but this could potentially lead to large memory
413 # usage for millions of keys. It will also make the initial call
414 # to this method potentially very slow. If making this method look
415 # like os.walk was not required, we could query all keys with
416 # pagination and return them in groups of 1000, but that would
417 # be a different interface since we can't guarantee we would get
418 # them all grouped properly across the 1000 limit boundary.
419 prefix = self.relativeToPathRoot if not self.is_root else ""
420 prefix_len = len(prefix)
421 dirnames = []
422 filenames = []
423 files_there = False
425 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
426 # All results are returned as full key names and we must
427 # convert them back to the root form. The prefix is fixed
428 # and delimited so that is a simple trim
430 # Directories are reported in the CommonPrefixes result
431 # which reports the entire key and must be stripped.
432 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
433 dirnames.extend(found_dirs)
435 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
436 if found_files:
437 files_there = True
438 if file_filter is not None:
439 found_files = [f for f in found_files if file_filter.search(f)]
441 filenames.extend(found_files)
443 # Directories do not exist so we can't test for them. If no files
444 # or directories were found though, this means that it effectively
445 # does not exist and we should match os.walk() behavior and return
446 # immediately.
447 if not dirnames and not files_there:
448 return
449 else:
450 yield self, dirnames, filenames
452 for dir in dirnames:
453 new_uri = self.join(dir)
454 yield from new_uri.walk(file_filter)