Coverage for python/lsst/resources/s3.py: 92%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14import logging
15import re
16import tempfile
17import threading
19__all__ = ("S3ResourcePath",)
21from http.client import HTTPException, ImproperConnectionState
22from typing import TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, Union
24from botocore.exceptions import ClientError
25from lsst.utils.timer import time_this
26from urllib3.exceptions import HTTPError, RequestError
28from ._resourcePath import ResourcePath
29from .s3utils import bucketExists, getS3Client, s3CheckFileExists
31if TYPE_CHECKING: 31 ↛ 32line 31 didn't jump to line 32, because the condition on line 31 was never true
32 try:
33 import boto3
34 except ImportError:
35 pass
36 from .utils import TransactionProtocol
38# https://pypi.org/project/backoff/
39try:
40 import backoff
41except ImportError:
43 class Backoff:
44 @staticmethod
45 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
46 return func
48 @staticmethod
49 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
50 return func
52 backoff = Backoff
54# settings for "backoff" retry decorators. these retries are belt-and-
55# suspenders along with the retries built into Boto3, to account for
56# semantic differences in errors between S3-like providers.
57retryable_io_errors = (
58 # http.client
59 ImproperConnectionState,
60 HTTPException,
61 # urllib3.exceptions
62 RequestError,
63 HTTPError,
64 # built-ins
65 TimeoutError,
66 ConnectionError,
67)
69# Client error can include NoSuchKey so retry may not be the right
70# thing. This may require more consideration if it is to be used.
71retryable_client_errors = (
72 # botocore.exceptions
73 ClientError,
74 # built-ins
75 PermissionError,
76)
78# Combine all errors into an easy package. For now client errors
79# are not included.
80all_retryable_errors = retryable_io_errors
81max_retry_time = 60
84log = logging.getLogger(__name__)
87class ProgressPercentage:
88 """Progress bar for S3 file uploads."""
90 log_level = logging.DEBUG
91 """Default log level to use when issuing a message."""
93 def __init__(self, file: ResourcePath, file_for_msg: Optional[ResourcePath] = None, msg: str = ""):
94 self._filename = file
95 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
96 self._size = file.size()
97 self._seen_so_far = 0
98 self._lock = threading.Lock()
99 self._msg = msg
101 def __call__(self, bytes_amount: int) -> None:
102 # To simplify, assume this is hooked up to a single filename
103 with self._lock:
104 self._seen_so_far += bytes_amount
105 percentage = (100 * self._seen_so_far) // self._size
106 log.log(
107 self.log_level,
108 "%s %s %s / %s (%s%%)",
109 self._msg,
110 self._file_for_msg,
111 self._seen_so_far,
112 self._size,
113 percentage,
114 )
117class S3ResourcePath(ResourcePath):
118 """S3 URI resource path implementation class."""
120 @property
121 def client(self) -> boto3.client:
122 """Client object to address remote resource."""
123 # Defer import for circular dependencies
124 return getS3Client()
126 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
127 def exists(self) -> bool:
128 """Check that the S3 resource exists."""
129 if self.is_root:
130 # Only check for the bucket since the path is irrelevant
131 return bucketExists(self.netloc)
132 exists, _ = s3CheckFileExists(self, client=self.client)
133 return exists
135 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
136 def size(self) -> int:
137 """Return the size of the resource in bytes."""
138 if self.dirLike:
139 return 0
140 exists, sz = s3CheckFileExists(self, client=self.client)
141 if not exists:
142 raise FileNotFoundError(f"Resource {self} does not exist")
143 return sz
145 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
146 def remove(self) -> None:
147 """Remove the resource."""
148 # https://github.com/boto/boto3/issues/507 - there is no
149 # way of knowing if the file was actually deleted except
150 # for checking all the keys again, reponse is HTTP 204 OK
151 # response all the time
152 try:
153 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
154 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
155 raise FileNotFoundError("No such resource: {self}") from err
157 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
158 def read(self, size: int = -1) -> bytes:
159 """Read the contents of the resource."""
160 args = {}
161 if size > 0:
162 args["Range"] = f"bytes=0-{size-1}"
163 try:
164 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
165 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
166 raise FileNotFoundError(f"No such resource: {self}") from err
167 with time_this(log, msg="Read from %s", args=(self,)):
168 body = response["Body"].read()
169 response["Body"].close()
170 return body
172 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
173 def write(self, data: bytes, overwrite: bool = True) -> None:
174 """Write the supplied data to the resource."""
175 if not overwrite:
176 if self.exists():
177 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
178 with time_this(log, msg="Write to %s", args=(self,)):
179 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
181 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
182 def mkdir(self) -> None:
183 """Write a directory key to S3."""
184 if not bucketExists(self.netloc):
185 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
187 if not self.dirLike:
188 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
190 # don't create S3 key when root is at the top-level of an Bucket
191 if not self.path == "/":
192 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
194 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
195 def _as_local(self) -> Tuple[str, bool]:
196 """Download object from S3 and place in temporary directory.
198 Returns
199 -------
200 path : `str`
201 Path to local temporary file.
202 temporary : `bool`
203 Always returns `True`. This is always a temporary file.
204 """
205 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
206 with time_this(log, msg="Downloading %s to local file", args=(self,)):
207 progress = (
208 ProgressPercentage(self, msg="Downloading:")
209 if log.isEnabledFor(ProgressPercentage.log_level)
210 else None
211 )
212 try:
213 self.client.download_fileobj(
214 self.netloc, self.relativeToPathRoot, tmpFile, Callback=progress
215 )
216 except (
217 ClientError,
218 self.client.exceptions.NoSuchKey,
219 self.client.exceptions.NoSuchBucket,
220 ) as err:
221 raise FileNotFoundError(f"No such resource: {self}") from err
222 return tmpFile.name, True
224 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
225 def transfer_from(
226 self,
227 src: ResourcePath,
228 transfer: str = "copy",
229 overwrite: bool = False,
230 transaction: Optional[TransactionProtocol] = None,
231 ) -> None:
232 """Transfer the current resource to an S3 bucket.
234 Parameters
235 ----------
236 src : `ResourcePath`
237 Source URI.
238 transfer : `str`
239 Mode to use for transferring the resource. Supports the following
240 options: copy.
241 overwrite : `bool`, optional
242 Allow an existing file to be overwritten. Defaults to `False`.
243 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
244 Currently unused.
245 """
246 # Fail early to prevent delays if remote resources are requested
247 if transfer not in self.transferModes:
248 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
250 # Existence checks cost time so do not call this unless we know
251 # that debugging is enabled.
252 if log.isEnabledFor(logging.DEBUG):
253 log.debug(
254 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
255 src,
256 src.exists(),
257 self,
258 self.exists(),
259 transfer,
260 )
262 # Short circuit if the URIs are identical immediately.
263 if self == src:
264 log.debug(
265 "Target and destination URIs are identical: %s, returning immediately."
266 " No further action required.",
267 self,
268 )
269 return
271 if not overwrite and self.exists():
272 raise FileExistsError(f"Destination path '{self}' already exists.")
274 if transfer == "auto":
275 transfer = self.transferDefault
277 timer_msg = "Transfer from %s to %s"
278 timer_args = (src, self)
280 if isinstance(src, type(self)):
281 # Looks like an S3 remote uri so we can use direct copy
282 # note that boto3.resource.meta.copy is cleverer than the low
283 # level copy_object
284 copy_source = {
285 "Bucket": src.netloc,
286 "Key": src.relativeToPathRoot,
287 }
288 with time_this(log, msg=timer_msg, args=timer_args):
289 try:
290 self.client.copy_object(
291 CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot
292 )
293 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
294 raise FileNotFoundError("No such resource to transfer: {self}") from err
295 else:
296 # Use local file and upload it
297 with src.as_local() as local_uri:
298 progress = (
299 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
300 if log.isEnabledFor(ProgressPercentage.log_level)
301 else None
302 )
303 with time_this(log, msg=timer_msg, args=timer_args):
304 self.client.upload_file(
305 local_uri.ospath, self.netloc, self.relativeToPathRoot, Callback=progress
306 )
308 # This was an explicit move requested from a remote resource
309 # try to remove that resource
310 if transfer == "move":
311 # Transactions do not work here
312 src.remove()
314 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
315 def walk(
316 self, file_filter: Optional[Union[str, re.Pattern]] = None
317 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]:
318 """Walk the directory tree returning matching files and directories.
320 Parameters
321 ----------
322 file_filter : `str` or `re.Pattern`, optional
323 Regex to filter out files from the list before it is returned.
325 Yields
326 ------
327 dirpath : `ResourcePath`
328 Current directory being examined.
329 dirnames : `list` of `str`
330 Names of subdirectories within dirpath.
331 filenames : `list` of `str`
332 Names of all the files within dirpath.
333 """
334 # We pretend that S3 uses directories and files and not simply keys
335 if not (self.isdir() or self.is_root):
336 raise ValueError(f"Can not walk a non-directory URI: {self}")
338 if isinstance(file_filter, str): 338 ↛ 339line 338 didn't jump to line 339, because the condition on line 338 was never true
339 file_filter = re.compile(file_filter)
341 s3_paginator = self.client.get_paginator("list_objects_v2")
343 # Limit each query to a single "directory" to match os.walk
344 # We could download all keys at once with no delimiter and work
345 # it out locally but this could potentially lead to large memory
346 # usage for millions of keys. It will also make the initial call
347 # to this method potentially very slow. If making this method look
348 # like os.walk was not required, we could query all keys with
349 # pagination and return them in groups of 1000, but that would
350 # be a different interface since we can't guarantee we would get
351 # them all grouped properly across the 1000 limit boundary.
352 prefix = self.relativeToPathRoot if not self.is_root else ""
353 prefix_len = len(prefix)
354 dirnames = []
355 filenames = []
356 files_there = False
358 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
359 # All results are returned as full key names and we must
360 # convert them back to the root form. The prefix is fixed
361 # and delimited so that is a simple trim
363 # Directories are reported in the CommonPrefixes result
364 # which reports the entire key and must be stripped.
365 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
366 dirnames.extend(found_dirs)
368 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
369 if found_files:
370 files_there = True
371 if file_filter is not None:
372 found_files = [f for f in found_files if file_filter.search(f)]
374 filenames.extend(found_files)
376 # Directories do not exist so we can't test for them. If no files
377 # or directories were found though, this means that it effectively
378 # does not exist and we should match os.walk() behavior and return
379 # immediately.
380 if not dirnames and not files_there:
381 return
382 else:
383 yield self, dirnames, filenames
385 for dir in dirnames:
386 new_uri = self.join(dir)
387 yield from new_uri.walk(file_filter)