Coverage for python/lsst/resources/s3.py: 89%
191 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-04 02:38 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-04 02:38 -0800
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14import contextlib
15import io
16import logging
17import re
18import sys
19import tempfile
20import threading
22__all__ = ("S3ResourcePath",)
24from typing import IO, TYPE_CHECKING, Iterator, List, Optional, Tuple, Union, cast
26from botocore.exceptions import ClientError
27from lsst.utils.timer import time_this
29from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
30from ._resourceHandles._s3ResourceHandle import S3ResourceHandle
31from ._resourcePath import ResourcePath
32from .s3utils import (
33 _TooManyRequestsException,
34 all_retryable_errors,
35 backoff,
36 bucketExists,
37 getS3Client,
38 max_retry_time,
39 retryable_io_errors,
40 s3CheckFileExists,
41)
43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 try:
45 import boto3
46 except ImportError:
47 pass
48 from .utils import TransactionProtocol
51log = logging.getLogger(__name__)
54class ProgressPercentage:
55 """Progress bar for S3 file uploads."""
57 log_level = logging.DEBUG
58 """Default log level to use when issuing a message."""
60 def __init__(self, file: ResourcePath, file_for_msg: Optional[ResourcePath] = None, msg: str = ""):
61 self._filename = file
62 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
63 self._size = file.size()
64 self._seen_so_far = 0
65 self._lock = threading.Lock()
66 self._msg = msg
68 def __call__(self, bytes_amount: int) -> None:
69 # To simplify, assume this is hooked up to a single filename
70 with self._lock:
71 self._seen_so_far += bytes_amount
72 percentage = (100 * self._seen_so_far) // self._size
73 log.log(
74 self.log_level,
75 "%s %s %s / %s (%s%%)",
76 self._msg,
77 self._file_for_msg,
78 self._seen_so_far,
79 self._size,
80 percentage,
81 )
84def _translate_client_error(err: ClientError) -> None:
85 """Translate a ClientError into a specialist error if relevant.
87 Parameters
88 ----------
89 err : `ClientError`
90 Exception to translate.
92 Raises
93 ------
94 _TooManyRequestsException
95 Raised if the `ClientError` looks like a 429 retry request.
96 """
97 if "(429)" in str(err): 97 ↛ 101line 97 didn't jump to line 101, because the condition on line 97 was never true
98 # ClientError includes the error code in the message
99 # but no direct way to access it without looking inside the
100 # response.
101 raise _TooManyRequestsException(str(err)) from err
102 elif "(404)" in str(err): 102 ↛ exitline 102 didn't return from function '_translate_client_error', because the condition on line 102 was never false
103 # Some systems can generate this rather than NoSuchKey.
104 raise FileNotFoundError("Resource not found: {self}")
107class S3ResourcePath(ResourcePath):
108 """S3 URI resource path implementation class."""
110 @property
111 def client(self) -> boto3.client:
112 """Client object to address remote resource."""
113 # Defer import for circular dependencies
114 return getS3Client()
116 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
117 def exists(self) -> bool:
118 """Check that the S3 resource exists."""
119 if self.is_root:
120 # Only check for the bucket since the path is irrelevant
121 return bucketExists(self.netloc)
122 exists, _ = s3CheckFileExists(self, client=self.client)
123 return exists
125 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
126 def size(self) -> int:
127 """Return the size of the resource in bytes."""
128 if self.dirLike:
129 return 0
130 exists, sz = s3CheckFileExists(self, client=self.client)
131 if not exists:
132 raise FileNotFoundError(f"Resource {self} does not exist")
133 return sz
135 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
136 def remove(self) -> None:
137 """Remove the resource."""
138 # https://github.com/boto/boto3/issues/507 - there is no
139 # way of knowing if the file was actually deleted except
140 # for checking all the keys again, reponse is HTTP 204 OK
141 # response all the time
142 try:
143 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
144 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
145 raise FileNotFoundError("No such resource: {self}") from err
147 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
148 def read(self, size: int = -1) -> bytes:
149 """Read the contents of the resource."""
150 args = {}
151 if size > 0:
152 args["Range"] = f"bytes=0-{size-1}"
153 try:
154 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
155 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 155 ↛ 157line 155 didn't jump to line 157
156 raise FileNotFoundError(f"No such resource: {self}") from err
157 except ClientError as err:
158 _translate_client_error(err)
159 raise
160 with time_this(log, msg="Read from %s", args=(self,)):
161 body = response["Body"].read()
162 response["Body"].close()
163 return body
165 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
166 def write(self, data: bytes, overwrite: bool = True) -> None:
167 """Write the supplied data to the resource."""
168 if not overwrite:
169 if self.exists(): 169 ↛ 171line 169 didn't jump to line 171, because the condition on line 169 was never false
170 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
171 with time_this(log, msg="Write to %s", args=(self,)):
172 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
174 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
175 def mkdir(self) -> None:
176 """Write a directory key to S3."""
177 if not bucketExists(self.netloc):
178 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
180 if not self.dirLike:
181 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
183 # don't create S3 key when root is at the top-level of an Bucket
184 if not self.path == "/":
185 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
187 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
188 def _download_file(self, local_file: IO, progress: Optional[ProgressPercentage]) -> None:
189 """Download the remote resource to a local file.
191 Helper routine for _as_local to allow backoff without regenerating
192 the temporary file.
193 """
194 try:
195 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress)
196 except ( 196 ↛ 200line 196 didn't jump to line 200, because the exception caught by line 196 didn't happen
197 self.client.exceptions.NoSuchKey,
198 self.client.exceptions.NoSuchBucket,
199 ) as err:
200 raise FileNotFoundError(f"No such resource: {self}") from err
201 except ClientError as err:
202 _translate_client_error(err)
203 raise
205 def _as_local(self) -> Tuple[str, bool]:
206 """Download object from S3 and place in temporary directory.
208 Returns
209 -------
210 path : `str`
211 Path to local temporary file.
212 temporary : `bool`
213 Always returns `True`. This is always a temporary file.
214 """
215 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
216 with time_this(log, msg="Downloading %s to local file", args=(self,)):
217 progress = (
218 ProgressPercentage(self, msg="Downloading:")
219 if log.isEnabledFor(ProgressPercentage.log_level)
220 else None
221 )
222 self._download_file(tmpFile, progress)
223 return tmpFile.name, True
225 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
226 def _upload_file(self, local_file: ResourcePath, progress: Optional[ProgressPercentage]) -> None:
227 """Upload a local file with backoff.
229 Helper method to wrap file uploading in backoff for transfer_from.
230 """
231 try:
232 self.client.upload_file(
233 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress
234 )
235 except self.client.exceptions.NoSuchBucket as err:
236 raise NotADirectoryError(f"Target does not exist: {err}") from err
237 except ClientError as err:
238 _translate_client_error(err)
239 raise
241 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
242 def _copy_from(self, src: ResourcePath) -> None:
243 copy_source = {
244 "Bucket": src.netloc,
245 "Key": src.relativeToPathRoot,
246 }
247 try:
248 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
249 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 249 ↛ 251line 249 didn't jump to line 251
250 raise FileNotFoundError("No such resource to transfer: {self}") from err
251 except ClientError as err:
252 _translate_client_error(err)
253 raise
255 def transfer_from(
256 self,
257 src: ResourcePath,
258 transfer: str = "copy",
259 overwrite: bool = False,
260 transaction: Optional[TransactionProtocol] = None,
261 ) -> None:
262 """Transfer the current resource to an S3 bucket.
264 Parameters
265 ----------
266 src : `ResourcePath`
267 Source URI.
268 transfer : `str`
269 Mode to use for transferring the resource. Supports the following
270 options: copy.
271 overwrite : `bool`, optional
272 Allow an existing file to be overwritten. Defaults to `False`.
273 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
274 Currently unused.
275 """
276 # Fail early to prevent delays if remote resources are requested
277 if transfer not in self.transferModes:
278 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
280 # Existence checks cost time so do not call this unless we know
281 # that debugging is enabled.
282 if log.isEnabledFor(logging.DEBUG):
283 log.debug(
284 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
285 src,
286 src.exists(),
287 self,
288 self.exists(),
289 transfer,
290 )
292 # Short circuit if the URIs are identical immediately.
293 if self == src:
294 log.debug(
295 "Target and destination URIs are identical: %s, returning immediately."
296 " No further action required.",
297 self,
298 )
299 return
301 if not overwrite and self.exists():
302 raise FileExistsError(f"Destination path '{self}' already exists.")
304 if transfer == "auto":
305 transfer = self.transferDefault
307 timer_msg = "Transfer from %s to %s"
308 timer_args = (src, self)
310 if isinstance(src, type(self)):
311 # Looks like an S3 remote uri so we can use direct copy
312 # note that boto3.resource.meta.copy is cleverer than the low
313 # level copy_object
314 with time_this(log, msg=timer_msg, args=timer_args):
315 self._copy_from(src)
317 else:
318 # Use local file and upload it
319 with src.as_local() as local_uri:
320 progress = (
321 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
322 if log.isEnabledFor(ProgressPercentage.log_level)
323 else None
324 )
325 with time_this(log, msg=timer_msg, args=timer_args):
326 self._upload_file(local_uri, progress)
328 # This was an explicit move requested from a remote resource
329 # try to remove that resource
330 if transfer == "move":
331 # Transactions do not work here
332 src.remove()
334 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
335 def walk(
336 self, file_filter: Optional[Union[str, re.Pattern]] = None
337 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]:
338 """Walk the directory tree returning matching files and directories.
340 Parameters
341 ----------
342 file_filter : `str` or `re.Pattern`, optional
343 Regex to filter out files from the list before it is returned.
345 Yields
346 ------
347 dirpath : `ResourcePath`
348 Current directory being examined.
349 dirnames : `list` of `str`
350 Names of subdirectories within dirpath.
351 filenames : `list` of `str`
352 Names of all the files within dirpath.
353 """
354 # We pretend that S3 uses directories and files and not simply keys
355 if not (self.isdir() or self.is_root):
356 raise ValueError(f"Can not walk a non-directory URI: {self}")
358 if isinstance(file_filter, str): 358 ↛ 359line 358 didn't jump to line 359, because the condition on line 358 was never true
359 file_filter = re.compile(file_filter)
361 s3_paginator = self.client.get_paginator("list_objects_v2")
363 # Limit each query to a single "directory" to match os.walk
364 # We could download all keys at once with no delimiter and work
365 # it out locally but this could potentially lead to large memory
366 # usage for millions of keys. It will also make the initial call
367 # to this method potentially very slow. If making this method look
368 # like os.walk was not required, we could query all keys with
369 # pagination and return them in groups of 1000, but that would
370 # be a different interface since we can't guarantee we would get
371 # them all grouped properly across the 1000 limit boundary.
372 prefix = self.relativeToPathRoot if not self.is_root else ""
373 prefix_len = len(prefix)
374 dirnames = []
375 filenames = []
376 files_there = False
378 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
379 # All results are returned as full key names and we must
380 # convert them back to the root form. The prefix is fixed
381 # and delimited so that is a simple trim
383 # Directories are reported in the CommonPrefixes result
384 # which reports the entire key and must be stripped.
385 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
386 dirnames.extend(found_dirs)
388 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
389 if found_files:
390 files_there = True
391 if file_filter is not None:
392 found_files = [f for f in found_files if file_filter.search(f)]
394 filenames.extend(found_files)
396 # Directories do not exist so we can't test for them. If no files
397 # or directories were found though, this means that it effectively
398 # does not exist and we should match os.walk() behavior and return
399 # immediately.
400 if not dirnames and not files_there:
401 return
402 else:
403 yield self, dirnames, filenames
405 for dir in dirnames:
406 new_uri = self.join(dir)
407 yield from new_uri.walk(file_filter)
409 @contextlib.contextmanager
410 def _openImpl(
411 self,
412 mode: str = "r",
413 *,
414 encoding: Optional[str] = None,
415 ) -> Iterator[ResourceHandleProtocol]:
416 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle:
417 if "b" in mode:
418 yield handle
419 else:
420 if encoding is None:
421 encoding = sys.getdefaultencoding()
422 # cast because the protocol is compatible, but does not have
423 # BytesIO in the inheritance tree
424 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub:
425 yield sub