Coverage for python/lsst/resources/s3.py: 89%
184 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:33 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-08-31 09:33 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourcePath",)
16import contextlib
17import io
18import logging
19import re
20import sys
21import tempfile
22import threading
23from collections.abc import Iterator
24from typing import IO, TYPE_CHECKING, cast
26from botocore.exceptions import ClientError
27from lsst.utils.timer import time_this
29from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
30from ._resourceHandles._s3ResourceHandle import S3ResourceHandle
31from ._resourcePath import ResourcePath
32from .s3utils import (
33 _TooManyRequestsError,
34 all_retryable_errors,
35 backoff,
36 bucketExists,
37 getS3Client,
38 max_retry_time,
39 retryable_io_errors,
40 s3CheckFileExists,
41)
43if TYPE_CHECKING:
44 with contextlib.suppress(ImportError):
45 import boto3
47 from .utils import TransactionProtocol
50log = logging.getLogger(__name__)
53class ProgressPercentage:
54 """Progress bar for S3 file uploads."""
56 log_level = logging.DEBUG
57 """Default log level to use when issuing a message."""
59 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""):
60 self._filename = file
61 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
62 self._size = file.size()
63 self._seen_so_far = 0
64 self._lock = threading.Lock()
65 self._msg = msg
67 def __call__(self, bytes_amount: int) -> None:
68 # To simplify, assume this is hooked up to a single filename
69 with self._lock:
70 self._seen_so_far += bytes_amount
71 percentage = (100 * self._seen_so_far) // self._size
72 log.log(
73 self.log_level,
74 "%s %s %s / %s (%s%%)",
75 self._msg,
76 self._file_for_msg,
77 self._seen_so_far,
78 self._size,
79 percentage,
80 )
83def _translate_client_error(err: ClientError) -> None:
84 """Translate a ClientError into a specialist error if relevant.
86 Parameters
87 ----------
88 err : `ClientError`
89 Exception to translate.
91 Raises
92 ------
93 _TooManyRequestsError
94 Raised if the `ClientError` looks like a 429 retry request.
95 """
96 if "(429)" in str(err):
97 # ClientError includes the error code in the message
98 # but no direct way to access it without looking inside the
99 # response.
100 raise _TooManyRequestsError(str(err)) from err
101 elif "(404)" in str(err):
102 # Some systems can generate this rather than NoSuchKey.
103 raise FileNotFoundError("Resource not found: {self}")
106class S3ResourcePath(ResourcePath):
107 """S3 URI resource path implementation class."""
109 @property
110 def client(self) -> boto3.client:
111 """Client object to address remote resource."""
112 # Defer import for circular dependencies
113 return getS3Client()
115 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
116 def exists(self) -> bool:
117 """Check that the S3 resource exists."""
118 if self.is_root:
119 # Only check for the bucket since the path is irrelevant
120 return bucketExists(self.netloc)
121 exists, _ = s3CheckFileExists(self, client=self.client)
122 return exists
124 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
125 def size(self) -> int:
126 """Return the size of the resource in bytes."""
127 if self.dirLike:
128 return 0
129 exists, sz = s3CheckFileExists(self, client=self.client)
130 if not exists:
131 raise FileNotFoundError(f"Resource {self} does not exist")
132 return sz
134 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
135 def remove(self) -> None:
136 """Remove the resource."""
137 # https://github.com/boto/boto3/issues/507 - there is no
138 # way of knowing if the file was actually deleted except
139 # for checking all the keys again, reponse is HTTP 204 OK
140 # response all the time
141 try:
142 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
143 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
144 raise FileNotFoundError("No such resource: {self}") from err
146 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
147 def read(self, size: int = -1) -> bytes:
148 """Read the contents of the resource."""
149 args = {}
150 if size > 0:
151 args["Range"] = f"bytes=0-{size-1}"
152 try:
153 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
154 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 154 ↛ 156line 154 didn't jump to line 156
155 raise FileNotFoundError(f"No such resource: {self}") from err
156 except ClientError as err:
157 _translate_client_error(err)
158 raise
159 with time_this(log, msg="Read from %s", args=(self,)):
160 body = response["Body"].read()
161 response["Body"].close()
162 return body
164 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
165 def write(self, data: bytes, overwrite: bool = True) -> None:
166 """Write the supplied data to the resource."""
167 if not overwrite and self.exists():
168 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
169 with time_this(log, msg="Write to %s", args=(self,)):
170 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
172 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
173 def mkdir(self) -> None:
174 """Write a directory key to S3."""
175 if not bucketExists(self.netloc):
176 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
178 if not self.dirLike:
179 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
181 # don't create S3 key when root is at the top-level of an Bucket
182 if self.path != "/":
183 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
185 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
186 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None:
187 """Download the remote resource to a local file.
189 Helper routine for _as_local to allow backoff without regenerating
190 the temporary file.
191 """
192 try:
193 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress)
194 except (
195 self.client.exceptions.NoSuchKey,
196 self.client.exceptions.NoSuchBucket,
197 ) as err:
198 raise FileNotFoundError(f"No such resource: {self}") from err
199 except ClientError as err:
200 _translate_client_error(err)
201 raise
203 def _as_local(self) -> tuple[str, bool]:
204 """Download object from S3 and place in temporary directory.
206 Returns
207 -------
208 path : `str`
209 Path to local temporary file.
210 temporary : `bool`
211 Always returns `True`. This is always a temporary file.
212 """
213 with (
214 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile,
215 time_this(log, msg="Downloading %s to local file", args=(self,)),
216 ):
217 progress = (
218 ProgressPercentage(self, msg="Downloading:")
219 if log.isEnabledFor(ProgressPercentage.log_level)
220 else None
221 )
222 self._download_file(tmpFile, progress)
223 return tmpFile.name, True
225 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
226 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None:
227 """Upload a local file with backoff.
229 Helper method to wrap file uploading in backoff for transfer_from.
230 """
231 try:
232 self.client.upload_file(
233 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress
234 )
235 except self.client.exceptions.NoSuchBucket as err:
236 raise NotADirectoryError(f"Target does not exist: {err}") from err
237 except ClientError as err:
238 _translate_client_error(err)
239 raise
241 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
242 def _copy_from(self, src: ResourcePath) -> None:
243 copy_source = {
244 "Bucket": src.netloc,
245 "Key": src.relativeToPathRoot,
246 }
247 try:
248 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
249 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 249 ↛ 251line 249 didn't jump to line 251
250 raise FileNotFoundError("No such resource to transfer: {self}") from err
251 except ClientError as err:
252 _translate_client_error(err)
253 raise
255 def transfer_from(
256 self,
257 src: ResourcePath,
258 transfer: str = "copy",
259 overwrite: bool = False,
260 transaction: TransactionProtocol | None = None,
261 ) -> None:
262 """Transfer the current resource to an S3 bucket.
264 Parameters
265 ----------
266 src : `ResourcePath`
267 Source URI.
268 transfer : `str`
269 Mode to use for transferring the resource. Supports the following
270 options: copy.
271 overwrite : `bool`, optional
272 Allow an existing file to be overwritten. Defaults to `False`.
273 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
274 Currently unused.
275 """
276 # Fail early to prevent delays if remote resources are requested
277 if transfer not in self.transferModes:
278 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
280 # Existence checks cost time so do not call this unless we know
281 # that debugging is enabled.
282 if log.isEnabledFor(logging.DEBUG): 282 ↛ 293line 282 didn't jump to line 293, because the condition on line 282 was never false
283 log.debug(
284 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
285 src,
286 src.exists(),
287 self,
288 self.exists(),
289 transfer,
290 )
292 # Short circuit if the URIs are identical immediately.
293 if self == src:
294 log.debug(
295 "Target and destination URIs are identical: %s, returning immediately."
296 " No further action required.",
297 self,
298 )
299 return
301 if not overwrite and self.exists():
302 raise FileExistsError(f"Destination path '{self}' already exists.")
304 if transfer == "auto":
305 transfer = self.transferDefault
307 timer_msg = "Transfer from %s to %s"
308 timer_args = (src, self)
310 if isinstance(src, type(self)):
311 # Looks like an S3 remote uri so we can use direct copy
312 # note that boto3.resource.meta.copy is cleverer than the low
313 # level copy_object
314 with time_this(log, msg=timer_msg, args=timer_args):
315 self._copy_from(src)
317 else:
318 # Use local file and upload it
319 with src.as_local() as local_uri:
320 progress = (
321 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
322 if log.isEnabledFor(ProgressPercentage.log_level)
323 else None
324 )
325 with time_this(log, msg=timer_msg, args=timer_args):
326 self._upload_file(local_uri, progress)
328 # This was an explicit move requested from a remote resource
329 # try to remove that resource
330 if transfer == "move":
331 # Transactions do not work here
332 src.remove()
334 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
335 def walk(
336 self, file_filter: str | re.Pattern | None = None
337 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
338 """Walk the directory tree returning matching files and directories.
340 Parameters
341 ----------
342 file_filter : `str` or `re.Pattern`, optional
343 Regex to filter out files from the list before it is returned.
345 Yields
346 ------
347 dirpath : `ResourcePath`
348 Current directory being examined.
349 dirnames : `list` of `str`
350 Names of subdirectories within dirpath.
351 filenames : `list` of `str`
352 Names of all the files within dirpath.
353 """
354 # We pretend that S3 uses directories and files and not simply keys
355 if not (self.isdir() or self.is_root):
356 raise ValueError(f"Can not walk a non-directory URI: {self}")
358 if isinstance(file_filter, str): 358 ↛ 359line 358 didn't jump to line 359, because the condition on line 358 was never true
359 file_filter = re.compile(file_filter)
361 s3_paginator = self.client.get_paginator("list_objects_v2")
363 # Limit each query to a single "directory" to match os.walk
364 # We could download all keys at once with no delimiter and work
365 # it out locally but this could potentially lead to large memory
366 # usage for millions of keys. It will also make the initial call
367 # to this method potentially very slow. If making this method look
368 # like os.walk was not required, we could query all keys with
369 # pagination and return them in groups of 1000, but that would
370 # be a different interface since we can't guarantee we would get
371 # them all grouped properly across the 1000 limit boundary.
372 prefix = self.relativeToPathRoot if not self.is_root else ""
373 prefix_len = len(prefix)
374 dirnames = []
375 filenames = []
376 files_there = False
378 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
379 # All results are returned as full key names and we must
380 # convert them back to the root form. The prefix is fixed
381 # and delimited so that is a simple trim
383 # Directories are reported in the CommonPrefixes result
384 # which reports the entire key and must be stripped.
385 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
386 dirnames.extend(found_dirs)
388 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
389 if found_files:
390 files_there = True
391 if file_filter is not None:
392 found_files = [f for f in found_files if file_filter.search(f)]
394 filenames.extend(found_files)
396 # Directories do not exist so we can't test for them. If no files
397 # or directories were found though, this means that it effectively
398 # does not exist and we should match os.walk() behavior and return
399 # immediately.
400 if not dirnames and not files_there:
401 return
402 else:
403 yield self, dirnames, filenames
405 for dir in dirnames:
406 new_uri = self.join(dir)
407 yield from new_uri.walk(file_filter)
409 @contextlib.contextmanager
410 def _openImpl(
411 self,
412 mode: str = "r",
413 *,
414 encoding: str | None = None,
415 ) -> Iterator[ResourceHandleProtocol]:
416 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle:
417 if "b" in mode:
418 yield handle
419 else:
420 if encoding is None:
421 encoding = sys.getdefaultencoding()
422 # cast because the protocol is compatible, but does not have
423 # BytesIO in the inheritance tree
424 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub:
425 yield sub