Coverage for python/lsst/resources/s3.py: 89%
188 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-30 11:34 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-30 11:34 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("S3ResourcePath",)
16import contextlib
17import io
18import logging
19import re
20import sys
21import tempfile
22import threading
23from collections.abc import Iterable, Iterator
24from typing import IO, TYPE_CHECKING, cast
26from botocore.exceptions import ClientError
27from lsst.utils.timer import time_this
29from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
30from ._resourceHandles._s3ResourceHandle import S3ResourceHandle
31from ._resourcePath import ResourcePath
32from .s3utils import (
33 _TooManyRequestsError,
34 all_retryable_errors,
35 backoff,
36 bucketExists,
37 getS3Client,
38 max_retry_time,
39 retryable_io_errors,
40 s3CheckFileExists,
41)
43if TYPE_CHECKING:
44 with contextlib.suppress(ImportError):
45 import boto3
47 from .utils import TransactionProtocol
50log = logging.getLogger(__name__)
53class ProgressPercentage:
54 """Progress bar for S3 file uploads."""
56 log_level = logging.DEBUG
57 """Default log level to use when issuing a message."""
59 def __init__(self, file: ResourcePath, file_for_msg: ResourcePath | None = None, msg: str = ""):
60 self._filename = file
61 self._file_for_msg = str(file_for_msg) if file_for_msg is not None else str(file)
62 self._size = file.size()
63 self._seen_so_far = 0
64 self._lock = threading.Lock()
65 self._msg = msg
67 def __call__(self, bytes_amount: int) -> None:
68 # To simplify, assume this is hooked up to a single filename
69 with self._lock:
70 self._seen_so_far += bytes_amount
71 percentage = (100 * self._seen_so_far) // self._size
72 log.log(
73 self.log_level,
74 "%s %s %s / %s (%s%%)",
75 self._msg,
76 self._file_for_msg,
77 self._seen_so_far,
78 self._size,
79 percentage,
80 )
83def _translate_client_error(err: ClientError) -> None:
84 """Translate a ClientError into a specialist error if relevant.
86 Parameters
87 ----------
88 err : `ClientError`
89 Exception to translate.
91 Raises
92 ------
93 _TooManyRequestsError
94 Raised if the `ClientError` looks like a 429 retry request.
95 """
96 if "(429)" in str(err):
97 # ClientError includes the error code in the message
98 # but no direct way to access it without looking inside the
99 # response.
100 raise _TooManyRequestsError(str(err)) from err
101 elif "(404)" in str(err):
102 # Some systems can generate this rather than NoSuchKey.
103 raise FileNotFoundError("Resource not found: {self}")
106class S3ResourcePath(ResourcePath):
107 """S3 URI resource path implementation class."""
109 @property
110 def client(self) -> boto3.client:
111 """Client object to address remote resource."""
112 # Defer import for circular dependencies
113 return getS3Client()
115 @classmethod
116 def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]:
117 # Force client to be created before creating threads.
118 getS3Client()
120 return super()._mexists(uris)
122 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
123 def exists(self) -> bool:
124 """Check that the S3 resource exists."""
125 if self.is_root:
126 # Only check for the bucket since the path is irrelevant
127 return bucketExists(self.netloc)
128 exists, _ = s3CheckFileExists(self, client=self.client)
129 return exists
131 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
132 def size(self) -> int:
133 """Return the size of the resource in bytes."""
134 if self.dirLike:
135 return 0
136 exists, sz = s3CheckFileExists(self, client=self.client)
137 if not exists:
138 raise FileNotFoundError(f"Resource {self} does not exist")
139 return sz
141 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
142 def remove(self) -> None:
143 """Remove the resource."""
144 # https://github.com/boto/boto3/issues/507 - there is no
145 # way of knowing if the file was actually deleted except
146 # for checking all the keys again, reponse is HTTP 204 OK
147 # response all the time
148 try:
149 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
150 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
151 raise FileNotFoundError("No such resource: {self}") from err
153 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
154 def read(self, size: int = -1) -> bytes:
155 """Read the contents of the resource."""
156 args = {}
157 if size > 0:
158 args["Range"] = f"bytes=0-{size-1}"
159 try:
160 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
161 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 161 ↛ 163line 161 didn't jump to line 163
162 raise FileNotFoundError(f"No such resource: {self}") from err
163 except ClientError as err:
164 _translate_client_error(err)
165 raise
166 with time_this(log, msg="Read from %s", args=(self,)):
167 body = response["Body"].read()
168 response["Body"].close()
169 return body
171 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
172 def write(self, data: bytes, overwrite: bool = True) -> None:
173 """Write the supplied data to the resource."""
174 if not overwrite and self.exists():
175 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
176 with time_this(log, msg="Write to %s", args=(self,)):
177 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
179 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
180 def mkdir(self) -> None:
181 """Write a directory key to S3."""
182 if not bucketExists(self.netloc):
183 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
185 if not self.dirLike:
186 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
188 # don't create S3 key when root is at the top-level of an Bucket
189 if self.path != "/":
190 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
192 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
193 def _download_file(self, local_file: IO, progress: ProgressPercentage | None) -> None:
194 """Download the remote resource to a local file.
196 Helper routine for _as_local to allow backoff without regenerating
197 the temporary file.
198 """
199 try:
200 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, local_file, Callback=progress)
201 except (
202 self.client.exceptions.NoSuchKey,
203 self.client.exceptions.NoSuchBucket,
204 ) as err:
205 raise FileNotFoundError(f"No such resource: {self}") from err
206 except ClientError as err:
207 _translate_client_error(err)
208 raise
210 def _as_local(self) -> tuple[str, bool]:
211 """Download object from S3 and place in temporary directory.
213 Returns
214 -------
215 path : `str`
216 Path to local temporary file.
217 temporary : `bool`
218 Always returns `True`. This is always a temporary file.
219 """
220 with (
221 tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile,
222 time_this(log, msg="Downloading %s to local file", args=(self,)),
223 ):
224 progress = (
225 ProgressPercentage(self, msg="Downloading:")
226 if log.isEnabledFor(ProgressPercentage.log_level)
227 else None
228 )
229 self._download_file(tmpFile, progress)
230 return tmpFile.name, True
232 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
233 def _upload_file(self, local_file: ResourcePath, progress: ProgressPercentage | None) -> None:
234 """Upload a local file with backoff.
236 Helper method to wrap file uploading in backoff for transfer_from.
237 """
238 try:
239 self.client.upload_file(
240 local_file.ospath, self.netloc, self.relativeToPathRoot, Callback=progress
241 )
242 except self.client.exceptions.NoSuchBucket as err:
243 raise NotADirectoryError(f"Target does not exist: {err}") from err
244 except ClientError as err:
245 _translate_client_error(err)
246 raise
248 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
249 def _copy_from(self, src: ResourcePath) -> None:
250 copy_source = {
251 "Bucket": src.netloc,
252 "Key": src.relativeToPathRoot,
253 }
254 try:
255 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
256 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err: 256 ↛ 258line 256 didn't jump to line 258
257 raise FileNotFoundError("No such resource to transfer: {self}") from err
258 except ClientError as err:
259 _translate_client_error(err)
260 raise
262 def transfer_from(
263 self,
264 src: ResourcePath,
265 transfer: str = "copy",
266 overwrite: bool = False,
267 transaction: TransactionProtocol | None = None,
268 ) -> None:
269 """Transfer the current resource to an S3 bucket.
271 Parameters
272 ----------
273 src : `ResourcePath`
274 Source URI.
275 transfer : `str`
276 Mode to use for transferring the resource. Supports the following
277 options: copy.
278 overwrite : `bool`, optional
279 Allow an existing file to be overwritten. Defaults to `False`.
280 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
281 Currently unused.
282 """
283 # Fail early to prevent delays if remote resources are requested
284 if transfer not in self.transferModes:
285 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
287 # Existence checks cost time so do not call this unless we know
288 # that debugging is enabled.
289 if log.isEnabledFor(logging.DEBUG): 289 ↛ 300line 289 didn't jump to line 300, because the condition on line 289 was never false
290 log.debug(
291 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
292 src,
293 src.exists(),
294 self,
295 self.exists(),
296 transfer,
297 )
299 # Short circuit if the URIs are identical immediately.
300 if self == src:
301 log.debug(
302 "Target and destination URIs are identical: %s, returning immediately."
303 " No further action required.",
304 self,
305 )
306 return
308 if not overwrite and self.exists():
309 raise FileExistsError(f"Destination path '{self}' already exists.")
311 if transfer == "auto":
312 transfer = self.transferDefault
314 timer_msg = "Transfer from %s to %s"
315 timer_args = (src, self)
317 if isinstance(src, type(self)):
318 # Looks like an S3 remote uri so we can use direct copy
319 # note that boto3.resource.meta.copy is cleverer than the low
320 # level copy_object
321 with time_this(log, msg=timer_msg, args=timer_args):
322 self._copy_from(src)
324 else:
325 # Use local file and upload it
326 with src.as_local() as local_uri:
327 progress = (
328 ProgressPercentage(local_uri, file_for_msg=src, msg="Uploading:")
329 if log.isEnabledFor(ProgressPercentage.log_level)
330 else None
331 )
332 with time_this(log, msg=timer_msg, args=timer_args):
333 self._upload_file(local_uri, progress)
335 # This was an explicit move requested from a remote resource
336 # try to remove that resource
337 if transfer == "move":
338 # Transactions do not work here
339 src.remove()
341 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
342 def walk(
343 self, file_filter: str | re.Pattern | None = None
344 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
345 """Walk the directory tree returning matching files and directories.
347 Parameters
348 ----------
349 file_filter : `str` or `re.Pattern`, optional
350 Regex to filter out files from the list before it is returned.
352 Yields
353 ------
354 dirpath : `ResourcePath`
355 Current directory being examined.
356 dirnames : `list` of `str`
357 Names of subdirectories within dirpath.
358 filenames : `list` of `str`
359 Names of all the files within dirpath.
360 """
361 # We pretend that S3 uses directories and files and not simply keys
362 if not (self.isdir() or self.is_root):
363 raise ValueError(f"Can not walk a non-directory URI: {self}")
365 if isinstance(file_filter, str): 365 ↛ 366line 365 didn't jump to line 366, because the condition on line 365 was never true
366 file_filter = re.compile(file_filter)
368 s3_paginator = self.client.get_paginator("list_objects_v2")
370 # Limit each query to a single "directory" to match os.walk
371 # We could download all keys at once with no delimiter and work
372 # it out locally but this could potentially lead to large memory
373 # usage for millions of keys. It will also make the initial call
374 # to this method potentially very slow. If making this method look
375 # like os.walk was not required, we could query all keys with
376 # pagination and return them in groups of 1000, but that would
377 # be a different interface since we can't guarantee we would get
378 # them all grouped properly across the 1000 limit boundary.
379 prefix = self.relativeToPathRoot if not self.is_root else ""
380 prefix_len = len(prefix)
381 dirnames = []
382 filenames = []
383 files_there = False
385 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
386 # All results are returned as full key names and we must
387 # convert them back to the root form. The prefix is fixed
388 # and delimited so that is a simple trim
390 # Directories are reported in the CommonPrefixes result
391 # which reports the entire key and must be stripped.
392 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
393 dirnames.extend(found_dirs)
395 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
396 if found_files:
397 files_there = True
398 if file_filter is not None:
399 found_files = [f for f in found_files if file_filter.search(f)]
401 filenames.extend(found_files)
403 # Directories do not exist so we can't test for them. If no files
404 # or directories were found though, this means that it effectively
405 # does not exist and we should match os.walk() behavior and return
406 # immediately.
407 if not dirnames and not files_there:
408 return
409 else:
410 yield self, dirnames, filenames
412 for dir in dirnames:
413 new_uri = self.join(dir)
414 yield from new_uri.walk(file_filter)
416 @contextlib.contextmanager
417 def _openImpl(
418 self,
419 mode: str = "r",
420 *,
421 encoding: str | None = None,
422 ) -> Iterator[ResourceHandleProtocol]:
423 with S3ResourceHandle(mode, log, self.client, self.netloc, self.relativeToPathRoot) as handle:
424 if "b" in mode:
425 yield handle
426 else:
427 if encoding is None:
428 encoding = sys.getdefaultencoding()
429 # cast because the protocol is compatible, but does not have
430 # BytesIO in the inheritance tree
431 with io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding, write_through=True) as sub:
432 yield sub