Coverage for python/lsst/resources/s3.py: 73%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14import logging
15import re
16import tempfile
18__all__ = ("S3ResourcePath",)
20from http.client import HTTPException, ImproperConnectionState
21from typing import TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, Union
23from botocore.exceptions import ClientError
24from lsst.utils.timer import time_this
25from urllib3.exceptions import HTTPError, RequestError
27from ._resourcePath import ResourcePath
28from .s3utils import bucketExists, getS3Client, s3CheckFileExists
30if TYPE_CHECKING: 30 ↛ 31line 30 didn't jump to line 31, because the condition on line 30 was never true
31 try:
32 import boto3
33 except ImportError:
34 pass
35 from .utils import TransactionProtocol
37# https://pypi.org/project/backoff/
38try:
39 import backoff
40except ImportError:
42 class Backoff:
43 @staticmethod
44 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
45 return func
47 @staticmethod
48 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
49 return func
51 backoff = Backoff
53# settings for "backoff" retry decorators. these retries are belt-and-
54# suspenders along with the retries built into Boto3, to account for
55# semantic differences in errors between S3-like providers.
56retryable_io_errors = (
57 # http.client
58 ImproperConnectionState,
59 HTTPException,
60 # urllib3.exceptions
61 RequestError,
62 HTTPError,
63 # built-ins
64 TimeoutError,
65 ConnectionError,
66)
68# Client error can include NoSuchKey so retry may not be the right
69# thing. This may require more consideration if it is to be used.
70retryable_client_errors = (
71 # botocore.exceptions
72 ClientError,
73 # built-ins
74 PermissionError,
75)
77# Combine all errors into an easy package. For now client errors
78# are not included.
79all_retryable_errors = retryable_io_errors
80max_retry_time = 60
83log = logging.getLogger(__name__)
86class S3ResourcePath(ResourcePath):
87 """S3 URI resource path implementation class."""
89 @property
90 def client(self) -> boto3.client:
91 """Client object to address remote resource."""
92 # Defer import for circular dependencies
93 return getS3Client()
95 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
96 def exists(self) -> bool:
97 """Check that the S3 resource exists."""
98 if self.is_root: 98 ↛ 100line 98 didn't jump to line 100, because the condition on line 98 was never true
99 # Only check for the bucket since the path is irrelevant
100 return bucketExists(self.netloc)
101 exists, _ = s3CheckFileExists(self, client=self.client)
102 return exists
104 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
105 def size(self) -> int:
106 """Return the size of the resource in bytes."""
107 if self.dirLike: 107 ↛ 108line 107 didn't jump to line 108, because the condition on line 107 was never true
108 return 0
109 exists, sz = s3CheckFileExists(self, client=self.client)
110 if not exists: 110 ↛ 112line 110 didn't jump to line 112, because the condition on line 110 was never false
111 raise FileNotFoundError(f"Resource {self} does not exist")
112 return sz
114 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
115 def remove(self) -> None:
116 """Remove the resource."""
117 # https://github.com/boto/boto3/issues/507 - there is no
118 # way of knowing if the file was actually deleted except
119 # for checking all the keys again, reponse is HTTP 204 OK
120 # response all the time
121 try:
122 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
123 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
124 raise FileNotFoundError("No such resource: {self}") from err
126 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
127 def read(self, size: int = -1) -> bytes:
128 """Read the contents of the resource."""
129 args = {}
130 if size > 0:
131 args["Range"] = f"bytes=0-{size-1}"
132 try:
133 response = self.client.get_object(Bucket=self.netloc, Key=self.relativeToPathRoot, **args)
134 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
135 raise FileNotFoundError(f"No such resource: {self}") from err
136 with time_this(log, msg="Read from %s", args=(self,)):
137 body = response["Body"].read()
138 response["Body"].close()
139 return body
141 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
142 def write(self, data: bytes, overwrite: bool = True) -> None:
143 """Write the supplied data to the resource."""
144 if not overwrite:
145 if self.exists(): 145 ↛ 146line 145 didn't jump to line 146, because the condition on line 145 was never true
146 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
147 with time_this(log, msg="Write to %s", args=(self,)):
148 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=data)
150 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
151 def mkdir(self) -> None:
152 """Write a directory key to S3."""
153 if not bucketExists(self.netloc):
154 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
156 if not self.dirLike:
157 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
159 # don't create S3 key when root is at the top-level of an Bucket
160 if not self.path == "/":
161 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
163 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
164 def _as_local(self) -> Tuple[str, bool]:
165 """Download object from S3 and place in temporary directory.
167 Returns
168 -------
169 path : `str`
170 Path to local temporary file.
171 temporary : `bool`
172 Always returns `True`. This is always a temporary file.
173 """
174 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
175 with time_this(log, msg="Downloading %s to local file", args=(self,)):
176 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, tmpFile)
177 return tmpFile.name, True
179 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
180 def transfer_from(
181 self,
182 src: ResourcePath,
183 transfer: str = "copy",
184 overwrite: bool = False,
185 transaction: Optional[TransactionProtocol] = None,
186 ) -> None:
187 """Transfer the current resource to an S3 bucket.
189 Parameters
190 ----------
191 src : `ResourcePath`
192 Source URI.
193 transfer : `str`
194 Mode to use for transferring the resource. Supports the following
195 options: copy.
196 overwrite : `bool`, optional
197 Allow an existing file to be overwritten. Defaults to `False`.
198 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
199 Currently unused.
200 """
201 # Fail early to prevent delays if remote resources are requested
202 if transfer not in self.transferModes:
203 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
205 # Existence checks cost time so do not call this unless we know
206 # that debugging is enabled.
207 if log.isEnabledFor(logging.DEBUG): 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true
208 log.debug(
209 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
210 src,
211 src.exists(),
212 self,
213 self.exists(),
214 transfer,
215 )
217 if not overwrite and self.exists():
218 raise FileExistsError(f"Destination path '{self}' already exists.")
220 if transfer == "auto": 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 transfer = self.transferDefault
223 timer_msg = "Transfer from %s to %s"
224 timer_args = (src, self)
226 if isinstance(src, type(self)):
227 # Looks like an S3 remote uri so we can use direct copy
228 # note that boto3.resource.meta.copy is cleverer than the low
229 # level copy_object
230 copy_source = {
231 "Bucket": src.netloc,
232 "Key": src.relativeToPathRoot,
233 }
234 with time_this(log, msg=timer_msg, args=timer_args):
235 self.client.copy_object(
236 CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot
237 )
238 else:
239 # Use local file and upload it
240 with src.as_local() as local_uri:
242 # resource.meta.upload_file seems like the right thing
243 # but we have a low level client
244 with time_this(log, msg=timer_msg, args=timer_args):
245 with open(local_uri.ospath, "rb") as fh:
246 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot, Body=fh)
248 # This was an explicit move requested from a remote resource
249 # try to remove that resource
250 if transfer == "move": 250 ↛ 252line 250 didn't jump to line 252, because the condition on line 250 was never true
251 # Transactions do not work here
252 src.remove()
254 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
255 def walk(
256 self, file_filter: Optional[Union[str, re.Pattern]] = None
257 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]:
258 """Walk the directory tree returning matching files and directories.
260 Parameters
261 ----------
262 file_filter : `str` or `re.Pattern`, optional
263 Regex to filter out files from the list before it is returned.
265 Yields
266 ------
267 dirpath : `ResourcePath`
268 Current directory being examined.
269 dirnames : `list` of `str`
270 Names of subdirectories within dirpath.
271 filenames : `list` of `str`
272 Names of all the files within dirpath.
273 """
274 # We pretend that S3 uses directories and files and not simply keys
275 if not (self.isdir() or self.is_root): 275 ↛ 276line 275 didn't jump to line 276, because the condition on line 275 was never true
276 raise ValueError(f"Can not walk a non-directory URI: {self}")
278 if isinstance(file_filter, str): 278 ↛ 279line 278 didn't jump to line 279, because the condition on line 278 was never true
279 file_filter = re.compile(file_filter)
281 s3_paginator = self.client.get_paginator("list_objects_v2")
283 # Limit each query to a single "directory" to match os.walk
284 # We could download all keys at once with no delimiter and work
285 # it out locally but this could potentially lead to large memory
286 # usage for millions of keys. It will also make the initial call
287 # to this method potentially very slow. If making this method look
288 # like os.walk was not required, we could query all keys with
289 # pagination and return them in groups of 1000, but that would
290 # be a different interface since we can't guarantee we would get
291 # them all grouped properly across the 1000 limit boundary.
292 prefix = self.relativeToPathRoot if not self.is_root else ""
293 prefix_len = len(prefix)
294 dirnames = []
295 filenames = []
296 files_there = False
298 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
299 # All results are returned as full key names and we must
300 # convert them back to the root form. The prefix is fixed
301 # and delimited so that is a simple trim
303 # Directories are reported in the CommonPrefixes result
304 # which reports the entire key and must be stripped.
305 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
306 dirnames.extend(found_dirs)
308 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
309 if found_files:
310 files_there = True
311 if file_filter is not None:
312 found_files = [f for f in found_files if file_filter.search(f)]
314 filenames.extend(found_files)
316 # Directories do not exist so we can't test for them. If no files
317 # or directories were found though, this means that it effectively
318 # does not exist and we should match os.walk() behavior and return
319 # [].
320 if not dirnames and not files_there: 320 ↛ 321line 320 didn't jump to line 321, because the condition on line 320 was never true
321 yield []
322 else:
323 yield self, dirnames, filenames
325 for dir in dirnames:
326 new_uri = self.join(dir)
327 yield from new_uri.walk(file_filter)