Coverage for python/lsst/daf/butler/core/_butlerUri/s3.py : 79%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import logging
25import re
26import tempfile
28__all__ = ('ButlerS3URI',)
30from typing import (
31 TYPE_CHECKING,
32 Optional,
33 Any,
34 Callable,
35 Iterator,
36 List,
37 Tuple,
38 Union,
39)
41from .utils import NoTransaction
42from ._butlerUri import ButlerURI
43from .s3utils import getS3Client, s3CheckFileExists, bucketExists
45from botocore.exceptions import ClientError
46from http.client import ImproperConnectionState, HTTPException
47from urllib3.exceptions import RequestError, HTTPError
49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true
50 try:
51 import boto3
52 except ImportError:
53 pass
54 from ..datastore import DatastoreTransaction
56# https://pypi.org/project/backoff/
57try:
58 import backoff
59except ImportError:
60 class Backoff():
61 @staticmethod
62 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
63 return func
65 @staticmethod
66 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
67 return func
69 backoff = Backoff
71# settings for "backoff" retry decorators. these retries are belt-and-
72# suspenders along with the retries built into Boto3, to account for
73# semantic differences in errors between S3-like providers.
74retryable_io_errors = (
75 # http.client
76 ImproperConnectionState, HTTPException,
77 # urllib3.exceptions
78 RequestError, HTTPError,
79 # built-ins
80 TimeoutError, ConnectionError)
81retryable_client_errors = (
82 # botocore.exceptions
83 ClientError,
84 # built-ins
85 PermissionError)
86all_retryable_errors = retryable_client_errors + retryable_io_errors
87max_retry_time = 60
90log = logging.getLogger(__name__)
93class ButlerS3URI(ButlerURI):
94 """S3 URI implementation class."""
96 @property
97 def client(self) -> boto3.client:
98 """Client object to address remote resource."""
99 # Defer import for circular dependencies
100 return getS3Client()
102 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
103 def exists(self) -> bool:
104 """Check that the S3 resource exists."""
105 if self.is_root: 105 ↛ 107line 105 didn't jump to line 107, because the condition on line 105 was never true
106 # Only check for the bucket since the path is irrelevant
107 return bucketExists(self.netloc)
108 exists, _ = s3CheckFileExists(self, client=self.client)
109 return exists
111 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
112 def size(self) -> int:
113 """Return the size of the resource in bytes."""
114 if self.dirLike: 114 ↛ 115line 114 didn't jump to line 115, because the condition on line 114 was never true
115 return 0
116 exists, sz = s3CheckFileExists(self, client=self.client)
117 if not exists:
118 raise FileNotFoundError(f"Resource {self} does not exist")
119 return sz
121 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
122 def remove(self) -> None:
123 """Remove the resource."""
124 # https://github.com/boto/boto3/issues/507 - there is no
125 # way of knowing if the file was actually deleted except
126 # for checking all the keys again, reponse is HTTP 204 OK
127 # response all the time
128 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
130 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
131 def read(self, size: int = -1) -> bytes:
132 """Read the contents of the resource."""
133 args = {}
134 if size > 0:
135 args["Range"] = f"bytes=0-{size-1}"
136 try:
137 response = self.client.get_object(Bucket=self.netloc,
138 Key=self.relativeToPathRoot,
139 **args)
140 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
141 raise FileNotFoundError(f"No such resource: {self}") from err
142 body = response["Body"].read()
143 response["Body"].close()
144 return body
146 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
147 def write(self, data: bytes, overwrite: bool = True) -> None:
148 """Write the supplied data to the resource."""
149 if not overwrite:
150 if self.exists(): 150 ↛ 151line 150 didn't jump to line 151, because the condition on line 150 was never true
151 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
152 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot,
153 Body=data)
155 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
156 def mkdir(self) -> None:
157 """Write a directory key to S3."""
158 if not bucketExists(self.netloc): 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true
159 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
161 if not self.dirLike: 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true
162 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
164 # don't create S3 key when root is at the top-level of an Bucket
165 if not self.path == "/": 165 ↛ exitline 165 didn't return from function 'mkdir', because the condition on line 165 was never false
166 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
168 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
169 def _as_local(self) -> Tuple[str, bool]:
170 """Download object from S3 and place in temporary directory.
172 Returns
173 -------
174 path : `str`
175 Path to local temporary file.
176 temporary : `bool`
177 Always returns `True`. This is always a temporary file.
178 """
179 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
180 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, tmpFile)
181 return tmpFile.name, True
183 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
184 def transfer_from(self, src: ButlerURI, transfer: str = "copy",
185 overwrite: bool = False,
186 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
187 """Transfer the current resource to an S3 bucket.
189 Parameters
190 ----------
191 src : `ButlerURI`
192 Source URI.
193 transfer : `str`
194 Mode to use for transferring the resource. Supports the following
195 options: copy.
196 overwrite : `bool`, optional
197 Allow an existing file to be overwritten. Defaults to `False`.
198 transaction : `DatastoreTransaction`, optional
199 Currently unused.
200 """
201 # Fail early to prevent delays if remote resources are requested
202 if transfer not in self.transferModes:
203 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
205 log.debug(f"Transferring {src} [exists: {src.exists()}] -> "
206 f"{self} [exists: {self.exists()}] (transfer={transfer})")
208 if not overwrite and self.exists():
209 raise FileExistsError(f"Destination path '{self}' already exists.")
211 if transfer == "auto": 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 transfer = self.transferDefault
214 if isinstance(src, type(self)):
215 # Looks like an S3 remote uri so we can use direct copy
216 # note that boto3.resource.meta.copy is cleverer than the low
217 # level copy_object
218 copy_source = {
219 "Bucket": src.netloc,
220 "Key": src.relativeToPathRoot,
221 }
222 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
223 else:
224 # Use local file and upload it
225 with src.as_local() as local_uri:
227 # resource.meta.upload_file seems like the right thing
228 # but we have a low level client
229 with open(local_uri.ospath, "rb") as fh:
230 self.client.put_object(Bucket=self.netloc,
231 Key=self.relativeToPathRoot, Body=fh)
233 # This was an explicit move requested from a remote resource
234 # try to remove that resource
235 if transfer == "move": 235 ↛ 237line 235 didn't jump to line 237, because the condition on line 235 was never true
236 # Transactions do not work here
237 src.remove()
239 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
240 def walk(self, file_filter: Optional[Union[str, re.Pattern]] = None) -> Iterator[Union[List,
241 Tuple[ButlerURI,
242 List[str],
243 List[str]]]]:
244 """Walk the directory tree returning matching files and directories.
246 Parameters
247 ----------
248 file_filter : `str` or `re.Pattern`, optional
249 Regex to filter out files from the list before it is returned.
251 Yields
252 ------
253 dirpath : `ButlerURI`
254 Current directory being examined.
255 dirnames : `list` of `str`
256 Names of subdirectories within dirpath.
257 filenames : `list` of `str`
258 Names of all the files within dirpath.
259 """
260 # We pretend that S3 uses directories and files and not simply keys
261 if not (self.isdir() or self.is_root): 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true
262 raise ValueError(f"Can not walk a non-directory URI: {self}")
264 if isinstance(file_filter, str): 264 ↛ 265line 264 didn't jump to line 265, because the condition on line 264 was never true
265 file_filter = re.compile(file_filter)
267 s3_paginator = self.client.get_paginator('list_objects_v2')
269 # Limit each query to a single "directory" to match os.walk
270 # We could download all keys at once with no delimiter and work
271 # it out locally but this could potentially lead to large memory
272 # usage for millions of keys. It will also make the initial call
273 # to this method potentially very slow. If making this method look
274 # like os.walk was not required, we could query all keys with
275 # pagination and return them in groups of 1000, but that would
276 # be a different interface since we can't guarantee we would get
277 # them all grouped properly across the 1000 limit boundary.
278 prefix = self.relativeToPathRoot if not self.is_root else ""
279 prefix_len = len(prefix)
280 dirnames = []
281 filenames = []
282 files_there = False
284 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
285 # All results are returned as full key names and we must
286 # convert them back to the root form. The prefix is fixed
287 # and delimited so that is a simple trim
289 # Directories are reported in the CommonPrefixes result
290 # which reports the entire key and must be stripped.
291 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
292 dirnames.extend(found_dirs)
294 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
295 if found_files:
296 files_there = True
297 if file_filter is not None:
298 found_files = [f for f in found_files if file_filter.search(f)]
300 filenames.extend(found_files)
302 # Directories do not exist so we can't test for them. If no files
303 # or directories were found though, this means that it effectively
304 # does not exist and we should match os.walk() behavior and return
305 # [].
306 if not dirnames and not files_there: 306 ↛ 307line 306 didn't jump to line 307, because the condition on line 306 was never true
307 yield []
308 else:
309 yield self, dirnames, filenames
311 for dir in dirnames:
312 new_uri = self.join(dir)
313 yield from new_uri.walk(file_filter)