Coverage for python/lsst/daf/butler/core/_butlerUri/s3.py: 78%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import logging
25import re
26import tempfile
28__all__ = ('ButlerS3URI',)
30from typing import (
31 TYPE_CHECKING,
32 Optional,
33 Any,
34 Callable,
35 Iterator,
36 List,
37 Tuple,
38 Union,
39)
41from lsst.utils.timer import time_this
42from .utils import NoTransaction
43from ._butlerUri import ButlerURI
44from .s3utils import getS3Client, s3CheckFileExists, bucketExists
46from botocore.exceptions import ClientError
47from http.client import ImproperConnectionState, HTTPException
48from urllib3.exceptions import RequestError, HTTPError
50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true
51 try:
52 import boto3
53 except ImportError:
54 pass
55 from ..datastore import DatastoreTransaction
57# https://pypi.org/project/backoff/
58try:
59 import backoff
60except ImportError:
61 class Backoff():
62 @staticmethod
63 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
64 return func
66 @staticmethod
67 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
68 return func
70 backoff = Backoff
72# settings for "backoff" retry decorators. these retries are belt-and-
73# suspenders along with the retries built into Boto3, to account for
74# semantic differences in errors between S3-like providers.
75retryable_io_errors = (
76 # http.client
77 ImproperConnectionState, HTTPException,
78 # urllib3.exceptions
79 RequestError, HTTPError,
80 # built-ins
81 TimeoutError, ConnectionError)
83# Client error can include NoSuchKey so retry may not be the right
84# thing. This may require more consideration if it is to be used.
85retryable_client_errors = (
86 # botocore.exceptions
87 ClientError,
88 # built-ins
89 PermissionError)
91# Combine all errors into an easy package. For now client errors
92# are not included.
93all_retryable_errors = retryable_io_errors
94max_retry_time = 60
97log = logging.getLogger(__name__)
100class ButlerS3URI(ButlerURI):
101 """S3 URI implementation class."""
103 @property
104 def client(self) -> boto3.client:
105 """Client object to address remote resource."""
106 # Defer import for circular dependencies
107 return getS3Client()
109 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
110 def exists(self) -> bool:
111 """Check that the S3 resource exists."""
112 if self.is_root: 112 ↛ 114line 112 didn't jump to line 114, because the condition on line 112 was never true
113 # Only check for the bucket since the path is irrelevant
114 return bucketExists(self.netloc)
115 exists, _ = s3CheckFileExists(self, client=self.client)
116 return exists
118 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
119 def size(self) -> int:
120 """Return the size of the resource in bytes."""
121 if self.dirLike: 121 ↛ 122line 121 didn't jump to line 122, because the condition on line 121 was never true
122 return 0
123 exists, sz = s3CheckFileExists(self, client=self.client)
124 if not exists:
125 raise FileNotFoundError(f"Resource {self} does not exist")
126 return sz
128 @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
129 def remove(self) -> None:
130 """Remove the resource."""
131 # https://github.com/boto/boto3/issues/507 - there is no
132 # way of knowing if the file was actually deleted except
133 # for checking all the keys again, reponse is HTTP 204 OK
134 # response all the time
135 try:
136 self.client.delete_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
137 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
138 raise FileNotFoundError("No such resource: {self}") from err
140 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
141 def read(self, size: int = -1) -> bytes:
142 """Read the contents of the resource."""
143 args = {}
144 if size > 0:
145 args["Range"] = f"bytes=0-{size-1}"
146 try:
147 response = self.client.get_object(Bucket=self.netloc,
148 Key=self.relativeToPathRoot,
149 **args)
150 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
151 raise FileNotFoundError(f"No such resource: {self}") from err
152 with time_this(log, msg="Read from %s", args=(self,)):
153 body = response["Body"].read()
154 response["Body"].close()
155 return body
157 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
158 def write(self, data: bytes, overwrite: bool = True) -> None:
159 """Write the supplied data to the resource."""
160 if not overwrite:
161 if self.exists(): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true
162 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
163 with time_this(log, msg="Write to %s", args=(self,)):
164 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot,
165 Body=data)
167 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
168 def mkdir(self) -> None:
169 """Write a directory key to S3."""
170 if not bucketExists(self.netloc): 170 ↛ 171line 170 didn't jump to line 171, because the condition on line 170 was never true
171 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
173 if not self.dirLike: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true
174 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
176 # don't create S3 key when root is at the top-level of an Bucket
177 if not self.path == "/": 177 ↛ exitline 177 didn't return from function 'mkdir', because the condition on line 177 was never false
178 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
180 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
181 def _as_local(self) -> Tuple[str, bool]:
182 """Download object from S3 and place in temporary directory.
184 Returns
185 -------
186 path : `str`
187 Path to local temporary file.
188 temporary : `bool`
189 Always returns `True`. This is always a temporary file.
190 """
191 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
192 with time_this(log, msg="Downloading %s to local file", args=(self,)):
193 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, tmpFile)
194 return tmpFile.name, True
196 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
197 def transfer_from(self, src: ButlerURI, transfer: str = "copy",
198 overwrite: bool = False,
199 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
200 """Transfer the current resource to an S3 bucket.
202 Parameters
203 ----------
204 src : `ButlerURI`
205 Source URI.
206 transfer : `str`
207 Mode to use for transferring the resource. Supports the following
208 options: copy.
209 overwrite : `bool`, optional
210 Allow an existing file to be overwritten. Defaults to `False`.
211 transaction : `DatastoreTransaction`, optional
212 Currently unused.
213 """
214 # Fail early to prevent delays if remote resources are requested
215 if transfer not in self.transferModes:
216 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
218 # Existence checks cost time so do not call this unless we know
219 # that debugging is enabled.
220 if log.isEnabledFor(logging.DEBUG): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 log.debug("Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
222 src, src.exists(), self, self.exists(), transfer)
224 if not overwrite and self.exists():
225 raise FileExistsError(f"Destination path '{self}' already exists.")
227 if transfer == "auto": 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true
228 transfer = self.transferDefault
230 timer_msg = "Transfer from %s to %s"
231 timer_args = (src, self)
233 if isinstance(src, type(self)):
234 # Looks like an S3 remote uri so we can use direct copy
235 # note that boto3.resource.meta.copy is cleverer than the low
236 # level copy_object
237 copy_source = {
238 "Bucket": src.netloc,
239 "Key": src.relativeToPathRoot,
240 }
241 with time_this(log, msg=timer_msg, args=timer_args):
242 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc,
243 Key=self.relativeToPathRoot)
244 else:
245 # Use local file and upload it
246 with src.as_local() as local_uri:
248 # resource.meta.upload_file seems like the right thing
249 # but we have a low level client
250 with time_this(log, msg=timer_msg, args=timer_args):
251 with open(local_uri.ospath, "rb") as fh:
252 self.client.put_object(Bucket=self.netloc,
253 Key=self.relativeToPathRoot, Body=fh)
255 # This was an explicit move requested from a remote resource
256 # try to remove that resource
257 if transfer == "move": 257 ↛ 259line 257 didn't jump to line 259, because the condition on line 257 was never true
258 # Transactions do not work here
259 src.remove()
261 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
262 def walk(self, file_filter: Optional[Union[str, re.Pattern]] = None) -> Iterator[Union[List,
263 Tuple[ButlerURI,
264 List[str],
265 List[str]]]]:
266 """Walk the directory tree returning matching files and directories.
268 Parameters
269 ----------
270 file_filter : `str` or `re.Pattern`, optional
271 Regex to filter out files from the list before it is returned.
273 Yields
274 ------
275 dirpath : `ButlerURI`
276 Current directory being examined.
277 dirnames : `list` of `str`
278 Names of subdirectories within dirpath.
279 filenames : `list` of `str`
280 Names of all the files within dirpath.
281 """
282 # We pretend that S3 uses directories and files and not simply keys
283 if not (self.isdir() or self.is_root): 283 ↛ 284line 283 didn't jump to line 284, because the condition on line 283 was never true
284 raise ValueError(f"Can not walk a non-directory URI: {self}")
286 if isinstance(file_filter, str): 286 ↛ 287line 286 didn't jump to line 287, because the condition on line 286 was never true
287 file_filter = re.compile(file_filter)
289 s3_paginator = self.client.get_paginator('list_objects_v2')
291 # Limit each query to a single "directory" to match os.walk
292 # We could download all keys at once with no delimiter and work
293 # it out locally but this could potentially lead to large memory
294 # usage for millions of keys. It will also make the initial call
295 # to this method potentially very slow. If making this method look
296 # like os.walk was not required, we could query all keys with
297 # pagination and return them in groups of 1000, but that would
298 # be a different interface since we can't guarantee we would get
299 # them all grouped properly across the 1000 limit boundary.
300 prefix = self.relativeToPathRoot if not self.is_root else ""
301 prefix_len = len(prefix)
302 dirnames = []
303 filenames = []
304 files_there = False
306 for page in s3_paginator.paginate(Bucket=self.netloc, Prefix=prefix, Delimiter="/"):
307 # All results are returned as full key names and we must
308 # convert them back to the root form. The prefix is fixed
309 # and delimited so that is a simple trim
311 # Directories are reported in the CommonPrefixes result
312 # which reports the entire key and must be stripped.
313 found_dirs = [dir["Prefix"][prefix_len:] for dir in page.get("CommonPrefixes", ())]
314 dirnames.extend(found_dirs)
316 found_files = [file["Key"][prefix_len:] for file in page.get("Contents", ())]
317 if found_files:
318 files_there = True
319 if file_filter is not None:
320 found_files = [f for f in found_files if file_filter.search(f)]
322 filenames.extend(found_files)
324 # Directories do not exist so we can't test for them. If no files
325 # or directories were found though, this means that it effectively
326 # does not exist and we should match os.walk() behavior and return
327 # [].
328 if not dirnames and not files_there: 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true
329 yield []
330 else:
331 yield self, dirnames, filenames
333 for dir in dirnames:
334 new_uri = self.join(dir)
335 yield from new_uri.walk(file_filter)