Coverage for python/lsst/daf/butler/datastores/s3Datastore.py : 65%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""S3 datastore."""
26__all__ = ("S3Datastore", )
28import logging
29import os
30import pathlib
31import tempfile
33from botocore.exceptions import ClientError
34from http.client import ImproperConnectionState, HTTPException
35from urllib3.exceptions import RequestError, HTTPError
37from typing import (
38 TYPE_CHECKING,
39 Any,
40 Optional,
41 Type,
42 Union,
43 Callable
44)
46# https://pypi.org/project/backoff/
47try:
48 import backoff
49except ImportError:
50 class Backoff():
51 @staticmethod
52 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
53 return func
55 @staticmethod
56 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
57 return func
59 backoff = Backoff
61from lsst.daf.butler import (
62 ButlerURI,
63 DatasetRef,
64 Formatter,
65 Location,
66 StoredFileInfo,
67)
69from .fileLikeDatastore import FileLikeDatastore
70from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists
72if TYPE_CHECKING: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true
73 from .fileLikeDatastore import DatastoreFileGetInformation
74 from lsst.daf.butler import DatastoreConfig
75 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager
77log = logging.getLogger(__name__)
79# settings for "backoff" retry decorators. these retries are belt-and-
80# suspenders along with the retries built into Boto3, to account for
81# semantic differences in errors between S3-like providers.
82retryable_io_errors = (
83 # http.client
84 ImproperConnectionState, HTTPException,
85 # urllib3.exceptions
86 RequestError, HTTPError,
87 # built-ins
88 TimeoutError, ConnectionError)
89retryable_client_errors = (
90 # botocore.exceptions
91 ClientError,
92 # built-ins
93 PermissionError)
94all_retryable_errors = retryable_client_errors + retryable_io_errors
95max_retry_time = 60
98class S3Datastore(FileLikeDatastore):
99 """Basic S3 Object Storage backed Datastore.
101 Parameters
102 ----------
103 config : `DatastoreConfig` or `str`
104 Configuration. A string should refer to the name of the config file.
105 bridgeManager : `DatastoreRegistryBridgeManager`
106 Object that manages the interface between `Registry` and datastores.
107 butlerRoot : `str`, optional
108 New datastore root to use to override the configuration value.
110 Raises
111 ------
112 ValueError
113 If root location does not exist and ``create`` is `False` in the
114 configuration.
116 Notes
117 -----
118 S3Datastore supports non-link transfer modes for file-based ingest:
119 `"move"`, `"copy"`, and `None` (no transfer).
120 """
122 defaultConfigFile = "datastores/s3Datastore.yaml"
123 """Path to configuration defaults. Accessed within the ``config`` resource
124 or relative to a search path. Can be None if no defaults specified.
125 """
127 def __init__(self, config: Union[DatastoreConfig, str],
128 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
129 super().__init__(config, bridgeManager, butlerRoot)
131 self.client = getS3Client()
132 if not bucketExists(self.locationFactory.netloc): 132 ↛ 138line 132 didn't jump to line 138, because the condition on line 132 was never true
133 # PosixDatastore creates the root directory if one does not exist.
134 # Calling s3 client.create_bucket is possible but also requires
135 # ACL LocationConstraints, Permissions and other configuration
136 # parameters, so for now we do not create a bucket if one is
137 # missing. Further discussion can make this happen though.
138 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!")
140 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
141 def _artifact_exists(self, location: Location) -> bool:
142 """Check that an artifact exists in this datastore at the specified
143 location.
145 Parameters
146 ----------
147 location : `Location`
148 Expected location of the artifact associated with this datastore.
150 Returns
151 -------
152 exists : `bool`
153 True if the location can be found, false otherwise.
154 """
155 log.debug("Checking if file exists: %s", location.uri)
156 exists, _ = s3CheckFileExists(location, client=self.client)
157 return exists
159 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
160 def _delete_artifact(self, location: Location) -> None:
161 """Delete the artifact from the datastore.
163 Parameters
164 ----------
165 location : `Location`
166 Location of the artifact associated with this datastore.
167 """
168 log.debug("Deleting file: %s", location.uri)
169 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot)
170 log.debug("Successfully deleted file: %s", location.uri)
172 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
173 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
174 ref: DatasetRef, isComponent: bool = False) -> Any:
175 location = getInfo.location
177 # since we have to make a GET request to S3 anyhow (for download) we
178 # might as well use the HEADER metadata for size comparison instead.
179 # s3CheckFileExists would just duplicate GET/LIST charges in this case.
180 try:
181 log.debug("Reading file: %s", location.uri)
182 response = self.client.get_object(Bucket=location.netloc,
183 Key=location.relativeToPathRoot)
184 log.debug("Successfully read file: %s", location.uri)
185 except self.client.exceptions.ClientError as err:
186 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
187 # head_object returns 404 when object does not exist only when user
188 # has s3:ListBucket permission. If list permission does not exist a
189 # 403 is returned. In practical terms this usually means that the
190 # file does not exist, but it could also mean user lacks GetObject
191 # permission. It's hard to tell which case is it.
192 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
193 # Unit tests right now demand FileExistsError is raised, but this
194 # should be updated to PermissionError like in s3CheckFileExists.
195 if errorcode == 403:
196 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at "
197 f"expected location {location}. Forbidden HEAD "
198 "operation error occured. Verify s3:ListBucket "
199 "and s3:GetObject permissions are granted for "
200 "your IAM user and that file exists. ") from err
201 if errorcode == 404:
202 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
203 raise FileNotFoundError(errmsg) from err
204 # other errors are reraised also, but less descriptively
205 raise err
207 storedFileInfo = getInfo.info
208 if response["ContentLength"] != storedFileInfo.file_size: 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true
209 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
210 " match recorded size of {}".format(location.path, response["ContentLength"],
211 storedFileInfo.file_size))
213 # download the data as bytes
214 serializedDataset = response["Body"].read()
216 # format the downloaded bytes into appropriate object directly, or via
217 # tempfile (when formatter does not support to/from/Bytes). This is S3
218 # equivalent of PosixDatastore formatter.read try-except block.
219 formatter = getInfo.formatter
220 try:
221 result = formatter.fromBytes(serializedDataset,
222 component=getInfo.component if isComponent else None)
223 except NotImplementedError: 223 ↛ 235line 223 didn't jump to line 235
224 # formatter might not always have an extension so mypy complains
225 # We can either ignore the complaint or use a temporary location
226 tmpLoc = Location(".", "temp")
227 tmpLoc = formatter.makeUpdatedLocation(tmpLoc)
228 with tempfile.NamedTemporaryFile(suffix=tmpLoc.getExtension()) as tmpFile:
229 tmpFile.write(serializedDataset)
230 # Flush the write. Do not close the file because that
231 # will delete it.
232 tmpFile.flush()
233 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
234 result = formatter.read(component=getInfo.component if isComponent else None)
235 except Exception as e:
236 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
237 f" ({ref.datasetType.name} from {location.uri}): {e}") from e
239 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
240 isComponent=isComponent)
242 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
243 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
244 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
246 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
247 # `Keys` instead only look like directories, but are not. We check if
248 # an *exact* full key already exists before writing instead. The insert
249 # key operation is equivalent to creating the dir and the file.
250 if s3CheckFileExists(location, client=self.client,)[0]:
251 raise FileExistsError(f"Cannot write file for ref {ref} as "
252 f"output file {location.uri} exists.")
254 # upload the file directly from bytes or by using a temporary file if
255 # _toBytes is not implemented
256 try:
257 serializedDataset = formatter.toBytes(inMemoryDataset)
258 log.debug("Writing file directly to %s", location.uri)
259 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
260 Body=serializedDataset)
261 log.debug("Successfully wrote file directly to %s", location.uri)
262 except NotImplementedError:
263 with tempfile.NamedTemporaryFile(suffix=location.getExtension()) as tmpFile:
264 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
265 formatter.write(inMemoryDataset)
266 with open(tmpFile.name, 'rb') as f:
267 log.debug("Writing file to %s via a temporary directory.", location.uri)
268 self.client.put_object(Bucket=location.netloc,
269 Key=location.relativeToPathRoot, Body=f)
270 log.debug("Successfully wrote file to %s via a temporary directory.", location.uri)
272 if self._transaction is None: 272 ↛ 273line 272 didn't jump to line 273, because the condition on line 272 was never true
273 raise RuntimeError("Attempting to write artifact without transaction enabled")
275 # Register a callback to try to delete the uploaded data if
276 # the ingest fails below
277 self._transaction.registerUndo("write", self.client.delete_object,
278 Bucket=location.netloc, Key=location.relativeToPathRoot)
280 # URI is needed to resolve what ingest case are we dealing with
281 return self._extractIngestInfo(location.uri, ref, formatter=formatter)
283 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
284 # Docstring inherited from base class
285 if transfer != "auto": 285 ↛ 287line 285 didn't jump to line 287, because the condition on line 285 was never false
286 return transfer
287 return "copy"
289 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
290 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
291 if transfer not in (None, "move", "copy"): 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true
292 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
293 # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
294 # target will always be s3). File has to exist at target location. Two
295 # Schemeless URIs are assumed to obey os.path rules. Equivalent to
296 # os.path.exists(fullPath) check in PosixDatastore.
297 srcUri = ButlerURI(path)
298 if srcUri.scheme == 'file' or not srcUri.scheme: 298 ↛ 301line 298 didn't jump to line 301, because the condition on line 298 was never false
299 if not os.path.exists(srcUri.ospath): 299 ↛ 300line 299 didn't jump to line 300, because the condition on line 299 was never true
300 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
301 elif srcUri.scheme == 's3':
302 if not s3CheckFileExists(srcUri, client=self.client)[0]:
303 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
304 else:
305 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.")
307 if transfer is None: 307 ↛ 308line 307 didn't jump to line 308, because the condition on line 307 was never true
308 rootUri = ButlerURI(self.root)
309 if srcUri.scheme == "file":
310 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. "
311 "Ingesting local data to S3Datastore without upload "
312 "to S3 is not allowed.")
313 elif srcUri.scheme == "s3":
314 if not srcUri.path.startswith(rootUri.path):
315 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.")
316 return path
318 def _extractIngestInfo(self, path: str, ref: DatasetRef, *,
319 formatter: Union[Formatter, Type[Formatter]],
320 transfer: Optional[str] = None) -> StoredFileInfo:
321 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
322 srcUri = ButlerURI(path)
323 if transfer is None:
324 rootUri = ButlerURI(self.root)
325 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
326 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
327 tgtLocation = self.locationFactory.fromPath(pathInStore)
328 else:
329 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath"
331 # Work out the name we want this ingested file to have
332 # inside the datastore
333 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
335 if srcUri.scheme == "file": 335 ↛ 342line 335 didn't jump to line 342, because the condition on line 335 was never false
336 # source is on local disk.
337 with open(srcUri.ospath, 'rb') as f:
338 self.client.put_object(Bucket=tgtLocation.netloc,
339 Key=tgtLocation.relativeToPathRoot, Body=f)
340 if transfer == "move": 340 ↛ 341line 340 didn't jump to line 341, because the condition on line 340 was never true
341 os.remove(srcUri.ospath)
342 elif srcUri.scheme == "s3":
343 # source is another S3 Bucket
344 relpath = srcUri.relativeToPathRoot
345 copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
346 self.client.copy(copySrc, self.locationFactory.netloc,
347 tgtLocation.relativeToPathRoot)
348 if transfer == "move":
349 # https://github.com/boto/boto3/issues/507 - there is no
350 # way of knowing if the file was actually deleted except
351 # for checking all the keys again, reponse is HTTP 204 OK
352 # response all the time
353 self.client.delete(Bucket=srcUri.netloc, Key=relpath)
355 # the file should exist on the bucket by now
356 _, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
357 bucket=tgtLocation.netloc,
358 client=self.client)
360 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
361 storageClass=ref.datasetType.storageClass,
362 component=ref.datasetType.component(),
363 file_size=size, checksum=None)