Coverage for python/lsst/daf/butler/datastores/s3Datastore.py : 75%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""S3 datastore."""
26__all__ = ("S3Datastore", )
28import logging
29import os
30import tempfile
32from botocore.exceptions import ClientError
33from http.client import ImproperConnectionState, HTTPException
34from urllib3.exceptions import RequestError, HTTPError
36from typing import (
37 TYPE_CHECKING,
38 Any,
39 Optional,
40 Type,
41 Union,
42 Callable
43)
45# https://pypi.org/project/backoff/
46try:
47 import backoff
48except ImportError:
49 class Backoff():
50 @staticmethod
51 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
52 return func
54 @staticmethod
55 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
56 return func
58 backoff = Backoff
60from lsst.daf.butler import (
61 ButlerURI,
62 DatasetRef,
63 Formatter,
64 Location,
65 StoredFileInfo,
66)
68from .fileLikeDatastore import FileLikeDatastore
69from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists
71if TYPE_CHECKING: 71 ↛ 72line 71 didn't jump to line 72, because the condition on line 71 was never true
72 from .fileLikeDatastore import DatastoreFileGetInformation
73 from lsst.daf.butler import DatastoreConfig
74 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager
76log = logging.getLogger(__name__)
78# settings for "backoff" retry decorators. these retries are belt-and-
79# suspenders along with the retries built into Boto3, to account for
80# semantic differences in errors between S3-like providers.
81retryable_io_errors = (
82 # http.client
83 ImproperConnectionState, HTTPException,
84 # urllib3.exceptions
85 RequestError, HTTPError,
86 # built-ins
87 TimeoutError, ConnectionError)
88retryable_client_errors = (
89 # botocore.exceptions
90 ClientError,
91 # built-ins
92 PermissionError)
93all_retryable_errors = retryable_client_errors + retryable_io_errors
94max_retry_time = 60
97class S3Datastore(FileLikeDatastore):
98 """Basic S3 Object Storage backed Datastore.
100 Parameters
101 ----------
102 config : `DatastoreConfig` or `str`
103 Configuration. A string should refer to the name of the config file.
104 bridgeManager : `DatastoreRegistryBridgeManager`
105 Object that manages the interface between `Registry` and datastores.
106 butlerRoot : `str`, optional
107 New datastore root to use to override the configuration value.
109 Raises
110 ------
111 ValueError
112 If root location does not exist and ``create`` is `False` in the
113 configuration.
115 Notes
116 -----
117 S3Datastore supports non-link transfer modes for file-based ingest:
118 `"move"`, `"copy"`, and `None` (no transfer).
119 """
121 defaultConfigFile = "datastores/s3Datastore.yaml"
122 """Path to configuration defaults. Accessed within the ``config`` resource
123 or relative to a search path. Can be None if no defaults specified.
124 """
126 def __init__(self, config: Union[DatastoreConfig, str],
127 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
128 super().__init__(config, bridgeManager, butlerRoot)
130 self.client = getS3Client()
131 if not bucketExists(self.locationFactory.netloc): 131 ↛ 137line 131 didn't jump to line 137, because the condition on line 131 was never true
132 # PosixDatastore creates the root directory if one does not exist.
133 # Calling s3 client.create_bucket is possible but also requires
134 # ACL LocationConstraints, Permissions and other configuration
135 # parameters, so for now we do not create a bucket if one is
136 # missing. Further discussion can make this happen though.
137 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!")
139 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
140 def _artifact_exists(self, location: Location) -> bool:
141 """Check that an artifact exists in this datastore at the specified
142 location.
144 Parameters
145 ----------
146 location : `Location`
147 Expected location of the artifact associated with this datastore.
149 Returns
150 -------
151 exists : `bool`
152 True if the location can be found, false otherwise.
153 """
154 log.debug("Checking if file exists: %s", location.uri)
155 exists, _ = s3CheckFileExists(location, client=self.client)
156 return exists
158 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
159 def _delete_artifact(self, location: Location) -> None:
160 """Delete the artifact from the datastore.
162 Parameters
163 ----------
164 location : `Location`
165 Location of the artifact associated with this datastore.
166 """
167 log.debug("Deleting file: %s", location.uri)
168 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot)
169 log.debug("Successfully deleted file: %s", location.uri)
171 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
172 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
173 ref: DatasetRef, isComponent: bool = False) -> Any:
174 location = getInfo.location
176 # since we have to make a GET request to S3 anyhow (for download) we
177 # might as well use the HEADER metadata for size comparison instead.
178 # s3CheckFileExists would just duplicate GET/LIST charges in this case.
179 try:
180 log.debug("Reading file: %s", location.uri)
181 response = self.client.get_object(Bucket=location.netloc,
182 Key=location.relativeToPathRoot)
183 log.debug("Successfully read file: %s", location.uri)
184 except self.client.exceptions.ClientError as err:
185 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
186 # head_object returns 404 when object does not exist only when user
187 # has s3:ListBucket permission. If list permission does not exist a
188 # 403 is returned. In practical terms this usually means that the
189 # file does not exist, but it could also mean user lacks GetObject
190 # permission. It's hard to tell which case is it.
191 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
192 # Unit tests right now demand FileExistsError is raised, but this
193 # should be updated to PermissionError like in s3CheckFileExists.
194 if errorcode == 403:
195 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at "
196 f"expected location {location}. Forbidden HEAD "
197 "operation error occured. Verify s3:ListBucket "
198 "and s3:GetObject permissions are granted for "
199 "your IAM user and that file exists. ") from err
200 if errorcode == 404:
201 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
202 raise FileNotFoundError(errmsg) from err
203 # other errors are reraised also, but less descriptively
204 raise err
206 storedFileInfo = getInfo.info
207 if response["ContentLength"] != storedFileInfo.file_size: 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true
208 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
209 " match recorded size of {}".format(location.path, response["ContentLength"],
210 storedFileInfo.file_size))
212 # download the data as bytes
213 serializedDataset = response["Body"].read()
215 # format the downloaded bytes into appropriate object directly, or via
216 # tempfile (when formatter does not support to/from/Bytes). This is S3
217 # equivalent of PosixDatastore formatter.read try-except block.
218 formatter = getInfo.formatter
219 try:
220 result = formatter.fromBytes(serializedDataset,
221 component=getInfo.component if isComponent else None)
222 except NotImplementedError: 222 ↛ 234line 222 didn't jump to line 234
223 # formatter might not always have an extension so mypy complains
224 # We can either ignore the complaint or use a temporary location
225 tmpLoc = Location(".", "temp")
226 tmpLoc = formatter.makeUpdatedLocation(tmpLoc)
227 with tempfile.NamedTemporaryFile(suffix=tmpLoc.getExtension()) as tmpFile:
228 tmpFile.write(serializedDataset)
229 # Flush the write. Do not close the file because that
230 # will delete it.
231 tmpFile.flush()
232 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
233 result = formatter.read(component=getInfo.component if isComponent else None)
234 except Exception as e:
235 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
236 f" ({ref.datasetType.name} from {location.uri}): {e}") from e
238 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
239 isComponent=isComponent)
241 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
242 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
243 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
245 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
246 # `Keys` instead only look like directories, but are not. We check if
247 # an *exact* full key already exists before writing instead. The insert
248 # key operation is equivalent to creating the dir and the file.
249 if s3CheckFileExists(location, client=self.client,)[0]:
250 raise FileExistsError(f"Cannot write file for ref {ref} as "
251 f"output file {location.uri} exists.")
253 # upload the file directly from bytes or by using a temporary file if
254 # _toBytes is not implemented
255 try:
256 serializedDataset = formatter.toBytes(inMemoryDataset)
257 log.debug("Writing file directly to %s", location.uri)
258 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
259 Body=serializedDataset)
260 log.debug("Successfully wrote file directly to %s", location.uri)
261 except NotImplementedError:
262 with tempfile.NamedTemporaryFile(suffix=location.getExtension()) as tmpFile:
263 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
264 formatter.write(inMemoryDataset)
265 with open(tmpFile.name, 'rb') as f:
266 log.debug("Writing file to %s via a temporary directory.", location.uri)
267 self.client.put_object(Bucket=location.netloc,
268 Key=location.relativeToPathRoot, Body=f)
269 log.debug("Successfully wrote file to %s via a temporary directory.", location.uri)
271 if self._transaction is None: 271 ↛ 272line 271 didn't jump to line 272, because the condition on line 271 was never true
272 raise RuntimeError("Attempting to write artifact without transaction enabled")
274 # Register a callback to try to delete the uploaded data if
275 # the ingest fails below
276 self._transaction.registerUndo("write", self.client.delete_object,
277 Bucket=location.netloc, Key=location.relativeToPathRoot)
279 # URI is needed to resolve what ingest case are we dealing with
280 return self._extractIngestInfo(location.uri, ref, formatter=formatter)
282 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
283 # Docstring inherited from base class
284 if transfer != "auto": 284 ↛ 286line 284 didn't jump to line 286, because the condition on line 284 was never false
285 return transfer
286 return "copy"
288 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
289 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
290 if transfer not in (None, "move", "copy"): 290 ↛ 291line 290 didn't jump to line 291, because the condition on line 290 was never true
291 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
292 # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
293 # target will always be s3). File has to exist at target location. Two
294 # Schemeless URIs are assumed to obey os.path rules. Equivalent to
295 # os.path.exists(fullPath) check in PosixDatastore.
296 srcUri = ButlerURI(path)
297 if not srcUri.exists(): 297 ↛ 298line 297 didn't jump to line 298, because the condition on line 297 was never true
298 raise FileNotFoundError(f"Resource at {srcUri} does not exist")
300 if transfer is None: 300 ↛ 301line 300 didn't jump to line 301, because the condition on line 300 was never true
301 rootUri = ButlerURI(self.root)
302 if not srcUri.relative_to(rootUri):
303 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
304 f"within datastore ({rootUri})")
305 return path
307 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
308 formatter: Union[Formatter, Type[Formatter]],
309 transfer: Optional[str] = None) -> StoredFileInfo:
310 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
311 srcUri = ButlerURI(path)
312 if transfer is None:
313 # The source file is already in the datastore but we have
314 # to work out the path relative to the root of the datastore.
315 # Because unlike for file to file ingest we can get absolute
316 # URIs here
317 rootUri = ButlerURI(self.root, forceDirectory=True)
318 pathInStore = srcUri.relative_to(rootUri)
319 if pathInStore is None: 319 ↛ 320line 319 didn't jump to line 320, because the condition on line 319 was never true
320 raise RuntimeError(f"Unexpectedly learned that {srcUri} is not within datastore {rootUri}")
321 tgtLocation = self.locationFactory.fromPath(pathInStore)
322 else:
323 # Work out the name we want this ingested file to have
324 # inside the datastore
325 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
327 # Convert that to a ButlerURI and transfer the resource to S3
328 targetUri = ButlerURI(tgtLocation.uri)
329 targetUri.transfer_from(srcUri, transfer=transfer)
331 # the file should exist on the bucket by now
332 _, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
333 bucket=tgtLocation.netloc,
334 client=self.client)
336 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
337 storageClass=ref.datasetType.storageClass,
338 component=ref.datasetType.component(),
339 file_size=size, checksum=None)