Coverage for python/lsst/daf/butler/datastores/s3Datastore.py : 69%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""S3 datastore."""
26__all__ = ("S3Datastore", )
28import logging
29import os
30import tempfile
32from botocore.exceptions import ClientError
33from http.client import ImproperConnectionState, HTTPException
34from urllib3.exceptions import RequestError, HTTPError
36from typing import (
37 TYPE_CHECKING,
38 Any,
39 Optional,
40 Type,
41 Union,
42 Callable
43)
45# https://pypi.org/project/backoff/
46try:
47 import backoff
48except ImportError:
49 class Backoff():
50 @staticmethod
51 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable:
52 return func
54 @staticmethod
55 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable:
56 return func
58 backoff = Backoff
60from lsst.daf.butler import (
61 ButlerURI,
62 DatasetRef,
63 Formatter,
64 Location,
65 StoredFileInfo,
66)
68from .fileLikeDatastore import FileLikeDatastore
69from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists
71if TYPE_CHECKING: 71 ↛ 72line 71 didn't jump to line 72, because the condition on line 71 was never true
72 from .fileLikeDatastore import DatastoreFileGetInformation
73 from lsst.daf.butler import DatastoreConfig
74 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager
76log = logging.getLogger(__name__)
78# settings for "backoff" retry decorators. these retries are belt-and-
79# suspenders along with the retries built into Boto3, to account for
80# semantic differences in errors between S3-like providers.
81retryable_io_errors = (
82 # http.client
83 ImproperConnectionState, HTTPException,
84 # urllib3.exceptions
85 RequestError, HTTPError,
86 # built-ins
87 TimeoutError, ConnectionError)
88retryable_client_errors = (
89 # botocore.exceptions
90 ClientError,
91 # built-ins
92 PermissionError)
93all_retryable_errors = retryable_client_errors + retryable_io_errors
94max_retry_time = 60
97class S3Datastore(FileLikeDatastore):
98 """Basic S3 Object Storage backed Datastore.
100 Parameters
101 ----------
102 config : `DatastoreConfig` or `str`
103 Configuration. A string should refer to the name of the config file.
104 bridgeManager : `DatastoreRegistryBridgeManager`
105 Object that manages the interface between `Registry` and datastores.
106 butlerRoot : `str`, optional
107 New datastore root to use to override the configuration value.
109 Raises
110 ------
111 ValueError
112 If root location does not exist and ``create`` is `False` in the
113 configuration.
115 Notes
116 -----
117 S3Datastore supports non-link transfer modes for file-based ingest:
118 `"move"`, `"copy"`, and `None` (no transfer).
119 """
121 defaultConfigFile = "datastores/s3Datastore.yaml"
122 """Path to configuration defaults. Accessed within the ``config`` resource
123 or relative to a search path. Can be None if no defaults specified.
124 """
126 def __init__(self, config: Union[DatastoreConfig, str],
127 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
128 super().__init__(config, bridgeManager, butlerRoot)
130 self.client = getS3Client()
131 if not bucketExists(self.locationFactory.netloc): 131 ↛ 137line 131 didn't jump to line 137, because the condition on line 131 was never true
132 # PosixDatastore creates the root directory if one does not exist.
133 # Calling s3 client.create_bucket is possible but also requires
134 # ACL LocationConstraints, Permissions and other configuration
135 # parameters, so for now we do not create a bucket if one is
136 # missing. Further discussion can make this happen though.
137 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!")
139 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
140 def _artifact_exists(self, location: Location) -> bool:
141 """Check that an artifact exists in this datastore at the specified
142 location.
144 Parameters
145 ----------
146 location : `Location`
147 Expected location of the artifact associated with this datastore.
149 Returns
150 -------
151 exists : `bool`
152 True if the location can be found, false otherwise.
153 """
154 log.debug("Checking if file exists: %s", location.uri)
155 exists, _ = s3CheckFileExists(location, client=self.client)
156 return exists
158 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time)
159 def _delete_artifact(self, location: Location) -> None:
160 """Delete the artifact from the datastore.
162 Parameters
163 ----------
164 location : `Location`
165 Location of the artifact associated with this datastore.
166 """
167 log.debug("Deleting file: %s", location.uri)
168 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot)
169 log.debug("Successfully deleted file: %s", location.uri)
171 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
172 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
173 ref: DatasetRef, isComponent: bool = False) -> Any:
174 location = getInfo.location
176 # since we have to make a GET request to S3 anyhow (for download) we
177 # might as well use the HEADER metadata for size comparison instead.
178 # s3CheckFileExists would just duplicate GET/LIST charges in this case.
179 try:
180 log.debug("Reading file: %s", location.uri)
181 response = self.client.get_object(Bucket=location.netloc,
182 Key=location.relativeToPathRoot)
183 log.debug("Successfully read file: %s", location.uri)
184 except self.client.exceptions.ClientError as err:
185 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
186 # head_object returns 404 when object does not exist only when user
187 # has s3:ListBucket permission. If list permission does not exist a
188 # 403 is returned. In practical terms this usually means that the
189 # file does not exist, but it could also mean user lacks GetObject
190 # permission. It's hard to tell which case is it.
191 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
192 # Unit tests right now demand FileExistsError is raised, but this
193 # should be updated to PermissionError like in s3CheckFileExists.
194 if errorcode == 403:
195 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at "
196 f"expected location {location}. Forbidden HEAD "
197 "operation error occured. Verify s3:ListBucket "
198 "and s3:GetObject permissions are granted for "
199 "your IAM user and that file exists. ") from err
200 if errorcode == 404:
201 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
202 raise FileNotFoundError(errmsg) from err
203 # other errors are reraised also, but less descriptively
204 raise err
206 storedFileInfo = getInfo.info
207 if response["ContentLength"] != storedFileInfo.file_size: 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true
208 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
209 " match recorded size of {}".format(location.path, response["ContentLength"],
210 storedFileInfo.file_size))
212 # download the data as bytes
213 serializedDataset = response["Body"].read()
215 # format the downloaded bytes into appropriate object directly, or via
216 # tempfile (when formatter does not support to/from/Bytes). This is S3
217 # equivalent of PosixDatastore formatter.read try-except block.
218 formatter = getInfo.formatter
219 try:
220 result = formatter.fromBytes(serializedDataset,
221 component=getInfo.component if isComponent else None)
222 except NotImplementedError: 222 ↛ 234line 222 didn't jump to line 234
223 # formatter might not always have an extension so mypy complains
224 # We can either ignore the complaint or use a temporary location
225 tmpLoc = Location(".", "temp")
226 tmpLoc = formatter.makeUpdatedLocation(tmpLoc)
227 with tempfile.NamedTemporaryFile(suffix=tmpLoc.getExtension()) as tmpFile:
228 tmpFile.write(serializedDataset)
229 # Flush the write. Do not close the file because that
230 # will delete it.
231 tmpFile.flush()
232 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
233 result = formatter.read(component=getInfo.component if isComponent else None)
234 except Exception as e:
235 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
236 f" ({ref.datasetType.name} from {location.uri}): {e}") from e
238 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
239 isComponent=isComponent)
241 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time)
242 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
243 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
245 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
246 # `Keys` instead only look like directories, but are not. We check if
247 # an *exact* full key already exists before writing instead. The insert
248 # key operation is equivalent to creating the dir and the file.
249 if s3CheckFileExists(location, client=self.client,)[0]: 249 ↛ 258line 249 didn't jump to line 258, because the condition on line 249 was never true
250 # Assume that by this point if registry thinks the file should
251 # not exist then the file should not exist and therefore we can
252 # overwrite it. This can happen if a put was interrupted by
253 # an external interrupt. The only time this could be problematic is
254 # if the file template is incomplete and multiple dataset refs
255 # result in identical filenames.
256 # Eventually we should remove the check completely (it takes
257 # non-zero time for network).
258 log.warning("Object %s exists in datastore for ref %s", location.uri, ref)
260 # upload the file directly from bytes or by using a temporary file if
261 # _toBytes is not implemented
262 try:
263 serializedDataset = formatter.toBytes(inMemoryDataset)
264 log.debug("Writing file directly to %s", location.uri)
265 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
266 Body=serializedDataset)
267 log.debug("Successfully wrote file directly to %s", location.uri)
268 except NotImplementedError:
269 with tempfile.NamedTemporaryFile(suffix=location.getExtension()) as tmpFile:
270 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
271 formatter.write(inMemoryDataset)
272 with open(tmpFile.name, 'rb') as f:
273 log.debug("Writing file to %s via a temporary directory.", location.uri)
274 self.client.put_object(Bucket=location.netloc,
275 Key=location.relativeToPathRoot, Body=f)
276 log.debug("Successfully wrote file to %s via a temporary directory.", location.uri)
278 if self._transaction is None: 278 ↛ 279line 278 didn't jump to line 279, because the condition on line 278 was never true
279 raise RuntimeError("Attempting to write artifact without transaction enabled")
281 # Register a callback to try to delete the uploaded data if
282 # the ingest fails below
283 self._transaction.registerUndo("write", self.client.delete_object,
284 Bucket=location.netloc, Key=location.relativeToPathRoot)
286 # URI is needed to resolve what ingest case are we dealing with
287 return self._extractIngestInfo(location.uri, ref, formatter=formatter)
289 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
290 # Docstring inherited from base class
291 if transfer != "auto": 291 ↛ 293line 291 didn't jump to line 293, because the condition on line 291 was never false
292 return transfer
293 return "copy"
295 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
296 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
297 if transfer not in (None, "move", "copy"): 297 ↛ 298line 297 didn't jump to line 298, because the condition on line 297 was never true
298 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
299 # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
300 # target will always be s3). File has to exist at target location. Two
301 # Schemeless URIs are assumed to obey os.path rules. Equivalent to
302 # os.path.exists(fullPath) check in PosixDatastore.
303 srcUri = ButlerURI(path)
304 if not srcUri.exists(): 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true
305 raise FileNotFoundError(f"Resource at {srcUri} does not exist")
307 if transfer is None: 307 ↛ 308line 307 didn't jump to line 308, because the condition on line 307 was never true
308 rootUri = ButlerURI(self.root)
309 if not srcUri.relative_to(rootUri):
310 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not "
311 f"within datastore ({rootUri})")
312 return path
314 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
315 formatter: Union[Formatter, Type[Formatter]],
316 transfer: Optional[str] = None) -> StoredFileInfo:
317 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
318 srcUri = ButlerURI(path)
319 if transfer is None:
320 # The source file is already in the datastore but we have
321 # to work out the path relative to the root of the datastore.
322 # Because unlike for file to file ingest we can get absolute
323 # URIs here
324 rootUri = ButlerURI(self.root, forceDirectory=True)
325 pathInStore = srcUri.relative_to(rootUri)
326 if pathInStore is None: 326 ↛ 327line 326 didn't jump to line 327, because the condition on line 326 was never true
327 raise RuntimeError(f"Unexpectedly learned that {srcUri} is not within datastore {rootUri}")
328 tgtLocation = self.locationFactory.fromPath(pathInStore)
329 else:
330 # Work out the name we want this ingested file to have
331 # inside the datastore
332 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
334 # Convert that to a ButlerURI and transfer the resource to S3
335 targetUri = ButlerURI(tgtLocation.uri)
336 targetUri.transfer_from(srcUri, transfer=transfer)
338 # the file should exist on the bucket by now
339 _, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
340 bucket=tgtLocation.netloc,
341 client=self.client)
343 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
344 storageClass=ref.datasetType.storageClass,
345 component=ref.datasetType.component(),
346 file_size=size, checksum=None)