Coverage for python/lsst/daf/butler/datastores/s3Datastore.py : 63%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""S3 datastore."""
24__all__ = ("S3Datastore", )
26import boto3
27import logging
28import os
29import pathlib
30import tempfile
32from typing import Optional, Type, Any
34from lsst.daf.butler import (
35 ButlerURI,
36 DatasetRef,
37 Formatter,
38 Location,
39 StoredFileInfo,
40)
42from .fileLikeDatastore import FileLikeDatastore
43from lsst.daf.butler.core.s3utils import s3CheckFileExists, bucketExists
44from lsst.daf.butler.core.utils import transactional
46log = logging.getLogger(__name__)
49class S3Datastore(FileLikeDatastore):
50 """Basic S3 Object Storage backed Datastore.
52 Parameters
53 ----------
54 config : `DatastoreConfig` or `str`
55 Configuration. A string should refer to the name of the config file.
56 registry : `Registry`
57 Registry to use for storing internal information about the datasets.
58 butlerRoot : `str`, optional
59 New datastore root to use to override the configuration value.
61 Raises
62 ------
63 ValueError
64 If root location does not exist and ``create`` is `False` in the
65 configuration.
67 Notes
68 -----
69 S3Datastore supports non-link transfer modes for file-based ingest:
70 `"move"`, `"copy"`, and `None` (no transfer).
71 """
73 defaultConfigFile = "datastores/s3Datastore.yaml"
74 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
75 absolute path. Can be None if no defaults specified.
76 """
78 def __init__(self, config, registry, butlerRoot=None):
79 super().__init__(config, registry, butlerRoot)
81 self.client = boto3.client("s3")
82 if not bucketExists(self.locationFactory.netloc): 82 ↛ 88line 82 didn't jump to line 88, because the condition on line 82 was never true
83 # PosixDatastore creates the root directory if one does not exist.
84 # Calling s3 client.create_bucket is possible but also requires
85 # ACL LocationConstraints, Permissions and other configuration
86 # parameters, so for now we do not create a bucket if one is
87 # missing. Further discussion can make this happen though.
88 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!")
90 def exists(self, ref):
91 """Check if the dataset exists in the datastore.
93 Parameters
94 ----------
95 ref : `DatasetRef`
96 Reference to the required dataset.
98 Returns
99 -------
100 exists : `bool`
101 `True` if the entity exists in the `Datastore`.
102 """
103 location, _ = self._get_dataset_location_info(ref)
104 if location is None:
105 return False
106 return s3CheckFileExists(location, client=self.client)[0]
108 def get(self, ref, parameters=None):
109 """Load an InMemoryDataset from the store.
111 Parameters
112 ----------
113 ref : `DatasetRef`
114 Reference to the required Dataset.
115 parameters : `dict`
116 `StorageClass`-specific parameters that specify, for example,
117 a slice of the Dataset to be loaded.
119 Returns
120 -------
121 inMemoryDataset : `object`
122 Requested Dataset or slice thereof as an InMemoryDataset.
124 Raises
125 ------
126 FileNotFoundError
127 Requested dataset can not be retrieved.
128 TypeError
129 Return value from formatter has unexpected type.
130 ValueError
131 Formatter failed to process the dataset.
132 """
133 getInfo = self._prepare_for_get(ref, parameters)
134 location = getInfo.location
136 # since we have to make a GET request to S3 anyhow (for download) we
137 # might as well use the HEADER metadata for size comparison instead.
138 # s3CheckFileExists would just duplicate GET/LIST charges in this case.
139 try:
140 response = self.client.get_object(Bucket=location.netloc,
141 Key=location.relativeToPathRoot)
142 except self.client.exceptions.ClientError as err:
143 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
144 # head_object returns 404 when object does not exist only when user
145 # has s3:ListBucket permission. If list permission does not exist a
146 # 403 is returned. In practical terms this usually means that the
147 # file does not exist, but it could also mean user lacks GetObject
148 # permission. It's hard to tell which case is it.
149 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
150 # Unit tests right now demand FileExistsError is raised, but this
151 # should be updated to PermissionError like in s3CheckFileExists.
152 if errorcode == 403:
153 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at "
154 f"expected location {location}. Forbidden HEAD "
155 "operation error occured. Verify s3:ListBucket "
156 "and s3:GetObject permissions are granted for "
157 "your IAM user and that file exists. ") from err
158 if errorcode == 404:
159 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
160 raise FileNotFoundError(errmsg) from err
161 # other errors are reraised also, but less descriptively
162 raise err
164 storedFileInfo = getInfo.info
165 if response["ContentLength"] != storedFileInfo.file_size: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true
166 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
167 " match recorded size of {}".format(location.path, response["ContentLength"],
168 storedFileInfo.file_size))
170 # download the data as bytes
171 serializedDataset = response["Body"].read()
173 # format the downloaded bytes into appropriate object directly, or via
174 # tempfile (when formatter does not support to/from/Bytes). This is S3
175 # equivalent of PosixDatastore formatter.read try-except block.
176 formatter = getInfo.formatter
177 try:
178 result = formatter.fromBytes(serializedDataset, component=getInfo.component)
179 except NotImplementedError: 179 ↛ 187line 179 didn't jump to line 187
180 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
181 tmpFile.file.write(serializedDataset)
182 # Flush the write. Do not close the file because that
183 # will delete it.
184 tmpFile.file.flush()
185 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
186 result = formatter.read(component=getInfo.component)
187 except Exception as e:
188 raise ValueError(f"Failure from formatter for Dataset {ref.id}: {e}") from e
190 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams)
192 @transactional
193 def put(self, inMemoryDataset, ref):
194 """Write a InMemoryDataset with a given `DatasetRef` to the store.
196 Parameters
197 ----------
198 inMemoryDataset : `object`
199 The Dataset to store.
200 ref : `DatasetRef`
201 Reference to the associated Dataset.
203 Raises
204 ------
205 TypeError
206 Supplied object and storage class are inconsistent.
207 DatasetTypeNotSupportedError
208 The associated `DatasetType` is not handled by this datastore.
210 Notes
211 -----
212 If the datastore is configured to reject certain dataset types it
213 is possible that the put will fail and raise a
214 `DatasetTypeNotSupportedError`. The main use case for this is to
215 allow `ChainedDatastore` to put to multiple datastores without
216 requiring that every datastore accepts the dataset.
217 """
218 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
220 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
221 # `Keys` instead only look like directories, but are not. We check if
222 # an *exact* full key already exists before writing instead. The insert
223 # key operation is equivalent to creating the dir and the file.
224 location.updateExtension(formatter.extension)
225 if s3CheckFileExists(location, client=self.client,)[0]:
226 raise FileExistsError(f"Cannot write file for ref {ref} as "
227 f"output file {location.uri} exists.")
229 # upload the file directly from bytes or by using a temporary file if
230 # _toBytes is not implemented
231 try:
232 serializedDataset = formatter.toBytes(inMemoryDataset)
233 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
234 Body=serializedDataset)
235 log.debug("Wrote file directly to %s", location.uri)
236 except NotImplementedError:
237 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
238 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
239 formatter.write(inMemoryDataset)
240 self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot,
241 Filename=tmpFile.name)
242 log.debug("Wrote file to %s via a temporary directory.", location.uri)
244 # Register a callback to try to delete the uploaded data if
245 # the ingest fails below
246 self._transaction.registerUndo("write", self.client.delete_object,
247 Bucket=location.netloc, Key=location.relativeToPathRoot)
249 # URI is needed to resolve what ingest case are we dealing with
250 info = self._extractIngestInfo(location.uri, ref, formatter=formatter)
251 self._register_datasets([(ref, info)])
253 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str:
254 # Docstring inherited from base class
255 if transfer != "auto": 255 ↛ 257line 255 didn't jump to line 257, because the condition on line 255 was never false
256 return transfer
257 return "copy"
259 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
260 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
261 if transfer not in (None, "move", "copy"): 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true
262 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
263 # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
264 # target will always be s3). File has to exist at target location. Two
265 # Schemeless URIs are assumed to obey os.path rules. Equivalent to
266 # os.path.exists(fullPath) check in PosixDatastore.
267 srcUri = ButlerURI(path)
268 if srcUri.scheme == 'file' or not srcUri.scheme: 268 ↛ 271line 268 didn't jump to line 271, because the condition on line 268 was never false
269 if not os.path.exists(srcUri.ospath): 269 ↛ 270line 269 didn't jump to line 270, because the condition on line 269 was never true
270 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
271 elif srcUri.scheme == 's3':
272 if not s3CheckFileExists(srcUri, client=self.client)[0]:
273 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
274 else:
275 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.")
277 if transfer is None: 277 ↛ 278line 277 didn't jump to line 278, because the condition on line 277 was never true
278 rootUri = ButlerURI(self.root)
279 if srcUri.scheme == "file":
280 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. "
281 "Ingesting local data to S3Datastore without upload "
282 "to S3 is not allowed.")
283 elif srcUri.scheme == "s3":
284 if not srcUri.path.startswith(rootUri.path):
285 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.")
286 return path
288 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
289 transfer: Optional[str] = None) -> StoredFileInfo:
290 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
291 srcUri = ButlerURI(path)
292 if transfer is None:
293 rootUri = ButlerURI(self.root)
294 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
295 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
296 tgtLocation = self.locationFactory.fromPath(pathInStore)
297 else:
298 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath"
299 if srcUri.scheme == "file": 299 ↛ 309line 299 didn't jump to line 309, because the condition on line 299 was never false
300 # source is on local disk.
301 template = self.templates.getTemplate(ref)
302 location = self.locationFactory.fromPath(template.format(ref))
303 tgtPathInStore = formatter.predictPathFromLocation(location)
304 tgtLocation = self.locationFactory.fromPath(tgtPathInStore)
305 self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot,
306 Filename=srcUri.ospath)
307 if transfer == "move": 307 ↛ 308line 307 didn't jump to line 308, because the condition on line 307 was never true
308 os.remove(srcUri.ospath)
309 elif srcUri.scheme == "s3":
310 # source is another S3 Bucket
311 relpath = srcUri.relativeToPathRoot
312 copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
313 self.client.copy(copySrc, self.locationFactory.netloc, relpath)
314 if transfer == "move":
315 # https://github.com/boto/boto3/issues/507 - there is no
316 # way of knowing if the file was actually deleted except
317 # for checking all the keys again, reponse is HTTP 204 OK
318 # response all the time
319 self.client.delete(Bucket=srcUri.netloc, Key=relpath)
320 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
321 relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot))
322 tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot)
324 # the file should exist on the bucket by now
325 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
326 bucket=tgtLocation.netloc,
327 client=self.client)
329 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
330 storageClass=ref.datasetType.storageClass,
331 file_size=size, checksum=None)
333 def remove(self, ref):
334 """Indicate to the Datastore that a Dataset can be removed.
336 .. warning::
338 This method does not support transactions; removals are
339 immediate, cannot be undone, and are not guaranteed to
340 be atomic if deleting either the file or the internal
341 database records fails.
343 Parameters
344 ----------
345 ref : `DatasetRef`
346 Reference to the required Dataset.
348 Raises
349 ------
350 FileNotFoundError
351 Attempt to remove a dataset that does not exist.
352 """
353 location, _ = self._get_dataset_location_info(ref)
354 if location is None: 354 ↛ 355line 354 didn't jump to line 355, because the condition on line 354 was never true
355 raise FileNotFoundError(f"Requested dataset ({ref}) does not exist")
357 if not s3CheckFileExists(location, client=self.client): 357 ↛ 358line 357 didn't jump to line 358, because the condition on line 357 was never true
358 raise FileNotFoundError(f"No such file: {location.uri}")
360 if self._can_remove_dataset_artifact(ref):
361 # https://github.com/boto/boto3/issues/507 - there is no way of
362 # knowing if the file was actually deleted
363 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot)
365 # Remove rows from registries
366 self._remove_from_registry(ref)