Coverage for python/lsst/daf/butler/datastores/s3Datastore.py : 61%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""S3 datastore."""
24__all__ = ("S3Datastore", )
26import boto3
27import logging
28import os
29import pathlib
30import tempfile
32from typing import Optional, Type, Any
34from lsst.daf.butler import (
35 ButlerURI,
36 DatasetRef,
37 Formatter,
38 Location,
39 StoredFileInfo,
40)
42from .fileLikeDatastore import FileLikeDatastore
43from lsst.daf.butler.core.s3utils import s3CheckFileExists, bucketExists
44from lsst.daf.butler.core.utils import transactional
46log = logging.getLogger(__name__)
49class S3Datastore(FileLikeDatastore):
50 """Basic S3 Object Storage backed Datastore.
52 Parameters
53 ----------
54 config : `DatastoreConfig` or `str`
55 Configuration. A string should refer to the name of the config file.
56 registry : `Registry`
57 Registry to use for storing internal information about the datasets.
58 butlerRoot : `str`, optional
59 New datastore root to use to override the configuration value.
61 Raises
62 ------
63 ValueError
64 If root location does not exist and ``create`` is `False` in the
65 configuration.
67 Notes
68 -----
69 S3Datastore supports non-link transfer modes for file-based ingest:
70 `"move"`, `"copy"`, and `None` (no transfer).
71 """
73 defaultConfigFile = "datastores/s3Datastore.yaml"
74 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
75 absolute path. Can be None if no defaults specified.
76 """
78 def __init__(self, config, registry, butlerRoot=None):
79 super().__init__(config, registry, butlerRoot)
81 self.client = boto3.client("s3")
82 if not bucketExists(self.locationFactory.netloc): 82 ↛ 88line 82 didn't jump to line 88, because the condition on line 82 was never true
83 # PosixDatastore creates the root directory if one does not exist.
84 # Calling s3 client.create_bucket is possible but also requires
85 # ACL LocationConstraints, Permissions and other configuration
86 # parameters, so for now we do not create a bucket if one is
87 # missing. Further discussion can make this happen though.
88 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!")
90 def _artifact_exists(self, location):
91 """Check that an artifact exists in this datastore at the specified
92 location.
94 Parameters
95 ----------
96 location : `Location`
97 Expected location of the artifact associated with this datastore.
99 Returns
100 -------
101 exists : `bool`
102 True if the location can be found, false otherwise.
103 """
104 return s3CheckFileExists(location, client=self.client)
106 def _delete_artifact(self, location):
107 """Delete the artifact from the datastore.
109 Parameters
110 ----------
111 location : `Location`
112 Location of the artifact associated with this datastore.
113 """
114 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot)
116 def get(self, ref, parameters=None):
117 """Load an InMemoryDataset from the store.
119 Parameters
120 ----------
121 ref : `DatasetRef`
122 Reference to the required Dataset.
123 parameters : `dict`
124 `StorageClass`-specific parameters that specify, for example,
125 a slice of the dataset to be loaded.
127 Returns
128 -------
129 inMemoryDataset : `object`
130 Requested dataset or slice thereof as an InMemoryDataset.
132 Raises
133 ------
134 FileNotFoundError
135 Requested dataset can not be retrieved.
136 TypeError
137 Return value from formatter has unexpected type.
138 ValueError
139 Formatter failed to process the dataset.
140 """
141 getInfo = self._prepare_for_get(ref, parameters)
142 location = getInfo.location
144 # since we have to make a GET request to S3 anyhow (for download) we
145 # might as well use the HEADER metadata for size comparison instead.
146 # s3CheckFileExists would just duplicate GET/LIST charges in this case.
147 try:
148 response = self.client.get_object(Bucket=location.netloc,
149 Key=location.relativeToPathRoot)
150 except self.client.exceptions.ClientError as err:
151 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
152 # head_object returns 404 when object does not exist only when user
153 # has s3:ListBucket permission. If list permission does not exist a
154 # 403 is returned. In practical terms this usually means that the
155 # file does not exist, but it could also mean user lacks GetObject
156 # permission. It's hard to tell which case is it.
157 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
158 # Unit tests right now demand FileExistsError is raised, but this
159 # should be updated to PermissionError like in s3CheckFileExists.
160 if errorcode == 403:
161 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at "
162 f"expected location {location}. Forbidden HEAD "
163 "operation error occured. Verify s3:ListBucket "
164 "and s3:GetObject permissions are granted for "
165 "your IAM user and that file exists. ") from err
166 if errorcode == 404:
167 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
168 raise FileNotFoundError(errmsg) from err
169 # other errors are reraised also, but less descriptively
170 raise err
172 storedFileInfo = getInfo.info
173 if response["ContentLength"] != storedFileInfo.file_size: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true
174 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
175 " match recorded size of {}".format(location.path, response["ContentLength"],
176 storedFileInfo.file_size))
178 # download the data as bytes
179 serializedDataset = response["Body"].read()
181 # format the downloaded bytes into appropriate object directly, or via
182 # tempfile (when formatter does not support to/from/Bytes). This is S3
183 # equivalent of PosixDatastore formatter.read try-except block.
184 formatter = getInfo.formatter
185 try:
186 result = formatter.fromBytes(serializedDataset, component=getInfo.component)
187 except NotImplementedError: 187 ↛ 195line 187 didn't jump to line 195
188 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
189 tmpFile.file.write(serializedDataset)
190 # Flush the write. Do not close the file because that
191 # will delete it.
192 tmpFile.file.flush()
193 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
194 result = formatter.read(component=getInfo.component)
195 except Exception as e:
196 raise ValueError(f"Failure from formatter for dataset {ref.id}: {e}") from e
198 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams)
200 @transactional
201 def put(self, inMemoryDataset, ref):
202 """Write a InMemoryDataset with a given `DatasetRef` to the store.
204 Parameters
205 ----------
206 inMemoryDataset : `object`
207 The dataset to store.
208 ref : `DatasetRef`
209 Reference to the associated Dataset.
211 Raises
212 ------
213 TypeError
214 Supplied object and storage class are inconsistent.
215 DatasetTypeNotSupportedError
216 The associated `DatasetType` is not handled by this datastore.
218 Notes
219 -----
220 If the datastore is configured to reject certain dataset types it
221 is possible that the put will fail and raise a
222 `DatasetTypeNotSupportedError`. The main use case for this is to
223 allow `ChainedDatastore` to put to multiple datastores without
224 requiring that every datastore accepts the dataset.
225 """
226 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
228 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
229 # `Keys` instead only look like directories, but are not. We check if
230 # an *exact* full key already exists before writing instead. The insert
231 # key operation is equivalent to creating the dir and the file.
232 location.updateExtension(formatter.extension)
233 if s3CheckFileExists(location, client=self.client,)[0]:
234 raise FileExistsError(f"Cannot write file for ref {ref} as "
235 f"output file {location.uri} exists.")
237 # upload the file directly from bytes or by using a temporary file if
238 # _toBytes is not implemented
239 try:
240 serializedDataset = formatter.toBytes(inMemoryDataset)
241 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
242 Body=serializedDataset)
243 log.debug("Wrote file directly to %s", location.uri)
244 except NotImplementedError:
245 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
246 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
247 formatter.write(inMemoryDataset)
248 self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot,
249 Filename=tmpFile.name)
250 log.debug("Wrote file to %s via a temporary directory.", location.uri)
252 # Register a callback to try to delete the uploaded data if
253 # the ingest fails below
254 self._transaction.registerUndo("write", self.client.delete_object,
255 Bucket=location.netloc, Key=location.relativeToPathRoot)
257 # URI is needed to resolve what ingest case are we dealing with
258 info = self._extractIngestInfo(location.uri, ref, formatter=formatter)
259 self._register_datasets([(ref, info)])
261 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str:
262 # Docstring inherited from base class
263 if transfer != "auto": 263 ↛ 265line 263 didn't jump to line 265, because the condition on line 263 was never false
264 return transfer
265 return "copy"
267 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
268 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
269 if transfer not in (None, "move", "copy"): 269 ↛ 270line 269 didn't jump to line 270, because the condition on line 269 was never true
270 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
271 # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
272 # target will always be s3). File has to exist at target location. Two
273 # Schemeless URIs are assumed to obey os.path rules. Equivalent to
274 # os.path.exists(fullPath) check in PosixDatastore.
275 srcUri = ButlerURI(path)
276 if srcUri.scheme == 'file' or not srcUri.scheme: 276 ↛ 279line 276 didn't jump to line 279, because the condition on line 276 was never false
277 if not os.path.exists(srcUri.ospath): 277 ↛ 278line 277 didn't jump to line 278, because the condition on line 277 was never true
278 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
279 elif srcUri.scheme == 's3':
280 if not s3CheckFileExists(srcUri, client=self.client)[0]:
281 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
282 else:
283 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.")
285 if transfer is None: 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true
286 rootUri = ButlerURI(self.root)
287 if srcUri.scheme == "file":
288 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. "
289 "Ingesting local data to S3Datastore without upload "
290 "to S3 is not allowed.")
291 elif srcUri.scheme == "s3":
292 if not srcUri.path.startswith(rootUri.path):
293 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.")
294 return path
296 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
297 transfer: Optional[str] = None) -> StoredFileInfo:
298 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
299 srcUri = ButlerURI(path)
300 if transfer is None:
301 rootUri = ButlerURI(self.root)
302 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
303 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
304 tgtLocation = self.locationFactory.fromPath(pathInStore)
305 else:
306 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath"
307 if srcUri.scheme == "file": 307 ↛ 317line 307 didn't jump to line 317, because the condition on line 307 was never false
308 # source is on local disk.
309 template = self.templates.getTemplate(ref)
310 location = self.locationFactory.fromPath(template.format(ref))
311 tgtPathInStore = formatter.predictPathFromLocation(location)
312 tgtLocation = self.locationFactory.fromPath(tgtPathInStore)
313 self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot,
314 Filename=srcUri.ospath)
315 if transfer == "move": 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true
316 os.remove(srcUri.ospath)
317 elif srcUri.scheme == "s3":
318 # source is another S3 Bucket
319 relpath = srcUri.relativeToPathRoot
320 copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
321 self.client.copy(copySrc, self.locationFactory.netloc, relpath)
322 if transfer == "move":
323 # https://github.com/boto/boto3/issues/507 - there is no
324 # way of knowing if the file was actually deleted except
325 # for checking all the keys again, reponse is HTTP 204 OK
326 # response all the time
327 self.client.delete(Bucket=srcUri.netloc, Key=relpath)
328 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
329 relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot))
330 tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot)
332 # the file should exist on the bucket by now
333 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
334 bucket=tgtLocation.netloc,
335 client=self.client)
337 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
338 storageClass=ref.datasetType.storageClass,
339 file_size=size, checksum=None)