Coverage for python/lsst/daf/butler/datastores/s3Datastore.py : 61%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""S3 datastore."""
24__all__ = ("S3Datastore", )
26import boto3
27import logging
28import os
29import pathlib
30import tempfile
32from typing import Optional, Type, Any
34from lsst.daf.butler import (
35 ButlerURI,
36 DatasetRef,
37 Formatter,
38 Location,
39 StoredFileInfo,
40)
42from .fileLikeDatastore import FileLikeDatastore
43from lsst.daf.butler.core.s3utils import s3CheckFileExists, bucketExists
45log = logging.getLogger(__name__)
48class S3Datastore(FileLikeDatastore):
49 """Basic S3 Object Storage backed Datastore.
51 Parameters
52 ----------
53 config : `DatastoreConfig` or `str`
54 Configuration. A string should refer to the name of the config file.
55 registry : `Registry`
56 Registry to use for storing internal information about the datasets.
57 butlerRoot : `str`, optional
58 New datastore root to use to override the configuration value.
60 Raises
61 ------
62 ValueError
63 If root location does not exist and ``create`` is `False` in the
64 configuration.
66 Notes
67 -----
68 S3Datastore supports non-link transfer modes for file-based ingest:
69 `"move"`, `"copy"`, and `None` (no transfer).
70 """
72 defaultConfigFile = "datastores/s3Datastore.yaml"
73 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
74 absolute path. Can be None if no defaults specified.
75 """
77 def __init__(self, config, registry, butlerRoot=None):
78 super().__init__(config, registry, butlerRoot)
80 self.client = boto3.client("s3")
81 if not bucketExists(self.locationFactory.netloc): 81 ↛ 87line 81 didn't jump to line 87, because the condition on line 81 was never true
82 # PosixDatastore creates the root directory if one does not exist.
83 # Calling s3 client.create_bucket is possible but also requires
84 # ACL LocationConstraints, Permissions and other configuration
85 # parameters, so for now we do not create a bucket if one is
86 # missing. Further discussion can make this happen though.
87 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!")
89 def _artifact_exists(self, location):
90 """Check that an artifact exists in this datastore at the specified
91 location.
93 Parameters
94 ----------
95 location : `Location`
96 Expected location of the artifact associated with this datastore.
98 Returns
99 -------
100 exists : `bool`
101 True if the location can be found, false otherwise.
102 """
103 return s3CheckFileExists(location, client=self.client)
105 def _delete_artifact(self, location):
106 """Delete the artifact from the datastore.
108 Parameters
109 ----------
110 location : `Location`
111 Location of the artifact associated with this datastore.
112 """
113 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot)
115 def _read_artifact_into_memory(self, getInfo, ref, isComponent=False):
116 location = getInfo.location
118 # since we have to make a GET request to S3 anyhow (for download) we
119 # might as well use the HEADER metadata for size comparison instead.
120 # s3CheckFileExists would just duplicate GET/LIST charges in this case.
121 try:
122 response = self.client.get_object(Bucket=location.netloc,
123 Key=location.relativeToPathRoot)
124 except self.client.exceptions.ClientError as err:
125 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
126 # head_object returns 404 when object does not exist only when user
127 # has s3:ListBucket permission. If list permission does not exist a
128 # 403 is returned. In practical terms this usually means that the
129 # file does not exist, but it could also mean user lacks GetObject
130 # permission. It's hard to tell which case is it.
131 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
132 # Unit tests right now demand FileExistsError is raised, but this
133 # should be updated to PermissionError like in s3CheckFileExists.
134 if errorcode == 403:
135 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at "
136 f"expected location {location}. Forbidden HEAD "
137 "operation error occured. Verify s3:ListBucket "
138 "and s3:GetObject permissions are granted for "
139 "your IAM user and that file exists. ") from err
140 if errorcode == 404:
141 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
142 raise FileNotFoundError(errmsg) from err
143 # other errors are reraised also, but less descriptively
144 raise err
146 storedFileInfo = getInfo.info
147 if response["ContentLength"] != storedFileInfo.file_size: 147 ↛ 148line 147 didn't jump to line 148, because the condition on line 147 was never true
148 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
149 " match recorded size of {}".format(location.path, response["ContentLength"],
150 storedFileInfo.file_size))
152 # download the data as bytes
153 serializedDataset = response["Body"].read()
155 # format the downloaded bytes into appropriate object directly, or via
156 # tempfile (when formatter does not support to/from/Bytes). This is S3
157 # equivalent of PosixDatastore formatter.read try-except block.
158 formatter = getInfo.formatter
159 try:
160 result = formatter.fromBytes(serializedDataset,
161 component=getInfo.component if isComponent else None)
162 except NotImplementedError: 162 ↛ 170line 162 didn't jump to line 170
163 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
164 tmpFile.file.write(serializedDataset)
165 # Flush the write. Do not close the file because that
166 # will delete it.
167 tmpFile.file.flush()
168 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
169 result = formatter.read(component=getInfo.component if isComponent else None)
170 except Exception as e:
171 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
172 f" ({ref.datasetType.name} from {location.uri}): {e}") from e
174 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
175 isComponent=isComponent)
177 def _write_in_memory_to_artifact(self, inMemoryDataset, ref):
178 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
180 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
181 # `Keys` instead only look like directories, but are not. We check if
182 # an *exact* full key already exists before writing instead. The insert
183 # key operation is equivalent to creating the dir and the file.
184 location.updateExtension(formatter.extension)
185 if s3CheckFileExists(location, client=self.client,)[0]:
186 raise FileExistsError(f"Cannot write file for ref {ref} as "
187 f"output file {location.uri} exists.")
189 # upload the file directly from bytes or by using a temporary file if
190 # _toBytes is not implemented
191 try:
192 serializedDataset = formatter.toBytes(inMemoryDataset)
193 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
194 Body=serializedDataset)
195 log.debug("Wrote file directly to %s", location.uri)
196 except NotImplementedError:
197 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
198 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
199 formatter.write(inMemoryDataset)
200 self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot,
201 Filename=tmpFile.name)
202 log.debug("Wrote file to %s via a temporary directory.", location.uri)
204 # Register a callback to try to delete the uploaded data if
205 # the ingest fails below
206 self._transaction.registerUndo("write", self.client.delete_object,
207 Bucket=location.netloc, Key=location.relativeToPathRoot)
209 # URI is needed to resolve what ingest case are we dealing with
210 return self._extractIngestInfo(location.uri, ref, formatter=formatter)
212 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str:
213 # Docstring inherited from base class
214 if transfer != "auto": 214 ↛ 216line 214 didn't jump to line 216, because the condition on line 214 was never false
215 return transfer
216 return "copy"
218 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
219 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
220 if transfer not in (None, "move", "copy"): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
222 # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
223 # target will always be s3). File has to exist at target location. Two
224 # Schemeless URIs are assumed to obey os.path rules. Equivalent to
225 # os.path.exists(fullPath) check in PosixDatastore.
226 srcUri = ButlerURI(path)
227 if srcUri.scheme == 'file' or not srcUri.scheme: 227 ↛ 230line 227 didn't jump to line 230, because the condition on line 227 was never false
228 if not os.path.exists(srcUri.ospath): 228 ↛ 229line 228 didn't jump to line 229, because the condition on line 228 was never true
229 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
230 elif srcUri.scheme == 's3':
231 if not s3CheckFileExists(srcUri, client=self.client)[0]:
232 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
233 else:
234 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.")
236 if transfer is None: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true
237 rootUri = ButlerURI(self.root)
238 if srcUri.scheme == "file":
239 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. "
240 "Ingesting local data to S3Datastore without upload "
241 "to S3 is not allowed.")
242 elif srcUri.scheme == "s3":
243 if not srcUri.path.startswith(rootUri.path):
244 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.")
245 return path
247 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
248 transfer: Optional[str] = None) -> StoredFileInfo:
249 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
250 srcUri = ButlerURI(path)
251 if transfer is None:
252 rootUri = ButlerURI(self.root)
253 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
254 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
255 tgtLocation = self.locationFactory.fromPath(pathInStore)
256 else:
257 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath"
258 if srcUri.scheme == "file": 258 ↛ 268line 258 didn't jump to line 268, because the condition on line 258 was never false
259 # source is on local disk.
260 template = self.templates.getTemplate(ref)
261 location = self.locationFactory.fromPath(template.format(ref))
262 tgtPathInStore = formatter.predictPathFromLocation(location)
263 tgtLocation = self.locationFactory.fromPath(tgtPathInStore)
264 self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot,
265 Filename=srcUri.ospath)
266 if transfer == "move": 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true
267 os.remove(srcUri.ospath)
268 elif srcUri.scheme == "s3":
269 # source is another S3 Bucket
270 relpath = srcUri.relativeToPathRoot
271 copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
272 self.client.copy(copySrc, self.locationFactory.netloc, relpath)
273 if transfer == "move":
274 # https://github.com/boto/boto3/issues/507 - there is no
275 # way of knowing if the file was actually deleted except
276 # for checking all the keys again, reponse is HTTP 204 OK
277 # response all the time
278 self.client.delete(Bucket=srcUri.netloc, Key=relpath)
279 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
280 relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot))
281 tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot)
283 # the file should exist on the bucket by now
284 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
285 bucket=tgtLocation.netloc,
286 client=self.client)
288 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
289 storageClass=ref.datasetType.storageClass,
290 component=ref.datasetType.component(),
291 file_size=size, checksum=None)