Coverage for python/lsst/daf/butler/datastores/s3Datastore.py : 52%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""S3 datastore."""
26__all__ = ("S3Datastore", )
28import logging
29import os
30import pathlib
31import tempfile
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 Optional,
37 Type,
38 Union,
39)
41from lsst.daf.butler import (
42 ButlerURI,
43 DatasetRef,
44 Formatter,
45 Location,
46 StoredFileInfo,
47)
49from .fileLikeDatastore import FileLikeDatastore
50from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from .fileLikeDatastore import DatastoreFileGetInformation
54 from lsst.daf.butler import DatastoreConfig
55 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager
57log = logging.getLogger(__name__)
60class S3Datastore(FileLikeDatastore):
61 """Basic S3 Object Storage backed Datastore.
63 Parameters
64 ----------
65 config : `DatastoreConfig` or `str`
66 Configuration. A string should refer to the name of the config file.
67 bridgeManager : `DatastoreRegistryBridgeManager`
68 Object that manages the interface between `Registry` and datastores.
69 butlerRoot : `str`, optional
70 New datastore root to use to override the configuration value.
72 Raises
73 ------
74 ValueError
75 If root location does not exist and ``create`` is `False` in the
76 configuration.
78 Notes
79 -----
80 S3Datastore supports non-link transfer modes for file-based ingest:
81 `"move"`, `"copy"`, and `None` (no transfer).
82 """
84 defaultConfigFile = "datastores/s3Datastore.yaml"
85 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
86 absolute path. Can be None if no defaults specified.
87 """
89 def __init__(self, config: Union[DatastoreConfig, str],
90 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
91 super().__init__(config, bridgeManager, butlerRoot)
93 self.client = getS3Client()
94 if not bucketExists(self.locationFactory.netloc): 94 ↛ 100line 94 didn't jump to line 100, because the condition on line 94 was never true
95 # PosixDatastore creates the root directory if one does not exist.
96 # Calling s3 client.create_bucket is possible but also requires
97 # ACL LocationConstraints, Permissions and other configuration
98 # parameters, so for now we do not create a bucket if one is
99 # missing. Further discussion can make this happen though.
100 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!")
102 def _artifact_exists(self, location: Location) -> bool:
103 """Check that an artifact exists in this datastore at the specified
104 location.
106 Parameters
107 ----------
108 location : `Location`
109 Expected location of the artifact associated with this datastore.
111 Returns
112 -------
113 exists : `bool`
114 True if the location can be found, false otherwise.
115 """
116 exists, _ = s3CheckFileExists(location, client=self.client)
117 return exists
119 def _delete_artifact(self, location: Location) -> None:
120 """Delete the artifact from the datastore.
122 Parameters
123 ----------
124 location : `Location`
125 Location of the artifact associated with this datastore.
126 """
127 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot)
129 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
130 ref: DatasetRef, isComponent: bool = False) -> Any:
131 location = getInfo.location
133 # since we have to make a GET request to S3 anyhow (for download) we
134 # might as well use the HEADER metadata for size comparison instead.
135 # s3CheckFileExists would just duplicate GET/LIST charges in this case.
136 try:
137 response = self.client.get_object(Bucket=location.netloc,
138 Key=location.relativeToPathRoot)
139 except self.client.exceptions.ClientError as err:
140 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
141 # head_object returns 404 when object does not exist only when user
142 # has s3:ListBucket permission. If list permission does not exist a
143 # 403 is returned. In practical terms this usually means that the
144 # file does not exist, but it could also mean user lacks GetObject
145 # permission. It's hard to tell which case is it.
146 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
147 # Unit tests right now demand FileExistsError is raised, but this
148 # should be updated to PermissionError like in s3CheckFileExists.
149 if errorcode == 403:
150 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at "
151 f"expected location {location}. Forbidden HEAD "
152 "operation error occured. Verify s3:ListBucket "
153 "and s3:GetObject permissions are granted for "
154 "your IAM user and that file exists. ") from err
155 if errorcode == 404:
156 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
157 raise FileNotFoundError(errmsg) from err
158 # other errors are reraised also, but less descriptively
159 raise err
161 storedFileInfo = getInfo.info
162 if response["ContentLength"] != storedFileInfo.file_size: 162 ↛ 163line 162 didn't jump to line 163, because the condition on line 162 was never true
163 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
164 " match recorded size of {}".format(location.path, response["ContentLength"],
165 storedFileInfo.file_size))
167 # download the data as bytes
168 serializedDataset = response["Body"].read()
170 # format the downloaded bytes into appropriate object directly, or via
171 # tempfile (when formatter does not support to/from/Bytes). This is S3
172 # equivalent of PosixDatastore formatter.read try-except block.
173 formatter = getInfo.formatter
174 try:
175 result = formatter.fromBytes(serializedDataset,
176 component=getInfo.component if isComponent else None)
177 except NotImplementedError:
178 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
179 tmpFile.write(serializedDataset)
180 # Flush the write. Do not close the file because that
181 # will delete it.
182 tmpFile.flush()
183 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
184 result = formatter.read(component=getInfo.component if isComponent else None)
185 except Exception as e:
186 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
187 f" ({ref.datasetType.name} from {location.uri}): {e}") from e
189 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
190 isComponent=isComponent)
192 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
193 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
195 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
196 # `Keys` instead only look like directories, but are not. We check if
197 # an *exact* full key already exists before writing instead. The insert
198 # key operation is equivalent to creating the dir and the file.
199 location.updateExtension(formatter.extension)
200 if s3CheckFileExists(location, client=self.client,)[0]:
201 raise FileExistsError(f"Cannot write file for ref {ref} as "
202 f"output file {location.uri} exists.")
204 # upload the file directly from bytes or by using a temporary file if
205 # _toBytes is not implemented
206 try:
207 serializedDataset = formatter.toBytes(inMemoryDataset)
208 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
209 Body=serializedDataset)
210 log.debug("Wrote file directly to %s", location.uri)
211 except NotImplementedError:
212 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
213 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
214 formatter.write(inMemoryDataset)
215 with open(tmpFile.name, 'rb') as f:
216 self.client.put_object(Bucket=location.netloc,
217 Key=location.relativeToPathRoot, Body=f)
218 log.debug("Wrote file to %s via a temporary directory.", location.uri)
220 if self._transaction is None: 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 raise RuntimeError("Attempting to write artifact without transaction enabled")
223 # Register a callback to try to delete the uploaded data if
224 # the ingest fails below
225 self._transaction.registerUndo("write", self.client.delete_object,
226 Bucket=location.netloc, Key=location.relativeToPathRoot)
228 # URI is needed to resolve what ingest case are we dealing with
229 return self._extractIngestInfo(location.uri, ref, formatter=formatter)
231 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
232 # Docstring inherited from base class
233 if transfer != "auto": 233 ↛ 235line 233 didn't jump to line 235, because the condition on line 233 was never false
234 return transfer
235 return "copy"
237 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
238 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
239 if transfer not in (None, "move", "copy"): 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true
240 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
241 # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
242 # target will always be s3). File has to exist at target location. Two
243 # Schemeless URIs are assumed to obey os.path rules. Equivalent to
244 # os.path.exists(fullPath) check in PosixDatastore.
245 srcUri = ButlerURI(path)
246 if srcUri.scheme == 'file' or not srcUri.scheme: 246 ↛ 249line 246 didn't jump to line 249, because the condition on line 246 was never false
247 if not os.path.exists(srcUri.ospath): 247 ↛ 248line 247 didn't jump to line 248, because the condition on line 247 was never true
248 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
249 elif srcUri.scheme == 's3':
250 if not s3CheckFileExists(srcUri, client=self.client)[0]:
251 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
252 else:
253 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.")
255 if transfer is None: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true
256 rootUri = ButlerURI(self.root)
257 if srcUri.scheme == "file":
258 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. "
259 "Ingesting local data to S3Datastore without upload "
260 "to S3 is not allowed.")
261 elif srcUri.scheme == "s3":
262 if not srcUri.path.startswith(rootUri.path):
263 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.")
264 return path
266 def _extractIngestInfo(self, path: str, ref: DatasetRef, *,
267 formatter: Union[Formatter, Type[Formatter]],
268 transfer: Optional[str] = None) -> StoredFileInfo:
269 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
270 srcUri = ButlerURI(path)
271 if transfer is None:
272 rootUri = ButlerURI(self.root)
273 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
274 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
275 tgtLocation = self.locationFactory.fromPath(pathInStore)
276 else:
277 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath"
278 if srcUri.scheme == "file": 278 ↛ 289line 278 didn't jump to line 289, because the condition on line 278 was never false
279 # source is on local disk.
280 template = self.templates.getTemplate(ref)
281 location = self.locationFactory.fromPath(template.format(ref))
282 tgtPathInStore = formatter.predictPathFromLocation(location)
283 tgtLocation = self.locationFactory.fromPath(tgtPathInStore)
284 with open(srcUri.ospath, 'rb') as f:
285 self.client.put_object(Bucket=tgtLocation.netloc,
286 Key=tgtLocation.relativeToPathRoot, Body=f)
287 if transfer == "move": 287 ↛ 288line 287 didn't jump to line 288, because the condition on line 287 was never true
288 os.remove(srcUri.ospath)
289 elif srcUri.scheme == "s3":
290 # source is another S3 Bucket
291 relpath = srcUri.relativeToPathRoot
292 copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
293 self.client.copy(copySrc, self.locationFactory.netloc, relpath)
294 if transfer == "move":
295 # https://github.com/boto/boto3/issues/507 - there is no
296 # way of knowing if the file was actually deleted except
297 # for checking all the keys again, reponse is HTTP 204 OK
298 # response all the time
299 self.client.delete(Bucket=srcUri.netloc, Key=relpath)
300 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
301 relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot))
302 tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot)
304 # the file should exist on the bucket by now
305 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
306 bucket=tgtLocation.netloc,
307 client=self.client)
309 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
310 storageClass=ref.datasetType.storageClass,
311 component=ref.datasetType.component(),
312 file_size=size, checksum=None)