Coverage for python/lsst/daf/butler/datastores/s3Datastore.py : 62%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""S3 datastore."""
26__all__ = ("S3Datastore", )
28import logging
29import os
30import pathlib
31import tempfile
33from typing import (
34 TYPE_CHECKING,
35 Any,
36 Optional,
37 Type,
38 Union,
39)
41from lsst.daf.butler import (
42 ButlerURI,
43 DatasetRef,
44 Formatter,
45 Location,
46 StoredFileInfo,
47)
49from .fileLikeDatastore import FileLikeDatastore
50from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from .fileLikeDatastore import DatastoreFileGetInformation
54 from lsst.daf.butler import DatastoreConfig
55 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager
57log = logging.getLogger(__name__)
60class S3Datastore(FileLikeDatastore):
61 """Basic S3 Object Storage backed Datastore.
63 Parameters
64 ----------
65 config : `DatastoreConfig` or `str`
66 Configuration. A string should refer to the name of the config file.
67 bridgeManager : `DatastoreRegistryBridgeManager`
68 Object that manages the interface between `Registry` and datastores.
69 butlerRoot : `str`, optional
70 New datastore root to use to override the configuration value.
72 Raises
73 ------
74 ValueError
75 If root location does not exist and ``create`` is `False` in the
76 configuration.
78 Notes
79 -----
80 S3Datastore supports non-link transfer modes for file-based ingest:
81 `"move"`, `"copy"`, and `None` (no transfer).
82 """
84 defaultConfigFile = "datastores/s3Datastore.yaml"
85 """Path to configuration defaults. Accessed within the ``config`` resource
86 or relative to a search path. Can be None if no defaults specified.
87 """
89 def __init__(self, config: Union[DatastoreConfig, str],
90 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
91 super().__init__(config, bridgeManager, butlerRoot)
93 self.client = getS3Client()
94 if not bucketExists(self.locationFactory.netloc): 94 ↛ 100line 94 didn't jump to line 100, because the condition on line 94 was never true
95 # PosixDatastore creates the root directory if one does not exist.
96 # Calling s3 client.create_bucket is possible but also requires
97 # ACL LocationConstraints, Permissions and other configuration
98 # parameters, so for now we do not create a bucket if one is
99 # missing. Further discussion can make this happen though.
100 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!")
102 def _artifact_exists(self, location: Location) -> bool:
103 """Check that an artifact exists in this datastore at the specified
104 location.
106 Parameters
107 ----------
108 location : `Location`
109 Expected location of the artifact associated with this datastore.
111 Returns
112 -------
113 exists : `bool`
114 True if the location can be found, false otherwise.
115 """
116 log.debug("Checking if file exists: %s", location.uri)
117 exists, _ = s3CheckFileExists(location, client=self.client)
118 return exists
120 def _delete_artifact(self, location: Location) -> None:
121 """Delete the artifact from the datastore.
123 Parameters
124 ----------
125 location : `Location`
126 Location of the artifact associated with this datastore.
127 """
128 log.debug("Deleting file: %s", location.uri)
129 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot)
130 log.debug("Successfully deleted file: %s", location.uri)
132 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
133 ref: DatasetRef, isComponent: bool = False) -> Any:
134 location = getInfo.location
136 # since we have to make a GET request to S3 anyhow (for download) we
137 # might as well use the HEADER metadata for size comparison instead.
138 # s3CheckFileExists would just duplicate GET/LIST charges in this case.
139 try:
140 log.debug("Reading file: %s", location.uri)
141 response = self.client.get_object(Bucket=location.netloc,
142 Key=location.relativeToPathRoot)
143 log.debug("Successfully read file: %s", location.uri)
144 except self.client.exceptions.ClientError as err:
145 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
146 # head_object returns 404 when object does not exist only when user
147 # has s3:ListBucket permission. If list permission does not exist a
148 # 403 is returned. In practical terms this usually means that the
149 # file does not exist, but it could also mean user lacks GetObject
150 # permission. It's hard to tell which case is it.
151 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
152 # Unit tests right now demand FileExistsError is raised, but this
153 # should be updated to PermissionError like in s3CheckFileExists.
154 if errorcode == 403:
155 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at "
156 f"expected location {location}. Forbidden HEAD "
157 "operation error occured. Verify s3:ListBucket "
158 "and s3:GetObject permissions are granted for "
159 "your IAM user and that file exists. ") from err
160 if errorcode == 404:
161 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
162 raise FileNotFoundError(errmsg) from err
163 # other errors are reraised also, but less descriptively
164 raise err
166 storedFileInfo = getInfo.info
167 if response["ContentLength"] != storedFileInfo.file_size: 167 ↛ 168line 167 didn't jump to line 168, because the condition on line 167 was never true
168 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
169 " match recorded size of {}".format(location.path, response["ContentLength"],
170 storedFileInfo.file_size))
172 # download the data as bytes
173 serializedDataset = response["Body"].read()
175 # format the downloaded bytes into appropriate object directly, or via
176 # tempfile (when formatter does not support to/from/Bytes). This is S3
177 # equivalent of PosixDatastore formatter.read try-except block.
178 formatter = getInfo.formatter
179 try:
180 result = formatter.fromBytes(serializedDataset,
181 component=getInfo.component if isComponent else None)
182 except NotImplementedError: 182 ↛ 194line 182 didn't jump to line 194
183 # formatter might not always have an extension so mypy complains
184 # We can either ignore the complaint or use a temporary location
185 tmpLoc = Location(".", "temp")
186 tmpLoc = formatter.makeUpdatedLocation(tmpLoc)
187 with tempfile.NamedTemporaryFile(suffix=tmpLoc.getExtension()) as tmpFile:
188 tmpFile.write(serializedDataset)
189 # Flush the write. Do not close the file because that
190 # will delete it.
191 tmpFile.flush()
192 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
193 result = formatter.read(component=getInfo.component if isComponent else None)
194 except Exception as e:
195 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
196 f" ({ref.datasetType.name} from {location.uri}): {e}") from e
198 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
199 isComponent=isComponent)
201 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
202 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
204 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
205 # `Keys` instead only look like directories, but are not. We check if
206 # an *exact* full key already exists before writing instead. The insert
207 # key operation is equivalent to creating the dir and the file.
208 if s3CheckFileExists(location, client=self.client,)[0]:
209 raise FileExistsError(f"Cannot write file for ref {ref} as "
210 f"output file {location.uri} exists.")
212 # upload the file directly from bytes or by using a temporary file if
213 # _toBytes is not implemented
214 try:
215 serializedDataset = formatter.toBytes(inMemoryDataset)
216 log.debug("Writing file directly to %s", location.uri)
217 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
218 Body=serializedDataset)
219 log.debug("Successfully wrote file directly to %s", location.uri)
220 except NotImplementedError:
221 with tempfile.NamedTemporaryFile(suffix=location.getExtension()) as tmpFile:
222 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
223 formatter.write(inMemoryDataset)
224 with open(tmpFile.name, 'rb') as f:
225 log.debug("Writing file to %s via a temporary directory.", location.uri)
226 self.client.put_object(Bucket=location.netloc,
227 Key=location.relativeToPathRoot, Body=f)
228 log.debug("Successfully wrote file to %s via a temporary directory.", location.uri)
230 if self._transaction is None: 230 ↛ 231line 230 didn't jump to line 231, because the condition on line 230 was never true
231 raise RuntimeError("Attempting to write artifact without transaction enabled")
233 # Register a callback to try to delete the uploaded data if
234 # the ingest fails below
235 self._transaction.registerUndo("write", self.client.delete_object,
236 Bucket=location.netloc, Key=location.relativeToPathRoot)
238 # URI is needed to resolve what ingest case are we dealing with
239 return self._extractIngestInfo(location.uri, ref, formatter=formatter)
241 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
242 # Docstring inherited from base class
243 if transfer != "auto": 243 ↛ 245line 243 didn't jump to line 245, because the condition on line 243 was never false
244 return transfer
245 return "copy"
247 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
248 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
249 if transfer not in (None, "move", "copy"): 249 ↛ 250line 249 didn't jump to line 250, because the condition on line 249 was never true
250 raise NotImplementedError(f"Transfer mode {transfer} not supported.")
251 # ingest can occur from file->s3 and s3->s3 (source can be file or s3,
252 # target will always be s3). File has to exist at target location. Two
253 # Schemeless URIs are assumed to obey os.path rules. Equivalent to
254 # os.path.exists(fullPath) check in PosixDatastore.
255 srcUri = ButlerURI(path)
256 if srcUri.scheme == 'file' or not srcUri.scheme: 256 ↛ 259line 256 didn't jump to line 259, because the condition on line 256 was never false
257 if not os.path.exists(srcUri.ospath): 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true
258 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
259 elif srcUri.scheme == 's3':
260 if not s3CheckFileExists(srcUri, client=self.client)[0]:
261 raise FileNotFoundError(f"File at '{srcUri}' does not exist.")
262 else:
263 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.")
265 if transfer is None: 265 ↛ 266line 265 didn't jump to line 266, because the condition on line 265 was never true
266 rootUri = ButlerURI(self.root)
267 if srcUri.scheme == "file":
268 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. "
269 "Ingesting local data to S3Datastore without upload "
270 "to S3 is not allowed.")
271 elif srcUri.scheme == "s3":
272 if not srcUri.path.startswith(rootUri.path):
273 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.")
274 return path
276 def _extractIngestInfo(self, path: str, ref: DatasetRef, *,
277 formatter: Union[Formatter, Type[Formatter]],
278 transfer: Optional[str] = None) -> StoredFileInfo:
279 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
280 srcUri = ButlerURI(path)
281 if transfer is None:
282 rootUri = ButlerURI(self.root)
283 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot)
284 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot))
285 tgtLocation = self.locationFactory.fromPath(pathInStore)
286 else:
287 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath"
289 # Work out the name we want this ingested file to have
290 # inside the datastore
291 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
293 if srcUri.scheme == "file": 293 ↛ 300line 293 didn't jump to line 300, because the condition on line 293 was never false
294 # source is on local disk.
295 with open(srcUri.ospath, 'rb') as f:
296 self.client.put_object(Bucket=tgtLocation.netloc,
297 Key=tgtLocation.relativeToPathRoot, Body=f)
298 if transfer == "move": 298 ↛ 299line 298 didn't jump to line 299, because the condition on line 298 was never true
299 os.remove(srcUri.ospath)
300 elif srcUri.scheme == "s3":
301 # source is another S3 Bucket
302 relpath = srcUri.relativeToPathRoot
303 copySrc = {"Bucket": srcUri.netloc, "Key": relpath}
304 self.client.copy(copySrc, self.locationFactory.netloc,
305 tgtLocation.relativeToPathRoot)
306 if transfer == "move":
307 # https://github.com/boto/boto3/issues/507 - there is no
308 # way of knowing if the file was actually deleted except
309 # for checking all the keys again, reponse is HTTP 204 OK
310 # response all the time
311 self.client.delete(Bucket=srcUri.netloc, Key=relpath)
313 # the file should exist on the bucket by now
314 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot,
315 bucket=tgtLocation.netloc,
316 client=self.client)
318 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore,
319 storageClass=ref.datasetType.storageClass,
320 component=ref.datasetType.component(),
321 file_size=size, checksum=None)