Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""S3 datastore.""" 

23 

24__all__ = ("S3Datastore", ) 

25 

26import boto3 

27import logging 

28import os 

29import pathlib 

30import tempfile 

31 

32from typing import Optional, Type, Any 

33 

34from lsst.daf.butler import ( 

35 ButlerURI, 

36 DatasetRef, 

37 Formatter, 

38 Location, 

39 StoredFileInfo, 

40) 

41 

42from .fileLikeDatastore import FileLikeDatastore 

43from lsst.daf.butler.core.s3utils import s3CheckFileExists, bucketExists 

44 

45log = logging.getLogger(__name__) 

46 

47 

48class S3Datastore(FileLikeDatastore): 

49 """Basic S3 Object Storage backed Datastore. 

50 

51 Parameters 

52 ---------- 

53 config : `DatastoreConfig` or `str` 

54 Configuration. A string should refer to the name of the config file. 

55 bridgeManager : `DatastoreRegistryBridgeManager` 

56 Object that manages the interface between `Registry` and datastores. 

57 butlerRoot : `str`, optional 

58 New datastore root to use to override the configuration value. 

59 

60 Raises 

61 ------ 

62 ValueError 

63 If root location does not exist and ``create`` is `False` in the 

64 configuration. 

65 

66 Notes 

67 ----- 

68 S3Datastore supports non-link transfer modes for file-based ingest: 

69 `"move"`, `"copy"`, and `None` (no transfer). 

70 """ 

71 

72 defaultConfigFile = "datastores/s3Datastore.yaml" 

73 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

74 absolute path. Can be None if no defaults specified. 

75 """ 

76 

77 def __init__(self, config, bridgeManager, butlerRoot=None): 

78 super().__init__(config, bridgeManager, butlerRoot) 

79 

80 self.client = boto3.client("s3") 

81 if not bucketExists(self.locationFactory.netloc): 81 ↛ 87line 81 didn't jump to line 87, because the condition on line 81 was never true

82 # PosixDatastore creates the root directory if one does not exist. 

83 # Calling s3 client.create_bucket is possible but also requires 

84 # ACL LocationConstraints, Permissions and other configuration 

85 # parameters, so for now we do not create a bucket if one is 

86 # missing. Further discussion can make this happen though. 

87 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!") 

88 

89 def _artifact_exists(self, location): 

90 """Check that an artifact exists in this datastore at the specified 

91 location. 

92 

93 Parameters 

94 ---------- 

95 location : `Location` 

96 Expected location of the artifact associated with this datastore. 

97 

98 Returns 

99 ------- 

100 exists : `bool` 

101 True if the location can be found, false otherwise. 

102 """ 

103 return s3CheckFileExists(location, client=self.client) 

104 

105 def _delete_artifact(self, location): 

106 """Delete the artifact from the datastore. 

107 

108 Parameters 

109 ---------- 

110 location : `Location` 

111 Location of the artifact associated with this datastore. 

112 """ 

113 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) 

114 

115 def _read_artifact_into_memory(self, getInfo, ref, isComponent=False): 

116 location = getInfo.location 

117 

118 # since we have to make a GET request to S3 anyhow (for download) we 

119 # might as well use the HEADER metadata for size comparison instead. 

120 # s3CheckFileExists would just duplicate GET/LIST charges in this case. 

121 try: 

122 response = self.client.get_object(Bucket=location.netloc, 

123 Key=location.relativeToPathRoot) 

124 except self.client.exceptions.ClientError as err: 

125 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

126 # head_object returns 404 when object does not exist only when user 

127 # has s3:ListBucket permission. If list permission does not exist a 

128 # 403 is returned. In practical terms this usually means that the 

129 # file does not exist, but it could also mean user lacks GetObject 

130 # permission. It's hard to tell which case is it. 

131 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

132 # Unit tests right now demand FileExistsError is raised, but this 

133 # should be updated to PermissionError like in s3CheckFileExists. 

134 if errorcode == 403: 

135 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at " 

136 f"expected location {location}. Forbidden HEAD " 

137 "operation error occured. Verify s3:ListBucket " 

138 "and s3:GetObject permissions are granted for " 

139 "your IAM user and that file exists. ") from err 

140 if errorcode == 404: 

141 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." 

142 raise FileNotFoundError(errmsg) from err 

143 # other errors are reraised also, but less descriptively 

144 raise err 

145 

146 storedFileInfo = getInfo.info 

147 if response["ContentLength"] != storedFileInfo.file_size: 147 ↛ 148line 147 didn't jump to line 148, because the condition on line 147 was never true

148 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

149 " match recorded size of {}".format(location.path, response["ContentLength"], 

150 storedFileInfo.file_size)) 

151 

152 # download the data as bytes 

153 serializedDataset = response["Body"].read() 

154 

155 # format the downloaded bytes into appropriate object directly, or via 

156 # tempfile (when formatter does not support to/from/Bytes). This is S3 

157 # equivalent of PosixDatastore formatter.read try-except block. 

158 formatter = getInfo.formatter 

159 try: 

160 result = formatter.fromBytes(serializedDataset, 

161 component=getInfo.component if isComponent else None) 

162 except NotImplementedError: 162 ↛ 170line 162 didn't jump to line 170

163 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: 

164 tmpFile.file.write(serializedDataset) 

165 # Flush the write. Do not close the file because that 

166 # will delete it. 

167 tmpFile.file.flush() 

168 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

169 result = formatter.read(component=getInfo.component if isComponent else None) 

170 except Exception as e: 

171 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

172 f" ({ref.datasetType.name} from {location.uri}): {e}") from e 

173 

174 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

175 isComponent=isComponent) 

176 

177 def _write_in_memory_to_artifact(self, inMemoryDataset, ref): 

178 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

179 

180 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 

181 # `Keys` instead only look like directories, but are not. We check if 

182 # an *exact* full key already exists before writing instead. The insert 

183 # key operation is equivalent to creating the dir and the file. 

184 location.updateExtension(formatter.extension) 

185 if s3CheckFileExists(location, client=self.client,)[0]: 

186 raise FileExistsError(f"Cannot write file for ref {ref} as " 

187 f"output file {location.uri} exists.") 

188 

189 # upload the file directly from bytes or by using a temporary file if 

190 # _toBytes is not implemented 

191 try: 

192 serializedDataset = formatter.toBytes(inMemoryDataset) 

193 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, 

194 Body=serializedDataset) 

195 log.debug("Wrote file directly to %s", location.uri) 

196 except NotImplementedError: 

197 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: 

198 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

199 formatter.write(inMemoryDataset) 

200 self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot, 

201 Filename=tmpFile.name) 

202 log.debug("Wrote file to %s via a temporary directory.", location.uri) 

203 

204 # Register a callback to try to delete the uploaded data if 

205 # the ingest fails below 

206 self._transaction.registerUndo("write", self.client.delete_object, 

207 Bucket=location.netloc, Key=location.relativeToPathRoot) 

208 

209 # URI is needed to resolve what ingest case are we dealing with 

210 return self._extractIngestInfo(location.uri, ref, formatter=formatter) 

211 

212 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str: 

213 # Docstring inherited from base class 

214 if transfer != "auto": 214 ↛ 216line 214 didn't jump to line 216, because the condition on line 214 was never false

215 return transfer 

216 return "copy" 

217 

218 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

219 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

220 if transfer not in (None, "move", "copy"): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true

221 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

222 # ingest can occur from file->s3 and s3->s3 (source can be file or s3, 

223 # target will always be s3). File has to exist at target location. Two 

224 # Schemeless URIs are assumed to obey os.path rules. Equivalent to 

225 # os.path.exists(fullPath) check in PosixDatastore. 

226 srcUri = ButlerURI(path) 

227 if srcUri.scheme == 'file' or not srcUri.scheme: 227 ↛ 230line 227 didn't jump to line 230, because the condition on line 227 was never false

228 if not os.path.exists(srcUri.ospath): 228 ↛ 229line 228 didn't jump to line 229, because the condition on line 228 was never true

229 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

230 elif srcUri.scheme == 's3': 

231 if not s3CheckFileExists(srcUri, client=self.client)[0]: 

232 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

233 else: 

234 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.") 

235 

236 if transfer is None: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true

237 rootUri = ButlerURI(self.root) 

238 if srcUri.scheme == "file": 

239 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. " 

240 "Ingesting local data to S3Datastore without upload " 

241 "to S3 is not allowed.") 

242 elif srcUri.scheme == "s3": 

243 if not srcUri.path.startswith(rootUri.path): 

244 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.") 

245 return path 

246 

247 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

248 transfer: Optional[str] = None) -> StoredFileInfo: 

249 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

250 srcUri = ButlerURI(path) 

251 if transfer is None: 

252 rootUri = ButlerURI(self.root) 

253 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

254 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) 

255 tgtLocation = self.locationFactory.fromPath(pathInStore) 

256 else: 

257 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" 

258 if srcUri.scheme == "file": 258 ↛ 268line 258 didn't jump to line 268, because the condition on line 258 was never false

259 # source is on local disk. 

260 template = self.templates.getTemplate(ref) 

261 location = self.locationFactory.fromPath(template.format(ref)) 

262 tgtPathInStore = formatter.predictPathFromLocation(location) 

263 tgtLocation = self.locationFactory.fromPath(tgtPathInStore) 

264 self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot, 

265 Filename=srcUri.ospath) 

266 if transfer == "move": 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true

267 os.remove(srcUri.ospath) 

268 elif srcUri.scheme == "s3": 

269 # source is another S3 Bucket 

270 relpath = srcUri.relativeToPathRoot 

271 copySrc = {"Bucket": srcUri.netloc, "Key": relpath} 

272 self.client.copy(copySrc, self.locationFactory.netloc, relpath) 

273 if transfer == "move": 

274 # https://github.com/boto/boto3/issues/507 - there is no 

275 # way of knowing if the file was actually deleted except 

276 # for checking all the keys again, reponse is HTTP 204 OK 

277 # response all the time 

278 self.client.delete(Bucket=srcUri.netloc, Key=relpath) 

279 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

280 relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot)) 

281 tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot) 

282 

283 # the file should exist on the bucket by now 

284 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, 

285 bucket=tgtLocation.netloc, 

286 client=self.client) 

287 

288 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

289 storageClass=ref.datasetType.storageClass, 

290 component=ref.datasetType.component(), 

291 file_size=size, checksum=None)