Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""S3 datastore.""" 

23 

24__all__ = ("S3Datastore", ) 

25 

26import boto3 

27import logging 

28import os 

29import pathlib 

30import tempfile 

31 

32from typing import Optional, Type, Any 

33 

34from lsst.daf.butler import ( 

35 ButlerURI, 

36 DatasetRef, 

37 Formatter, 

38 Location, 

39 StoredFileInfo, 

40) 

41 

42from .fileLikeDatastore import FileLikeDatastore 

43from lsst.daf.butler.core.s3utils import s3CheckFileExists, bucketExists 

44from lsst.daf.butler.core.utils import transactional 

45 

46log = logging.getLogger(__name__) 

47 

48 

49class S3Datastore(FileLikeDatastore): 

50 """Basic S3 Object Storage backed Datastore. 

51 

52 Parameters 

53 ---------- 

54 config : `DatastoreConfig` or `str` 

55 Configuration. A string should refer to the name of the config file. 

56 registry : `Registry` 

57 Registry to use for storing internal information about the datasets. 

58 butlerRoot : `str`, optional 

59 New datastore root to use to override the configuration value. 

60 

61 Raises 

62 ------ 

63 ValueError 

64 If root location does not exist and ``create`` is `False` in the 

65 configuration. 

66 

67 Notes 

68 ----- 

69 S3Datastore supports non-link transfer modes for file-based ingest: 

70 `"move"`, `"copy"`, and `None` (no transfer). 

71 """ 

72 

73 defaultConfigFile = "datastores/s3Datastore.yaml" 

74 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

75 absolute path. Can be None if no defaults specified. 

76 """ 

77 

78 def __init__(self, config, registry, butlerRoot=None): 

79 super().__init__(config, registry, butlerRoot) 

80 

81 self.client = boto3.client("s3") 

82 if not bucketExists(self.locationFactory.netloc): 82 ↛ 88line 82 didn't jump to line 88, because the condition on line 82 was never true

83 # PosixDatastore creates the root directory if one does not exist. 

84 # Calling s3 client.create_bucket is possible but also requires 

85 # ACL LocationConstraints, Permissions and other configuration 

86 # parameters, so for now we do not create a bucket if one is 

87 # missing. Further discussion can make this happen though. 

88 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!") 

89 

90 def _artifact_exists(self, location): 

91 """Check that an artifact exists in this datastore at the specified 

92 location. 

93 

94 Parameters 

95 ---------- 

96 location : `Location` 

97 Expected location of the artifact associated with this datastore. 

98 

99 Returns 

100 ------- 

101 exists : `bool` 

102 True if the location can be found, false otherwise. 

103 """ 

104 return s3CheckFileExists(location, client=self.client) 

105 

106 def _delete_artifact(self, location): 

107 """Delete the artifact from the datastore. 

108 

109 Parameters 

110 ---------- 

111 location : `Location` 

112 Location of the artifact associated with this datastore. 

113 """ 

114 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) 

115 

116 def get(self, ref, parameters=None): 

117 """Load an InMemoryDataset from the store. 

118 

119 Parameters 

120 ---------- 

121 ref : `DatasetRef` 

122 Reference to the required Dataset. 

123 parameters : `dict` 

124 `StorageClass`-specific parameters that specify, for example, 

125 a slice of the dataset to be loaded. 

126 

127 Returns 

128 ------- 

129 inMemoryDataset : `object` 

130 Requested dataset or slice thereof as an InMemoryDataset. 

131 

132 Raises 

133 ------ 

134 FileNotFoundError 

135 Requested dataset can not be retrieved. 

136 TypeError 

137 Return value from formatter has unexpected type. 

138 ValueError 

139 Formatter failed to process the dataset. 

140 """ 

141 getInfo = self._prepare_for_get(ref, parameters) 

142 location = getInfo.location 

143 

144 # since we have to make a GET request to S3 anyhow (for download) we 

145 # might as well use the HEADER metadata for size comparison instead. 

146 # s3CheckFileExists would just duplicate GET/LIST charges in this case. 

147 try: 

148 response = self.client.get_object(Bucket=location.netloc, 

149 Key=location.relativeToPathRoot) 

150 except self.client.exceptions.ClientError as err: 

151 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

152 # head_object returns 404 when object does not exist only when user 

153 # has s3:ListBucket permission. If list permission does not exist a 

154 # 403 is returned. In practical terms this usually means that the 

155 # file does not exist, but it could also mean user lacks GetObject 

156 # permission. It's hard to tell which case is it. 

157 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

158 # Unit tests right now demand FileExistsError is raised, but this 

159 # should be updated to PermissionError like in s3CheckFileExists. 

160 if errorcode == 403: 

161 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at " 

162 f"expected location {location}. Forbidden HEAD " 

163 "operation error occured. Verify s3:ListBucket " 

164 "and s3:GetObject permissions are granted for " 

165 "your IAM user and that file exists. ") from err 

166 if errorcode == 404: 

167 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." 

168 raise FileNotFoundError(errmsg) from err 

169 # other errors are reraised also, but less descriptively 

170 raise err 

171 

172 storedFileInfo = getInfo.info 

173 if response["ContentLength"] != storedFileInfo.file_size: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true

174 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

175 " match recorded size of {}".format(location.path, response["ContentLength"], 

176 storedFileInfo.file_size)) 

177 

178 # download the data as bytes 

179 serializedDataset = response["Body"].read() 

180 

181 # format the downloaded bytes into appropriate object directly, or via 

182 # tempfile (when formatter does not support to/from/Bytes). This is S3 

183 # equivalent of PosixDatastore formatter.read try-except block. 

184 formatter = getInfo.formatter 

185 try: 

186 result = formatter.fromBytes(serializedDataset, component=getInfo.component) 

187 except NotImplementedError: 187 ↛ 195line 187 didn't jump to line 195

188 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: 

189 tmpFile.file.write(serializedDataset) 

190 # Flush the write. Do not close the file because that 

191 # will delete it. 

192 tmpFile.file.flush() 

193 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

194 result = formatter.read(component=getInfo.component) 

195 except Exception as e: 

196 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

197 f" ({ref.datasetType.name} from {location.uri}): {e}") from e 

198 

199 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

200 isComponent=getInfo.component is not None) 

201 

202 @transactional 

203 def put(self, inMemoryDataset, ref): 

204 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

205 

206 Parameters 

207 ---------- 

208 inMemoryDataset : `object` 

209 The dataset to store. 

210 ref : `DatasetRef` 

211 Reference to the associated Dataset. 

212 

213 Raises 

214 ------ 

215 TypeError 

216 Supplied object and storage class are inconsistent. 

217 DatasetTypeNotSupportedError 

218 The associated `DatasetType` is not handled by this datastore. 

219 

220 Notes 

221 ----- 

222 If the datastore is configured to reject certain dataset types it 

223 is possible that the put will fail and raise a 

224 `DatasetTypeNotSupportedError`. The main use case for this is to 

225 allow `ChainedDatastore` to put to multiple datastores without 

226 requiring that every datastore accepts the dataset. 

227 """ 

228 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

229 

230 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 

231 # `Keys` instead only look like directories, but are not. We check if 

232 # an *exact* full key already exists before writing instead. The insert 

233 # key operation is equivalent to creating the dir and the file. 

234 location.updateExtension(formatter.extension) 

235 if s3CheckFileExists(location, client=self.client,)[0]: 

236 raise FileExistsError(f"Cannot write file for ref {ref} as " 

237 f"output file {location.uri} exists.") 

238 

239 # upload the file directly from bytes or by using a temporary file if 

240 # _toBytes is not implemented 

241 try: 

242 serializedDataset = formatter.toBytes(inMemoryDataset) 

243 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, 

244 Body=serializedDataset) 

245 log.debug("Wrote file directly to %s", location.uri) 

246 except NotImplementedError: 

247 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: 

248 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

249 formatter.write(inMemoryDataset) 

250 self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot, 

251 Filename=tmpFile.name) 

252 log.debug("Wrote file to %s via a temporary directory.", location.uri) 

253 

254 # Register a callback to try to delete the uploaded data if 

255 # the ingest fails below 

256 self._transaction.registerUndo("write", self.client.delete_object, 

257 Bucket=location.netloc, Key=location.relativeToPathRoot) 

258 

259 # URI is needed to resolve what ingest case are we dealing with 

260 info = self._extractIngestInfo(location.uri, ref, formatter=formatter) 

261 self._register_datasets([(ref, info)]) 

262 

263 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str: 

264 # Docstring inherited from base class 

265 if transfer != "auto": 265 ↛ 267line 265 didn't jump to line 267, because the condition on line 265 was never false

266 return transfer 

267 return "copy" 

268 

269 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

270 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

271 if transfer not in (None, "move", "copy"): 271 ↛ 272line 271 didn't jump to line 272, because the condition on line 271 was never true

272 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

273 # ingest can occur from file->s3 and s3->s3 (source can be file or s3, 

274 # target will always be s3). File has to exist at target location. Two 

275 # Schemeless URIs are assumed to obey os.path rules. Equivalent to 

276 # os.path.exists(fullPath) check in PosixDatastore. 

277 srcUri = ButlerURI(path) 

278 if srcUri.scheme == 'file' or not srcUri.scheme: 278 ↛ 281line 278 didn't jump to line 281, because the condition on line 278 was never false

279 if not os.path.exists(srcUri.ospath): 279 ↛ 280line 279 didn't jump to line 280, because the condition on line 279 was never true

280 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

281 elif srcUri.scheme == 's3': 

282 if not s3CheckFileExists(srcUri, client=self.client)[0]: 

283 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

284 else: 

285 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.") 

286 

287 if transfer is None: 287 ↛ 288line 287 didn't jump to line 288, because the condition on line 287 was never true

288 rootUri = ButlerURI(self.root) 

289 if srcUri.scheme == "file": 

290 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. " 

291 "Ingesting local data to S3Datastore without upload " 

292 "to S3 is not allowed.") 

293 elif srcUri.scheme == "s3": 

294 if not srcUri.path.startswith(rootUri.path): 

295 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.") 

296 return path 

297 

298 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

299 transfer: Optional[str] = None) -> StoredFileInfo: 

300 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

301 srcUri = ButlerURI(path) 

302 if transfer is None: 

303 rootUri = ButlerURI(self.root) 

304 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

305 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) 

306 tgtLocation = self.locationFactory.fromPath(pathInStore) 

307 else: 

308 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" 

309 if srcUri.scheme == "file": 309 ↛ 319line 309 didn't jump to line 319, because the condition on line 309 was never false

310 # source is on local disk. 

311 template = self.templates.getTemplate(ref) 

312 location = self.locationFactory.fromPath(template.format(ref)) 

313 tgtPathInStore = formatter.predictPathFromLocation(location) 

314 tgtLocation = self.locationFactory.fromPath(tgtPathInStore) 

315 self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot, 

316 Filename=srcUri.ospath) 

317 if transfer == "move": 317 ↛ 318line 317 didn't jump to line 318, because the condition on line 317 was never true

318 os.remove(srcUri.ospath) 

319 elif srcUri.scheme == "s3": 

320 # source is another S3 Bucket 

321 relpath = srcUri.relativeToPathRoot 

322 copySrc = {"Bucket": srcUri.netloc, "Key": relpath} 

323 self.client.copy(copySrc, self.locationFactory.netloc, relpath) 

324 if transfer == "move": 

325 # https://github.com/boto/boto3/issues/507 - there is no 

326 # way of knowing if the file was actually deleted except 

327 # for checking all the keys again, reponse is HTTP 204 OK 

328 # response all the time 

329 self.client.delete(Bucket=srcUri.netloc, Key=relpath) 

330 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

331 relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot)) 

332 tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot) 

333 

334 # the file should exist on the bucket by now 

335 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, 

336 bucket=tgtLocation.netloc, 

337 client=self.client) 

338 

339 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

340 storageClass=ref.datasetType.storageClass, 

341 file_size=size, checksum=None)