Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""S3 datastore.""" 

23 

24__all__ = ("S3Datastore", ) 

25 

26import boto3 

27import logging 

28import os 

29import pathlib 

30import tempfile 

31 

32from typing import Optional, Type 

33 

34from lsst.daf.butler import ( 

35 ButlerURI, 

36 DatasetRef, 

37 Formatter, 

38 Location, 

39 StoredFileInfo, 

40) 

41 

42from .fileLikeDatastore import FileLikeDatastore 

43from lsst.daf.butler.core.s3utils import s3CheckFileExists, bucketExists 

44from lsst.daf.butler.core.utils import transactional 

45 

46log = logging.getLogger(__name__) 

47 

48 

49class S3Datastore(FileLikeDatastore): 

50 """Basic S3 Object Storage backed Datastore. 

51 

52 Parameters 

53 ---------- 

54 config : `DatastoreConfig` or `str` 

55 Configuration. A string should refer to the name of the config file. 

56 registry : `Registry` 

57 Registry to use for storing internal information about the datasets. 

58 butlerRoot : `str`, optional 

59 New datastore root to use to override the configuration value. 

60 

61 Raises 

62 ------ 

63 ValueError 

64 If root location does not exist and ``create`` is `False` in the 

65 configuration. 

66 

67 Notes 

68 ----- 

69 S3Datastore supports non-link transfer modes for file-based ingest: 

70 `"move"`, `"copy"`, and `None` (no transfer). 

71 """ 

72 

73 defaultConfigFile = "datastores/s3Datastore.yaml" 

74 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

75 absolute path. Can be None if no defaults specified. 

76 """ 

77 

78 def __init__(self, config, registry, butlerRoot=None): 

79 super().__init__(config, registry, butlerRoot) 

80 

81 self.client = boto3.client("s3") 

82 if not bucketExists(self.locationFactory.netloc): 82 ↛ 88line 82 didn't jump to line 88, because the condition on line 82 was never true

83 # PosixDatastore creates the root directory if one does not exist. 

84 # Calling s3 client.create_bucket is possible but also requires 

85 # ACL LocationConstraints, Permissions and other configuration 

86 # parameters, so for now we do not create a bucket if one is 

87 # missing. Further discussion can make this happen though. 

88 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!") 

89 

90 def exists(self, ref): 

91 """Check if the dataset exists in the datastore. 

92 

93 Parameters 

94 ---------- 

95 ref : `DatasetRef` 

96 Reference to the required dataset. 

97 

98 Returns 

99 ------- 

100 exists : `bool` 

101 `True` if the entity exists in the `Datastore`. 

102 """ 

103 location, _ = self._get_dataset_location_info(ref) 

104 if location is None: 

105 return False 

106 return s3CheckFileExists(location, client=self.client)[0] 

107 

108 def get(self, ref, parameters=None): 

109 """Load an InMemoryDataset from the store. 

110 

111 Parameters 

112 ---------- 

113 ref : `DatasetRef` 

114 Reference to the required Dataset. 

115 parameters : `dict` 

116 `StorageClass`-specific parameters that specify, for example, 

117 a slice of the Dataset to be loaded. 

118 

119 Returns 

120 ------- 

121 inMemoryDataset : `object` 

122 Requested Dataset or slice thereof as an InMemoryDataset. 

123 

124 Raises 

125 ------ 

126 FileNotFoundError 

127 Requested dataset can not be retrieved. 

128 TypeError 

129 Return value from formatter has unexpected type. 

130 ValueError 

131 Formatter failed to process the dataset. 

132 """ 

133 getInfo = self._prepare_for_get(ref, parameters) 

134 location = getInfo.location 

135 

136 # since we have to make a GET request to S3 anyhow (for download) we 

137 # might as well use the HEADER metadata for size comparison instead. 

138 # s3CheckFileExists would just duplicate GET/LIST charges in this case. 

139 try: 

140 response = self.client.get_object(Bucket=location.netloc, 

141 Key=location.relativeToPathRoot) 

142 except self.client.exceptions.ClientError as err: 

143 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

144 # head_object returns 404 when object does not exist only when user 

145 # has s3:ListBucket permission. If list permission does not exist a 

146 # 403 is returned. In practical terms this usually means that the 

147 # file does not exist, but it could also mean user lacks GetObject 

148 # permission. It's hard to tell which case is it. 

149 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

150 # Unit tests right now demand FileExistsError is raised, but this 

151 # should be updated to PermissionError like in s3CheckFileExists. 

152 if errorcode == 403: 

153 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at " 

154 f"expected location {location}. Forbidden HEAD " 

155 "operation error occured. Verify s3:ListBucket " 

156 "and s3:GetObject permissions are granted for " 

157 "your IAM user and that file exists. ") from err 

158 if errorcode == 404: 

159 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." 

160 raise FileNotFoundError(errmsg) from err 

161 # other errors are reraised also, but less descriptively 

162 raise err 

163 

164 storedFileInfo = getInfo.info 

165 if response["ContentLength"] != storedFileInfo.file_size: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

167 " match recorded size of {}".format(location.path, response["ContentLength"], 

168 storedFileInfo.file_size)) 

169 

170 # download the data as bytes 

171 serializedDataset = response["Body"].read() 

172 

173 # format the downloaded bytes into appropriate object directly, or via 

174 # tempfile (when formatter does not support to/from/Bytes). This is S3 

175 # equivalent of PosixDatastore formatter.read try-except block. 

176 formatter = getInfo.formatter 

177 try: 

178 result = formatter.fromBytes(serializedDataset, component=getInfo.component) 

179 except NotImplementedError: 179 ↛ 187line 179 didn't jump to line 187

180 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: 

181 tmpFile.file.write(serializedDataset) 

182 # Flush the write. Do not close the file because that 

183 # will delete it. 

184 tmpFile.file.flush() 

185 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

186 result = formatter.read(component=getInfo.component) 

187 except Exception as e: 

188 raise ValueError(f"Failure from formatter for Dataset {ref.id}: {e}") from e 

189 

190 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams) 

191 

192 @transactional 

193 def put(self, inMemoryDataset, ref): 

194 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

195 

196 Parameters 

197 ---------- 

198 inMemoryDataset : `object` 

199 The Dataset to store. 

200 ref : `DatasetRef` 

201 Reference to the associated Dataset. 

202 

203 Raises 

204 ------ 

205 TypeError 

206 Supplied object and storage class are inconsistent. 

207 DatasetTypeNotSupportedError 

208 The associated `DatasetType` is not handled by this datastore. 

209 

210 Notes 

211 ----- 

212 If the datastore is configured to reject certain dataset types it 

213 is possible that the put will fail and raise a 

214 `DatasetTypeNotSupportedError`. The main use case for this is to 

215 allow `ChainedDatastore` to put to multiple datastores without 

216 requiring that every datastore accepts the dataset. 

217 """ 

218 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

219 

220 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 

221 # `Keys` instead only look like directories, but are not. We check if 

222 # an *exact* full key already exists before writing instead. The insert 

223 # key operation is equivalent to creating the dir and the file. 

224 location.updateExtension(formatter.extension) 

225 if s3CheckFileExists(location, client=self.client,)[0]: 

226 raise FileExistsError(f"Cannot write file for ref {ref} as " 

227 f"output file {location.uri} exists.") 

228 

229 # upload the file directly from bytes or by using a temporary file if 

230 # _toBytes is not implemented 

231 try: 

232 serializedDataset = formatter.toBytes(inMemoryDataset) 

233 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, 

234 Body=serializedDataset) 

235 log.debug("Wrote file directly to %s", location.uri) 

236 except NotImplementedError: 

237 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: 

238 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

239 formatter.write(inMemoryDataset) 

240 self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot, 

241 Filename=tmpFile.name) 

242 log.debug("Wrote file to %s via a temporary directory.", location.uri) 

243 

244 # Register a callback to try to delete the uploaded data if 

245 # the ingest fails below 

246 self._transaction.registerUndo("write", self.client.delete_object, 

247 Bucket=location.netloc, Key=location.relativeToPathRoot) 

248 

249 # URI is needed to resolve what ingest case are we dealing with 

250 info = self._extractIngestInfo(location.uri, ref, formatter=formatter) 

251 self._register_datasets([(ref, info)]) 

252 

253 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

254 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

255 if transfer not in (None, "move", "copy"): 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true

256 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

257 # ingest can occur from file->s3 and s3->s3 (source can be file or s3, 

258 # target will always be s3). File has to exist at target location. Two 

259 # Schemeless URIs are assumed to obey os.path rules. Equivalent to 

260 # os.path.exists(fullPath) check in PosixDatastore. 

261 srcUri = ButlerURI(path) 

262 if srcUri.scheme == 'file' or not srcUri.scheme: 262 ↛ 265line 262 didn't jump to line 265, because the condition on line 262 was never false

263 if not os.path.exists(srcUri.ospath): 263 ↛ 264line 263 didn't jump to line 264, because the condition on line 263 was never true

264 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

265 elif srcUri.scheme == 's3': 

266 if not s3CheckFileExists(srcUri, client=self.client)[0]: 

267 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

268 else: 

269 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.") 

270 

271 if transfer is None: 271 ↛ 272line 271 didn't jump to line 272, because the condition on line 271 was never true

272 rootUri = ButlerURI(self.root) 

273 if srcUri.scheme == "file": 

274 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. " 

275 "Ingesting local data to S3Datastore without upload " 

276 "to S3 is not allowed.") 

277 elif srcUri.scheme == "s3": 

278 if not srcUri.path.startswith(rootUri.path): 

279 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.") 

280 return path 

281 

282 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

283 transfer: Optional[str] = None) -> StoredFileInfo: 

284 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

285 srcUri = ButlerURI(path) 

286 if transfer is None: 

287 rootUri = ButlerURI(self.root) 

288 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

289 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) 

290 tgtLocation = self.locationFactory.fromPath(pathInStore) 

291 else: 

292 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" 

293 if srcUri.scheme == "file": 293 ↛ 303line 293 didn't jump to line 303, because the condition on line 293 was never false

294 # source is on local disk. 

295 template = self.templates.getTemplate(ref) 

296 location = self.locationFactory.fromPath(template.format(ref)) 

297 tgtPathInStore = formatter.predictPathFromLocation(location) 

298 tgtLocation = self.locationFactory.fromPath(tgtPathInStore) 

299 self.client.upload_file(Bucket=tgtLocation.netloc, Key=tgtLocation.relativeToPathRoot, 

300 Filename=srcUri.ospath) 

301 if transfer == "move": 301 ↛ 302line 301 didn't jump to line 302, because the condition on line 301 was never true

302 os.remove(srcUri.ospath) 

303 elif srcUri.scheme == "s3": 

304 # source is another S3 Bucket 

305 relpath = srcUri.relativeToPathRoot 

306 copySrc = {"Bucket": srcUri.netloc, "Key": relpath} 

307 self.client.copy(copySrc, self.locationFactory.netloc, relpath) 

308 if transfer == "move": 

309 # https://github.com/boto/boto3/issues/507 - there is no 

310 # way of knowing if the file was actually deleted except 

311 # for checking all the keys again, reponse is HTTP 204 OK 

312 # response all the time 

313 self.client.delete(Bucket=srcUri.netloc, Key=relpath) 

314 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

315 relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot)) 

316 tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot) 

317 

318 # the file should exist on the bucket by now 

319 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, 

320 bucket=tgtLocation.netloc, 

321 client=self.client) 

322 

323 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

324 storageClass=ref.datasetType.storageClass, 

325 file_size=size, checksum=None) 

326 

327 def remove(self, ref): 

328 """Indicate to the Datastore that a Dataset can be removed. 

329 

330 .. warning:: 

331 

332 This method does not support transactions; removals are 

333 immediate, cannot be undone, and are not guaranteed to 

334 be atomic if deleting either the file or the internal 

335 database records fails. 

336 

337 Parameters 

338 ---------- 

339 ref : `DatasetRef` 

340 Reference to the required Dataset. 

341 

342 Raises 

343 ------ 

344 FileNotFoundError 

345 Attempt to remove a dataset that does not exist. 

346 """ 

347 location, _ = self._get_dataset_location_info(ref) 

348 if location is None: 348 ↛ 349line 348 didn't jump to line 349, because the condition on line 348 was never true

349 raise FileNotFoundError(f"Requested dataset ({ref}) does not exist") 

350 

351 if not s3CheckFileExists(location, client=self.client): 351 ↛ 352line 351 didn't jump to line 352, because the condition on line 351 was never true

352 raise FileNotFoundError(f"No such file: {location.uri}") 

353 

354 if self._can_remove_dataset_artifact(ref): 

355 # https://github.com/boto/boto3/issues/507 - there is no way of 

356 # knowing if the file was actually deleted 

357 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) 

358 

359 # Remove rows from registries 

360 self._remove_from_registry(ref)