Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""S3 datastore.""" 

25 

26__all__ = ("S3Datastore", ) 

27 

28import logging 

29import os 

30import pathlib 

31import tempfile 

32 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 Optional, 

37 Type, 

38 Union, 

39) 

40 

41from lsst.daf.butler import ( 

42 ButlerURI, 

43 DatasetRef, 

44 Formatter, 

45 Location, 

46 StoredFileInfo, 

47) 

48 

49from .fileLikeDatastore import FileLikeDatastore 

50from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from .fileLikeDatastore import DatastoreFileGetInformation 

54 from lsst.daf.butler import DatastoreConfig 

55 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

56 

57log = logging.getLogger(__name__) 

58 

59 

60class S3Datastore(FileLikeDatastore): 

61 """Basic S3 Object Storage backed Datastore. 

62 

63 Parameters 

64 ---------- 

65 config : `DatastoreConfig` or `str` 

66 Configuration. A string should refer to the name of the config file. 

67 bridgeManager : `DatastoreRegistryBridgeManager` 

68 Object that manages the interface between `Registry` and datastores. 

69 butlerRoot : `str`, optional 

70 New datastore root to use to override the configuration value. 

71 

72 Raises 

73 ------ 

74 ValueError 

75 If root location does not exist and ``create`` is `False` in the 

76 configuration. 

77 

78 Notes 

79 ----- 

80 S3Datastore supports non-link transfer modes for file-based ingest: 

81 `"move"`, `"copy"`, and `None` (no transfer). 

82 """ 

83 

84 defaultConfigFile = "datastores/s3Datastore.yaml" 

85 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

86 absolute path. Can be None if no defaults specified. 

87 """ 

88 

89 def __init__(self, config: Union[DatastoreConfig, str], 

90 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

91 super().__init__(config, bridgeManager, butlerRoot) 

92 

93 self.client = getS3Client() 

94 if not bucketExists(self.locationFactory.netloc): 94 ↛ 100line 94 didn't jump to line 100, because the condition on line 94 was never true

95 # PosixDatastore creates the root directory if one does not exist. 

96 # Calling s3 client.create_bucket is possible but also requires 

97 # ACL LocationConstraints, Permissions and other configuration 

98 # parameters, so for now we do not create a bucket if one is 

99 # missing. Further discussion can make this happen though. 

100 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!") 

101 

102 def _artifact_exists(self, location: Location) -> bool: 

103 """Check that an artifact exists in this datastore at the specified 

104 location. 

105 

106 Parameters 

107 ---------- 

108 location : `Location` 

109 Expected location of the artifact associated with this datastore. 

110 

111 Returns 

112 ------- 

113 exists : `bool` 

114 True if the location can be found, false otherwise. 

115 """ 

116 return s3CheckFileExists(location, client=self.client) 

117 

118 def _delete_artifact(self, location: Location) -> None: 

119 """Delete the artifact from the datastore. 

120 

121 Parameters 

122 ---------- 

123 location : `Location` 

124 Location of the artifact associated with this datastore. 

125 """ 

126 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) 

127 

128 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

129 ref: DatasetRef, isComponent: bool = False) -> Any: 

130 location = getInfo.location 

131 

132 # since we have to make a GET request to S3 anyhow (for download) we 

133 # might as well use the HEADER metadata for size comparison instead. 

134 # s3CheckFileExists would just duplicate GET/LIST charges in this case. 

135 try: 

136 response = self.client.get_object(Bucket=location.netloc, 

137 Key=location.relativeToPathRoot) 

138 except self.client.exceptions.ClientError as err: 

139 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

140 # head_object returns 404 when object does not exist only when user 

141 # has s3:ListBucket permission. If list permission does not exist a 

142 # 403 is returned. In practical terms this usually means that the 

143 # file does not exist, but it could also mean user lacks GetObject 

144 # permission. It's hard to tell which case is it. 

145 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

146 # Unit tests right now demand FileExistsError is raised, but this 

147 # should be updated to PermissionError like in s3CheckFileExists. 

148 if errorcode == 403: 

149 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at " 

150 f"expected location {location}. Forbidden HEAD " 

151 "operation error occured. Verify s3:ListBucket " 

152 "and s3:GetObject permissions are granted for " 

153 "your IAM user and that file exists. ") from err 

154 if errorcode == 404: 

155 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." 

156 raise FileNotFoundError(errmsg) from err 

157 # other errors are reraised also, but less descriptively 

158 raise err 

159 

160 storedFileInfo = getInfo.info 

161 if response["ContentLength"] != storedFileInfo.file_size: 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

163 " match recorded size of {}".format(location.path, response["ContentLength"], 

164 storedFileInfo.file_size)) 

165 

166 # download the data as bytes 

167 serializedDataset = response["Body"].read() 

168 

169 # format the downloaded bytes into appropriate object directly, or via 

170 # tempfile (when formatter does not support to/from/Bytes). This is S3 

171 # equivalent of PosixDatastore formatter.read try-except block. 

172 formatter = getInfo.formatter 

173 try: 

174 result = formatter.fromBytes(serializedDataset, 

175 component=getInfo.component if isComponent else None) 

176 except NotImplementedError: 176 ↛ 184line 176 didn't jump to line 184

177 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: 

178 tmpFile.write(serializedDataset) 

179 # Flush the write. Do not close the file because that 

180 # will delete it. 

181 tmpFile.flush() 

182 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

183 result = formatter.read(component=getInfo.component if isComponent else None) 

184 except Exception as e: 

185 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

186 f" ({ref.datasetType.name} from {location.uri}): {e}") from e 

187 

188 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

189 isComponent=isComponent) 

190 

191 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

192 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

193 

194 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 

195 # `Keys` instead only look like directories, but are not. We check if 

196 # an *exact* full key already exists before writing instead. The insert 

197 # key operation is equivalent to creating the dir and the file. 

198 location.updateExtension(formatter.extension) 

199 if s3CheckFileExists(location, client=self.client,)[0]: 

200 raise FileExistsError(f"Cannot write file for ref {ref} as " 

201 f"output file {location.uri} exists.") 

202 

203 # upload the file directly from bytes or by using a temporary file if 

204 # _toBytes is not implemented 

205 try: 

206 serializedDataset = formatter.toBytes(inMemoryDataset) 

207 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, 

208 Body=serializedDataset) 

209 log.debug("Wrote file directly to %s", location.uri) 

210 except NotImplementedError: 

211 with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: 

212 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

213 formatter.write(inMemoryDataset) 

214 with open(tmpFile.name, 'rb') as f: 

215 self.client.put_object(Bucket=location.netloc, 

216 Key=location.relativeToPathRoot, Body=f) 

217 log.debug("Wrote file to %s via a temporary directory.", location.uri) 

218 

219 if self._transaction is None: 219 ↛ 220line 219 didn't jump to line 220, because the condition on line 219 was never true

220 raise RuntimeError("Attempting to write artifact without transaction enabled") 

221 

222 # Register a callback to try to delete the uploaded data if 

223 # the ingest fails below 

224 self._transaction.registerUndo("write", self.client.delete_object, 

225 Bucket=location.netloc, Key=location.relativeToPathRoot) 

226 

227 # URI is needed to resolve what ingest case are we dealing with 

228 return self._extractIngestInfo(location.uri, ref, formatter=formatter) 

229 

230 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

231 # Docstring inherited from base class 

232 if transfer != "auto": 232 ↛ 234line 232 didn't jump to line 234, because the condition on line 232 was never false

233 return transfer 

234 return "copy" 

235 

236 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

237 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

238 if transfer not in (None, "move", "copy"): 238 ↛ 239line 238 didn't jump to line 239, because the condition on line 238 was never true

239 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

240 # ingest can occur from file->s3 and s3->s3 (source can be file or s3, 

241 # target will always be s3). File has to exist at target location. Two 

242 # Schemeless URIs are assumed to obey os.path rules. Equivalent to 

243 # os.path.exists(fullPath) check in PosixDatastore. 

244 srcUri = ButlerURI(path) 

245 if srcUri.scheme == 'file' or not srcUri.scheme: 245 ↛ 248line 245 didn't jump to line 248, because the condition on line 245 was never false

246 if not os.path.exists(srcUri.ospath): 246 ↛ 247line 246 didn't jump to line 247, because the condition on line 246 was never true

247 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

248 elif srcUri.scheme == 's3': 

249 if not s3CheckFileExists(srcUri, client=self.client)[0]: 

250 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

251 else: 

252 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.") 

253 

254 if transfer is None: 254 ↛ 255line 254 didn't jump to line 255, because the condition on line 254 was never true

255 rootUri = ButlerURI(self.root) 

256 if srcUri.scheme == "file": 

257 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. " 

258 "Ingesting local data to S3Datastore without upload " 

259 "to S3 is not allowed.") 

260 elif srcUri.scheme == "s3": 

261 if not srcUri.path.startswith(rootUri.path): 

262 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.") 

263 return path 

264 

265 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

266 formatter: Union[Formatter, Type[Formatter]], 

267 transfer: Optional[str] = None) -> StoredFileInfo: 

268 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

269 srcUri = ButlerURI(path) 

270 if transfer is None: 

271 rootUri = ButlerURI(self.root) 

272 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

273 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) 

274 tgtLocation = self.locationFactory.fromPath(pathInStore) 

275 else: 

276 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" 

277 if srcUri.scheme == "file": 277 ↛ 288line 277 didn't jump to line 288, because the condition on line 277 was never false

278 # source is on local disk. 

279 template = self.templates.getTemplate(ref) 

280 location = self.locationFactory.fromPath(template.format(ref)) 

281 tgtPathInStore = formatter.predictPathFromLocation(location) 

282 tgtLocation = self.locationFactory.fromPath(tgtPathInStore) 

283 with open(srcUri.ospath, 'rb') as f: 

284 self.client.put_object(Bucket=tgtLocation.netloc, 

285 Key=tgtLocation.relativeToPathRoot, Body=f) 

286 if transfer == "move": 286 ↛ 287line 286 didn't jump to line 287, because the condition on line 286 was never true

287 os.remove(srcUri.ospath) 

288 elif srcUri.scheme == "s3": 

289 # source is another S3 Bucket 

290 relpath = srcUri.relativeToPathRoot 

291 copySrc = {"Bucket": srcUri.netloc, "Key": relpath} 

292 self.client.copy(copySrc, self.locationFactory.netloc, relpath) 

293 if transfer == "move": 

294 # https://github.com/boto/boto3/issues/507 - there is no 

295 # way of knowing if the file was actually deleted except 

296 # for checking all the keys again, reponse is HTTP 204 OK 

297 # response all the time 

298 self.client.delete(Bucket=srcUri.netloc, Key=relpath) 

299 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

300 relativeToDatastoreRoot = str(p.relative_to(rootUri.relativeToPathRoot)) 

301 tgtLocation = self.locationFactory.fromPath(relativeToDatastoreRoot) 

302 

303 # the file should exist on the bucket by now 

304 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, 

305 bucket=tgtLocation.netloc, 

306 client=self.client) 

307 

308 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

309 storageClass=ref.datasetType.storageClass, 

310 component=ref.datasetType.component(), 

311 file_size=size, checksum=None)