Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""S3 datastore.""" 

25 

26__all__ = ("S3Datastore", ) 

27 

28import logging 

29import os 

30import pathlib 

31import tempfile 

32 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 Optional, 

37 Type, 

38 Union, 

39) 

40 

41from lsst.daf.butler import ( 

42 ButlerURI, 

43 DatasetRef, 

44 Formatter, 

45 Location, 

46 StoredFileInfo, 

47) 

48 

49from .fileLikeDatastore import FileLikeDatastore 

50from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from .fileLikeDatastore import DatastoreFileGetInformation 

54 from lsst.daf.butler import DatastoreConfig 

55 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

56 

57log = logging.getLogger(__name__) 

58 

59 

60class S3Datastore(FileLikeDatastore): 

61 """Basic S3 Object Storage backed Datastore. 

62 

63 Parameters 

64 ---------- 

65 config : `DatastoreConfig` or `str` 

66 Configuration. A string should refer to the name of the config file. 

67 bridgeManager : `DatastoreRegistryBridgeManager` 

68 Object that manages the interface between `Registry` and datastores. 

69 butlerRoot : `str`, optional 

70 New datastore root to use to override the configuration value. 

71 

72 Raises 

73 ------ 

74 ValueError 

75 If root location does not exist and ``create`` is `False` in the 

76 configuration. 

77 

78 Notes 

79 ----- 

80 S3Datastore supports non-link transfer modes for file-based ingest: 

81 `"move"`, `"copy"`, and `None` (no transfer). 

82 """ 

83 

84 defaultConfigFile = "datastores/s3Datastore.yaml" 

85 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

86 absolute path. Can be None if no defaults specified. 

87 """ 

88 

89 def __init__(self, config: Union[DatastoreConfig, str], 

90 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

91 super().__init__(config, bridgeManager, butlerRoot) 

92 

93 self.client = getS3Client() 

94 if not bucketExists(self.locationFactory.netloc): 94 ↛ 100line 94 didn't jump to line 100, because the condition on line 94 was never true

95 # PosixDatastore creates the root directory if one does not exist. 

96 # Calling s3 client.create_bucket is possible but also requires 

97 # ACL LocationConstraints, Permissions and other configuration 

98 # parameters, so for now we do not create a bucket if one is 

99 # missing. Further discussion can make this happen though. 

100 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!") 

101 

102 def _artifact_exists(self, location: Location) -> bool: 

103 """Check that an artifact exists in this datastore at the specified 

104 location. 

105 

106 Parameters 

107 ---------- 

108 location : `Location` 

109 Expected location of the artifact associated with this datastore. 

110 

111 Returns 

112 ------- 

113 exists : `bool` 

114 True if the location can be found, false otherwise. 

115 """ 

116 log.debug("Checking if file exists: %s", location.uri) 

117 exists, _ = s3CheckFileExists(location, client=self.client) 

118 return exists 

119 

120 def _delete_artifact(self, location: Location) -> None: 

121 """Delete the artifact from the datastore. 

122 

123 Parameters 

124 ---------- 

125 location : `Location` 

126 Location of the artifact associated with this datastore. 

127 """ 

128 log.debug("Deleting file: %s", location.uri) 

129 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) 

130 log.debug("Successfully deleted file: %s", location.uri) 

131 

132 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

133 ref: DatasetRef, isComponent: bool = False) -> Any: 

134 location = getInfo.location 

135 

136 # since we have to make a GET request to S3 anyhow (for download) we 

137 # might as well use the HEADER metadata for size comparison instead. 

138 # s3CheckFileExists would just duplicate GET/LIST charges in this case. 

139 try: 

140 log.debug("Reading file: %s", location.uri) 

141 response = self.client.get_object(Bucket=location.netloc, 

142 Key=location.relativeToPathRoot) 

143 log.debug("Successfully read file: %s", location.uri) 

144 except self.client.exceptions.ClientError as err: 

145 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

146 # head_object returns 404 when object does not exist only when user 

147 # has s3:ListBucket permission. If list permission does not exist a 

148 # 403 is returned. In practical terms this usually means that the 

149 # file does not exist, but it could also mean user lacks GetObject 

150 # permission. It's hard to tell which case is it. 

151 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

152 # Unit tests right now demand FileExistsError is raised, but this 

153 # should be updated to PermissionError like in s3CheckFileExists. 

154 if errorcode == 403: 

155 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at " 

156 f"expected location {location}. Forbidden HEAD " 

157 "operation error occured. Verify s3:ListBucket " 

158 "and s3:GetObject permissions are granted for " 

159 "your IAM user and that file exists. ") from err 

160 if errorcode == 404: 

161 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." 

162 raise FileNotFoundError(errmsg) from err 

163 # other errors are reraised also, but less descriptively 

164 raise err 

165 

166 storedFileInfo = getInfo.info 

167 if response["ContentLength"] != storedFileInfo.file_size: 167 ↛ 168line 167 didn't jump to line 168, because the condition on line 167 was never true

168 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

169 " match recorded size of {}".format(location.path, response["ContentLength"], 

170 storedFileInfo.file_size)) 

171 

172 # download the data as bytes 

173 serializedDataset = response["Body"].read() 

174 

175 # format the downloaded bytes into appropriate object directly, or via 

176 # tempfile (when formatter does not support to/from/Bytes). This is S3 

177 # equivalent of PosixDatastore formatter.read try-except block. 

178 formatter = getInfo.formatter 

179 try: 

180 result = formatter.fromBytes(serializedDataset, 

181 component=getInfo.component if isComponent else None) 

182 except NotImplementedError: 182 ↛ 194line 182 didn't jump to line 194

183 # formatter might not always have an extension so mypy complains 

184 # We can either ignore the complaint or use a temporary location 

185 tmpLoc = Location(".", "temp") 

186 tmpLoc = formatter.makeUpdatedLocation(tmpLoc) 

187 with tempfile.NamedTemporaryFile(suffix=tmpLoc.getExtension()) as tmpFile: 

188 tmpFile.write(serializedDataset) 

189 # Flush the write. Do not close the file because that 

190 # will delete it. 

191 tmpFile.flush() 

192 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

193 result = formatter.read(component=getInfo.component if isComponent else None) 

194 except Exception as e: 

195 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

196 f" ({ref.datasetType.name} from {location.uri}): {e}") from e 

197 

198 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

199 isComponent=isComponent) 

200 

201 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

202 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

203 

204 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 

205 # `Keys` instead only look like directories, but are not. We check if 

206 # an *exact* full key already exists before writing instead. The insert 

207 # key operation is equivalent to creating the dir and the file. 

208 if s3CheckFileExists(location, client=self.client,)[0]: 

209 raise FileExistsError(f"Cannot write file for ref {ref} as " 

210 f"output file {location.uri} exists.") 

211 

212 # upload the file directly from bytes or by using a temporary file if 

213 # _toBytes is not implemented 

214 try: 

215 serializedDataset = formatter.toBytes(inMemoryDataset) 

216 log.debug("Writing file directly to %s", location.uri) 

217 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, 

218 Body=serializedDataset) 

219 log.debug("Successfully wrote file directly to %s", location.uri) 

220 except NotImplementedError: 

221 with tempfile.NamedTemporaryFile(suffix=location.getExtension()) as tmpFile: 

222 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

223 formatter.write(inMemoryDataset) 

224 with open(tmpFile.name, 'rb') as f: 

225 log.debug("Writing file to %s via a temporary directory.", location.uri) 

226 self.client.put_object(Bucket=location.netloc, 

227 Key=location.relativeToPathRoot, Body=f) 

228 log.debug("Successfully wrote file to %s via a temporary directory.", location.uri) 

229 

230 if self._transaction is None: 230 ↛ 231line 230 didn't jump to line 231, because the condition on line 230 was never true

231 raise RuntimeError("Attempting to write artifact without transaction enabled") 

232 

233 # Register a callback to try to delete the uploaded data if 

234 # the ingest fails below 

235 self._transaction.registerUndo("write", self.client.delete_object, 

236 Bucket=location.netloc, Key=location.relativeToPathRoot) 

237 

238 # URI is needed to resolve what ingest case are we dealing with 

239 return self._extractIngestInfo(location.uri, ref, formatter=formatter) 

240 

241 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

242 # Docstring inherited from base class 

243 if transfer != "auto": 243 ↛ 245line 243 didn't jump to line 245, because the condition on line 243 was never false

244 return transfer 

245 return "copy" 

246 

247 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

248 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

249 if transfer not in (None, "move", "copy"): 249 ↛ 250line 249 didn't jump to line 250, because the condition on line 249 was never true

250 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

251 # ingest can occur from file->s3 and s3->s3 (source can be file or s3, 

252 # target will always be s3). File has to exist at target location. Two 

253 # Schemeless URIs are assumed to obey os.path rules. Equivalent to 

254 # os.path.exists(fullPath) check in PosixDatastore. 

255 srcUri = ButlerURI(path) 

256 if srcUri.scheme == 'file' or not srcUri.scheme: 256 ↛ 259line 256 didn't jump to line 259, because the condition on line 256 was never false

257 if not os.path.exists(srcUri.ospath): 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true

258 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

259 elif srcUri.scheme == 's3': 

260 if not s3CheckFileExists(srcUri, client=self.client)[0]: 

261 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

262 else: 

263 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.") 

264 

265 if transfer is None: 265 ↛ 266line 265 didn't jump to line 266, because the condition on line 265 was never true

266 rootUri = ButlerURI(self.root) 

267 if srcUri.scheme == "file": 

268 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. " 

269 "Ingesting local data to S3Datastore without upload " 

270 "to S3 is not allowed.") 

271 elif srcUri.scheme == "s3": 

272 if not srcUri.path.startswith(rootUri.path): 

273 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.") 

274 return path 

275 

276 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

277 formatter: Union[Formatter, Type[Formatter]], 

278 transfer: Optional[str] = None) -> StoredFileInfo: 

279 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

280 srcUri = ButlerURI(path) 

281 if transfer is None: 

282 rootUri = ButlerURI(self.root) 

283 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

284 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) 

285 tgtLocation = self.locationFactory.fromPath(pathInStore) 

286 else: 

287 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" 

288 

289 # Work out the name we want this ingested file to have 

290 # inside the datastore 

291 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

292 

293 if srcUri.scheme == "file": 293 ↛ 300line 293 didn't jump to line 300, because the condition on line 293 was never false

294 # source is on local disk. 

295 with open(srcUri.ospath, 'rb') as f: 

296 self.client.put_object(Bucket=tgtLocation.netloc, 

297 Key=tgtLocation.relativeToPathRoot, Body=f) 

298 if transfer == "move": 298 ↛ 299line 298 didn't jump to line 299, because the condition on line 298 was never true

299 os.remove(srcUri.ospath) 

300 elif srcUri.scheme == "s3": 

301 # source is another S3 Bucket 

302 relpath = srcUri.relativeToPathRoot 

303 copySrc = {"Bucket": srcUri.netloc, "Key": relpath} 

304 self.client.copy(copySrc, self.locationFactory.netloc, 

305 tgtLocation.relativeToPathRoot) 

306 if transfer == "move": 

307 # https://github.com/boto/boto3/issues/507 - there is no 

308 # way of knowing if the file was actually deleted except 

309 # for checking all the keys again, reponse is HTTP 204 OK 

310 # response all the time 

311 self.client.delete(Bucket=srcUri.netloc, Key=relpath) 

312 

313 # the file should exist on the bucket by now 

314 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, 

315 bucket=tgtLocation.netloc, 

316 client=self.client) 

317 

318 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

319 storageClass=ref.datasetType.storageClass, 

320 component=ref.datasetType.component(), 

321 file_size=size, checksum=None)