Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""S3 datastore.""" 

25 

26__all__ = ("S3Datastore", ) 

27 

28import logging 

29import os 

30import pathlib 

31import tempfile 

32 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 Optional, 

37 Type, 

38 Union, 

39) 

40 

41from lsst.daf.butler import ( 

42 ButlerURI, 

43 DatasetRef, 

44 Formatter, 

45 Location, 

46 StoredFileInfo, 

47) 

48 

49from .fileLikeDatastore import FileLikeDatastore 

50from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists 

51 

52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 from .fileLikeDatastore import DatastoreFileGetInformation 

54 from lsst.daf.butler import DatastoreConfig 

55 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

56 

57log = logging.getLogger(__name__) 

58 

59 

60class S3Datastore(FileLikeDatastore): 

61 """Basic S3 Object Storage backed Datastore. 

62 

63 Parameters 

64 ---------- 

65 config : `DatastoreConfig` or `str` 

66 Configuration. A string should refer to the name of the config file. 

67 bridgeManager : `DatastoreRegistryBridgeManager` 

68 Object that manages the interface between `Registry` and datastores. 

69 butlerRoot : `str`, optional 

70 New datastore root to use to override the configuration value. 

71 

72 Raises 

73 ------ 

74 ValueError 

75 If root location does not exist and ``create`` is `False` in the 

76 configuration. 

77 

78 Notes 

79 ----- 

80 S3Datastore supports non-link transfer modes for file-based ingest: 

81 `"move"`, `"copy"`, and `None` (no transfer). 

82 """ 

83 

84 defaultConfigFile = "datastores/s3Datastore.yaml" 

85 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

86 absolute path. Can be None if no defaults specified. 

87 """ 

88 

89 def __init__(self, config: Union[DatastoreConfig, str], 

90 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

91 super().__init__(config, bridgeManager, butlerRoot) 

92 

93 self.client = getS3Client() 

94 if not bucketExists(self.locationFactory.netloc): 94 ↛ 100line 94 didn't jump to line 100, because the condition on line 94 was never true

95 # PosixDatastore creates the root directory if one does not exist. 

96 # Calling s3 client.create_bucket is possible but also requires 

97 # ACL LocationConstraints, Permissions and other configuration 

98 # parameters, so for now we do not create a bucket if one is 

99 # missing. Further discussion can make this happen though. 

100 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!") 

101 

102 def _artifact_exists(self, location: Location) -> bool: 

103 """Check that an artifact exists in this datastore at the specified 

104 location. 

105 

106 Parameters 

107 ---------- 

108 location : `Location` 

109 Expected location of the artifact associated with this datastore. 

110 

111 Returns 

112 ------- 

113 exists : `bool` 

114 True if the location can be found, false otherwise. 

115 """ 

116 exists, _ = s3CheckFileExists(location, client=self.client) 

117 return exists 

118 

119 def _delete_artifact(self, location: Location) -> None: 

120 """Delete the artifact from the datastore. 

121 

122 Parameters 

123 ---------- 

124 location : `Location` 

125 Location of the artifact associated with this datastore. 

126 """ 

127 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) 

128 

129 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

130 ref: DatasetRef, isComponent: bool = False) -> Any: 

131 location = getInfo.location 

132 

133 # since we have to make a GET request to S3 anyhow (for download) we 

134 # might as well use the HEADER metadata for size comparison instead. 

135 # s3CheckFileExists would just duplicate GET/LIST charges in this case. 

136 try: 

137 response = self.client.get_object(Bucket=location.netloc, 

138 Key=location.relativeToPathRoot) 

139 except self.client.exceptions.ClientError as err: 

140 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

141 # head_object returns 404 when object does not exist only when user 

142 # has s3:ListBucket permission. If list permission does not exist a 

143 # 403 is returned. In practical terms this usually means that the 

144 # file does not exist, but it could also mean user lacks GetObject 

145 # permission. It's hard to tell which case is it. 

146 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

147 # Unit tests right now demand FileExistsError is raised, but this 

148 # should be updated to PermissionError like in s3CheckFileExists. 

149 if errorcode == 403: 

150 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at " 

151 f"expected location {location}. Forbidden HEAD " 

152 "operation error occured. Verify s3:ListBucket " 

153 "and s3:GetObject permissions are granted for " 

154 "your IAM user and that file exists. ") from err 

155 if errorcode == 404: 

156 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." 

157 raise FileNotFoundError(errmsg) from err 

158 # other errors are reraised also, but less descriptively 

159 raise err 

160 

161 storedFileInfo = getInfo.info 

162 if response["ContentLength"] != storedFileInfo.file_size: 162 ↛ 163line 162 didn't jump to line 163, because the condition on line 162 was never true

163 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

164 " match recorded size of {}".format(location.path, response["ContentLength"], 

165 storedFileInfo.file_size)) 

166 

167 # download the data as bytes 

168 serializedDataset = response["Body"].read() 

169 

170 # format the downloaded bytes into appropriate object directly, or via 

171 # tempfile (when formatter does not support to/from/Bytes). This is S3 

172 # equivalent of PosixDatastore formatter.read try-except block. 

173 formatter = getInfo.formatter 

174 try: 

175 result = formatter.fromBytes(serializedDataset, 

176 component=getInfo.component if isComponent else None) 

177 except NotImplementedError: 177 ↛ 189line 177 didn't jump to line 189

178 # formatter might not always have an extension so mypy complains 

179 # We can either ignore the complaint or use a temporary location 

180 tmpLoc = Location(".", "temp") 

181 tmpLoc = formatter.makeUpdatedLocation(tmpLoc) 

182 with tempfile.NamedTemporaryFile(suffix=tmpLoc.getExtension()) as tmpFile: 

183 tmpFile.write(serializedDataset) 

184 # Flush the write. Do not close the file because that 

185 # will delete it. 

186 tmpFile.flush() 

187 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

188 result = formatter.read(component=getInfo.component if isComponent else None) 

189 except Exception as e: 

190 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

191 f" ({ref.datasetType.name} from {location.uri}): {e}") from e 

192 

193 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

194 isComponent=isComponent) 

195 

196 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

197 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

198 

199 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 

200 # `Keys` instead only look like directories, but are not. We check if 

201 # an *exact* full key already exists before writing instead. The insert 

202 # key operation is equivalent to creating the dir and the file. 

203 if s3CheckFileExists(location, client=self.client,)[0]: 

204 raise FileExistsError(f"Cannot write file for ref {ref} as " 

205 f"output file {location.uri} exists.") 

206 

207 # upload the file directly from bytes or by using a temporary file if 

208 # _toBytes is not implemented 

209 try: 

210 serializedDataset = formatter.toBytes(inMemoryDataset) 

211 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, 

212 Body=serializedDataset) 

213 log.debug("Wrote file directly to %s", location.uri) 

214 except NotImplementedError: 

215 with tempfile.NamedTemporaryFile(suffix=location.getExtension()) as tmpFile: 

216 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

217 formatter.write(inMemoryDataset) 

218 with open(tmpFile.name, 'rb') as f: 

219 self.client.put_object(Bucket=location.netloc, 

220 Key=location.relativeToPathRoot, Body=f) 

221 log.debug("Wrote file to %s via a temporary directory.", location.uri) 

222 

223 if self._transaction is None: 223 ↛ 224line 223 didn't jump to line 224, because the condition on line 223 was never true

224 raise RuntimeError("Attempting to write artifact without transaction enabled") 

225 

226 # Register a callback to try to delete the uploaded data if 

227 # the ingest fails below 

228 self._transaction.registerUndo("write", self.client.delete_object, 

229 Bucket=location.netloc, Key=location.relativeToPathRoot) 

230 

231 # URI is needed to resolve what ingest case are we dealing with 

232 return self._extractIngestInfo(location.uri, ref, formatter=formatter) 

233 

234 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

235 # Docstring inherited from base class 

236 if transfer != "auto": 236 ↛ 238line 236 didn't jump to line 238, because the condition on line 236 was never false

237 return transfer 

238 return "copy" 

239 

240 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

241 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

242 if transfer not in (None, "move", "copy"): 242 ↛ 243line 242 didn't jump to line 243, because the condition on line 242 was never true

243 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

244 # ingest can occur from file->s3 and s3->s3 (source can be file or s3, 

245 # target will always be s3). File has to exist at target location. Two 

246 # Schemeless URIs are assumed to obey os.path rules. Equivalent to 

247 # os.path.exists(fullPath) check in PosixDatastore. 

248 srcUri = ButlerURI(path) 

249 if srcUri.scheme == 'file' or not srcUri.scheme: 249 ↛ 252line 249 didn't jump to line 252, because the condition on line 249 was never false

250 if not os.path.exists(srcUri.ospath): 250 ↛ 251line 250 didn't jump to line 251, because the condition on line 250 was never true

251 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

252 elif srcUri.scheme == 's3': 

253 if not s3CheckFileExists(srcUri, client=self.client)[0]: 

254 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

255 else: 

256 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.") 

257 

258 if transfer is None: 258 ↛ 259line 258 didn't jump to line 259, because the condition on line 258 was never true

259 rootUri = ButlerURI(self.root) 

260 if srcUri.scheme == "file": 

261 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. " 

262 "Ingesting local data to S3Datastore without upload " 

263 "to S3 is not allowed.") 

264 elif srcUri.scheme == "s3": 

265 if not srcUri.path.startswith(rootUri.path): 

266 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.") 

267 return path 

268 

269 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

270 formatter: Union[Formatter, Type[Formatter]], 

271 transfer: Optional[str] = None) -> StoredFileInfo: 

272 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

273 srcUri = ButlerURI(path) 

274 if transfer is None: 

275 rootUri = ButlerURI(self.root) 

276 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

277 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) 

278 tgtLocation = self.locationFactory.fromPath(pathInStore) 

279 else: 

280 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" 

281 

282 # Work out the name we want this ingested file to have 

283 # inside the datastore 

284 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

285 

286 if srcUri.scheme == "file": 286 ↛ 293line 286 didn't jump to line 293, because the condition on line 286 was never false

287 # source is on local disk. 

288 with open(srcUri.ospath, 'rb') as f: 

289 self.client.put_object(Bucket=tgtLocation.netloc, 

290 Key=tgtLocation.relativeToPathRoot, Body=f) 

291 if transfer == "move": 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true

292 os.remove(srcUri.ospath) 

293 elif srcUri.scheme == "s3": 

294 # source is another S3 Bucket 

295 relpath = srcUri.relativeToPathRoot 

296 copySrc = {"Bucket": srcUri.netloc, "Key": relpath} 

297 self.client.copy(copySrc, self.locationFactory.netloc, 

298 tgtLocation.relativeToPathRoot) 

299 if transfer == "move": 

300 # https://github.com/boto/boto3/issues/507 - there is no 

301 # way of knowing if the file was actually deleted except 

302 # for checking all the keys again, reponse is HTTP 204 OK 

303 # response all the time 

304 self.client.delete(Bucket=srcUri.netloc, Key=relpath) 

305 

306 # the file should exist on the bucket by now 

307 exists, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, 

308 bucket=tgtLocation.netloc, 

309 client=self.client) 

310 

311 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

312 storageClass=ref.datasetType.storageClass, 

313 component=ref.datasetType.component(), 

314 file_size=size, checksum=None)