Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""S3 datastore.""" 

25 

26__all__ = ("S3Datastore", ) 

27 

28import logging 

29import os 

30import pathlib 

31import tempfile 

32 

33from botocore.exceptions import ClientError 

34from http.client import ImproperConnectionState, HTTPException 

35from urllib3.exceptions import RequestError, HTTPError 

36 

37from typing import ( 

38 TYPE_CHECKING, 

39 Any, 

40 Optional, 

41 Type, 

42 Union, 

43 Callable 

44) 

45 

46# https://pypi.org/project/backoff/ 

47try: 

48 import backoff 

49except ImportError: 

50 class Backoff(): 

51 @staticmethod 

52 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

53 return func 

54 

55 @staticmethod 

56 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

57 return func 

58 

59 backoff = Backoff 

60 

61from lsst.daf.butler import ( 

62 ButlerURI, 

63 DatasetRef, 

64 Formatter, 

65 Location, 

66 StoredFileInfo, 

67) 

68 

69from .fileLikeDatastore import FileLikeDatastore 

70from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists 

71 

72if TYPE_CHECKING: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true

73 from .fileLikeDatastore import DatastoreFileGetInformation 

74 from lsst.daf.butler import DatastoreConfig 

75 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

76 

77log = logging.getLogger(__name__) 

78 

79# settings for "backoff" retry decorators. these retries are belt-and- 

80# suspenders along with the retries built into Boto3, to account for 

81# semantic differences in errors between S3-like providers. 

82retryable_io_errors = ( 

83 # http.client 

84 ImproperConnectionState, HTTPException, 

85 # urllib3.exceptions 

86 RequestError, HTTPError, 

87 # built-ins 

88 TimeoutError, ConnectionError) 

89retryable_client_errors = ( 

90 # botocore.exceptions 

91 ClientError, 

92 # built-ins 

93 PermissionError) 

94all_retryable_errors = retryable_client_errors + retryable_io_errors 

95max_retry_time = 60 

96 

97 

98class S3Datastore(FileLikeDatastore): 

99 """Basic S3 Object Storage backed Datastore. 

100 

101 Parameters 

102 ---------- 

103 config : `DatastoreConfig` or `str` 

104 Configuration. A string should refer to the name of the config file. 

105 bridgeManager : `DatastoreRegistryBridgeManager` 

106 Object that manages the interface between `Registry` and datastores. 

107 butlerRoot : `str`, optional 

108 New datastore root to use to override the configuration value. 

109 

110 Raises 

111 ------ 

112 ValueError 

113 If root location does not exist and ``create`` is `False` in the 

114 configuration. 

115 

116 Notes 

117 ----- 

118 S3Datastore supports non-link transfer modes for file-based ingest: 

119 `"move"`, `"copy"`, and `None` (no transfer). 

120 """ 

121 

122 defaultConfigFile = "datastores/s3Datastore.yaml" 

123 """Path to configuration defaults. Accessed within the ``config`` resource 

124 or relative to a search path. Can be None if no defaults specified. 

125 """ 

126 

127 def __init__(self, config: Union[DatastoreConfig, str], 

128 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

129 super().__init__(config, bridgeManager, butlerRoot) 

130 

131 self.client = getS3Client() 

132 if not bucketExists(self.locationFactory.netloc): 132 ↛ 138line 132 didn't jump to line 138, because the condition on line 132 was never true

133 # PosixDatastore creates the root directory if one does not exist. 

134 # Calling s3 client.create_bucket is possible but also requires 

135 # ACL LocationConstraints, Permissions and other configuration 

136 # parameters, so for now we do not create a bucket if one is 

137 # missing. Further discussion can make this happen though. 

138 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!") 

139 

140 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time) 

141 def _artifact_exists(self, location: Location) -> bool: 

142 """Check that an artifact exists in this datastore at the specified 

143 location. 

144 

145 Parameters 

146 ---------- 

147 location : `Location` 

148 Expected location of the artifact associated with this datastore. 

149 

150 Returns 

151 ------- 

152 exists : `bool` 

153 True if the location can be found, false otherwise. 

154 """ 

155 log.debug("Checking if file exists: %s", location.uri) 

156 exists, _ = s3CheckFileExists(location, client=self.client) 

157 return exists 

158 

159 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time) 

160 def _delete_artifact(self, location: Location) -> None: 

161 """Delete the artifact from the datastore. 

162 

163 Parameters 

164 ---------- 

165 location : `Location` 

166 Location of the artifact associated with this datastore. 

167 """ 

168 log.debug("Deleting file: %s", location.uri) 

169 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) 

170 log.debug("Successfully deleted file: %s", location.uri) 

171 

172 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

173 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

174 ref: DatasetRef, isComponent: bool = False) -> Any: 

175 location = getInfo.location 

176 

177 # since we have to make a GET request to S3 anyhow (for download) we 

178 # might as well use the HEADER metadata for size comparison instead. 

179 # s3CheckFileExists would just duplicate GET/LIST charges in this case. 

180 try: 

181 log.debug("Reading file: %s", location.uri) 

182 response = self.client.get_object(Bucket=location.netloc, 

183 Key=location.relativeToPathRoot) 

184 log.debug("Successfully read file: %s", location.uri) 

185 except self.client.exceptions.ClientError as err: 

186 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

187 # head_object returns 404 when object does not exist only when user 

188 # has s3:ListBucket permission. If list permission does not exist a 

189 # 403 is returned. In practical terms this usually means that the 

190 # file does not exist, but it could also mean user lacks GetObject 

191 # permission. It's hard to tell which case is it. 

192 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

193 # Unit tests right now demand FileExistsError is raised, but this 

194 # should be updated to PermissionError like in s3CheckFileExists. 

195 if errorcode == 403: 

196 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at " 

197 f"expected location {location}. Forbidden HEAD " 

198 "operation error occured. Verify s3:ListBucket " 

199 "and s3:GetObject permissions are granted for " 

200 "your IAM user and that file exists. ") from err 

201 if errorcode == 404: 

202 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." 

203 raise FileNotFoundError(errmsg) from err 

204 # other errors are reraised also, but less descriptively 

205 raise err 

206 

207 storedFileInfo = getInfo.info 

208 if response["ContentLength"] != storedFileInfo.file_size: 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true

209 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

210 " match recorded size of {}".format(location.path, response["ContentLength"], 

211 storedFileInfo.file_size)) 

212 

213 # download the data as bytes 

214 serializedDataset = response["Body"].read() 

215 

216 # format the downloaded bytes into appropriate object directly, or via 

217 # tempfile (when formatter does not support to/from/Bytes). This is S3 

218 # equivalent of PosixDatastore formatter.read try-except block. 

219 formatter = getInfo.formatter 

220 try: 

221 result = formatter.fromBytes(serializedDataset, 

222 component=getInfo.component if isComponent else None) 

223 except NotImplementedError: 223 ↛ 235line 223 didn't jump to line 235

224 # formatter might not always have an extension so mypy complains 

225 # We can either ignore the complaint or use a temporary location 

226 tmpLoc = Location(".", "temp") 

227 tmpLoc = formatter.makeUpdatedLocation(tmpLoc) 

228 with tempfile.NamedTemporaryFile(suffix=tmpLoc.getExtension()) as tmpFile: 

229 tmpFile.write(serializedDataset) 

230 # Flush the write. Do not close the file because that 

231 # will delete it. 

232 tmpFile.flush() 

233 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

234 result = formatter.read(component=getInfo.component if isComponent else None) 

235 except Exception as e: 

236 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

237 f" ({ref.datasetType.name} from {location.uri}): {e}") from e 

238 

239 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

240 isComponent=isComponent) 

241 

242 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

243 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

244 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

245 

246 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 

247 # `Keys` instead only look like directories, but are not. We check if 

248 # an *exact* full key already exists before writing instead. The insert 

249 # key operation is equivalent to creating the dir and the file. 

250 if s3CheckFileExists(location, client=self.client,)[0]: 

251 raise FileExistsError(f"Cannot write file for ref {ref} as " 

252 f"output file {location.uri} exists.") 

253 

254 # upload the file directly from bytes or by using a temporary file if 

255 # _toBytes is not implemented 

256 try: 

257 serializedDataset = formatter.toBytes(inMemoryDataset) 

258 log.debug("Writing file directly to %s", location.uri) 

259 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, 

260 Body=serializedDataset) 

261 log.debug("Successfully wrote file directly to %s", location.uri) 

262 except NotImplementedError: 

263 with tempfile.NamedTemporaryFile(suffix=location.getExtension()) as tmpFile: 

264 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

265 formatter.write(inMemoryDataset) 

266 with open(tmpFile.name, 'rb') as f: 

267 log.debug("Writing file to %s via a temporary directory.", location.uri) 

268 self.client.put_object(Bucket=location.netloc, 

269 Key=location.relativeToPathRoot, Body=f) 

270 log.debug("Successfully wrote file to %s via a temporary directory.", location.uri) 

271 

272 if self._transaction is None: 272 ↛ 273line 272 didn't jump to line 273, because the condition on line 272 was never true

273 raise RuntimeError("Attempting to write artifact without transaction enabled") 

274 

275 # Register a callback to try to delete the uploaded data if 

276 # the ingest fails below 

277 self._transaction.registerUndo("write", self.client.delete_object, 

278 Bucket=location.netloc, Key=location.relativeToPathRoot) 

279 

280 # URI is needed to resolve what ingest case are we dealing with 

281 return self._extractIngestInfo(location.uri, ref, formatter=formatter) 

282 

283 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

284 # Docstring inherited from base class 

285 if transfer != "auto": 285 ↛ 287line 285 didn't jump to line 287, because the condition on line 285 was never false

286 return transfer 

287 return "copy" 

288 

289 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

290 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

291 if transfer not in (None, "move", "copy"): 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true

292 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

293 # ingest can occur from file->s3 and s3->s3 (source can be file or s3, 

294 # target will always be s3). File has to exist at target location. Two 

295 # Schemeless URIs are assumed to obey os.path rules. Equivalent to 

296 # os.path.exists(fullPath) check in PosixDatastore. 

297 srcUri = ButlerURI(path) 

298 if srcUri.scheme == 'file' or not srcUri.scheme: 298 ↛ 301line 298 didn't jump to line 301, because the condition on line 298 was never false

299 if not os.path.exists(srcUri.ospath): 299 ↛ 300line 299 didn't jump to line 300, because the condition on line 299 was never true

300 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

301 elif srcUri.scheme == 's3': 

302 if not s3CheckFileExists(srcUri, client=self.client)[0]: 

303 raise FileNotFoundError(f"File at '{srcUri}' does not exist.") 

304 else: 

305 raise NotImplementedError(f"Scheme type {srcUri.scheme} not supported.") 

306 

307 if transfer is None: 307 ↛ 308line 307 didn't jump to line 308, because the condition on line 307 was never true

308 rootUri = ButlerURI(self.root) 

309 if srcUri.scheme == "file": 

310 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'. " 

311 "Ingesting local data to S3Datastore without upload " 

312 "to S3 is not allowed.") 

313 elif srcUri.scheme == "s3": 

314 if not srcUri.path.startswith(rootUri.path): 

315 raise RuntimeError(f"'{srcUri}' is not inside repository root '{rootUri}'.") 

316 return path 

317 

318 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

319 formatter: Union[Formatter, Type[Formatter]], 

320 transfer: Optional[str] = None) -> StoredFileInfo: 

321 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

322 srcUri = ButlerURI(path) 

323 if transfer is None: 

324 rootUri = ButlerURI(self.root) 

325 p = pathlib.PurePosixPath(srcUri.relativeToPathRoot) 

326 pathInStore = str(p.relative_to(rootUri.relativeToPathRoot)) 

327 tgtLocation = self.locationFactory.fromPath(pathInStore) 

328 else: 

329 assert transfer == "move" or transfer == "copy", "Should be guaranteed by _standardizeIngestPath" 

330 

331 # Work out the name we want this ingested file to have 

332 # inside the datastore 

333 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

334 

335 if srcUri.scheme == "file": 335 ↛ 342line 335 didn't jump to line 342, because the condition on line 335 was never false

336 # source is on local disk. 

337 with open(srcUri.ospath, 'rb') as f: 

338 self.client.put_object(Bucket=tgtLocation.netloc, 

339 Key=tgtLocation.relativeToPathRoot, Body=f) 

340 if transfer == "move": 340 ↛ 341line 340 didn't jump to line 341, because the condition on line 340 was never true

341 os.remove(srcUri.ospath) 

342 elif srcUri.scheme == "s3": 

343 # source is another S3 Bucket 

344 relpath = srcUri.relativeToPathRoot 

345 copySrc = {"Bucket": srcUri.netloc, "Key": relpath} 

346 self.client.copy(copySrc, self.locationFactory.netloc, 

347 tgtLocation.relativeToPathRoot) 

348 if transfer == "move": 

349 # https://github.com/boto/boto3/issues/507 - there is no 

350 # way of knowing if the file was actually deleted except 

351 # for checking all the keys again, reponse is HTTP 204 OK 

352 # response all the time 

353 self.client.delete(Bucket=srcUri.netloc, Key=relpath) 

354 

355 # the file should exist on the bucket by now 

356 _, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, 

357 bucket=tgtLocation.netloc, 

358 client=self.client) 

359 

360 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

361 storageClass=ref.datasetType.storageClass, 

362 component=ref.datasetType.component(), 

363 file_size=size, checksum=None)