Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""S3 datastore.""" 

25 

26__all__ = ("S3Datastore", ) 

27 

28import logging 

29import os 

30import tempfile 

31 

32from botocore.exceptions import ClientError 

33from http.client import ImproperConnectionState, HTTPException 

34from urllib3.exceptions import RequestError, HTTPError 

35 

36from typing import ( 

37 TYPE_CHECKING, 

38 Any, 

39 Optional, 

40 Type, 

41 Union, 

42 Callable 

43) 

44 

45# https://pypi.org/project/backoff/ 

46try: 

47 import backoff 

48except ImportError: 

49 class Backoff(): 

50 @staticmethod 

51 def expo(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

52 return func 

53 

54 @staticmethod 

55 def on_exception(func: Callable, *args: Any, **kwargs: Any) -> Callable: 

56 return func 

57 

58 backoff = Backoff 

59 

60from lsst.daf.butler import ( 

61 ButlerURI, 

62 DatasetRef, 

63 Formatter, 

64 Location, 

65 StoredFileInfo, 

66) 

67 

68from .fileLikeDatastore import FileLikeDatastore 

69from lsst.daf.butler.core.s3utils import getS3Client, s3CheckFileExists, bucketExists 

70 

71if TYPE_CHECKING: 71 ↛ 72line 71 didn't jump to line 72, because the condition on line 71 was never true

72 from .fileLikeDatastore import DatastoreFileGetInformation 

73 from lsst.daf.butler import DatastoreConfig 

74 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

75 

76log = logging.getLogger(__name__) 

77 

78# settings for "backoff" retry decorators. these retries are belt-and- 

79# suspenders along with the retries built into Boto3, to account for 

80# semantic differences in errors between S3-like providers. 

81retryable_io_errors = ( 

82 # http.client 

83 ImproperConnectionState, HTTPException, 

84 # urllib3.exceptions 

85 RequestError, HTTPError, 

86 # built-ins 

87 TimeoutError, ConnectionError) 

88retryable_client_errors = ( 

89 # botocore.exceptions 

90 ClientError, 

91 # built-ins 

92 PermissionError) 

93all_retryable_errors = retryable_client_errors + retryable_io_errors 

94max_retry_time = 60 

95 

96 

97class S3Datastore(FileLikeDatastore): 

98 """Basic S3 Object Storage backed Datastore. 

99 

100 Parameters 

101 ---------- 

102 config : `DatastoreConfig` or `str` 

103 Configuration. A string should refer to the name of the config file. 

104 bridgeManager : `DatastoreRegistryBridgeManager` 

105 Object that manages the interface between `Registry` and datastores. 

106 butlerRoot : `str`, optional 

107 New datastore root to use to override the configuration value. 

108 

109 Raises 

110 ------ 

111 ValueError 

112 If root location does not exist and ``create`` is `False` in the 

113 configuration. 

114 

115 Notes 

116 ----- 

117 S3Datastore supports non-link transfer modes for file-based ingest: 

118 `"move"`, `"copy"`, and `None` (no transfer). 

119 """ 

120 

121 defaultConfigFile = "datastores/s3Datastore.yaml" 

122 """Path to configuration defaults. Accessed within the ``config`` resource 

123 or relative to a search path. Can be None if no defaults specified. 

124 """ 

125 

126 def __init__(self, config: Union[DatastoreConfig, str], 

127 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

128 super().__init__(config, bridgeManager, butlerRoot) 

129 

130 self.client = getS3Client() 

131 if not bucketExists(self.locationFactory.netloc): 131 ↛ 137line 131 didn't jump to line 137, because the condition on line 131 was never true

132 # PosixDatastore creates the root directory if one does not exist. 

133 # Calling s3 client.create_bucket is possible but also requires 

134 # ACL LocationConstraints, Permissions and other configuration 

135 # parameters, so for now we do not create a bucket if one is 

136 # missing. Further discussion can make this happen though. 

137 raise IOError(f"Bucket {self.locationFactory.netloc} does not exist!") 

138 

139 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time) 

140 def _artifact_exists(self, location: Location) -> bool: 

141 """Check that an artifact exists in this datastore at the specified 

142 location. 

143 

144 Parameters 

145 ---------- 

146 location : `Location` 

147 Expected location of the artifact associated with this datastore. 

148 

149 Returns 

150 ------- 

151 exists : `bool` 

152 True if the location can be found, false otherwise. 

153 """ 

154 log.debug("Checking if file exists: %s", location.uri) 

155 exists, _ = s3CheckFileExists(location, client=self.client) 

156 return exists 

157 

158 @backoff.on_exception(backoff.expo, retryable_client_errors, max_time=max_retry_time) 

159 def _delete_artifact(self, location: Location) -> None: 

160 """Delete the artifact from the datastore. 

161 

162 Parameters 

163 ---------- 

164 location : `Location` 

165 Location of the artifact associated with this datastore. 

166 """ 

167 log.debug("Deleting file: %s", location.uri) 

168 self.client.delete_object(Bucket=location.netloc, Key=location.relativeToPathRoot) 

169 log.debug("Successfully deleted file: %s", location.uri) 

170 

171 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

172 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

173 ref: DatasetRef, isComponent: bool = False) -> Any: 

174 location = getInfo.location 

175 

176 # since we have to make a GET request to S3 anyhow (for download) we 

177 # might as well use the HEADER metadata for size comparison instead. 

178 # s3CheckFileExists would just duplicate GET/LIST charges in this case. 

179 try: 

180 log.debug("Reading file: %s", location.uri) 

181 response = self.client.get_object(Bucket=location.netloc, 

182 Key=location.relativeToPathRoot) 

183 log.debug("Successfully read file: %s", location.uri) 

184 except self.client.exceptions.ClientError as err: 

185 errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] 

186 # head_object returns 404 when object does not exist only when user 

187 # has s3:ListBucket permission. If list permission does not exist a 

188 # 403 is returned. In practical terms this usually means that the 

189 # file does not exist, but it could also mean user lacks GetObject 

190 # permission. It's hard to tell which case is it. 

191 # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html 

192 # Unit tests right now demand FileExistsError is raised, but this 

193 # should be updated to PermissionError like in s3CheckFileExists. 

194 if errorcode == 403: 

195 raise FileNotFoundError(f"Dataset with Id {ref.id} not accessible at " 

196 f"expected location {location}. Forbidden HEAD " 

197 "operation error occured. Verify s3:ListBucket " 

198 "and s3:GetObject permissions are granted for " 

199 "your IAM user and that file exists. ") from err 

200 if errorcode == 404: 

201 errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." 

202 raise FileNotFoundError(errmsg) from err 

203 # other errors are reraised also, but less descriptively 

204 raise err 

205 

206 storedFileInfo = getInfo.info 

207 if response["ContentLength"] != storedFileInfo.file_size: 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true

208 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

209 " match recorded size of {}".format(location.path, response["ContentLength"], 

210 storedFileInfo.file_size)) 

211 

212 # download the data as bytes 

213 serializedDataset = response["Body"].read() 

214 

215 # format the downloaded bytes into appropriate object directly, or via 

216 # tempfile (when formatter does not support to/from/Bytes). This is S3 

217 # equivalent of PosixDatastore formatter.read try-except block. 

218 formatter = getInfo.formatter 

219 try: 

220 result = formatter.fromBytes(serializedDataset, 

221 component=getInfo.component if isComponent else None) 

222 except NotImplementedError: 222 ↛ 234line 222 didn't jump to line 234

223 # formatter might not always have an extension so mypy complains 

224 # We can either ignore the complaint or use a temporary location 

225 tmpLoc = Location(".", "temp") 

226 tmpLoc = formatter.makeUpdatedLocation(tmpLoc) 

227 with tempfile.NamedTemporaryFile(suffix=tmpLoc.getExtension()) as tmpFile: 

228 tmpFile.write(serializedDataset) 

229 # Flush the write. Do not close the file because that 

230 # will delete it. 

231 tmpFile.flush() 

232 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

233 result = formatter.read(component=getInfo.component if isComponent else None) 

234 except Exception as e: 

235 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

236 f" ({ref.datasetType.name} from {location.uri}): {e}") from e 

237 

238 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

239 isComponent=isComponent) 

240 

241 @backoff.on_exception(backoff.expo, all_retryable_errors, max_time=max_retry_time) 

242 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

243 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

244 

245 # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 

246 # `Keys` instead only look like directories, but are not. We check if 

247 # an *exact* full key already exists before writing instead. The insert 

248 # key operation is equivalent to creating the dir and the file. 

249 if s3CheckFileExists(location, client=self.client,)[0]: 249 ↛ 258line 249 didn't jump to line 258, because the condition on line 249 was never true

250 # Assume that by this point if registry thinks the file should 

251 # not exist then the file should not exist and therefore we can 

252 # overwrite it. This can happen if a put was interrupted by 

253 # an external interrupt. The only time this could be problematic is 

254 # if the file template is incomplete and multiple dataset refs 

255 # result in identical filenames. 

256 # Eventually we should remove the check completely (it takes 

257 # non-zero time for network). 

258 log.warning("Object %s exists in datastore for ref %s", location.uri, ref) 

259 

260 # upload the file directly from bytes or by using a temporary file if 

261 # _toBytes is not implemented 

262 try: 

263 serializedDataset = formatter.toBytes(inMemoryDataset) 

264 log.debug("Writing file directly to %s", location.uri) 

265 self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, 

266 Body=serializedDataset) 

267 log.debug("Successfully wrote file directly to %s", location.uri) 

268 except NotImplementedError: 

269 with tempfile.NamedTemporaryFile(suffix=location.getExtension()) as tmpFile: 

270 formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) 

271 formatter.write(inMemoryDataset) 

272 with open(tmpFile.name, 'rb') as f: 

273 log.debug("Writing file to %s via a temporary directory.", location.uri) 

274 self.client.put_object(Bucket=location.netloc, 

275 Key=location.relativeToPathRoot, Body=f) 

276 log.debug("Successfully wrote file to %s via a temporary directory.", location.uri) 

277 

278 if self._transaction is None: 278 ↛ 279line 278 didn't jump to line 279, because the condition on line 278 was never true

279 raise RuntimeError("Attempting to write artifact without transaction enabled") 

280 

281 # Register a callback to try to delete the uploaded data if 

282 # the ingest fails below 

283 self._transaction.registerUndo("write", self.client.delete_object, 

284 Bucket=location.netloc, Key=location.relativeToPathRoot) 

285 

286 # URI is needed to resolve what ingest case are we dealing with 

287 return self._extractIngestInfo(location.uri, ref, formatter=formatter) 

288 

289 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]: 

290 # Docstring inherited from base class 

291 if transfer != "auto": 291 ↛ 293line 291 didn't jump to line 293, because the condition on line 291 was never false

292 return transfer 

293 return "copy" 

294 

295 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

296 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

297 if transfer not in (None, "move", "copy"): 297 ↛ 298line 297 didn't jump to line 298, because the condition on line 297 was never true

298 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

299 # ingest can occur from file->s3 and s3->s3 (source can be file or s3, 

300 # target will always be s3). File has to exist at target location. Two 

301 # Schemeless URIs are assumed to obey os.path rules. Equivalent to 

302 # os.path.exists(fullPath) check in PosixDatastore. 

303 srcUri = ButlerURI(path) 

304 if not srcUri.exists(): 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true

305 raise FileNotFoundError(f"Resource at {srcUri} does not exist") 

306 

307 if transfer is None: 307 ↛ 308line 307 didn't jump to line 308, because the condition on line 307 was never true

308 rootUri = ButlerURI(self.root) 

309 if not srcUri.relative_to(rootUri): 

310 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

311 f"within datastore ({rootUri})") 

312 return path 

313 

314 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

315 formatter: Union[Formatter, Type[Formatter]], 

316 transfer: Optional[str] = None) -> StoredFileInfo: 

317 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

318 srcUri = ButlerURI(path) 

319 if transfer is None: 

320 # The source file is already in the datastore but we have 

321 # to work out the path relative to the root of the datastore. 

322 # Because unlike for file to file ingest we can get absolute 

323 # URIs here 

324 rootUri = ButlerURI(self.root, forceDirectory=True) 

325 pathInStore = srcUri.relative_to(rootUri) 

326 if pathInStore is None: 326 ↛ 327line 326 didn't jump to line 327, because the condition on line 326 was never true

327 raise RuntimeError(f"Unexpectedly learned that {srcUri} is not within datastore {rootUri}") 

328 tgtLocation = self.locationFactory.fromPath(pathInStore) 

329 else: 

330 # Work out the name we want this ingested file to have 

331 # inside the datastore 

332 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

333 

334 # Convert that to a ButlerURI and transfer the resource to S3 

335 targetUri = ButlerURI(tgtLocation.uri) 

336 targetUri.transfer_from(srcUri, transfer=transfer) 

337 

338 # the file should exist on the bucket by now 

339 _, size = s3CheckFileExists(path=tgtLocation.relativeToPathRoot, 

340 bucket=tgtLocation.netloc, 

341 client=self.client) 

342 

343 return StoredFileInfo(formatter=formatter, path=tgtLocation.pathInStore, 

344 storageClass=ref.datasetType.storageClass, 

345 component=ref.datasetType.component(), 

346 file_size=size, checksum=None)