Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import os 

25import os.path 

26import requests 

27import tempfile 

28import logging 

29 

30__all__ = ('ButlerHttpURI', ) 

31 

32from requests.adapters import HTTPAdapter 

33from requests.packages.urllib3.util.retry import Retry 

34 

35from typing import ( 

36 TYPE_CHECKING, 

37 Optional, 

38 Tuple, 

39 Union, 

40) 

41 

42from .utils import NoTransaction 

43from ._butlerUri import ButlerURI 

44from ..location import Location 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ..datastore import DatastoreTransaction 

48 

49log = logging.getLogger(__name__) 

50 

51# Default timeout for all HTTP requests, in seconds 

52TIMEOUT = 20 

53 

54 

55def getHttpSession() -> requests.Session: 

56 """Create a requests.Session pre-configured with environment variable data. 

57 

58 Returns 

59 ------- 

60 session : `requests.Session` 

61 An http session used to execute requests. 

62 

63 Notes 

64 ----- 

65 The following environment variables must be set: 

66 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA 

67 certificates are stored if you intend to use HTTPS to 

68 communicate with the endpoint. 

69 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use. 

70 Possible values are X509 and TOKEN 

71 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy 

72 certificate used to authenticate requests 

73 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which 

74 contains the bearer token used to authenticate requests 

75 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an 

76 "Expect: 100-Continue" header in all requests. This is required 

77 on certain endpoints where requests redirection is made. 

78 """ 

79 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) 

80 

81 session = requests.Session() 

82 session.mount("http://", HTTPAdapter(max_retries=retries)) 

83 session.mount("https://", HTTPAdapter(max_retries=retries)) 

84 

85 log.debug("Creating new HTTP session...") 

86 

87 try: 

88 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

89 except KeyError: 

90 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

91 "please use values X509 or TOKEN") 

92 

93 if env_auth_method == "X509": 

94 log.debug("... using x509 authentication.") 

95 try: 

96 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT'] 

97 except KeyError: 

98 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set") 

99 session.cert = (proxy_cert, proxy_cert) 

100 elif env_auth_method == "TOKEN": 

101 log.debug("... using bearer-token authentication.") 

102 refreshToken(session) 

103 else: 

104 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN") 

105 

106 ca_bundle = None 

107 try: 

108 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

109 except KeyError: 

110 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

111 "HTTPS requests will fail. If you intend to use HTTPS, please " 

112 "export this variable.") 

113 

114 session.verify = ca_bundle 

115 log.debug("Session configured and ready.") 

116 

117 return session 

118 

119 

120def useExpect100() -> bool: 

121 """Return the status of the "Expect-100" header. 

122 

123 Returns 

124 ------- 

125 useExpect100 : `bool` 

126 True if LSST_BUTLER_WEBDAV_EXPECT100 is set, False otherwise. 

127 """ 

128 # This header is required for request redirection, in dCache for example 

129 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ: 

130 log.debug("Expect: 100-Continue header enabled.") 

131 return True 

132 return False 

133 

134 

135def isTokenAuth() -> bool: 

136 """Return the status of bearer-token authentication. 

137 

138 Returns 

139 ------- 

140 isTokenAuth : `bool` 

141 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise. 

142 """ 

143 try: 

144 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

145 except KeyError: 

146 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

147 "please use values X509 or TOKEN") 

148 

149 if env_auth_method == "TOKEN": 

150 return True 

151 return False 

152 

153 

154def refreshToken(session: requests.Session) -> None: 

155 """Refresh the session token. 

156 

157 Set or update the 'Authorization' header of the session, 

158 configure bearer token authentication, with the value fetched 

159 from LSST_BUTLER_WEBDAV_TOKEN_FILE 

160 

161 Parameters 

162 ---------- 

163 session : `requests.Session` 

164 Session on which bearer token authentication must be configured. 

165 """ 

166 try: 

167 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'] 

168 if not os.path.isfile(token_path): 

169 raise FileNotFoundError(f"No token file: {token_path}") 

170 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh: 

171 bearer_token = fh.read().replace('\n', '') 

172 except KeyError: 

173 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set") 

174 

175 session.headers.update({'Authorization': 'Bearer ' + bearer_token}) 

176 

177 

178def webdavCheckFileExists(path: Union[Location, ButlerURI, str], 

179 session: Optional[requests.Session] = None) -> Tuple[bool, int]: 

180 """Check that a remote HTTP resource exists. 

181 

182 Parameters 

183 ---------- 

184 path : `Location`, `ButlerURI` or `str` 

185 Location or ButlerURI containing the bucket name and filepath. 

186 session : `requests.Session`, optional 

187 Session object to query. 

188 

189 Returns 

190 ------- 

191 exists : `bool` 

192 True if resource exists, False otherwise. 

193 size : `int` 

194 Size of the resource, if it exists, in bytes, otherwise -1 

195 """ 

196 if session is None: 

197 session = getHttpSession() 

198 

199 filepath = _getFileURL(path) 

200 

201 log.debug("Checking if file exists: %s", filepath) 

202 

203 r = session.head(filepath, timeout=TIMEOUT) 

204 return (True, int(r.headers['Content-Length'])) if r.status_code == 200 else (False, -1) 

205 

206 

207def webdavDeleteFile(path: Union[Location, ButlerURI, str], 

208 session: Optional[requests.Session] = None) -> None: 

209 """Remove a remote HTTP resource. 

210 

211 Parameters 

212 ---------- 

213 path : `Location`, `ButlerURI` or `str` 

214 Location or ButlerURI containing the bucket name and filepath. 

215 session : `requests.Session`, optional 

216 Session object to query. 

217 

218 Raises 

219 ------ 

220 FileNotFoundError 

221 Raises a FileNotFoundError if the resource does not exist or on 

222 failure. 

223 """ 

224 if session is None: 

225 session = getHttpSession() 

226 

227 filepath = _getFileURL(path) 

228 

229 log.debug("Removing file: %s", filepath) 

230 r = session.delete(filepath, timeout=TIMEOUT) 

231 if r.status_code not in [200, 202, 204]: 

232 raise FileNotFoundError(f"Unable to delete resource {filepath}; status code: {r.status_code}") 

233 

234 

235def folderExists(path: Union[Location, ButlerURI, str], 

236 session: Optional[requests.Session] = None) -> bool: 

237 """Check if the Webdav repository at a given URL actually exists. 

238 

239 Parameters 

240 ---------- 

241 path : `Location`, `ButlerURI` or `str` 

242 Location or ButlerURI containing the bucket name and filepath. 

243 session : `requests.Session`, optional 

244 Session object to query. 

245 

246 Returns 

247 ------- 

248 exists : `bool` 

249 True if it exists, False if no folder is found. 

250 """ 

251 if session is None: 

252 session = getHttpSession() 

253 

254 filepath = _getFileURL(path) 

255 

256 log.debug("Checking if folder exists: %s", filepath) 

257 r = session.head(filepath, timeout=TIMEOUT) 

258 return True if r.status_code == 200 else False 

259 

260 

261def isWebdavEndpoint(path: Union[Location, ButlerURI, str]) -> bool: 

262 """Check whether the remote HTTP endpoint implements Webdav features. 

263 

264 Parameters 

265 ---------- 

266 path : `Location`, `ButlerURI` or `str` 

267 Location or ButlerURI containing the bucket name and filepath. 

268 

269 Returns 

270 ------- 

271 isWebdav : `bool` 

272 True if the endpoint implements Webdav, False if it doesn't. 

273 """ 

274 ca_bundle = None 

275 try: 

276 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

277 except KeyError: 

278 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

279 "HTTPS requests will fail. If you intend to use HTTPS, please " 

280 "export this variable.") 

281 filepath = _getFileURL(path) 

282 

283 log.debug("Detecting HTTP endpoint type...") 

284 r = requests.options(filepath, verify=ca_bundle) 

285 return True if 'DAV' in r.headers else False 

286 

287 

288def finalurl(r: requests.Response) -> str: 

289 """Calculate the final URL, including redirects. 

290 

291 Check whether the remote HTTP endpoint redirects to a different 

292 endpoint, and return the final destination of the request. 

293 This is needed when using PUT operations, to avoid starting 

294 to send the data to the endpoint, before having to send it again once 

295 the 307 redirect response is received, and thus wasting bandwidth. 

296 

297 Parameters 

298 ---------- 

299 r : `requests.Response` 

300 An HTTP response received when requesting the endpoint 

301 

302 Returns 

303 ------- 

304 destination_url: `string` 

305 The final destination to which requests must be sent. 

306 """ 

307 destination_url = r.url 

308 if r.status_code == 307: 

309 destination_url = r.headers['Location'] 

310 log.debug("Request redirected to %s", destination_url) 

311 return destination_url 

312 

313 

314def _getFileURL(path: Union[Location, ButlerURI, str]) -> str: 

315 """Return the absolute URL of the resource as a string. 

316 

317 Parameters 

318 ---------- 

319 path : `Location`, `ButlerURI` or `str` 

320 Location or ButlerURI containing the bucket name and filepath. 

321 

322 Returns 

323 ------- 

324 filepath : `str` 

325 The fully qualified URL of the resource. 

326 """ 

327 if isinstance(path, Location): 

328 filepath = path.uri.geturl() 

329 else: 

330 filepath = ButlerURI(path).geturl() 

331 return filepath 

332 

333 

334class ButlerHttpURI(ButlerURI): 

335 """General HTTP(S) resource.""" 

336 

337 _session = requests.Session() 

338 _sessionInitialized = False 

339 

340 @property 

341 def session(self) -> requests.Session: 

342 """Client object to address remote resource.""" 

343 if ButlerHttpURI._sessionInitialized: 

344 if isTokenAuth(): 

345 refreshToken(ButlerHttpURI._session) 

346 return ButlerHttpURI._session 

347 

348 baseURL = self.scheme + "://" + self.netloc 

349 

350 if isWebdavEndpoint(baseURL): 

351 log.debug("%s looks like a Webdav endpoint.", baseURL) 

352 s = getHttpSession() 

353 else: 

354 s = requests.Session() 

355 

356 ButlerHttpURI._session = s 

357 ButlerHttpURI._sessionInitialized = True 

358 return s 

359 

360 def exists(self) -> bool: 

361 """Check that a remote HTTP resource exists.""" 

362 log.debug("Checking if resource exists: %s", self.geturl()) 

363 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

364 

365 return True if r.status_code == 200 else False 

366 

367 def size(self) -> int: 

368 """Return the size of the remote resource in bytes.""" 

369 if self.dirLike: 

370 return 0 

371 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

372 if r.status_code == 200: 

373 return int(r.headers['Content-Length']) 

374 else: 

375 raise FileNotFoundError(f"Resource {self} does not exist") 

376 

377 def mkdir(self) -> None: 

378 """Create the directory resource if it does not already exist.""" 

379 if not self.dirLike: 

380 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

381 

382 if not self.exists(): 

383 # We need to test the absence of the parent directory, 

384 # but also if parent URL is different from self URL, 

385 # otherwise we could be stuck in a recursive loop 

386 # where self == parent 

387 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

388 self.parent().mkdir() 

389 log.debug("Creating new directory: %s", self.geturl()) 

390 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT) 

391 if r.status_code != 201: 

392 if r.status_code == 405: 

393 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

394 else: 

395 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}") 

396 

397 def remove(self) -> None: 

398 """Remove the resource.""" 

399 log.debug("Removing resource: %s", self.geturl()) 

400 r = self.session.delete(self.geturl(), timeout=TIMEOUT) 

401 if r.status_code not in [200, 202, 204]: 

402 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}") 

403 

404 def _as_local(self) -> Tuple[str, bool]: 

405 """Download object over HTTP and place in temporary directory. 

406 

407 Returns 

408 ------- 

409 path : `str` 

410 Path to local temporary file. 

411 temporary : `bool` 

412 Always returns `True`. This is always a temporary file. 

413 """ 

414 log.debug("Downloading remote resource as local file: %s", self.geturl()) 

415 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

416 if r.status_code != 200: 

417 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}") 

418 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

419 for chunk in r.iter_content(): 

420 tmpFile.write(chunk) 

421 return tmpFile.name, True 

422 

423 def read(self, size: int = -1) -> bytes: 

424 """Open the resource and return the contents in bytes. 

425 

426 Parameters 

427 ---------- 

428 size : `int`, optional 

429 The number of bytes to read. Negative or omitted indicates 

430 that all data should be read. 

431 """ 

432 log.debug("Reading from remote resource: %s", self.geturl()) 

433 stream = True if size > 0 else False 

434 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

435 if r.status_code != 200: 

436 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}") 

437 if not stream: 

438 return r.content 

439 else: 

440 return next(r.iter_content(chunk_size=size)) 

441 

442 def write(self, data: bytes, overwrite: bool = True) -> None: 

443 """Write the supplied bytes to the new resource. 

444 

445 Parameters 

446 ---------- 

447 data : `bytes` 

448 The bytes to write to the resource. The entire contents of the 

449 resource will be replaced. 

450 overwrite : `bool`, optional 

451 If `True` the resource will be overwritten if it exists. Otherwise 

452 the write will fail. 

453 """ 

454 log.debug("Writing to remote resource: %s", self.geturl()) 

455 if not overwrite: 

456 if self.exists(): 

457 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

458 dest_url = finalurl(self._emptyPut()) 

459 r = self.session.put(dest_url, data=data, timeout=TIMEOUT) 

460 if r.status_code not in [201, 202, 204]: 

461 raise ValueError(f"Can not write file {self}, status code: {r.status_code}") 

462 

463 def transfer_from(self, src: ButlerURI, transfer: str = "copy", 

464 overwrite: bool = False, 

465 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

466 """Transfer the current resource to a Webdav repository. 

467 

468 Parameters 

469 ---------- 

470 src : `ButlerURI` 

471 Source URI. 

472 transfer : `str` 

473 Mode to use for transferring the resource. Supports the following 

474 options: copy. 

475 transaction : `DatastoreTransaction`, optional 

476 Currently unused. 

477 """ 

478 # Fail early to prevent delays if remote resources are requested 

479 if transfer not in self.transferModes: 

480 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

481 

482 log.debug(f"Transferring {src} [exists: {src.exists()}] -> " 

483 f"{self} [exists: {self.exists()}] (transfer={transfer})") 

484 

485 if self.exists(): 

486 raise FileExistsError(f"Destination path {self} already exists.") 

487 

488 if transfer == "auto": 

489 transfer = self.transferDefault 

490 

491 if isinstance(src, type(self)): 

492 if transfer == "move": 

493 r = self.session.request("MOVE", src.geturl(), 

494 headers={"Destination": self.geturl()}, 

495 timeout=TIMEOUT) 

496 log.debug("Running move via MOVE HTTP request.") 

497 else: 

498 r = self.session.request("COPY", src.geturl(), 

499 headers={"Destination": self.geturl()}, 

500 timeout=TIMEOUT) 

501 log.debug("Running copy via COPY HTTP request.") 

502 else: 

503 # Use local file and upload it 

504 with src.as_local() as local_uri: 

505 with open(local_uri.ospath, "rb") as f: 

506 dest_url = finalurl(self._emptyPut()) 

507 r = self.session.put(dest_url, data=f, timeout=TIMEOUT) 

508 log.debug("Uploading URI %s to %s via local file", src, self) 

509 

510 if r.status_code not in [201, 202, 204]: 

511 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}") 

512 

513 # This was an explicit move requested from a remote resource 

514 # try to remove that resource 

515 if transfer == "move": 

516 # Transactions do not work here 

517 src.remove() 

518 

519 def _emptyPut(self) -> requests.Response: 

520 """Send an empty PUT request to current URL. 

521 

522 This is used to detect if redirection is enabled before sending actual 

523 data. 

524 

525 Returns 

526 ------- 

527 response : `requests.Response` 

528 HTTP Response from the endpoint. 

529 """ 

530 headers = {"Content-Length": "0"} 

531 if useExpect100(): 

532 headers["Expect"] = "100-continue" 

533 return self.session.put(self.geturl(), data=None, headers=headers, 

534 allow_redirects=False, timeout=TIMEOUT)