Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import os 

25import os.path 

26import requests 

27import tempfile 

28import logging 

29 

30__all__ = ('ButlerHttpURI', ) 

31 

32from requests.adapters import HTTPAdapter 

33from requests.packages.urllib3.util.retry import Retry 

34 

35from typing import ( 

36 TYPE_CHECKING, 

37 Optional, 

38 Tuple, 

39 Union, 

40) 

41 

42from .utils import NoTransaction 

43from ._butlerUri import ButlerURI 

44from ..location import Location 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ..datastore import DatastoreTransaction 

48 

49log = logging.getLogger(__name__) 

50 

51# Default timeout for all HTTP requests, in seconds 

52TIMEOUT = 20 

53 

54 

55def getHttpSession() -> requests.Session: 

56 """Create a requests.Session pre-configured with environment variable data 

57 

58 Returns 

59 ------- 

60 session : `requests.Session` 

61 An http session used to execute requests. 

62 

63 Notes 

64 ----- 

65 The following environment variables must be set: 

66 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA 

67 certificates are stored if you intend to use HTTPS to 

68 communicate with the endpoint. 

69 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use. 

70 Possible values are X509 and TOKEN 

71 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy 

72 certificate used to authenticate requests 

73 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which 

74 contains the bearer token used to authenticate requests 

75 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an 

76 "Expect: 100-Continue" header in all requests. This is required 

77 on certain endpoints where requests redirection is made. 

78 """ 

79 

80 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) 

81 

82 session = requests.Session() 

83 session.mount("http://", HTTPAdapter(max_retries=retries)) 

84 session.mount("https://", HTTPAdapter(max_retries=retries)) 

85 

86 log.debug("Creating new HTTP session...") 

87 

88 try: 

89 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

90 except KeyError: 

91 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

92 "please use values X509 or TOKEN") 

93 

94 if env_auth_method == "X509": 

95 log.debug("... using x509 authentication.") 

96 try: 

97 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT'] 

98 except KeyError: 

99 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set") 

100 session.cert = (proxy_cert, proxy_cert) 

101 elif env_auth_method == "TOKEN": 

102 log.debug("... using bearer-token authentication.") 

103 refreshToken(session) 

104 else: 

105 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN") 

106 

107 ca_bundle = None 

108 try: 

109 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

110 except KeyError: 

111 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

112 "HTTPS requests will fail. If you intend to use HTTPS, please " 

113 "export this variable.") 

114 

115 session.verify = ca_bundle 

116 log.debug("Session configured and ready.") 

117 

118 return session 

119 

120 

121def useExpect100() -> bool: 

122 """Returns the status of the "Expect-100" header. 

123 

124 Returns 

125 ------- 

126 useExpect100 : `bool` 

127 True if LSST_BUTLER_WEBDAV_EXPECT100 is set, False otherwise. 

128 """ 

129 # This header is required for request redirection, in dCache for example 

130 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ: 

131 log.debug("Expect: 100-Continue header enabled.") 

132 return True 

133 return False 

134 

135 

136def isTokenAuth() -> bool: 

137 """Returns the status of bearer-token authentication. 

138 

139 Returns 

140 ------- 

141 isTokenAuth : `bool` 

142 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise. 

143 """ 

144 try: 

145 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

146 except KeyError: 

147 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

148 "please use values X509 or TOKEN") 

149 

150 if env_auth_method == "TOKEN": 

151 return True 

152 return False 

153 

154 

155def refreshToken(session: requests.Session) -> None: 

156 """Set or update the 'Authorization' header of the session, 

157 configure bearer token authentication, with the value fetched 

158 from LSST_BUTLER_WEBDAV_TOKEN_FILE 

159 

160 Parameters 

161 ---------- 

162 session : `requests.Session` 

163 Session on which bearer token authentication must be configured 

164 """ 

165 try: 

166 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'] 

167 if not os.path.isfile(token_path): 

168 raise FileNotFoundError(f"No token file: {token_path}") 

169 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh: 

170 bearer_token = fh.read().replace('\n', '') 

171 except KeyError: 

172 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set") 

173 

174 session.headers.update({'Authorization': 'Bearer ' + bearer_token}) 

175 

176 

177def webdavCheckFileExists(path: Union[Location, ButlerURI, str], 

178 session: Optional[requests.Session] = None) -> Tuple[bool, int]: 

179 """Check that a remote HTTP resource exists. 

180 

181 Parameters 

182 ---------- 

183 path : `Location`, `ButlerURI` or `str` 

184 Location or ButlerURI containing the bucket name and filepath. 

185 session : `requests.Session`, optional 

186 Session object to query. 

187 

188 Returns 

189 ------- 

190 exists : `bool` 

191 True if resource exists, False otherwise. 

192 size : `int` 

193 Size of the resource, if it exists, in bytes, otherwise -1 

194 """ 

195 if session is None: 

196 session = getHttpSession() 

197 

198 filepath = _getFileURL(path) 

199 

200 log.debug("Checking if file exists: %s", filepath) 

201 

202 r = session.head(filepath, timeout=TIMEOUT) 

203 return (True, int(r.headers['Content-Length'])) if r.status_code == 200 else (False, -1) 

204 

205 

206def webdavDeleteFile(path: Union[Location, ButlerURI, str], 

207 session: Optional[requests.Session] = None) -> None: 

208 """Remove a remote HTTP resource. 

209 Raises a FileNotFoundError if the resource does not exist or on failure. 

210 

211 Parameters 

212 ---------- 

213 path : `Location`, `ButlerURI` or `str` 

214 Location or ButlerURI containing the bucket name and filepath. 

215 session : `requests.Session`, optional 

216 Session object to query. 

217 """ 

218 if session is None: 

219 session = getHttpSession() 

220 

221 filepath = _getFileURL(path) 

222 

223 log.debug("Removing file: %s", filepath) 

224 r = session.delete(filepath, timeout=TIMEOUT) 

225 if r.status_code not in [200, 202, 204]: 

226 raise FileNotFoundError(f"Unable to delete resource {filepath}; status code: {r.status_code}") 

227 

228 

229def folderExists(path: Union[Location, ButlerURI, str], 

230 session: Optional[requests.Session] = None) -> bool: 

231 """Check if the Webdav repository at a given URL actually exists. 

232 

233 Parameters 

234 ---------- 

235 path : `Location`, `ButlerURI` or `str` 

236 Location or ButlerURI containing the bucket name and filepath. 

237 session : `requests.Session`, optional 

238 Session object to query. 

239 

240 Returns 

241 ------- 

242 exists : `bool` 

243 True if it exists, False if no folder is found. 

244 """ 

245 if session is None: 

246 session = getHttpSession() 

247 

248 filepath = _getFileURL(path) 

249 

250 log.debug("Checking if folder exists: %s", filepath) 

251 r = session.head(filepath, timeout=TIMEOUT) 

252 return True if r.status_code == 200 else False 

253 

254 

255def isWebdavEndpoint(path: Union[Location, ButlerURI, str]) -> bool: 

256 """Check whether the remote HTTP endpoint implements Webdav features. 

257 

258 Parameters 

259 ---------- 

260 path : `Location`, `ButlerURI` or `str` 

261 Location or ButlerURI containing the bucket name and filepath. 

262 

263 Returns 

264 ------- 

265 isWebdav : `bool` 

266 True if the endpoint implements Webdav, False if it doesn't. 

267 """ 

268 ca_bundle = None 

269 try: 

270 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

271 except KeyError: 

272 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

273 "HTTPS requests will fail. If you intend to use HTTPS, please " 

274 "export this variable.") 

275 filepath = _getFileURL(path) 

276 

277 log.debug("Detecting HTTP endpoint type...") 

278 r = requests.options(filepath, verify=ca_bundle) 

279 return True if 'DAV' in r.headers else False 

280 

281 

282def finalurl(r: requests.Response) -> str: 

283 """Check whether the remote HTTP endpoint redirects to a different 

284 endpoint, and return the final destination of the request. 

285 This is needed when using PUT operations, to avoid starting 

286 to send the data to the endpoint, before having to send it again once 

287 the 307 redirect response is received, and thus wasting bandwidth. 

288 

289 Parameters 

290 ---------- 

291 r : `requests.Response` 

292 An HTTP response received when requesting the endpoint 

293 

294 Returns 

295 ------- 

296 destination_url: `string` 

297 The final destination to which requests must be sent. 

298 """ 

299 destination_url = r.url 

300 if r.status_code == 307: 

301 destination_url = r.headers['Location'] 

302 log.debug("Request redirected to %s", destination_url) 

303 return destination_url 

304 

305 

306def _getFileURL(path: Union[Location, ButlerURI, str]) -> str: 

307 """Returns the absolute URL of the resource as a string. 

308 

309 Parameters 

310 ---------- 

311 path : `Location`, `ButlerURI` or `str` 

312 Location or ButlerURI containing the bucket name and filepath. 

313 

314 Returns 

315 ------- 

316 filepath : `str` 

317 The fully qualified URL of the resource. 

318 """ 

319 if isinstance(path, Location): 

320 filepath = path.uri.geturl() 

321 else: 

322 filepath = ButlerURI(path).geturl() 

323 return filepath 

324 

325 

326class ButlerHttpURI(ButlerURI): 

327 """General HTTP(S) resource.""" 

328 _session = requests.Session() 

329 _sessionInitialized = False 

330 

331 @property 

332 def session(self) -> requests.Session: 

333 """Client object to address remote resource.""" 

334 if ButlerHttpURI._sessionInitialized: 

335 if isTokenAuth(): 

336 refreshToken(ButlerHttpURI._session) 

337 return ButlerHttpURI._session 

338 

339 baseURL = self.scheme + "://" + self.netloc 

340 

341 if isWebdavEndpoint(baseURL): 

342 log.debug("%s looks like a Webdav endpoint.", baseURL) 

343 s = getHttpSession() 

344 else: 

345 raise RuntimeError(f"Only Webdav endpoints are supported; got base URL '{baseURL}'.") 

346 

347 ButlerHttpURI._session = s 

348 ButlerHttpURI._sessionInitialized = True 

349 return s 

350 

351 def exists(self) -> bool: 

352 """Check that a remote HTTP resource exists.""" 

353 log.debug("Checking if resource exists: %s", self.geturl()) 

354 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

355 

356 return True if r.status_code == 200 else False 

357 

358 def size(self) -> int: 

359 if self.dirLike: 

360 return 0 

361 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

362 if r.status_code == 200: 

363 return int(r.headers['Content-Length']) 

364 else: 

365 raise FileNotFoundError(f"Resource {self} does not exist") 

366 

367 def mkdir(self) -> None: 

368 """For a dir-like URI, create the directory resource if it does not 

369 already exist. 

370 """ 

371 if not self.dirLike: 

372 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

373 

374 if not self.exists(): 

375 # We need to test the absence of the parent directory, 

376 # but also if parent URL is different from self URL, 

377 # otherwise we could be stuck in a recursive loop 

378 # where self == parent 

379 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

380 self.parent().mkdir() 

381 log.debug("Creating new directory: %s", self.geturl()) 

382 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT) 

383 if r.status_code != 201: 

384 if r.status_code == 405: 

385 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

386 else: 

387 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}") 

388 

389 def remove(self) -> None: 

390 """Remove the resource.""" 

391 log.debug("Removing resource: %s", self.geturl()) 

392 r = self.session.delete(self.geturl(), timeout=TIMEOUT) 

393 if r.status_code not in [200, 202, 204]: 

394 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}") 

395 

396 def _as_local(self) -> Tuple[str, bool]: 

397 """Download object over HTTP and place in temporary directory. 

398 

399 Returns 

400 ------- 

401 path : `str` 

402 Path to local temporary file. 

403 temporary : `bool` 

404 Always returns `True`. This is always a temporary file. 

405 """ 

406 log.debug("Downloading remote resource as local file: %s", self.geturl()) 

407 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

408 if r.status_code != 200: 

409 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}") 

410 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

411 for chunk in r.iter_content(): 

412 tmpFile.write(chunk) 

413 return tmpFile.name, True 

414 

415 def read(self, size: int = -1) -> bytes: 

416 """Open the resource and return the contents in bytes. 

417 

418 Parameters 

419 ---------- 

420 size : `int`, optional 

421 The number of bytes to read. Negative or omitted indicates 

422 that all data should be read. 

423 """ 

424 log.debug("Reading from remote resource: %s", self.geturl()) 

425 stream = True if size > 0 else False 

426 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

427 if r.status_code != 200: 

428 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}") 

429 if not stream: 

430 return r.content 

431 else: 

432 return next(r.iter_content(chunk_size=size)) 

433 

434 def write(self, data: bytes, overwrite: bool = True) -> None: 

435 """Write the supplied bytes to the new resource. 

436 

437 Parameters 

438 ---------- 

439 data : `bytes` 

440 The bytes to write to the resource. The entire contents of the 

441 resource will be replaced. 

442 overwrite : `bool`, optional 

443 If `True` the resource will be overwritten if it exists. Otherwise 

444 the write will fail. 

445 """ 

446 log.debug("Writing to remote resource: %s", self.geturl()) 

447 if not overwrite: 

448 if self.exists(): 

449 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

450 dest_url = finalurl(self._emptyPut()) 

451 r = self.session.put(dest_url, data=data, timeout=TIMEOUT) 

452 if r.status_code not in [201, 202, 204]: 

453 raise ValueError(f"Can not write file {self}, status code: {r.status_code}") 

454 

455 def transfer_from(self, src: ButlerURI, transfer: str = "copy", 

456 overwrite: bool = False, 

457 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

458 """Transfer the current resource to a Webdav repository. 

459 

460 Parameters 

461 ---------- 

462 src : `ButlerURI` 

463 Source URI. 

464 transfer : `str` 

465 Mode to use for transferring the resource. Supports the following 

466 options: copy. 

467 transaction : `DatastoreTransaction`, optional 

468 Currently unused. 

469 """ 

470 # Fail early to prevent delays if remote resources are requested 

471 if transfer not in self.transferModes: 

472 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

473 

474 log.debug(f"Transferring {src} [exists: {src.exists()}] -> " 

475 f"{self} [exists: {self.exists()}] (transfer={transfer})") 

476 

477 if self.exists(): 

478 raise FileExistsError(f"Destination path {self} already exists.") 

479 

480 if transfer == "auto": 

481 transfer = self.transferDefault 

482 

483 if isinstance(src, type(self)): 

484 if transfer == "move": 

485 r = self.session.request("MOVE", src.geturl(), 

486 headers={"Destination": self.geturl()}, 

487 timeout=TIMEOUT) 

488 log.debug("Running move via MOVE HTTP request.") 

489 else: 

490 r = self.session.request("COPY", src.geturl(), 

491 headers={"Destination": self.geturl()}, 

492 timeout=TIMEOUT) 

493 log.debug("Running copy via COPY HTTP request.") 

494 else: 

495 # Use local file and upload it 

496 with src.as_local() as local_uri: 

497 with open(local_uri.ospath, "rb") as f: 

498 dest_url = finalurl(self._emptyPut()) 

499 r = self.session.put(dest_url, data=f, timeout=TIMEOUT) 

500 log.debug("Uploading URI %s to %s via local file", src, self) 

501 

502 if r.status_code not in [201, 202, 204]: 

503 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}") 

504 

505 # This was an explicit move requested from a remote resource 

506 # try to remove that resource 

507 if transfer == "move": 

508 # Transactions do not work here 

509 src.remove() 

510 

511 def _emptyPut(self) -> requests.Response: 

512 """Send an empty PUT request to current URL. This is used to detect 

513 if redirection is enabled before sending actual data. 

514 

515 Returns 

516 ------- 

517 response : `requests.Response` 

518 HTTP Response from the endpoint. 

519 """ 

520 headers = {"Content-Length": "0"} 

521 if useExpect100(): 

522 headers["Expect"] = "100-continue" 

523 return self.session.put(self.geturl(), data=None, headers=headers, 

524 allow_redirects=False, timeout=TIMEOUT)