Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import os 

25import os.path 

26import requests 

27import tempfile 

28import logging 

29 

30__all__ = ('ButlerHttpURI', ) 

31 

32from requests.adapters import HTTPAdapter 

33from requests.packages.urllib3.util.retry import Retry 

34 

35from typing import ( 

36 TYPE_CHECKING, 

37 Optional, 

38 Tuple, 

39 Union, 

40) 

41 

42from .utils import NoTransaction 

43from ._butlerUri import ButlerURI 

44from ..location import Location 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ..datastore import DatastoreTransaction 

48 

49log = logging.getLogger(__name__) 

50 

51 

52def getHttpSession() -> requests.Session: 

53 """Create a requests.Session pre-configured with environment variable data 

54 

55 Returns 

56 ------- 

57 session : `requests.Session` 

58 An http session used to execute requests. 

59 

60 Notes 

61 ----- 

62 The following environment variables must be set: 

63 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA 

64 certificates are stored if you intend to use HTTPS to 

65 communicate with the endpoint. 

66 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use. 

67 Possible values are X509 and TOKEN 

68 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy 

69 certificate used to authenticate requests 

70 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which 

71 contains the bearer token used to authenticate requests 

72 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an 

73 "Expect: 100-Continue" header in all requests. This is required 

74 on certain endpoints where requests redirection is made. 

75 """ 

76 

77 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) 

78 

79 session = requests.Session() 

80 session.mount("http://", HTTPAdapter(max_retries=retries)) 

81 session.mount("https://", HTTPAdapter(max_retries=retries)) 

82 

83 log.debug("Creating new HTTP session...") 

84 

85 try: 

86 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

87 except KeyError: 

88 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

89 "please use values X509 or TOKEN") 

90 

91 if env_auth_method == "X509": 

92 log.debug("... using x509 authentication.") 

93 try: 

94 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT'] 

95 except KeyError: 

96 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set") 

97 session.cert = (proxy_cert, proxy_cert) 

98 elif env_auth_method == "TOKEN": 

99 log.debug("... using bearer-token authentication.") 

100 refreshToken(session) 

101 else: 

102 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN") 

103 

104 ca_bundle = None 

105 try: 

106 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

107 except KeyError: 

108 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

109 "HTTPS requests will fail. If you intend to use HTTPS, please " 

110 "export this variable.") 

111 

112 session.verify = ca_bundle 

113 

114 # This header is required for request redirection, in dCache for example 

115 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ: 

116 log.debug("Expect: 100-Continue header enabled.") 

117 session.headers.update({'Expect': '100-continue'}) 

118 

119 log.debug("Session configured and ready.") 

120 

121 return session 

122 

123 

124def isTokenAuth() -> bool: 

125 """Returns the status of bearer-token authentication. 

126 

127 Returns 

128 ------- 

129 isTokenAuth : `bool` 

130 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise. 

131 """ 

132 try: 

133 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

134 except KeyError: 

135 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

136 "please use values X509 or TOKEN") 

137 

138 if env_auth_method == "TOKEN": 

139 return True 

140 return False 

141 

142 

143def refreshToken(session: requests.Session) -> None: 

144 """Set or update the 'Authorization' header of the session, 

145 configure bearer token authentication, with the value fetched 

146 from LSST_BUTLER_WEBDAV_TOKEN_FILE 

147 

148 Parameters 

149 ---------- 

150 session : `requests.Session` 

151 Session on which bearer token authentication must be configured 

152 """ 

153 try: 

154 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'] 

155 if not os.path.isfile(token_path): 

156 raise FileNotFoundError(f"No token file: {token_path}") 

157 bearer_token = open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], 'r').read().replace('\n', '') 

158 except KeyError: 

159 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set") 

160 

161 session.headers.update({'Authorization': 'Bearer ' + bearer_token}) 

162 

163 

164def webdavCheckFileExists(path: Union[Location, ButlerURI, str], 

165 session: Optional[requests.Session] = None) -> Tuple[bool, int]: 

166 """Check that a remote HTTP resource exists. 

167 

168 Parameters 

169 ---------- 

170 path : `Location`, `ButlerURI` or `str` 

171 Location or ButlerURI containing the bucket name and filepath. 

172 session : `requests.Session`, optional 

173 Session object to query. 

174 

175 Returns 

176 ------- 

177 exists : `bool` 

178 True if resource exists, False otherwise. 

179 size : `int` 

180 Size of the resource, if it exists, in bytes, otherwise -1 

181 """ 

182 if session is None: 

183 session = getHttpSession() 

184 

185 filepath = _getFileURL(path) 

186 

187 log.debug("Checking if file exists: %s", filepath) 

188 

189 r = session.head(filepath) 

190 return (True, int(r.headers['Content-Length'])) if r.status_code == 200 else (False, -1) 

191 

192 

193def webdavDeleteFile(path: Union[Location, ButlerURI, str], 

194 session: Optional[requests.Session] = None) -> None: 

195 """Remove a remote HTTP resource. 

196 Raises a FileNotFoundError if the resource does not exist or on failure. 

197 

198 Parameters 

199 ---------- 

200 path : `Location`, `ButlerURI` or `str` 

201 Location or ButlerURI containing the bucket name and filepath. 

202 session : `requests.Session`, optional 

203 Session object to query. 

204 """ 

205 if session is None: 

206 session = getHttpSession() 

207 

208 filepath = _getFileURL(path) 

209 

210 log.debug("Removing file: %s", filepath) 

211 r = session.delete(filepath) 

212 if r.status_code not in [200, 202, 204]: 

213 raise FileNotFoundError(f"Unable to delete resource {filepath}; status code: {r.status_code}") 

214 

215 

216def folderExists(path: Union[Location, ButlerURI, str], 

217 session: Optional[requests.Session] = None) -> bool: 

218 """Check if the Webdav repository at a given URL actually exists. 

219 

220 Parameters 

221 ---------- 

222 path : `Location`, `ButlerURI` or `str` 

223 Location or ButlerURI containing the bucket name and filepath. 

224 session : `requests.Session`, optional 

225 Session object to query. 

226 

227 Returns 

228 ------- 

229 exists : `bool` 

230 True if it exists, False if no folder is found. 

231 """ 

232 if session is None: 

233 session = getHttpSession() 

234 

235 filepath = _getFileURL(path) 

236 

237 log.debug("Checking if folder exists: %s", filepath) 

238 r = session.head(filepath) 

239 return True if r.status_code == 200 else False 

240 

241 

242def isWebdavEndpoint(path: Union[Location, ButlerURI, str]) -> bool: 

243 """Check whether the remote HTTP endpoint implements Webdav features. 

244 

245 Parameters 

246 ---------- 

247 path : `Location`, `ButlerURI` or `str` 

248 Location or ButlerURI containing the bucket name and filepath. 

249 

250 Returns 

251 ------- 

252 isWebdav : `bool` 

253 True if the endpoint implements Webdav, False if it doesn't. 

254 """ 

255 ca_bundle = None 

256 try: 

257 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

258 except KeyError: 

259 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

260 "HTTPS requests will fail. If you intend to use HTTPS, please " 

261 "export this variable.") 

262 filepath = _getFileURL(path) 

263 

264 log.debug("Detecting HTTP endpoint type...") 

265 r = requests.options(filepath, verify=ca_bundle) 

266 return True if 'DAV' in r.headers else False 

267 

268 

269def finalurl(r: requests.Response) -> str: 

270 """Check whether the remote HTTP endpoint redirects to a different 

271 endpoint, and return the final destination of the request. 

272 This is needed when using PUT operations, to avoid starting 

273 to send the data to the endpoint, before having to send it again once 

274 the 307 redirect response is received, and thus wasting bandwidth. 

275 

276 Parameters 

277 ---------- 

278 r : `requests.Response` 

279 An HTTP response received when requesting the endpoint 

280 

281 Returns 

282 ------- 

283 destination_url: `string` 

284 The final destination to which requests must be sent. 

285 """ 

286 destination_url = r.url 

287 if r.status_code == 307: 

288 destination_url = r.headers['Location'] 

289 log.debug("Request redirected to %s", destination_url) 

290 return destination_url 

291 

292 

293def _getFileURL(path: Union[Location, ButlerURI, str]) -> str: 

294 """Returns the absolute URL of the resource as a string. 

295 

296 Parameters 

297 ---------- 

298 path : `Location`, `ButlerURI` or `str` 

299 Location or ButlerURI containing the bucket name and filepath. 

300 

301 Returns 

302 ------- 

303 filepath : `str` 

304 The fully qualified URL of the resource. 

305 """ 

306 if isinstance(path, Location): 

307 filepath = path.uri.geturl() 

308 else: 

309 filepath = ButlerURI(path).geturl() 

310 return filepath 

311 

312 

313class ButlerHttpURI(ButlerURI): 

314 """General HTTP(S) resource.""" 

315 _session = requests.Session() 

316 _sessionInitialized = False 

317 

318 @property 

319 def session(self) -> requests.Session: 

320 """Client object to address remote resource.""" 

321 if ButlerHttpURI._sessionInitialized: 

322 if isTokenAuth(): 

323 refreshToken(ButlerHttpURI._session) 

324 return ButlerHttpURI._session 

325 

326 baseURL = self.scheme + "://" + self.netloc 

327 

328 if isWebdavEndpoint(baseURL): 

329 log.debug("%s looks like a Webdav endpoint.", baseURL) 

330 s = getHttpSession() 

331 

332 ButlerHttpURI._session = s 

333 ButlerHttpURI._sessionInitialized = True 

334 return s 

335 

336 def exists(self) -> bool: 

337 """Check that a remote HTTP resource exists.""" 

338 log.debug("Checking if resource exists: %s", self.geturl()) 

339 r = self.session.head(self.geturl()) 

340 

341 return True if r.status_code == 200 else False 

342 

343 def size(self) -> int: 

344 if self.dirLike: 

345 return 0 

346 r = self.session.head(self.geturl()) 

347 if r.status_code == 200: 

348 return int(r.headers['Content-Length']) 

349 else: 

350 raise FileNotFoundError(f"Resource {self} does not exist") 

351 

352 def mkdir(self) -> None: 

353 """For a dir-like URI, create the directory resource if it does not 

354 already exist. 

355 """ 

356 if not self.dirLike: 

357 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

358 

359 if not self.exists(): 

360 # We need to test the absence of the parent directory, 

361 # but also if parent URL is different from self URL, 

362 # otherwise we could be stuck in a recursive loop 

363 # where self == parent 

364 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

365 self.parent().mkdir() 

366 log.debug("Creating new directory: %s", self.geturl()) 

367 r = self.session.request("MKCOL", self.geturl()) 

368 if r.status_code != 201: 

369 if r.status_code == 405: 

370 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

371 else: 

372 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}") 

373 

374 def remove(self) -> None: 

375 """Remove the resource.""" 

376 log.debug("Removing resource: %s", self.geturl()) 

377 r = self.session.delete(self.geturl()) 

378 if r.status_code not in [200, 202, 204]: 

379 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}") 

380 

381 def as_local(self) -> Tuple[str, bool]: 

382 """Download object over HTTP and place in temporary directory. 

383 

384 Returns 

385 ------- 

386 path : `str` 

387 Path to local temporary file. 

388 temporary : `bool` 

389 Always returns `True`. This is always a temporary file. 

390 """ 

391 log.debug("Downloading remote resource as local file: %s", self.geturl()) 

392 r = self.session.get(self.geturl(), stream=True) 

393 if r.status_code != 200: 

394 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}") 

395 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

396 for chunk in r.iter_content(): 

397 tmpFile.write(chunk) 

398 return tmpFile.name, True 

399 

400 def read(self, size: int = -1) -> bytes: 

401 """Open the resource and return the contents in bytes. 

402 

403 Parameters 

404 ---------- 

405 size : `int`, optional 

406 The number of bytes to read. Negative or omitted indicates 

407 that all data should be read. 

408 """ 

409 log.debug("Reading from remote resource: %s", self.geturl()) 

410 stream = True if size > 0 else False 

411 r = self.session.get(self.geturl(), stream=stream) 

412 if r.status_code != 200: 

413 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}") 

414 if not stream: 

415 return r.content 

416 else: 

417 return next(r.iter_content(chunk_size=size)) 

418 

419 def write(self, data: bytes, overwrite: bool = True) -> None: 

420 """Write the supplied bytes to the new resource. 

421 

422 Parameters 

423 ---------- 

424 data : `bytes` 

425 The bytes to write to the resource. The entire contents of the 

426 resource will be replaced. 

427 overwrite : `bool`, optional 

428 If `True` the resource will be overwritten if it exists. Otherwise 

429 the write will fail. 

430 """ 

431 log.debug("Writing to remote resource: %s", self.geturl()) 

432 if not overwrite: 

433 if self.exists(): 

434 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

435 dest_url = finalurl(self._emptyPut()) 

436 r = self.session.put(dest_url, data=data) 

437 if r.status_code not in [201, 202, 204]: 

438 raise ValueError(f"Can not write file {self}, status code: {r.status_code}") 

439 

440 def transfer_from(self, src: ButlerURI, transfer: str = "copy", 

441 overwrite: bool = False, 

442 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

443 """Transfer the current resource to a Webdav repository. 

444 

445 Parameters 

446 ---------- 

447 src : `ButlerURI` 

448 Source URI. 

449 transfer : `str` 

450 Mode to use for transferring the resource. Supports the following 

451 options: copy. 

452 transaction : `DatastoreTransaction`, optional 

453 Currently unused. 

454 """ 

455 # Fail early to prevent delays if remote resources are requested 

456 if transfer not in self.transferModes: 

457 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

458 

459 log.debug(f"Transferring {src} [exists: {src.exists()}] -> " 

460 f"{self} [exists: {self.exists()}] (transfer={transfer})") 

461 

462 if self.exists(): 

463 raise FileExistsError(f"Destination path {self} already exists.") 

464 

465 if transfer == "auto": 

466 transfer = self.transferDefault 

467 

468 if isinstance(src, type(self)): 

469 if transfer == "move": 

470 r = self.session.request("MOVE", src.geturl(), headers={"Destination": self.geturl()}) 

471 log.debug("Running move via MOVE HTTP request.") 

472 else: 

473 r = self.session.request("COPY", src.geturl(), headers={"Destination": self.geturl()}) 

474 log.debug("Running copy via COPY HTTP request.") 

475 else: 

476 # Use local file and upload it 

477 local_src, is_temporary = src.as_local() 

478 f = open(local_src, "rb") 

479 dest_url = finalurl(self._emptyPut()) 

480 r = self.session.put(dest_url, data=f) 

481 f.close() 

482 if is_temporary: 

483 os.remove(local_src) 

484 log.debug("Running transfer from a local copy of the file.") 

485 

486 if r.status_code not in [201, 202, 204]: 

487 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}") 

488 

489 def _emptyPut(self) -> requests.Response: 

490 """Send an empty PUT request to current URL. This is used to detect 

491 if redirection is enabled before sending actual data. 

492 

493 Returns 

494 ------- 

495 response : `requests.Response` 

496 HTTP Response from the endpoint. 

497 """ 

498 return self.session.put(self.geturl(), data=None, 

499 headers={"Content-Length": "0"}, allow_redirects=False)