Coverage for python/lsst/resources/http.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

200 statements  

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14import functools 

15import logging 

16import os 

17import os.path 

18import tempfile 

19 

20import requests 

21 

22__all__ = ("HttpResourcePath",) 

23 

24from typing import TYPE_CHECKING, Optional, Tuple, Union 

25 

26from lsst.utils.timer import time_this 

27from requests.adapters import HTTPAdapter 

28from requests.packages.urllib3.util.retry import Retry 

29 

30from ._resourcePath import ResourcePath 

31 

32if TYPE_CHECKING: 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true

33 from .utils import TransactionProtocol 

34 

35log = logging.getLogger(__name__) 

36 

37# Default timeout for all HTTP requests, in seconds 

38TIMEOUT = 20 

39 

40 

41def getHttpSession() -> requests.Session: 

42 """Create a requests.Session pre-configured with environment variable data. 

43 

44 Returns 

45 ------- 

46 session : `requests.Session` 

47 An http session used to execute requests. 

48 

49 Notes 

50 ----- 

51 The following environment variables must be set: 

52 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA 

53 certificates are stored if you intend to use HTTPS to 

54 communicate with the endpoint. 

55 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use. 

56 Possible values are X509 and TOKEN 

57 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy 

58 certificate used to authenticate requests 

59 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which 

60 contains the bearer token used to authenticate requests 

61 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an 

62 "Expect: 100-Continue" header in all requests. This is required 

63 on certain endpoints where requests redirection is made. 

64 """ 

65 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) 

66 

67 session = requests.Session() 

68 session.mount("http://", HTTPAdapter(max_retries=retries)) 

69 session.mount("https://", HTTPAdapter(max_retries=retries)) 

70 

71 log.debug("Creating new HTTP session...") 

72 

73 ca_bundle = None 

74 try: 

75 ca_bundle = os.environ["LSST_BUTLER_WEBDAV_CA_BUNDLE"] 

76 except KeyError: 

77 log.debug( 

78 "Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

79 "If you would like to trust additional CAs, please consider " 

80 "exporting this variable." 

81 ) 

82 session.verify = ca_bundle 

83 

84 try: 

85 env_auth_method = os.environ["LSST_BUTLER_WEBDAV_AUTH"] 

86 except KeyError: 

87 log.debug("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, no authentication configured.") 

88 log.debug("Unauthenticated session configured and ready.") 

89 return session 

90 

91 if env_auth_method == "X509": 

92 log.debug("... using x509 authentication.") 

93 try: 

94 proxy_cert = os.environ["LSST_BUTLER_WEBDAV_PROXY_CERT"] 

95 except KeyError: 

96 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set") 

97 session.cert = (proxy_cert, proxy_cert) 

98 elif env_auth_method == "TOKEN": 

99 log.debug("... using bearer-token authentication.") 

100 refreshToken(session) 

101 else: 

102 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN") 

103 

104 log.debug("Authenticated session configured and ready.") 

105 return session 

106 

107 

108def useExpect100() -> bool: 

109 """Return the status of the "Expect-100" header. 

110 

111 Returns 

112 ------- 

113 useExpect100 : `bool` 

114 True if LSST_BUTLER_WEBDAV_EXPECT100 is set, False otherwise. 

115 """ 

116 # This header is required for request redirection, in dCache for example 

117 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ: 

118 log.debug("Expect: 100-Continue header enabled.") 

119 return True 

120 return False 

121 

122 

123def isTokenAuth() -> bool: 

124 """Return the status of bearer-token authentication. 

125 

126 Returns 

127 ------- 

128 isTokenAuth : `bool` 

129 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise. 

130 """ 

131 try: 

132 env_auth_method = os.environ["LSST_BUTLER_WEBDAV_AUTH"] 

133 except KeyError: 

134 raise KeyError( 

135 "Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, please use values X509 or TOKEN" 

136 ) 

137 

138 if env_auth_method == "TOKEN": 

139 return True 

140 return False 

141 

142 

143def refreshToken(session: requests.Session) -> None: 

144 """Refresh the session token. 

145 

146 Set or update the 'Authorization' header of the session, 

147 configure bearer token authentication, with the value fetched 

148 from LSST_BUTLER_WEBDAV_TOKEN_FILE 

149 

150 Parameters 

151 ---------- 

152 session : `requests.Session` 

153 Session on which bearer token authentication must be configured. 

154 """ 

155 try: 

156 token_path = os.environ["LSST_BUTLER_WEBDAV_TOKEN_FILE"] 

157 if not os.path.isfile(token_path): 

158 raise FileNotFoundError(f"No token file: {token_path}") 

159 with open(os.environ["LSST_BUTLER_WEBDAV_TOKEN_FILE"], "r") as fh: 

160 bearer_token = fh.read().replace("\n", "") 

161 except KeyError: 

162 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set") 

163 

164 session.headers.update({"Authorization": "Bearer " + bearer_token}) 

165 

166 

167@functools.lru_cache 

168def isWebdavEndpoint(path: Union[ResourcePath, str]) -> bool: 

169 """Check whether the remote HTTP endpoint implements Webdav features. 

170 

171 Parameters 

172 ---------- 

173 path : `ResourcePath` or `str` 

174 URL to the resource to be checked. 

175 Should preferably refer to the root since the status is shared 

176 by all paths in that server. 

177 

178 Returns 

179 ------- 

180 isWebdav : `bool` 

181 True if the endpoint implements Webdav, False if it doesn't. 

182 """ 

183 ca_bundle = None 

184 try: 

185 ca_bundle = os.environ["LSST_BUTLER_WEBDAV_CA_BUNDLE"] 

186 except KeyError: 

187 log.warning( 

188 "Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

189 "some HTTPS requests will fail. If you intend to use HTTPS, please " 

190 "export this variable." 

191 ) 

192 

193 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

194 r = requests.options(str(path), verify=ca_bundle) 

195 return True if "DAV" in r.headers else False 

196 

197 

198def finalurl(r: requests.Response) -> str: 

199 """Calculate the final URL, including redirects. 

200 

201 Check whether the remote HTTP endpoint redirects to a different 

202 endpoint, and return the final destination of the request. 

203 This is needed when using PUT operations, to avoid starting 

204 to send the data to the endpoint, before having to send it again once 

205 the 307 redirect response is received, and thus wasting bandwidth. 

206 

207 Parameters 

208 ---------- 

209 r : `requests.Response` 

210 An HTTP response received when requesting the endpoint 

211 

212 Returns 

213 ------- 

214 destination_url: `string` 

215 The final destination to which requests must be sent. 

216 """ 

217 destination_url = r.url 

218 if r.status_code == 307: 

219 destination_url = r.headers["Location"] 

220 log.debug("Request redirected to %s", destination_url) 

221 return destination_url 

222 

223 

224class HttpResourcePath(ResourcePath): 

225 """General HTTP(S) resource.""" 

226 

227 _session = requests.Session() 

228 _sessionInitialized = False 

229 _is_webdav: Optional[bool] = None 

230 

231 @property 

232 def session(self) -> requests.Session: 

233 """Client object to address remote resource.""" 

234 cls = type(self) 

235 if cls._sessionInitialized: 

236 if isTokenAuth(): 

237 refreshToken(cls._session) 

238 return cls._session 

239 

240 s = getHttpSession() 

241 cls._session = s 

242 cls._sessionInitialized = True 

243 return s 

244 

245 @property 

246 def is_webdav_endpoint(self) -> bool: 

247 """Check if the current endpoint implements WebDAV features. 

248 

249 This is stored per URI but cached by root so there is 

250 only one check per hostname. 

251 """ 

252 if self._is_webdav is not None: 

253 return self._is_webdav 

254 

255 self._is_webdav = isWebdavEndpoint(self.root_uri()) 

256 return self._is_webdav 

257 

258 def exists(self) -> bool: 

259 """Check that a remote HTTP resource exists.""" 

260 log.debug("Checking if resource exists: %s", self.geturl()) 

261 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

262 

263 return True if r.status_code == 200 else False 

264 

265 def size(self) -> int: 

266 """Return the size of the remote resource in bytes.""" 

267 if self.dirLike: 

268 return 0 

269 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

270 if r.status_code == 200: 

271 return int(r.headers["Content-Length"]) 

272 else: 

273 raise FileNotFoundError(f"Resource {self} does not exist") 

274 

275 def mkdir(self) -> None: 

276 """Create the directory resource if it does not already exist.""" 

277 # Only available on WebDAV backends 

278 if not self.is_webdav_endpoint: 

279 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

280 

281 if not self.dirLike: 

282 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

283 

284 if not self.exists(): 

285 # We need to test the absence of the parent directory, 

286 # but also if parent URL is different from self URL, 

287 # otherwise we could be stuck in a recursive loop 

288 # where self == parent 

289 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

290 self.parent().mkdir() 

291 log.debug("Creating new directory: %s", self.geturl()) 

292 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT) 

293 if r.status_code != 201: 

294 if r.status_code == 405: 

295 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

296 else: 

297 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}") 

298 

299 def remove(self) -> None: 

300 """Remove the resource.""" 

301 log.debug("Removing resource: %s", self.geturl()) 

302 r = self.session.delete(self.geturl(), timeout=TIMEOUT) 

303 if r.status_code not in [200, 202, 204]: 

304 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}") 

305 

306 def _as_local(self) -> Tuple[str, bool]: 

307 """Download object over HTTP and place in temporary directory. 

308 

309 Returns 

310 ------- 

311 path : `str` 

312 Path to local temporary file. 

313 temporary : `bool` 

314 Always returns `True`. This is always a temporary file. 

315 """ 

316 log.debug("Downloading remote resource as local file: %s", self.geturl()) 

317 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

318 if r.status_code != 200: 

319 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}") 

320 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

321 with time_this(log, msg="Downloading %s to local file", args=(self,)): 

322 for chunk in r.iter_content(): 

323 tmpFile.write(chunk) 

324 return tmpFile.name, True 

325 

326 def read(self, size: int = -1) -> bytes: 

327 """Open the resource and return the contents in bytes. 

328 

329 Parameters 

330 ---------- 

331 size : `int`, optional 

332 The number of bytes to read. Negative or omitted indicates 

333 that all data should be read. 

334 """ 

335 log.debug("Reading from remote resource: %s", self.geturl()) 

336 stream = True if size > 0 else False 

337 with time_this(log, msg="Read from remote resource %s", args=(self,)): 

338 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

339 if r.status_code != 200: 

340 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}") 

341 if not stream: 

342 return r.content 

343 else: 

344 return next(r.iter_content(chunk_size=size)) 

345 

346 def write(self, data: bytes, overwrite: bool = True) -> None: 

347 """Write the supplied bytes to the new resource. 

348 

349 Parameters 

350 ---------- 

351 data : `bytes` 

352 The bytes to write to the resource. The entire contents of the 

353 resource will be replaced. 

354 overwrite : `bool`, optional 

355 If `True` the resource will be overwritten if it exists. Otherwise 

356 the write will fail. 

357 """ 

358 log.debug("Writing to remote resource: %s", self.geturl()) 

359 if not overwrite: 

360 if self.exists(): 

361 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

362 dest_url = finalurl(self._emptyPut()) 

363 with time_this(log, msg="Write data to remote %s", args=(self,)): 

364 r = self.session.put(dest_url, data=data, timeout=TIMEOUT) 

365 if r.status_code not in [201, 202, 204]: 

366 raise ValueError(f"Can not write file {self}, status code: {r.status_code}") 

367 

368 def transfer_from( 

369 self, 

370 src: ResourcePath, 

371 transfer: str = "copy", 

372 overwrite: bool = False, 

373 transaction: Optional[TransactionProtocol] = None, 

374 ) -> None: 

375 """Transfer the current resource to a Webdav repository. 

376 

377 Parameters 

378 ---------- 

379 src : `ResourcePath` 

380 Source URI. 

381 transfer : `str` 

382 Mode to use for transferring the resource. Supports the following 

383 options: copy. 

384 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

385 Currently unused. 

386 """ 

387 # Fail early to prevent delays if remote resources are requested 

388 if transfer not in self.transferModes: 

389 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

390 

391 # Existence checks cost time so do not call this unless we know 

392 # that debugging is enabled. 

393 if log.isEnabledFor(logging.DEBUG): 

394 log.debug( 

395 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

396 src, 

397 src.exists(), 

398 self, 

399 self.exists(), 

400 transfer, 

401 ) 

402 

403 if self.exists(): 

404 raise FileExistsError(f"Destination path {self} already exists.") 

405 

406 if transfer == "auto": 

407 transfer = self.transferDefault 

408 

409 if isinstance(src, type(self)): 

410 # Only available on WebDAV backends 

411 if not self.is_webdav_endpoint: 

412 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

413 

414 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)): 

415 if transfer == "move": 

416 r = self.session.request( 

417 "MOVE", src.geturl(), headers={"Destination": self.geturl()}, timeout=TIMEOUT 

418 ) 

419 log.debug("Running move via MOVE HTTP request.") 

420 else: 

421 r = self.session.request( 

422 "COPY", src.geturl(), headers={"Destination": self.geturl()}, timeout=TIMEOUT 

423 ) 

424 log.debug("Running copy via COPY HTTP request.") 

425 else: 

426 # Use local file and upload it 

427 with src.as_local() as local_uri: 

428 with open(local_uri.ospath, "rb") as f: 

429 dest_url = finalurl(self._emptyPut()) 

430 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)): 

431 r = self.session.put(dest_url, data=f, timeout=TIMEOUT) 

432 

433 if r.status_code not in [201, 202, 204]: 

434 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}") 

435 

436 # This was an explicit move requested from a remote resource 

437 # try to remove that resource 

438 if transfer == "move": 

439 # Transactions do not work here 

440 src.remove() 

441 

442 def _emptyPut(self) -> requests.Response: 

443 """Send an empty PUT request to current URL. 

444 

445 This is used to detect if redirection is enabled before sending actual 

446 data. 

447 

448 Returns 

449 ------- 

450 response : `requests.Response` 

451 HTTP Response from the endpoint. 

452 """ 

453 headers = {"Content-Length": "0"} 

454 if useExpect100(): 

455 headers["Expect"] = "100-continue" 

456 return self.session.put( 

457 self.geturl(), data=None, headers=headers, allow_redirects=False, timeout=TIMEOUT 

458 )