Coverage for python/lsst/resources/http.py: 15%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

211 statements  

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14import functools 

15import logging 

16import os 

17import os.path 

18import tempfile 

19 

20import requests 

21 

22__all__ = ("HttpResourcePath",) 

23 

24from typing import TYPE_CHECKING, Optional, Tuple, Union 

25 

26from lsst.utils.timer import time_this 

27from requests.adapters import HTTPAdapter 

28from urllib3.util.retry import Retry 

29 

30from ._resourcePath import ResourcePath 

31 

32if TYPE_CHECKING: 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true

33 from .utils import TransactionProtocol 

34 

35log = logging.getLogger(__name__) 

36 

37# Default timeout for all HTTP requests, in seconds 

38TIMEOUT = 20 

39 

40 

41def getHttpSession() -> requests.Session: 

42 """Create a requests.Session pre-configured with environment variable data. 

43 

44 Returns 

45 ------- 

46 session : `requests.Session` 

47 An http session used to execute requests. 

48 

49 Notes 

50 ----- 

51 The following environment variables must be set: 

52 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA 

53 certificates are stored if you intend to use HTTPS to 

54 communicate with the endpoint. 

55 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use. 

56 Possible values are X509 and TOKEN 

57 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy 

58 certificate used to authenticate requests 

59 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which 

60 contains the bearer token used to authenticate requests 

61 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an 

62 "Expect: 100-Continue" header in all requests. This is required 

63 on certain endpoints where requests redirection is made. 

64 """ 

65 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) 

66 

67 session = requests.Session() 

68 session.mount("http://", HTTPAdapter(max_retries=retries)) 

69 session.mount("https://", HTTPAdapter(max_retries=retries)) 

70 

71 log.debug("Creating new HTTP session...") 

72 

73 ca_bundle = None 

74 try: 

75 ca_bundle = os.environ["LSST_BUTLER_WEBDAV_CA_BUNDLE"] 

76 except KeyError: 

77 log.debug( 

78 "Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

79 "If you would like to trust additional CAs, please consider " 

80 "exporting this variable." 

81 ) 

82 session.verify = ca_bundle 

83 

84 try: 

85 env_auth_method = os.environ["LSST_BUTLER_WEBDAV_AUTH"] 

86 except KeyError: 

87 log.debug("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, no authentication configured.") 

88 log.debug("Unauthenticated session configured and ready.") 

89 return session 

90 

91 if env_auth_method == "X509": 

92 log.debug("... using x509 authentication.") 

93 try: 

94 proxy_cert = os.environ["LSST_BUTLER_WEBDAV_PROXY_CERT"] 

95 except KeyError: 

96 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set") 

97 session.cert = (proxy_cert, proxy_cert) 

98 elif env_auth_method == "TOKEN": 

99 log.debug("... using bearer-token authentication.") 

100 refreshToken(session) 

101 else: 

102 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN") 

103 

104 log.debug("Authenticated session configured and ready.") 

105 return session 

106 

107 

108def useExpect100() -> bool: 

109 """Return the status of the "Expect-100" header. 

110 

111 Returns 

112 ------- 

113 useExpect100 : `bool` 

114 True if LSST_BUTLER_WEBDAV_EXPECT100 is set, False otherwise. 

115 """ 

116 # This header is required for request redirection, in dCache for example 

117 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ: 

118 log.debug("Expect: 100-Continue header enabled.") 

119 return True 

120 return False 

121 

122 

123def isTokenAuth() -> bool: 

124 """Return the status of bearer-token authentication. 

125 

126 Returns 

127 ------- 

128 isTokenAuth : `bool` 

129 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise. 

130 """ 

131 try: 

132 env_auth_method = os.environ["LSST_BUTLER_WEBDAV_AUTH"] 

133 except KeyError: 

134 raise KeyError( 

135 "Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, please use values X509 or TOKEN" 

136 ) 

137 

138 if env_auth_method == "TOKEN": 

139 return True 

140 return False 

141 

142 

143def refreshToken(session: requests.Session) -> None: 

144 """Refresh the session token. 

145 

146 Set or update the 'Authorization' header of the session, 

147 configure bearer token authentication, with the value fetched 

148 from LSST_BUTLER_WEBDAV_TOKEN_FILE 

149 

150 Parameters 

151 ---------- 

152 session : `requests.Session` 

153 Session on which bearer token authentication must be configured. 

154 """ 

155 try: 

156 token_path = os.environ["LSST_BUTLER_WEBDAV_TOKEN_FILE"] 

157 if not os.path.isfile(token_path): 

158 raise FileNotFoundError(f"No token file: {token_path}") 

159 with open(os.environ["LSST_BUTLER_WEBDAV_TOKEN_FILE"], "r") as fh: 

160 bearer_token = fh.read().replace("\n", "") 

161 except KeyError: 

162 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set") 

163 

164 session.headers.update({"Authorization": "Bearer " + bearer_token}) 

165 

166 

167@functools.lru_cache 

168def isWebdavEndpoint(path: Union[ResourcePath, str]) -> bool: 

169 """Check whether the remote HTTP endpoint implements Webdav features. 

170 

171 Parameters 

172 ---------- 

173 path : `ResourcePath` or `str` 

174 URL to the resource to be checked. 

175 Should preferably refer to the root since the status is shared 

176 by all paths in that server. 

177 

178 Returns 

179 ------- 

180 isWebdav : `bool` 

181 True if the endpoint implements Webdav, False if it doesn't. 

182 """ 

183 ca_bundle = None 

184 try: 

185 ca_bundle = os.environ["LSST_BUTLER_WEBDAV_CA_BUNDLE"] 

186 except KeyError: 

187 log.warning( 

188 "Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

189 "some HTTPS requests will fail. If you intend to use HTTPS, please " 

190 "export this variable." 

191 ) 

192 

193 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

194 r = requests.options(str(path), verify=ca_bundle) 

195 return True if "DAV" in r.headers else False 

196 

197 

198def finalurl(r: requests.Response) -> str: 

199 """Calculate the final URL, including redirects. 

200 

201 Check whether the remote HTTP endpoint redirects to a different 

202 endpoint, and return the final destination of the request. 

203 This is needed when using PUT operations, to avoid starting 

204 to send the data to the endpoint, before having to send it again once 

205 the 307 redirect response is received, and thus wasting bandwidth. 

206 

207 Parameters 

208 ---------- 

209 r : `requests.Response` 

210 An HTTP response received when requesting the endpoint 

211 

212 Returns 

213 ------- 

214 destination_url: `string` 

215 The final destination to which requests must be sent. 

216 """ 

217 destination_url = r.url 

218 if r.status_code == 307: 

219 destination_url = r.headers["Location"] 

220 log.debug("Request redirected to %s", destination_url) 

221 return destination_url 

222 

223 

224# Tuple (path, block_size) pointing to the location of a local directory 

225# to save temporary files and the block size of the underlying file system 

226_TMPDIR: Optional[Tuple[str, int]] = None 

227 

228 

229def _get_temp_dir() -> Tuple[str, int]: 

230 """Return the temporary directory path and block size. 

231 

232 This function caches its results in _TMPDIR. 

233 """ 

234 global _TMPDIR 

235 if _TMPDIR: 

236 return _TMPDIR 

237 

238 # Use the value of environment variables 'LSST_BUTLER_TMPDIR' or 

239 # 'TMPDIR', if defined. Otherwise use current working directory 

240 tmpdir = os.getcwd() 

241 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

242 if dir and os.path.isdir(dir): 

243 tmpdir = dir 

244 break 

245 

246 # Compute the block size as 256 blocks of typical size 

247 # (i.e. 4096 bytes) or 10 times the file system block size, 

248 # whichever is higher. This is a reasonable compromise between 

249 # using memory for buffering and the number of system calls 

250 # issued to read from or write to temporary files 

251 fsstats = os.statvfs(tmpdir) 

252 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

253 

254 

255class HttpResourcePath(ResourcePath): 

256 """General HTTP(S) resource.""" 

257 

258 _session = requests.Session() 

259 _sessionInitialized = False 

260 _is_webdav: Optional[bool] = None 

261 

262 @property 

263 def session(self) -> requests.Session: 

264 """Client object to address remote resource.""" 

265 cls = type(self) 

266 if cls._sessionInitialized: 

267 if isTokenAuth(): 

268 refreshToken(cls._session) 

269 return cls._session 

270 

271 s = getHttpSession() 

272 cls._session = s 

273 cls._sessionInitialized = True 

274 return s 

275 

276 @property 

277 def is_webdav_endpoint(self) -> bool: 

278 """Check if the current endpoint implements WebDAV features. 

279 

280 This is stored per URI but cached by root so there is 

281 only one check per hostname. 

282 """ 

283 if self._is_webdav is not None: 

284 return self._is_webdav 

285 

286 self._is_webdav = isWebdavEndpoint(self.root_uri()) 

287 return self._is_webdav 

288 

289 def exists(self) -> bool: 

290 """Check that a remote HTTP resource exists.""" 

291 log.debug("Checking if resource exists: %s", self.geturl()) 

292 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

293 

294 return True if r.status_code == 200 else False 

295 

296 def size(self) -> int: 

297 """Return the size of the remote resource in bytes.""" 

298 if self.dirLike: 

299 return 0 

300 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

301 if r.status_code == 200: 

302 return int(r.headers["Content-Length"]) 

303 else: 

304 raise FileNotFoundError(f"Resource {self} does not exist") 

305 

306 def mkdir(self) -> None: 

307 """Create the directory resource if it does not already exist.""" 

308 # Only available on WebDAV backends 

309 if not self.is_webdav_endpoint: 

310 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

311 

312 if not self.dirLike: 

313 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

314 

315 if not self.exists(): 

316 # We need to test the absence of the parent directory, 

317 # but also if parent URL is different from self URL, 

318 # otherwise we could be stuck in a recursive loop 

319 # where self == parent 

320 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

321 self.parent().mkdir() 

322 log.debug("Creating new directory: %s", self.geturl()) 

323 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT) 

324 if r.status_code != 201: 

325 if r.status_code == 405: 

326 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

327 else: 

328 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}") 

329 

330 def remove(self) -> None: 

331 """Remove the resource.""" 

332 log.debug("Removing resource: %s", self.geturl()) 

333 r = self.session.delete(self.geturl(), timeout=TIMEOUT) 

334 if r.status_code not in [200, 202, 204]: 

335 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}") 

336 

337 def _as_local(self) -> Tuple[str, bool]: 

338 """Download object over HTTP and place in temporary directory. 

339 

340 Returns 

341 ------- 

342 path : `str` 

343 Path to local temporary file. 

344 temporary : `bool` 

345 Always returns `True`. This is always a temporary file. 

346 """ 

347 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

348 if r.status_code != 200: 

349 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}") 

350 

351 tmpdir, buffering = _get_temp_dir() 

352 with tempfile.NamedTemporaryFile( 

353 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

354 ) as tmpFile: 

355 with time_this( 

356 log, 

357 msg="Downloading %s [length=%s] to local file %s [chunk_size=%d]", 

358 args=(self, r.headers.get("Content-Length"), tmpFile.name, buffering), 

359 ): 

360 for chunk in r.iter_content(chunk_size=buffering): 

361 tmpFile.write(chunk) 

362 return tmpFile.name, True 

363 

364 def read(self, size: int = -1) -> bytes: 

365 """Open the resource and return the contents in bytes. 

366 

367 Parameters 

368 ---------- 

369 size : `int`, optional 

370 The number of bytes to read. Negative or omitted indicates 

371 that all data should be read. 

372 """ 

373 log.debug("Reading from remote resource: %s", self.geturl()) 

374 stream = True if size > 0 else False 

375 with time_this(log, msg="Read from remote resource %s", args=(self,)): 

376 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

377 if r.status_code != 200: 

378 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}") 

379 if not stream: 

380 return r.content 

381 else: 

382 return next(r.iter_content(chunk_size=size)) 

383 

384 def write(self, data: bytes, overwrite: bool = True) -> None: 

385 """Write the supplied bytes to the new resource. 

386 

387 Parameters 

388 ---------- 

389 data : `bytes` 

390 The bytes to write to the resource. The entire contents of the 

391 resource will be replaced. 

392 overwrite : `bool`, optional 

393 If `True` the resource will be overwritten if it exists. Otherwise 

394 the write will fail. 

395 """ 

396 log.debug("Writing to remote resource: %s", self.geturl()) 

397 if not overwrite: 

398 if self.exists(): 

399 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

400 dest_url = finalurl(self._emptyPut()) 

401 with time_this(log, msg="Write data to remote %s", args=(self,)): 

402 r = self.session.put(dest_url, data=data, timeout=TIMEOUT) 

403 if r.status_code not in [201, 202, 204]: 

404 raise ValueError(f"Can not write file {self}, status code: {r.status_code}") 

405 

406 def transfer_from( 

407 self, 

408 src: ResourcePath, 

409 transfer: str = "copy", 

410 overwrite: bool = False, 

411 transaction: Optional[TransactionProtocol] = None, 

412 ) -> None: 

413 """Transfer the current resource to a Webdav repository. 

414 

415 Parameters 

416 ---------- 

417 src : `ResourcePath` 

418 Source URI. 

419 transfer : `str` 

420 Mode to use for transferring the resource. Supports the following 

421 options: copy. 

422 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

423 Currently unused. 

424 """ 

425 # Fail early to prevent delays if remote resources are requested 

426 if transfer not in self.transferModes: 

427 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

428 

429 # Existence checks cost time so do not call this unless we know 

430 # that debugging is enabled. 

431 if log.isEnabledFor(logging.DEBUG): 

432 log.debug( 

433 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

434 src, 

435 src.exists(), 

436 self, 

437 self.exists(), 

438 transfer, 

439 ) 

440 

441 if self.exists(): 

442 raise FileExistsError(f"Destination path {self} already exists.") 

443 

444 if transfer == "auto": 

445 transfer = self.transferDefault 

446 

447 if isinstance(src, type(self)): 

448 # Only available on WebDAV backends 

449 if not self.is_webdav_endpoint: 

450 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

451 

452 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)): 

453 if transfer == "move": 

454 r = self.session.request( 

455 "MOVE", src.geturl(), headers={"Destination": self.geturl()}, timeout=TIMEOUT 

456 ) 

457 log.debug("Running move via MOVE HTTP request.") 

458 else: 

459 r = self.session.request( 

460 "COPY", src.geturl(), headers={"Destination": self.geturl()}, timeout=TIMEOUT 

461 ) 

462 log.debug("Running copy via COPY HTTP request.") 

463 else: 

464 # Use local file and upload it 

465 with src.as_local() as local_uri: 

466 with open(local_uri.ospath, "rb") as f: 

467 dest_url = finalurl(self._emptyPut()) 

468 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)): 

469 r = self.session.put(dest_url, data=f, timeout=TIMEOUT) 

470 

471 if r.status_code not in [201, 202, 204]: 

472 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}") 

473 

474 # This was an explicit move requested from a remote resource 

475 # try to remove that resource 

476 if transfer == "move": 

477 # Transactions do not work here 

478 src.remove() 

479 

480 def _emptyPut(self) -> requests.Response: 

481 """Send an empty PUT request to current URL. 

482 

483 This is used to detect if redirection is enabled before sending actual 

484 data. 

485 

486 Returns 

487 ------- 

488 response : `requests.Response` 

489 HTTP Response from the endpoint. 

490 """ 

491 headers = {"Content-Length": "0"} 

492 if useExpect100(): 

493 headers["Expect"] = "100-continue" 

494 return self.session.put( 

495 self.geturl(), data=None, headers=headers, allow_redirects=False, timeout=TIMEOUT 

496 )