Coverage for python/lsst/resources/http.py: 18%

230 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-04 02:17 -0700

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14import functools 

15import logging 

16import os 

17import os.path 

18import random 

19import stat 

20import tempfile 

21 

22import requests 

23 

24__all__ = ("HttpResourcePath",) 

25 

26from typing import TYPE_CHECKING, BinaryIO, Optional, Tuple, Union 

27 

28from lsst.utils.timer import time_this 

29from requests.adapters import HTTPAdapter 

30from requests.auth import AuthBase 

31from urllib3.util.retry import Retry 

32 

33from ._resourcePath import ResourcePath 

34 

35if TYPE_CHECKING: 35 ↛ 36line 35 didn't jump to line 36, because the condition on line 35 was never true

36 from .utils import TransactionProtocol 

37 

38log = logging.getLogger(__name__) 

39 

40 

41# Default timeouts for all HTTP requests, in seconds. 

42DEFAULT_TIMEOUT_CONNECT = 60 

43DEFAULT_TIMEOUT_READ = 300 

44 

45# Allow for network timeouts to be set in the environment. 

46TIMEOUT = ( 

47 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)), 

48 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)), 

49) 

50 

51# Should we send a "Expect: 100-continue" header on PUT requests? 

52# The "Expect: 100-continue" header is used by some servers (e.g. dCache) 

53# as an indication that the client knows how to handle redirects to 

54# the specific server that will actually receive the data for PUT 

55# requests. 

56_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

57 

58 

59@functools.lru_cache 

60def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool: 

61 """Check whether the remote HTTP endpoint implements WebDAV features. 

62 

63 Parameters 

64 ---------- 

65 path : `ResourcePath` or `str` 

66 URL to the resource to be checked. 

67 Should preferably refer to the root since the status is shared 

68 by all paths in that server. 

69 

70 Returns 

71 ------- 

72 _is_webdav_endpoint : `bool` 

73 True if the endpoint implements WebDAV, False if it doesn't. 

74 """ 

75 if (ca_cert_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE")) is None: 

76 log.warning( 

77 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: " 

78 "some HTTPS requests may fail if remote server presents a " 

79 "certificate issued by an unknown certificate authority." 

80 ) 

81 

82 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

83 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True 

84 resp = requests.options(str(path), verify=verify) 

85 return "DAV" in resp.headers 

86 

87 

88# Tuple (path, block_size) pointing to the location of a local directory 

89# to save temporary files and the block size of the underlying file system. 

90_TMPDIR: Optional[Tuple[str, int]] = None 

91 

92 

93def _get_temp_dir() -> Tuple[str, int]: 

94 """Return the temporary directory path and block size. 

95 

96 This function caches its results in _TMPDIR. 

97 """ 

98 global _TMPDIR 

99 if _TMPDIR: 

100 return _TMPDIR 

101 

102 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

103 # 'TMPDIR', if defined. Otherwise use current working directory. 

104 tmpdir = os.getcwd() 

105 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

106 if dir and os.path.isdir(dir): 

107 tmpdir = dir 

108 break 

109 

110 # Compute the block size as 256 blocks of typical size 

111 # (i.e. 4096 bytes) or 10 times the file system block size, 

112 # whichever is higher. This is a reasonable compromise between 

113 # using memory for buffering and the number of system calls 

114 # issued to read from or write to temporary files. 

115 fsstats = os.statvfs(tmpdir) 

116 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

117 

118 

119class BearerTokenAuth(AuthBase): 

120 """Attach a bearer token 'Authorization' header to each request. 

121 

122 Parameters 

123 ---------- 

124 token : `str` 

125 Can be either the path to a local protected file which contains the 

126 value of the token or the token itself. 

127 """ 

128 

129 def __init__(self, token: str): 

130 self._token = self._path = None 

131 self._mtime: float = -1.0 

132 if not token: 

133 return 

134 

135 self._token = token 

136 if os.path.isfile(token): 

137 self._path = os.path.abspath(token) 

138 if not _is_protected(self._path): 

139 raise PermissionError( 

140 f"Bearer token file at {self._path} must be protected for access only by its owner" 

141 ) 

142 self._refresh() 

143 

144 def _refresh(self) -> None: 

145 """Read the token file (if any) if its modification time is more recent 

146 than the last time we read it. 

147 """ 

148 if not self._path: 

149 return 

150 

151 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

152 log.debug("Reading bearer token file at %s", self._path) 

153 self._mtime = mtime 

154 with open(self._path) as f: 

155 self._token = f.read().rstrip("\n") 

156 

157 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

158 if self._token: 

159 self._refresh() 

160 req.headers["Authorization"] = f"Bearer {self._token}" 

161 return req 

162 

163 

164class SessionStore: 

165 """Cache a single reusable HTTP client session per enpoint.""" 

166 

167 def __init__(self) -> None: 

168 # The key of the dictionary is a root URI and the value is the 

169 # session 

170 self._sessions: dict[str, requests.Session] = {} 

171 

172 def get(self, rpath: ResourcePath, persist: bool = True) -> requests.Session: 

173 """Retrieve a session for accessing the remote resource at rpath. 

174 

175 Parameters 

176 ---------- 

177 rpath : `ResourcePath` 

178 URL to a resource at the remote server for which a session is to 

179 be retrieved. 

180 

181 persist : `bool` 

182 if `True`, make the network connection with the front end server 

183 of the endpoint persistent. Connections to the backend servers 

184 are persisted. 

185 

186 Notes 

187 ----- 

188 Once a session is created for a given endpoint it is cached and 

189 returned every time a session is requested for any path under that same 

190 endpoint. For instance, a single session will be cached and shared 

191 for paths "https://www.example.org/path/to/file" and 

192 "https://www.example.org/any/other/path". 

193 

194 Note that "https://www.example.org" and "https://www.example.org:12345" 

195 will have different sessions since the port number is not identical. 

196 

197 In order to configure the session, some environment variables are 

198 inspected: 

199 

200 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

201 certificates to trust when verifying the server's certificate. 

202 

203 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

204 local file containing a bearer token to be used as the client 

205 authentication mechanism with all requests. 

206 The permissions of the token file must be set so that only its 

207 owner can access it. 

208 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

209 and LSST_HTTP_AUTH_CLIENT_KEY. 

210 

211 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

212 client certificate for authenticating to the server. 

213 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

214 initialized with the path of the client private key file. 

215 The permissions of the client private key must be set so that only 

216 its owner can access it, at least for reading. 

217 """ 

218 root_uri = str(rpath.root_uri()) 

219 if root_uri not in self._sessions: 

220 # We don't have yet a session for this endpoint: create a new one 

221 self._sessions[root_uri] = self._make_session(rpath, persist) 

222 return self._sessions[root_uri] 

223 

224 def _make_session(self, rpath: ResourcePath, persist: bool) -> requests.Session: 

225 """Make a new session configured from values from the environment.""" 

226 session = requests.Session() 

227 root_uri = str(rpath.root_uri()) 

228 log.debug("Creating new HTTP session for endpoint %s (persist connection=%s)...", root_uri, persist) 

229 

230 retries = Retry( 

231 total=3, 

232 connect=3, 

233 read=3, 

234 backoff_factor=5.0 + random.random(), 

235 status=3, 

236 status_forcelist=[429, 500, 502, 503, 504], 

237 ) 

238 

239 # Persist a single connection to the front end server, if required 

240 num_connections = 1 if persist else 0 

241 session.mount( 

242 root_uri, 

243 HTTPAdapter( 

244 pool_connections=1, pool_maxsize=num_connections, pool_block=False, max_retries=retries 

245 ), 

246 ) 

247 

248 # Prevent persisting connections to back-end servers which may vary 

249 # from request to request. Systematically persisting connections to 

250 # those servers may exhaust their capabilities when there are thousands 

251 # of simultaneous clients 

252 session.mount( 

253 f"{rpath.scheme}://", 

254 HTTPAdapter(pool_connections=1, pool_maxsize=0, pool_block=False, max_retries=retries), 

255 ) 

256 

257 # Should we use a specific CA cert bundle for authenticating the 

258 # server? 

259 session.verify = True 

260 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

261 session.verify = ca_bundle 

262 else: 

263 log.debug( 

264 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: " 

265 "if you would need to verify the remote server's certificate " 

266 "issued by specific certificate authorities please consider " 

267 "initializing this variable." 

268 ) 

269 

270 # Should we use bearer tokens for client authentication? 

271 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

272 log.debug("... using bearer token authentication") 

273 session.auth = BearerTokenAuth(token) 

274 return session 

275 

276 # Should we instead use client certificate and private key? If so, both 

277 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

278 # initialized. 

279 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

280 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

281 if client_cert and client_key: 

282 if not _is_protected(client_key): 

283 raise PermissionError( 

284 f"Private key file at {client_key} must be protected for access only by its owner" 

285 ) 

286 log.debug("... using client certificate authentication.") 

287 session.cert = (client_cert, client_key) 

288 return session 

289 

290 if client_cert: 

291 # Only the client certificate was provided. 

292 raise ValueError( 

293 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

294 ) 

295 

296 if client_key: 

297 # Only the client private key was provided. 

298 raise ValueError( 

299 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

300 ) 

301 

302 log.debug( 

303 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

304 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

305 ) 

306 return session 

307 

308 

309class HttpResourcePath(ResourcePath): 

310 """General HTTP(S) resource. 

311 

312 Notes 

313 ----- 

314 In order to configure the behavior of the object, one environment variable 

315 is inspected: 

316 

317 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

318 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

319 This header is required by some servers to detect if the client 

320 knows how to handle redirections. In case of redirection, the body 

321 of the PUT request is sent to the redirected location and not to 

322 the front end server. 

323 """ 

324 

325 _is_webdav: Optional[bool] = None 

326 _sessions_store = SessionStore() 

327 _put_sessions_store = SessionStore() 

328 

329 # Use a session exclusively for PUT requests and another session for 

330 # all other requests. PUT requests may be redirected and in that case 

331 # the server may close the persisted connection. If that is the case 

332 # only the connection persisted for PUT requests will be closed and 

333 # the other persisted connection will be kept alive and reused for 

334 # other requests. 

335 

336 @property 

337 def session(self) -> requests.Session: 

338 """Client session to address remote resource for all HTTP methods but 

339 PUT. 

340 """ 

341 if hasattr(self, "_session"): 

342 return self._session 

343 

344 self._session: requests.Session = self._sessions_store.get(self) 

345 return self._session 

346 

347 @property 

348 def put_session(self) -> requests.Session: 

349 """Client session for uploading data to the remote resource.""" 

350 if hasattr(self, "_put_session"): 

351 return self._put_session 

352 

353 self._put_session: requests.Session = self._put_sessions_store.get(self) 

354 return self._put_session 

355 

356 @property 

357 def is_webdav_endpoint(self) -> bool: 

358 """Check if the current endpoint implements WebDAV features. 

359 

360 This is stored per URI but cached by root so there is 

361 only one check per hostname. 

362 """ 

363 if self._is_webdav is not None: 

364 return self._is_webdav 

365 

366 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

367 return self._is_webdav 

368 

369 def exists(self) -> bool: 

370 """Check that a remote HTTP resource exists.""" 

371 log.debug("Checking if resource exists: %s", self.geturl()) 

372 resp = self.session.head(self.geturl(), timeout=TIMEOUT) 

373 return resp.status_code == 200 

374 

375 def size(self) -> int: 

376 """Return the size of the remote resource in bytes.""" 

377 if self.dirLike: 

378 return 0 

379 

380 resp = self.session.head(self.geturl(), timeout=TIMEOUT) 

381 if resp.status_code != 200: 

382 raise FileNotFoundError(f"Resource {self} does not exist") 

383 return int(resp.headers["Content-Length"]) 

384 

385 def mkdir(self) -> None: 

386 """Create the directory resource if it does not already exist.""" 

387 # Creating directories is only available on WebDAV backends. 

388 if not self.is_webdav_endpoint: 

389 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

390 

391 if not self.dirLike: 

392 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

393 

394 if not self.exists(): 

395 # We need to test the absence of the parent directory, 

396 # but also if parent URL is different from self URL, 

397 # otherwise we could be stuck in a recursive loop 

398 # where self == parent. 

399 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

400 self.parent().mkdir() 

401 log.debug("Creating new directory: %s", self.geturl()) 

402 resp = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT) 

403 if resp.status_code != 201: 

404 if resp.status_code == 405: 

405 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

406 else: 

407 raise ValueError(f"Can not create directory {self}, status code: {resp.status_code}") 

408 

409 def remove(self) -> None: 

410 """Remove the resource.""" 

411 log.debug("Removing resource: %s", self.geturl()) 

412 resp = self.session.delete(self.geturl(), timeout=TIMEOUT) 

413 if resp.status_code not in [200, 202, 204]: 

414 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {resp.status_code}") 

415 

416 def _as_local(self) -> Tuple[str, bool]: 

417 """Download object over HTTP and place in temporary directory. 

418 

419 Returns 

420 ------- 

421 path : `str` 

422 Path to local temporary file. 

423 temporary : `bool` 

424 Always returns `True`. This is always a temporary file. 

425 """ 

426 resp = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

427 if resp.status_code != 200: 

428 raise FileNotFoundError(f"Unable to download resource {self}; status code: {resp.status_code}") 

429 

430 tmpdir, buffering = _get_temp_dir() 

431 with tempfile.NamedTemporaryFile( 

432 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

433 ) as tmpFile: 

434 with time_this( 

435 log, 

436 msg="Downloading %s [length=%s] to local file %s [chunk_size=%d]", 

437 args=(self, resp.headers.get("Content-Length"), tmpFile.name, buffering), 

438 ): 

439 for chunk in resp.iter_content(chunk_size=buffering): 

440 tmpFile.write(chunk) 

441 return tmpFile.name, True 

442 

443 def read(self, size: int = -1) -> bytes: 

444 """Open the resource and return the contents in bytes. 

445 

446 Parameters 

447 ---------- 

448 size : `int`, optional 

449 The number of bytes to read. Negative or omitted indicates 

450 that all data should be read. 

451 """ 

452 log.debug("Reading from remote resource: %s", self.geturl()) 

453 stream = True if size > 0 else False 

454 with time_this(log, msg="Read from remote resource %s", args=(self,)): 

455 resp = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

456 if resp.status_code != 200: 

457 raise FileNotFoundError(f"Unable to read resource {self}; status code: {resp.status_code}") 

458 if not stream: 

459 return resp.content 

460 else: 

461 return next(resp.iter_content(chunk_size=size)) 

462 

463 def write(self, data: bytes, overwrite: bool = True) -> None: 

464 """Write the supplied bytes to the new resource. 

465 

466 Parameters 

467 ---------- 

468 data : `bytes` 

469 The bytes to write to the resource. The entire contents of the 

470 resource will be replaced. 

471 overwrite : `bool`, optional 

472 If `True` the resource will be overwritten if it exists. Otherwise 

473 the write will fail. 

474 """ 

475 log.debug("Writing to remote resource: %s", self.geturl()) 

476 if not overwrite: 

477 if self.exists(): 

478 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

479 with time_this(log, msg="Write to remote %s (%d bytes)", args=(self, len(data))): 

480 self._do_put(data=data) 

481 

482 def transfer_from( 

483 self, 

484 src: ResourcePath, 

485 transfer: str = "copy", 

486 overwrite: bool = False, 

487 transaction: Optional[TransactionProtocol] = None, 

488 ) -> None: 

489 """Transfer the current resource to a Webdav repository. 

490 

491 Parameters 

492 ---------- 

493 src : `ResourcePath` 

494 Source URI. 

495 transfer : `str` 

496 Mode to use for transferring the resource. Supports the following 

497 options: copy. 

498 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

499 Currently unused. 

500 """ 

501 # Fail early to prevent delays if remote resources are requested 

502 if transfer not in self.transferModes: 

503 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

504 

505 # Existence checks cost time so do not call this unless we know 

506 # that debugging is enabled. 

507 if log.isEnabledFor(logging.DEBUG): 

508 log.debug( 

509 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

510 src, 

511 src.exists(), 

512 self, 

513 self.exists(), 

514 transfer, 

515 ) 

516 

517 # Short circuit if the URIs are identical immediately. 

518 if self == src: 

519 log.debug( 

520 "Target and destination URIs are identical: %s, returning immediately." 

521 " No further action required.", 

522 self, 

523 ) 

524 return 

525 

526 if self.exists() and not overwrite: 

527 raise FileExistsError(f"Destination path {self} already exists.") 

528 

529 if transfer == "auto": 

530 transfer = self.transferDefault 

531 

532 if isinstance(src, type(self)): 

533 # Only available on WebDAV backends. 

534 if not self.is_webdav_endpoint: 

535 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

536 

537 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)): 

538 method = "MOVE" if transfer == "move" else "COPY" 

539 log.debug("%s from %s to %s", method, src.geturl(), self.geturl()) 

540 resp = self.session.request( 

541 method, src.geturl(), headers={"Destination": self.geturl()}, timeout=TIMEOUT 

542 ) 

543 if resp.status_code not in [201, 202, 204]: 

544 raise ValueError(f"Can not transfer file {self}, status code: {resp.status_code}") 

545 else: 

546 # Use local file and upload it. 

547 with src.as_local() as local_uri: 

548 with open(local_uri.ospath, "rb") as f: 

549 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)): 

550 self._do_put(data=f) 

551 

552 # This was an explicit move requested from a remote resource 

553 # try to remove that resource. 

554 if transfer == "move": 

555 # Transactions do not work here 

556 src.remove() 

557 

558 def _do_put(self, data: Union[BinaryIO, bytes]) -> None: 

559 """Perform an HTTP PUT request taking into account redirection.""" 

560 final_url = self.geturl() 

561 if _SEND_EXPECT_HEADER_ON_PUT: 

562 # Do a PUT request with an empty body and retrieve the final 

563 # destination URL returned by the server. 

564 headers = {"Content-Length": "0", "Expect": "100-continue"} 

565 resp = self.put_session.put( 

566 final_url, data=None, headers=headers, allow_redirects=False, timeout=TIMEOUT 

567 ) 

568 if resp.is_redirect or resp.is_permanent_redirect: 

569 final_url = resp.headers["Location"] 

570 log.debug("PUT request to %s redirected to %s", self.geturl(), final_url) 

571 

572 # Send data to its final destination. 

573 resp = self.put_session.put(final_url, data=data, timeout=TIMEOUT) 

574 if resp.status_code not in [201, 202, 204]: 

575 raise ValueError(f"Can not write file {self}, status code: {resp.status_code}") 

576 

577 

578def _is_protected(filepath: str) -> bool: 

579 """Return true if the permissions of file at filepath only allow for access 

580 by its owner. 

581 

582 Parameters 

583 ---------- 

584 filepath : `str` 

585 Path of a local file. 

586 """ 

587 if not os.path.isfile(filepath): 

588 return False 

589 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

590 owner_accessible = bool(mode & stat.S_IRWXU) 

591 group_accessible = bool(mode & stat.S_IRWXG) 

592 other_accessible = bool(mode & stat.S_IRWXO) 

593 return owner_accessible and not group_accessible and not other_accessible