Coverage for python/lsst/resources/http.py: 17%

246 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-01 02:02 -0800

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpResourcePath",) 

15 

16import contextlib 

17import functools 

18import io 

19import logging 

20import os 

21import os.path 

22import random 

23import stat 

24import tempfile 

25from typing import TYPE_CHECKING, BinaryIO, Iterator, Optional, Tuple, Union, cast 

26 

27import requests 

28from lsst.utils.timer import time_this 

29from requests.adapters import HTTPAdapter 

30from requests.auth import AuthBase 

31from urllib3.util.retry import Retry 

32 

33from ._resourceHandles import ResourceHandleProtocol 

34from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle 

35from ._resourcePath import ResourcePath 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from .utils import TransactionProtocol 

39 

40log = logging.getLogger(__name__) 

41 

42 

43# Default timeouts for all HTTP requests, in seconds. 

44DEFAULT_TIMEOUT_CONNECT = 60 

45DEFAULT_TIMEOUT_READ = 300 

46 

47# Allow for network timeouts to be set in the environment. 

48TIMEOUT = ( 

49 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)), 

50 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)), 

51) 

52 

53# Should we send a "Expect: 100-continue" header on PUT requests? 

54# The "Expect: 100-continue" header is used by some servers (e.g. dCache) 

55# as an indication that the client knows how to handle redirects to 

56# the specific server that will actually receive the data for PUT 

57# requests. 

58_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

59 

60 

61@functools.lru_cache 

62def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool: 

63 """Check whether the remote HTTP endpoint implements WebDAV features. 

64 

65 Parameters 

66 ---------- 

67 path : `ResourcePath` or `str` 

68 URL to the resource to be checked. 

69 Should preferably refer to the root since the status is shared 

70 by all paths in that server. 

71 

72 Returns 

73 ------- 

74 _is_webdav_endpoint : `bool` 

75 True if the endpoint implements WebDAV, False if it doesn't. 

76 """ 

77 if (ca_cert_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE")) is None: 

78 log.warning( 

79 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: " 

80 "some HTTPS requests may fail if remote server presents a " 

81 "certificate issued by an unknown certificate authority." 

82 ) 

83 

84 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

85 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True 

86 resp = requests.options(str(path), verify=verify) 

87 return "DAV" in resp.headers 

88 

89 

90# Tuple (path, block_size) pointing to the location of a local directory 

91# to save temporary files and the block size of the underlying file system. 

92_TMPDIR: Optional[Tuple[str, int]] = None 

93 

94 

95def _get_temp_dir() -> Tuple[str, int]: 

96 """Return the temporary directory path and block size. 

97 

98 This function caches its results in _TMPDIR. 

99 """ 

100 global _TMPDIR 

101 if _TMPDIR: 

102 return _TMPDIR 

103 

104 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

105 # 'TMPDIR', if defined. Otherwise use current working directory. 

106 tmpdir = os.getcwd() 

107 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

108 if dir and os.path.isdir(dir): 

109 tmpdir = dir 

110 break 

111 

112 # Compute the block size as 256 blocks of typical size 

113 # (i.e. 4096 bytes) or 10 times the file system block size, 

114 # whichever is higher. This is a reasonable compromise between 

115 # using memory for buffering and the number of system calls 

116 # issued to read from or write to temporary files. 

117 fsstats = os.statvfs(tmpdir) 

118 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

119 

120 

121class BearerTokenAuth(AuthBase): 

122 """Attach a bearer token 'Authorization' header to each request. 

123 

124 Parameters 

125 ---------- 

126 token : `str` 

127 Can be either the path to a local protected file which contains the 

128 value of the token or the token itself. 

129 """ 

130 

131 def __init__(self, token: str): 

132 self._token = self._path = None 

133 self._mtime: float = -1.0 

134 if not token: 

135 return 

136 

137 self._token = token 

138 if os.path.isfile(token): 

139 self._path = os.path.abspath(token) 

140 if not _is_protected(self._path): 

141 raise PermissionError( 

142 f"Bearer token file at {self._path} must be protected for access only by its owner" 

143 ) 

144 self._refresh() 

145 

146 def _refresh(self) -> None: 

147 """Read the token file (if any) if its modification time is more recent 

148 than the last time we read it. 

149 """ 

150 if not self._path: 

151 return 

152 

153 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

154 log.debug("Reading bearer token file at %s", self._path) 

155 self._mtime = mtime 

156 with open(self._path) as f: 

157 self._token = f.read().rstrip("\n") 

158 

159 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

160 if self._token: 

161 self._refresh() 

162 req.headers["Authorization"] = f"Bearer {self._token}" 

163 return req 

164 

165 

166class SessionStore: 

167 """Cache a single reusable HTTP client session per enpoint.""" 

168 

169 def __init__(self) -> None: 

170 # The key of the dictionary is a root URI and the value is the 

171 # session 

172 self._sessions: dict[str, requests.Session] = {} 

173 

174 def get(self, rpath: ResourcePath, persist: bool = True) -> requests.Session: 

175 """Retrieve a session for accessing the remote resource at rpath. 

176 

177 Parameters 

178 ---------- 

179 rpath : `ResourcePath` 

180 URL to a resource at the remote server for which a session is to 

181 be retrieved. 

182 

183 persist : `bool` 

184 if `True`, make the network connection with the front end server 

185 of the endpoint persistent. Connections to the backend servers 

186 are persisted. 

187 

188 Notes 

189 ----- 

190 Once a session is created for a given endpoint it is cached and 

191 returned every time a session is requested for any path under that same 

192 endpoint. For instance, a single session will be cached and shared 

193 for paths "https://www.example.org/path/to/file" and 

194 "https://www.example.org/any/other/path". 

195 

196 Note that "https://www.example.org" and "https://www.example.org:12345" 

197 will have different sessions since the port number is not identical. 

198 

199 In order to configure the session, some environment variables are 

200 inspected: 

201 

202 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

203 certificates to trust when verifying the server's certificate. 

204 

205 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

206 local file containing a bearer token to be used as the client 

207 authentication mechanism with all requests. 

208 The permissions of the token file must be set so that only its 

209 owner can access it. 

210 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

211 and LSST_HTTP_AUTH_CLIENT_KEY. 

212 

213 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

214 client certificate for authenticating to the server. 

215 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

216 initialized with the path of the client private key file. 

217 The permissions of the client private key must be set so that only 

218 its owner can access it, at least for reading. 

219 """ 

220 root_uri = str(rpath.root_uri()) 

221 if root_uri not in self._sessions: 

222 # We don't have yet a session for this endpoint: create a new one 

223 self._sessions[root_uri] = self._make_session(rpath, persist) 

224 return self._sessions[root_uri] 

225 

226 def _make_session(self, rpath: ResourcePath, persist: bool) -> requests.Session: 

227 """Make a new session configured from values from the environment.""" 

228 session = requests.Session() 

229 root_uri = str(rpath.root_uri()) 

230 log.debug("Creating new HTTP session for endpoint %s (persist connection=%s)...", root_uri, persist) 

231 

232 retries = Retry( 

233 total=3, 

234 connect=3, 

235 read=3, 

236 backoff_factor=5.0 + random.random(), 

237 status=3, 

238 status_forcelist=[429, 500, 502, 503, 504], 

239 ) 

240 

241 # Persist a single connection to the front end server, if required 

242 num_connections = 1 if persist else 0 

243 session.mount( 

244 root_uri, 

245 HTTPAdapter( 

246 pool_connections=1, pool_maxsize=num_connections, pool_block=False, max_retries=retries 

247 ), 

248 ) 

249 

250 # Prevent persisting connections to back-end servers which may vary 

251 # from request to request. Systematically persisting connections to 

252 # those servers may exhaust their capabilities when there are thousands 

253 # of simultaneous clients 

254 session.mount( 

255 f"{rpath.scheme}://", 

256 HTTPAdapter(pool_connections=1, pool_maxsize=0, pool_block=False, max_retries=retries), 

257 ) 

258 

259 # Should we use a specific CA cert bundle for authenticating the 

260 # server? 

261 session.verify = True 

262 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

263 session.verify = ca_bundle 

264 else: 

265 log.debug( 

266 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: " 

267 "if you would need to verify the remote server's certificate " 

268 "issued by specific certificate authorities please consider " 

269 "initializing this variable." 

270 ) 

271 

272 # Should we use bearer tokens for client authentication? 

273 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

274 log.debug("... using bearer token authentication") 

275 session.auth = BearerTokenAuth(token) 

276 return session 

277 

278 # Should we instead use client certificate and private key? If so, both 

279 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

280 # initialized. 

281 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

282 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

283 if client_cert and client_key: 

284 if not _is_protected(client_key): 

285 raise PermissionError( 

286 f"Private key file at {client_key} must be protected for access only by its owner" 

287 ) 

288 log.debug("... using client certificate authentication.") 

289 session.cert = (client_cert, client_key) 

290 return session 

291 

292 if client_cert: 

293 # Only the client certificate was provided. 

294 raise ValueError( 

295 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

296 ) 

297 

298 if client_key: 

299 # Only the client private key was provided. 

300 raise ValueError( 

301 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

302 ) 

303 

304 log.debug( 

305 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

306 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

307 ) 

308 return session 

309 

310 

311class HttpResourcePath(ResourcePath): 

312 """General HTTP(S) resource. 

313 

314 Notes 

315 ----- 

316 In order to configure the behavior of the object, one environment variable 

317 is inspected: 

318 

319 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

320 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

321 This header is required by some servers to detect if the client 

322 knows how to handle redirections. In case of redirection, the body 

323 of the PUT request is sent to the redirected location and not to 

324 the front end server. 

325 """ 

326 

327 _is_webdav: Optional[bool] = None 

328 _sessions_store = SessionStore() 

329 _put_sessions_store = SessionStore() 

330 

331 # Use a session exclusively for PUT requests and another session for 

332 # all other requests. PUT requests may be redirected and in that case 

333 # the server may close the persisted connection. If that is the case 

334 # only the connection persisted for PUT requests will be closed and 

335 # the other persisted connection will be kept alive and reused for 

336 # other requests. 

337 

338 @property 

339 def session(self) -> requests.Session: 

340 """Client session to address remote resource for all HTTP methods but 

341 PUT. 

342 """ 

343 if hasattr(self, "_session"): 

344 return self._session 

345 

346 self._session: requests.Session = self._sessions_store.get(self) 

347 return self._session 

348 

349 @property 

350 def put_session(self) -> requests.Session: 

351 """Client session for uploading data to the remote resource.""" 

352 if hasattr(self, "_put_session"): 

353 return self._put_session 

354 

355 self._put_session: requests.Session = self._put_sessions_store.get(self) 

356 return self._put_session 

357 

358 @property 

359 def is_webdav_endpoint(self) -> bool: 

360 """Check if the current endpoint implements WebDAV features. 

361 

362 This is stored per URI but cached by root so there is 

363 only one check per hostname. 

364 """ 

365 if self._is_webdav is not None: 

366 return self._is_webdav 

367 

368 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

369 return self._is_webdav 

370 

371 def exists(self) -> bool: 

372 """Check that a remote HTTP resource exists.""" 

373 log.debug("Checking if resource exists: %s", self.geturl()) 

374 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True) 

375 return resp.status_code == 200 

376 

377 def size(self) -> int: 

378 """Return the size of the remote resource in bytes.""" 

379 if self.dirLike: 

380 return 0 

381 

382 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True) 

383 if resp.status_code != 200: 

384 raise FileNotFoundError(f"Resource {self} does not exist") 

385 return int(resp.headers["Content-Length"]) 

386 

387 def mkdir(self) -> None: 

388 """Create the directory resource if it does not already exist.""" 

389 # Creating directories is only available on WebDAV backends. 

390 if not self.is_webdav_endpoint: 

391 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

392 

393 if not self.dirLike: 

394 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

395 

396 if not self.exists(): 

397 # We need to test the absence of the parent directory, 

398 # but also if parent URL is different from self URL, 

399 # otherwise we could be stuck in a recursive loop 

400 # where self == parent. 

401 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

402 self.parent().mkdir() 

403 log.debug("Creating new directory: %s", self.geturl()) 

404 resp = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT) 

405 if resp.status_code != 201: 

406 if resp.status_code == 405: 

407 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

408 else: 

409 raise ValueError(f"Can not create directory {self}, status code: {resp.status_code}") 

410 

411 def remove(self) -> None: 

412 """Remove the resource.""" 

413 log.debug("Removing resource: %s", self.geturl()) 

414 resp = self.session.delete(self.geturl(), timeout=TIMEOUT) 

415 if resp.status_code not in [200, 202, 204]: 

416 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {resp.status_code}") 

417 

418 def _as_local(self) -> Tuple[str, bool]: 

419 """Download object over HTTP and place in temporary directory. 

420 

421 Returns 

422 ------- 

423 path : `str` 

424 Path to local temporary file. 

425 temporary : `bool` 

426 Always returns `True`. This is always a temporary file. 

427 """ 

428 resp = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

429 if resp.status_code != 200: 

430 raise FileNotFoundError(f"Unable to download resource {self}; status code: {resp.status_code}") 

431 

432 tmpdir, buffering = _get_temp_dir() 

433 with tempfile.NamedTemporaryFile( 

434 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

435 ) as tmpFile: 

436 with time_this( 

437 log, 

438 msg="Downloading %s [length=%s] to local file %s [chunk_size=%d]", 

439 args=(self, resp.headers.get("Content-Length"), tmpFile.name, buffering), 

440 ): 

441 for chunk in resp.iter_content(chunk_size=buffering): 

442 tmpFile.write(chunk) 

443 return tmpFile.name, True 

444 

445 def read(self, size: int = -1) -> bytes: 

446 """Open the resource and return the contents in bytes. 

447 

448 Parameters 

449 ---------- 

450 size : `int`, optional 

451 The number of bytes to read. Negative or omitted indicates 

452 that all data should be read. 

453 """ 

454 log.debug("Reading from remote resource: %s", self.geturl()) 

455 stream = True if size > 0 else False 

456 with time_this(log, msg="Read from remote resource %s", args=(self,)): 

457 resp = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

458 if resp.status_code != 200: 

459 raise FileNotFoundError(f"Unable to read resource {self}; status code: {resp.status_code}") 

460 if not stream: 

461 return resp.content 

462 else: 

463 return next(resp.iter_content(chunk_size=size)) 

464 

465 def write(self, data: bytes, overwrite: bool = True) -> None: 

466 """Write the supplied bytes to the new resource. 

467 

468 Parameters 

469 ---------- 

470 data : `bytes` 

471 The bytes to write to the resource. The entire contents of the 

472 resource will be replaced. 

473 overwrite : `bool`, optional 

474 If `True` the resource will be overwritten if it exists. Otherwise 

475 the write will fail. 

476 """ 

477 log.debug("Writing to remote resource: %s", self.geturl()) 

478 if not overwrite: 

479 if self.exists(): 

480 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

481 with time_this(log, msg="Write to remote %s (%d bytes)", args=(self, len(data))): 

482 self._do_put(data=data) 

483 

484 def transfer_from( 

485 self, 

486 src: ResourcePath, 

487 transfer: str = "copy", 

488 overwrite: bool = False, 

489 transaction: Optional[TransactionProtocol] = None, 

490 ) -> None: 

491 """Transfer the current resource to a Webdav repository. 

492 

493 Parameters 

494 ---------- 

495 src : `ResourcePath` 

496 Source URI. 

497 transfer : `str` 

498 Mode to use for transferring the resource. Supports the following 

499 options: copy. 

500 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

501 Currently unused. 

502 """ 

503 # Fail early to prevent delays if remote resources are requested 

504 if transfer not in self.transferModes: 

505 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

506 

507 # Existence checks cost time so do not call this unless we know 

508 # that debugging is enabled. 

509 if log.isEnabledFor(logging.DEBUG): 

510 log.debug( 

511 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

512 src, 

513 src.exists(), 

514 self, 

515 self.exists(), 

516 transfer, 

517 ) 

518 

519 # Short circuit if the URIs are identical immediately. 

520 if self == src: 

521 log.debug( 

522 "Target and destination URIs are identical: %s, returning immediately." 

523 " No further action required.", 

524 self, 

525 ) 

526 return 

527 

528 if self.exists() and not overwrite: 

529 raise FileExistsError(f"Destination path {self} already exists.") 

530 

531 if transfer == "auto": 

532 transfer = self.transferDefault 

533 

534 if isinstance(src, type(self)): 

535 # Only available on WebDAV backends. 

536 if not self.is_webdav_endpoint: 

537 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

538 

539 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)): 

540 method = "MOVE" if transfer == "move" else "COPY" 

541 log.debug("%s from %s to %s", method, src.geturl(), self.geturl()) 

542 resp = self.session.request( 

543 method, src.geturl(), headers={"Destination": self.geturl()}, timeout=TIMEOUT 

544 ) 

545 if resp.status_code not in [201, 202, 204]: 

546 raise ValueError(f"Can not transfer file {self}, status code: {resp.status_code}") 

547 else: 

548 # Use local file and upload it. 

549 with src.as_local() as local_uri: 

550 with open(local_uri.ospath, "rb") as f: 

551 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)): 

552 self._do_put(data=f) 

553 

554 # This was an explicit move requested from a remote resource 

555 # try to remove that resource. 

556 if transfer == "move": 

557 # Transactions do not work here 

558 src.remove() 

559 

560 def _do_put(self, data: Union[BinaryIO, bytes]) -> None: 

561 """Perform an HTTP PUT request taking into account redirection.""" 

562 final_url = self.geturl() 

563 if _SEND_EXPECT_HEADER_ON_PUT: 

564 # Do a PUT request with an empty body and retrieve the final 

565 # destination URL returned by the server. 

566 headers = {"Content-Length": "0", "Expect": "100-continue"} 

567 resp = self.put_session.put( 

568 final_url, data=None, headers=headers, allow_redirects=False, timeout=TIMEOUT 

569 ) 

570 if resp.is_redirect or resp.is_permanent_redirect: 

571 final_url = resp.headers["Location"] 

572 log.debug("PUT request to %s redirected to %s", self.geturl(), final_url) 

573 

574 # Send data to its final destination. 

575 resp = self.put_session.put(final_url, data=data, timeout=TIMEOUT) 

576 if resp.status_code not in [200, 201, 202, 204]: 

577 raise ValueError(f"Can not write file {self}, status code: {resp.status_code}") 

578 

579 @contextlib.contextmanager 

580 def _openImpl( 

581 self, 

582 mode: str = "r", 

583 *, 

584 encoding: Optional[str] = None, 

585 ) -> Iterator[ResourceHandleProtocol]: 

586 url = self.geturl() 

587 response = self.session.head(url, timeout=TIMEOUT, allow_redirects=True) 

588 accepts_range = "Accept-Ranges" in response.headers 

589 handle: ResourceHandleProtocol 

590 if mode in ("rb", "r") and accepts_range: 

591 handle = HttpReadResourceHandle( 

592 mode, log, url=self.geturl(), session=self.session, timeout=TIMEOUT 

593 ) 

594 if mode == "r": 

595 # cast because the protocol is compatible, but does not have 

596 # BytesIO in the inheritance tree 

597 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding) 

598 else: 

599 yield handle 

600 else: 

601 with super()._openImpl(mode, encoding=encoding) as http_handle: 

602 yield http_handle 

603 

604 

605def _is_protected(filepath: str) -> bool: 

606 """Return true if the permissions of file at filepath only allow for access 

607 by its owner. 

608 

609 Parameters 

610 ---------- 

611 filepath : `str` 

612 Path of a local file. 

613 """ 

614 if not os.path.isfile(filepath): 

615 return False 

616 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

617 owner_accessible = bool(mode & stat.S_IRWXU) 

618 group_accessible = bool(mode & stat.S_IRWXG) 

619 other_accessible = bool(mode & stat.S_IRWXO) 

620 return owner_accessible and not group_accessible and not other_accessible