Coverage for python/lsst/daf/butler/core/_butlerUri/http.py: 18%

228 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-01 19:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import functools 

25import logging 

26import os 

27import os.path 

28import random 

29import stat 

30import tempfile 

31 

32import requests 

33 

34__all__ = ("ButlerHttpURI",) 

35 

36from typing import TYPE_CHECKING, BinaryIO, Optional, Tuple, Union 

37 

38from requests.adapters import HTTPAdapter 

39from requests.auth import AuthBase 

40from urllib3.util.retry import Retry 

41 

42from ..utils import time_this 

43from ._butlerUri import ButlerURI 

44from .utils import NoTransaction 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ..datastore import DatastoreTransaction 

48 

49log = logging.getLogger(__name__) 

50 

51 

52# Default timeouts for all HTTP requests, in seconds. 

53DEFAULT_TIMEOUT_CONNECT = 60 

54DEFAULT_TIMEOUT_READ = 300 

55 

56# Allow for network timeouts to be set in the environment. 

57TIMEOUT = ( 

58 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)), 

59 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)), 

60) 

61 

62# Should we send a "Expect: 100-continue" header on PUT requests? 

63# The "Expect: 100-continue" header is used by some servers (e.g. dCache) 

64# as an indication that the client knows how to handle redirects to 

65# the specific server that will actually receive the data for PUT 

66# requests. 

67_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

68 

69 

70class BearerTokenAuth(AuthBase): 

71 """Attach a bearer token 'Authorization' header to each request. 

72 

73 Parameters 

74 ---------- 

75 token : `str` 

76 Can be either the path to a local protected file which contains the 

77 value of the token or the token itself. 

78 """ 

79 

80 def __init__(self, token: str): 

81 self._token = self._path = None 

82 self._mtime: float = -1.0 

83 if not token: 

84 return 

85 

86 self._token = token 

87 if os.path.isfile(token): 

88 self._path = os.path.abspath(token) 

89 if not _is_protected(self._path): 

90 raise PermissionError( 

91 f"Bearer token file at {self._path} must be protected for access only by its owner" 

92 ) 

93 self._refresh() 

94 

95 def _refresh(self) -> None: 

96 """Read the token file (if any) if its modification time is more recent 

97 than the last time we read it. 

98 """ 

99 if not self._path: 

100 return 

101 

102 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

103 log.debug("Reading bearer token file at %s", self._path) 

104 self._mtime = mtime 

105 with open(self._path) as f: 

106 self._token = f.read().rstrip("\n") 

107 

108 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

109 if self._token: 

110 self._refresh() 

111 req.headers["Authorization"] = f"Bearer {self._token}" 

112 return req 

113 

114 

115class SessionStore: 

116 """Cache a single reusable HTTP client session per enpoint.""" 

117 

118 def __init__(self) -> None: 

119 # The key of the dictionary is a root URI and the value is the 

120 # session 

121 self._sessions: dict[str, requests.Session] = {} 

122 

123 def get(self, rpath: ButlerHttpURI, persist: bool = True) -> requests.Session: 

124 """Retrieve a session for accessing the remote resource at rpath. 

125 

126 Parameters 

127 ---------- 

128 rpath : `ButlerHttpURI` 

129 URL to a resource at the remote server for which a session is to 

130 be retrieved. 

131 

132 persist : `bool` 

133 if `True`, make the network connection with the front end server 

134 of the endpoint persistent. Connections to the backend servers 

135 are persisted. 

136 

137 Notes 

138 ----- 

139 Once a session is created for a given endpoint it is cached and 

140 returned every time a session is requested for any path under that same 

141 endpoint. For instance, a single session will be cached and shared 

142 for paths "https://www.example.org/path/to/file" and 

143 "https://www.example.org/any/other/path". 

144 

145 Note that "https://www.example.org" and "https://www.example.org:12345" 

146 will have different sessions since the port number is not identical. 

147 

148 In order to configure the session, some environment variables are 

149 inspected: 

150 

151 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

152 certificates to trust when verifying the server's certificate. 

153 

154 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

155 local file containing a bearer token to be used as the client 

156 authentication mechanism with all requests. 

157 The permissions of the token file must be set so that only its 

158 owner can access it. 

159 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

160 and LSST_HTTP_AUTH_CLIENT_KEY. 

161 

162 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

163 client certificate for authenticating to the server. 

164 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

165 initialized with the path of the client private key file. 

166 The permissions of the client private key must be set so that only 

167 its owner can access it, at least for reading. 

168 """ 

169 root_uri = str(rpath.root_uri()) 

170 if root_uri not in self._sessions: 

171 # We don't have yet a session for this endpoint: create a new one 

172 self._sessions[root_uri] = self._make_session(rpath, persist) 

173 return self._sessions[root_uri] 

174 

175 def _make_session(self, rpath: ButlerHttpURI, persist: bool) -> requests.Session: 

176 """Make a new session configured from values from the environment.""" 

177 session = requests.Session() 

178 root_uri = str(rpath.root_uri()) 

179 log.debug( 

180 "Creating new HTTP session for endpoint %s (persist connection=%s)...", 

181 root_uri, 

182 persist, 

183 ) 

184 

185 retries = Retry( 

186 total=3, 

187 connect=3, 

188 read=3, 

189 backoff_factor=5.0 + random.random(), 

190 status=3, 

191 status_forcelist=[429, 500, 502, 503, 504], 

192 ) 

193 

194 # Persist a single connection to the front end server, if required 

195 num_connections = 1 if persist else 0 

196 session.mount( 

197 root_uri, 

198 HTTPAdapter( 

199 pool_connections=1, 

200 pool_maxsize=num_connections, 

201 pool_block=False, 

202 max_retries=retries, 

203 ), 

204 ) 

205 

206 # Prevent persisting connections to back-end servers which may vary 

207 # from request to request. Systematically persisting connections to 

208 # those servers may exhaust their capabilities when there are thousands 

209 # of simultaneous clients 

210 session.mount( 

211 f"{rpath.scheme}://", 

212 HTTPAdapter( 

213 pool_connections=1, 

214 pool_maxsize=0, 

215 pool_block=False, 

216 max_retries=retries, 

217 ), 

218 ) 

219 

220 # Should we use a specific CA cert bundle for authenticating the 

221 # server? 

222 session.verify = True 

223 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

224 session.verify = ca_bundle 

225 else: 

226 log.debug( 

227 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: " 

228 "if you would need to verify the remote server's certificate " 

229 "issued by specific certificate authorities please consider " 

230 "initializing this variable." 

231 ) 

232 

233 # Should we use bearer tokens for client authentication? 

234 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

235 log.debug("... using bearer token authentication") 

236 session.auth = BearerTokenAuth(token) 

237 return session 

238 

239 # Should we instead use client certificate and private key? If so, both 

240 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

241 # initialized. 

242 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

243 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

244 if client_cert and client_key: 

245 if not _is_protected(client_key): 

246 raise PermissionError( 

247 f"Private key file at {client_key} must be protected for access only by its owner" 

248 ) 

249 log.debug("... using client certificate authentication.") 

250 session.cert = (client_cert, client_key) 

251 return session 

252 

253 if client_cert: 

254 # Only the client certificate was provided. 

255 raise ValueError( 

256 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

257 ) 

258 

259 if client_key: 

260 # Only the client private key was provided. 

261 raise ValueError( 

262 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

263 ) 

264 

265 log.debug( 

266 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

267 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

268 ) 

269 return session 

270 

271 

272@functools.lru_cache 

273def _is_webdav_endpoint(path: Union[ButlerURI, str]) -> bool: 

274 """Check whether the remote HTTP endpoint implements Webdav features. 

275 

276 Parameters 

277 ---------- 

278 path : `ButlerURI` or `str` 

279 URL to the resource to be checked. 

280 Should preferably refer to the root since the status is shared 

281 by all paths in that server. 

282 

283 Returns 

284 ------- 

285 isWebdav : `bool` 

286 True if the endpoint implements Webdav, False if it doesn't. 

287 """ 

288 if (ca_cert_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE")) is None: 

289 log.warning( 

290 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: " 

291 "some HTTPS requests may fail if remote server presents a " 

292 "certificate issued by an unknown certificate authority." 

293 ) 

294 

295 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

296 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True 

297 resp = requests.options(str(path), verify=verify) 

298 return "DAV" in resp.headers 

299 

300 

301# Tuple (path, block_size) pointing to the location of a local directory 

302# to save temporary files and the block size of the underlying file system 

303_TMPDIR: Optional[Tuple[str, int]] = None 

304 

305 

306def _get_temp_dir() -> Tuple[str, int]: 

307 """Return the temporary directory path and block size. 

308 This function caches its results in _TMPDIR. 

309 """ 

310 global _TMPDIR 

311 if _TMPDIR: 

312 return _TMPDIR 

313 

314 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

315 # 'TMPDIR', if defined. Otherwise use current working directory 

316 tmpdir = os.getcwd() 

317 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

318 if dir and os.path.isdir(dir): 

319 tmpdir = dir 

320 break 

321 

322 # Compute the block size as 256 blocks of typical size 

323 # (i.e. 4096 bytes) or 10 times the file system block size, 

324 # whichever is higher. This is a reasonable compromise between 

325 # using memory for buffering and the number of system calls 

326 # issued to read from or write to temporary files 

327 fsstats = os.statvfs(tmpdir) 

328 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

329 

330 

331class ButlerHttpURI(ButlerURI): 

332 """General HTTP(S) resource.""" 

333 

334 _is_webdav: Optional[bool] = None 

335 _sessions_store = SessionStore() 

336 _put_sessions_store = SessionStore() 

337 

338 @property 

339 def session(self) -> requests.Session: 

340 """Client session to address remote resource for all HTTP methods but 

341 PUT. 

342 """ 

343 if hasattr(self, "_session"): 

344 return self._session 

345 

346 self._session: requests.Session = self._sessions_store.get(self) 

347 return self._session 

348 

349 @property 

350 def put_session(self) -> requests.Session: 

351 """Client session for uploading data to the remote resource.""" 

352 if hasattr(self, "_put_session"): 

353 return self._put_session 

354 

355 self._put_session: requests.Session = self._put_sessions_store.get(self) 

356 return self._put_session 

357 

358 @property 

359 def is_webdav_endpoint(self) -> bool: 

360 """Check if the current endpoint implements WebDAV features. 

361 

362 This is stored per URI but cached by root so there is 

363 only one check per hostname. 

364 """ 

365 if self._is_webdav is not None: 

366 return self._is_webdav 

367 

368 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

369 return self._is_webdav 

370 

371 def exists(self) -> bool: 

372 """Check that a remote HTTP resource exists.""" 

373 log.debug("Checking if resource exists: %s", self.geturl()) 

374 resp = self.session.head(self.geturl(), timeout=TIMEOUT) 

375 return resp.status_code == 200 

376 

377 def size(self) -> int: 

378 """Return the size of the remote resource in bytes.""" 

379 if self.dirLike: 

380 return 0 

381 

382 resp = self.session.head(self.geturl(), timeout=TIMEOUT) 

383 if resp.status_code != 200: 

384 raise FileNotFoundError(f"Resource {self} does not exist") 

385 return int(resp.headers["Content-Length"]) 

386 

387 def mkdir(self) -> None: 

388 """Create the directory resource if it does not already exist.""" 

389 # Only available on WebDAV backends 

390 if not self.is_webdav_endpoint: 

391 raise NotImplementedError( 

392 "Endpoint does not implement WebDAV functionality" 

393 ) 

394 

395 if not self.dirLike: 

396 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

397 

398 if not self.exists(): 

399 # We need to test the absence of the parent directory, 

400 # but also if parent URL is different from self URL, 

401 # otherwise we could be stuck in a recursive loop 

402 # where self == parent 

403 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

404 self.parent().mkdir() 

405 log.debug("Creating new directory: %s", self.geturl()) 

406 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT) 

407 if r.status_code != 201: 

408 if r.status_code == 405: 

409 log.debug( 

410 "Can not create directory: %s may already exist: skipping.", 

411 self.geturl(), 

412 ) 

413 else: 

414 raise ValueError( 

415 f"Can not create directory {self}, status code: {r.status_code}" 

416 ) 

417 

418 def remove(self) -> None: 

419 """Remove the resource.""" 

420 log.debug("Removing resource: %s", self.geturl()) 

421 r = self.session.delete(self.geturl(), timeout=TIMEOUT) 

422 if r.status_code not in [200, 202, 204]: 

423 raise FileNotFoundError( 

424 f"Unable to delete resource {self}; status code: {r.status_code}" 

425 ) 

426 

427 def _as_local(self) -> Tuple[str, bool]: 

428 """Download object over HTTP and place in temporary directory. 

429 

430 Returns 

431 ------- 

432 path : `str` 

433 Path to local temporary file. 

434 temporary : `bool` 

435 Always returns `True`. This is always a temporary file. 

436 """ 

437 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

438 if r.status_code != 200: 

439 raise FileNotFoundError( 

440 f"Unable to download resource {self}; status code: {r.status_code}" 

441 ) 

442 tmpdir, buffering = _get_temp_dir() 

443 with tempfile.NamedTemporaryFile( 

444 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

445 ) as tmpFile: 

446 with time_this( 

447 log, 

448 msg="Downloading %s [length=%s] to local file %s [chunk_size=%d]", 

449 args=(self, r.headers.get("Content-Length"), tmpFile.name, buffering), 

450 ): 

451 for chunk in r.iter_content(chunk_size=buffering): 

452 tmpFile.write(chunk) 

453 return tmpFile.name, True 

454 

455 def read(self, size: int = -1) -> bytes: 

456 """Open the resource and return the contents in bytes. 

457 

458 Parameters 

459 ---------- 

460 size : `int`, optional 

461 The number of bytes to read. Negative or omitted indicates 

462 that all data should be read. 

463 """ 

464 log.debug("Reading from remote resource: %s", self.geturl()) 

465 stream = True if size > 0 else False 

466 with time_this(log, msg="Read from remote resource %s", args=(self,)): 

467 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

468 if r.status_code != 200: 

469 raise FileNotFoundError( 

470 f"Unable to read resource {self}; status code: {r.status_code}" 

471 ) 

472 if not stream: 

473 return r.content 

474 else: 

475 return next(r.iter_content(chunk_size=size)) 

476 

477 def write(self, data: bytes, overwrite: bool = True) -> None: 

478 """Write the supplied bytes to the new resource. 

479 

480 Parameters 

481 ---------- 

482 data : `bytes` 

483 The bytes to write to the resource. The entire contents of the 

484 resource will be replaced. 

485 overwrite : `bool`, optional 

486 If `True` the resource will be overwritten if it exists. Otherwise 

487 the write will fail. 

488 """ 

489 log.debug("Writing to remote resource: %s", self.geturl()) 

490 if not overwrite: 

491 if self.exists(): 

492 raise FileExistsError( 

493 f"Remote resource {self} exists and overwrite has been disabled" 

494 ) 

495 with time_this( 

496 log, msg="Write to remote %s (%d bytes)", args=(self, len(data)) 

497 ): 

498 self._do_put(data=data) 

499 

500 def transfer_from( 

501 self, 

502 src: ButlerURI, 

503 transfer: str = "copy", 

504 overwrite: bool = False, 

505 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None, 

506 ) -> None: 

507 """Transfer the current resource to a Webdav repository. 

508 

509 Parameters 

510 ---------- 

511 src : `ButlerURI` 

512 Source URI. 

513 transfer : `str` 

514 Mode to use for transferring the resource. Supports the following 

515 options: copy. 

516 transaction : `DatastoreTransaction`, optional 

517 Currently unused. 

518 """ 

519 # Fail early to prevent delays if remote resources are requested 

520 if transfer not in self.transferModes: 

521 raise ValueError( 

522 f"Transfer mode {transfer} not supported by URI scheme {self.scheme}" 

523 ) 

524 

525 # Existence checks cost time so do not call this unless we know 

526 # that debugging is enabled. 

527 if log.isEnabledFor(logging.DEBUG): 

528 log.debug( 

529 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

530 src, 

531 src.exists(), 

532 self, 

533 self.exists(), 

534 transfer, 

535 ) 

536 

537 if self.exists(): 

538 raise FileExistsError(f"Destination path {self} already exists.") 

539 

540 if transfer == "auto": 

541 transfer = self.transferDefault 

542 

543 if isinstance(src, type(self)): 

544 # Only available on WebDAV backends 

545 if not self.is_webdav_endpoint: 

546 raise NotImplementedError( 

547 "Endpoint does not implement WebDAV functionality" 

548 ) 

549 

550 with time_this( 

551 log, msg="Transfer from %s to %s directly", args=(src, self) 

552 ): 

553 method = "MOVE" if transfer == "move" else "COPY" 

554 log.debug("%s from %s to %s", method, src.geturl(), self.geturl()) 

555 resp = self.session.request( 

556 method, 

557 src.geturl(), 

558 headers={"Destination": self.geturl()}, 

559 timeout=TIMEOUT, 

560 ) 

561 if resp.status_code not in [201, 202, 204]: 

562 raise ValueError( 

563 f"Can not transfer file {self}, status code: {resp.status_code}" 

564 ) 

565 else: 

566 # Use local file and upload it 

567 with src.as_local() as local_uri: 

568 with open(local_uri.ospath, "rb") as f: 

569 with time_this( 

570 log, 

571 msg="Transfer from %s to %s via local file", 

572 args=(src, self), 

573 ): 

574 self._do_put(data=f) 

575 

576 # This was an explicit move requested from a remote resource 

577 # try to remove that resource 

578 if transfer == "move": 

579 # Transactions do not work here 

580 src.remove() 

581 

582 def _do_put(self, data: Union[BinaryIO, bytes]) -> None: 

583 """Perform an HTTP PUT request taking into account redirection.""" 

584 final_url = self.geturl() 

585 if _SEND_EXPECT_HEADER_ON_PUT: 

586 # Do a PUT request with an empty body and retrieve the final 

587 # destination URL returned by the server. 

588 headers = {"Content-Length": "0", "Expect": "100-continue"} 

589 resp = self.put_session.put( 

590 final_url, 

591 data=None, 

592 headers=headers, 

593 allow_redirects=False, 

594 timeout=TIMEOUT, 

595 ) 

596 if resp.is_redirect or resp.is_permanent_redirect: 

597 final_url = resp.headers["Location"] 

598 log.debug( 

599 "PUT request to %s redirected to %s", self.geturl(), final_url 

600 ) 

601 

602 # Send data to its final destination. 

603 resp = self.put_session.put(final_url, data=data, timeout=TIMEOUT) 

604 if resp.status_code not in [201, 202, 204]: 

605 raise ValueError( 

606 f"Can not write file {self}, status code: {resp.status_code}" 

607 ) 

608 

609 

610def _is_protected(filepath: str) -> bool: 

611 """Return true if the permissions of file at filepath only allow for access 

612 by its owner. 

613 

614 Parameters 

615 ---------- 

616 filepath : `str` 

617 Path of a local file. 

618 """ 

619 if not os.path.isfile(filepath): 

620 return False 

621 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

622 owner_accessible = bool(mode & stat.S_IRWXU) 

623 group_accessible = bool(mode & stat.S_IRWXG) 

624 other_accessible = bool(mode & stat.S_IRWXO) 

625 return owner_accessible and not group_accessible and not other_accessible