Coverage for python/lsst/resources/http.py: 23%

570 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-31 09:33 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpResourcePath",) 

15 

16import contextlib 

17import functools 

18import io 

19import logging 

20import math 

21import os 

22import os.path 

23import random 

24import re 

25import stat 

26import tempfile 

27from collections.abc import Iterator 

28from typing import TYPE_CHECKING, BinaryIO, cast 

29 

30try: 

31 # Prefer 'defusedxml' (not part of standard library) if available, since 

32 # 'xml' is vulnerable to XML bombs. 

33 import defusedxml.ElementTree as eTree 

34except ImportError: 

35 import xml.etree.ElementTree as eTree 

36 

37import requests 

38from astropy import units as u 

39from lsst.utils.timer import time_this 

40from requests.adapters import HTTPAdapter 

41from requests.auth import AuthBase 

42from urllib3.util.retry import Retry 

43 

44from ._resourceHandles import ResourceHandleProtocol 

45from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle 

46from ._resourcePath import ResourcePath 

47 

48if TYPE_CHECKING: 

49 from .utils import TransactionProtocol 

50 

51log = logging.getLogger(__name__) 

52 

53 

54def _timeout_from_environment(env_var: str, default_value: float) -> float: 

55 """Convert and return a timeout from the value of an environment variable 

56 or a default value if the environment variable is not initialized. The 

57 value of `env_var` must be a valid `float` otherwise this function raises. 

58 

59 Parameters 

60 ---------- 

61 env_var : `str` 

62 Environment variable to look for. 

63 default_value: `float`` 

64 Value to return if `env_var` is not defined in the environment. 

65 

66 Returns 

67 ------- 

68 _timeout_from_environment : `float` 

69 Converted value. 

70 """ 

71 try: 

72 timeout = float(os.environ.get(env_var, default_value)) 

73 except ValueError: 

74 raise ValueError( 

75 f"Expecting valid timeout value in environment variable {env_var} but found " 

76 f"{os.environ.get(env_var)}" 

77 ) 

78 

79 if math.isnan(timeout): 

80 raise ValueError(f"Unexpected timeout value NaN found in environment variable {env_var}") 

81 

82 return timeout 

83 

84 

85class HttpResourcePathConfig: 

86 """Configuration class to encapsulate the configurable items used by class 

87 HttpResourcePath. 

88 """ 

89 

90 # Default timeouts for all HTTP requests (seconds). 

91 DEFAULT_TIMEOUT_CONNECT = 30.0 

92 DEFAULT_TIMEOUT_READ = 1_500.0 

93 

94 # Default lower and upper bounds for the backoff interval (seconds). 

95 # A value in this interval is randomly selected as the backoff factor when 

96 # requests need to be retried. 

97 DEFAULT_BACKOFF_MIN = 1.0 

98 DEFAULT_BACKOFF_MAX = 3.0 

99 

100 # Default number of connections to persist with both the front end and 

101 # back end servers. 

102 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2 

103 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1 

104 

105 # Accepted digest algorithms 

106 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512") 

107 

108 _front_end_connections: int | None = None 

109 _back_end_connections: int | None = None 

110 _digest_algorithm: str | None = None 

111 _send_expect_on_put: bool | None = None 

112 _timeout: tuple[float, float] | None = None 

113 _collect_memory_usage: bool | None = None 

114 _backoff_min: float | None = None 

115 _backoff_max: float | None = None 

116 

117 @property 

118 def front_end_connections(self) -> int: 

119 """Number of persistent connections to the front end server.""" 

120 if self._front_end_connections is not None: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true

121 return self._front_end_connections 

122 

123 try: 

124 self._front_end_connections = int( 

125 os.environ.get( 

126 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

127 ) 

128 ) 

129 except ValueError: 

130 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

131 

132 return self._front_end_connections 

133 

134 @property 

135 def back_end_connections(self) -> int: 

136 """Number of persistent connections to the back end servers.""" 

137 if self._back_end_connections is not None: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true

138 return self._back_end_connections 

139 

140 try: 

141 self._back_end_connections = int( 

142 os.environ.get( 

143 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

144 ) 

145 ) 

146 except ValueError: 

147 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

148 

149 return self._back_end_connections 

150 

151 @property 

152 def digest_algorithm(self) -> str: 

153 """Algorithm to ask the server to use for computing and recording 

154 digests of each file contents in PUT requests. 

155 

156 Returns 

157 ------- 

158 digest_algorithm: `str` 

159 The name of a digest algorithm or the empty string if no algotihm 

160 is configured. 

161 """ 

162 if self._digest_algorithm is not None: 

163 return self._digest_algorithm 

164 

165 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower() 

166 if digest not in self.ACCEPTED_DIGESTS: 

167 digest = "" 

168 

169 self._digest_algorithm = digest 

170 return self._digest_algorithm 

171 

172 @property 

173 def send_expect_on_put(self) -> bool: 

174 """Return True if a "Expect: 100-continue" header is to be sent to 

175 the server on each PUT request. 

176 

177 Some servers (e.g. dCache) uses this information as an indication that 

178 the client knows how to handle redirects to the specific server that 

179 will actually receive the data for PUT requests. 

180 """ 

181 if self._send_expect_on_put is not None: 

182 return self._send_expect_on_put 

183 

184 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

185 return self._send_expect_on_put 

186 

187 @property 

188 def timeout(self) -> tuple[float, float]: 

189 """Return a tuple with the values of timeouts for connecting to the 

190 server and reading its response, respectively. Both values are in 

191 seconds. 

192 """ 

193 if self._timeout is not None: 

194 return self._timeout 

195 

196 self._timeout = ( 

197 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT), 

198 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ), 

199 ) 

200 return self._timeout 

201 

202 @property 

203 def collect_memory_usage(self) -> bool: 

204 """Return true if we want to collect memory usage when timing 

205 operations against the remote server via the `lsst.utils.time_this` 

206 context manager. 

207 """ 

208 if self._collect_memory_usage is not None: 

209 return self._collect_memory_usage 

210 

211 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ 

212 return self._collect_memory_usage 

213 

214 @property 

215 def backoff_min(self) -> float: 

216 """Lower bound of the interval from which a backoff factor is randomly 

217 selected when retrying requests (seconds). 

218 """ 

219 if self._backoff_min is not None: 

220 return self._backoff_min 

221 

222 self._backoff_min = self.DEFAULT_BACKOFF_MIN 

223 try: 

224 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN)) 

225 if not math.isnan(backoff_min): 225 ↛ 230line 225 didn't jump to line 230, because the condition on line 225 was never false

226 self._backoff_min = backoff_min 

227 except ValueError: 

228 pass 

229 

230 return self._backoff_min 

231 

232 @property 

233 def backoff_max(self) -> float: 

234 """Upper bound of the interval from which a backoff factor is randomly 

235 selected when retrying requests (seconds). 

236 """ 

237 if self._backoff_max is not None: 

238 return self._backoff_max 

239 

240 self._backoff_max = self.DEFAULT_BACKOFF_MAX 

241 try: 

242 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX)) 

243 if not math.isnan(backoff_max): 243 ↛ 248line 243 didn't jump to line 248, because the condition on line 243 was never false

244 self._backoff_max = backoff_max 

245 except ValueError: 

246 pass 

247 

248 return self._backoff_max 

249 

250 

251@functools.lru_cache 

252def _is_webdav_endpoint(path: ResourcePath | str) -> bool: 

253 """Check whether the remote HTTP endpoint implements WebDAV features. 

254 

255 Parameters 

256 ---------- 

257 path : `ResourcePath` or `str` 

258 URL to the resource to be checked. 

259 Should preferably refer to the root since the status is shared 

260 by all paths in that server. 

261 

262 Returns 

263 ------- 

264 _is_webdav_endpoint : `bool` 

265 True if the endpoint implements WebDAV, False if it doesn't. 

266 """ 

267 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

268 

269 # Send an OPTIONS request and inspect its response. An OPTIONS 

270 # request does not need authentication of the client, so we don't need 

271 # to provide a client certificate or a bearer token. We set a 

272 # relatively short timeout since an OPTIONS request is relatively cheap 

273 # for the server to compute. 

274 

275 # Create a session for configuring retries 

276 retries = Retry( 

277 # Total number of retries to allow. Takes precedence over other 

278 # counts. 

279 total=6, 

280 # How many connection-related errors to retry on. 

281 connect=3, 

282 # How many times to retry on read errors. 

283 read=3, 

284 # How many times to retry on bad status codes. 

285 status=5, 

286 # Set of uppercased HTTP method verbs that we should retry on. 

287 allowed_methods=frozenset( 

288 [ 

289 "OPTIONS", 

290 ] 

291 ), 

292 # HTTP status codes that we should force a retry on. 

293 status_forcelist=frozenset( 

294 [ 

295 requests.codes.too_many_requests, # 429 

296 requests.codes.internal_server_error, # 500 

297 requests.codes.bad_gateway, # 502 

298 requests.codes.service_unavailable, # 503 

299 requests.codes.gateway_timeout, # 504 

300 ] 

301 ), 

302 # Whether to respect 'Retry-After' header on status codes defined 

303 # above. 

304 respect_retry_after_header=True, 

305 ) 

306 

307 try: 

308 session = requests.Session() 

309 session.mount(str(path), HTTPAdapter(max_retries=retries)) 

310 session.verify = os.environ.get("LSST_HTTP_CACERT_BUNDLE", True) 

311 with session: 

312 resp = session.options( 

313 str(path), 

314 stream=False, 

315 timeout=( 

316 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", 30.0), 

317 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", 60.0), 

318 ), 

319 ) 

320 if resp.status_code not in (requests.codes.ok, requests.codes.created): 

321 return False 

322 

323 # Check that "1" is part of the value of the "DAV" header. We don't 

324 # use locks, so a server complying to class 1 is enough for our 

325 # purposes. All webDAV servers must advertise at least compliance 

326 # class "1". 

327 # 

328 # Compliance classes are documented in 

329 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes 

330 # 

331 # Examples of values for header DAV are: 

332 # DAV: 1, 2 

333 # DAV: 1, <http://apache.org/dav/propset/fs/1> 

334 if "DAV" not in resp.headers: 

335 return False 

336 else: 

337 # Convert to str to keep mypy happy 

338 compliance_class = str(resp.headers.get("DAV")) 

339 return "1" in compliance_class.replace(" ", "").split(",") 

340 

341 except requests.exceptions.SSLError as e: 

342 log.warning( 

343 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to " 

344 "specify tha path to a bundle of certificate authorities you trust " 

345 "which are not included in the default set of trusted authorities " 

346 "of this system." 

347 ) 

348 raise e 

349 

350 

351# Tuple (path, block_size) pointing to the location of a local directory 

352# to save temporary files and the block size of the underlying file system. 

353_TMPDIR: tuple[str, int] | None = None 

354 

355 

356def _get_temp_dir() -> tuple[str, int]: 

357 """Return the temporary directory path and block size. 

358 

359 This function caches its results in _TMPDIR. 

360 """ 

361 global _TMPDIR 

362 if _TMPDIR: 

363 return _TMPDIR 

364 

365 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

366 # 'TMPDIR', if defined. Otherwise use current working directory. 

367 tmpdir = os.getcwd() 

368 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

369 if dir and os.path.isdir(dir): 

370 tmpdir = dir 

371 break 

372 

373 # Compute the block size as 256 blocks of typical size 

374 # (i.e. 4096 bytes) or 10 times the file system block size, 

375 # whichever is higher. This is a reasonable compromise between 

376 # using memory for buffering and the number of system calls 

377 # issued to read from or write to temporary files. 

378 fsstats = os.statvfs(tmpdir) 

379 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

380 

381 

382class BearerTokenAuth(AuthBase): 

383 """Attach a bearer token 'Authorization' header to each request. 

384 

385 Parameters 

386 ---------- 

387 token : `str` 

388 Can be either the path to a local protected file which contains the 

389 value of the token or the token itself. 

390 """ 

391 

392 def __init__(self, token: str): 

393 self._token = self._path = None 

394 self._mtime: float = -1.0 

395 if not token: 

396 return 

397 

398 self._token = token 

399 if os.path.isfile(token): 

400 self._path = os.path.abspath(token) 

401 if not _is_protected(self._path): 

402 raise PermissionError( 

403 f"Bearer token file at {self._path} must be protected for access only by its owner" 

404 ) 

405 self._refresh() 

406 

407 def _refresh(self) -> None: 

408 """Read the token file (if any) if its modification time is more recent 

409 than the last time we read it. 

410 """ 

411 if not self._path: 

412 return 

413 

414 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

415 log.debug("Reading bearer token file at %s", self._path) 

416 self._mtime = mtime 

417 with open(self._path) as f: 

418 self._token = f.read().rstrip("\n") 

419 

420 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

421 # Only add a bearer token to a request when using secure HTTP. 

422 if req.url and req.url.lower().startswith("https://") and self._token: 

423 self._refresh() 

424 req.headers["Authorization"] = f"Bearer {self._token}" 

425 return req 

426 

427 

428class SessionStore: 

429 """Cache a reusable HTTP client session per endpoint.""" 

430 

431 def __init__( 

432 self, 

433 num_pools: int = 10, 

434 max_persistent_connections: int = 1, 

435 backoff_min: float = 1.0, 

436 backoff_max: float = 3.0, 

437 ) -> None: 

438 # Dictionary to store the session associated to a given URI. The key 

439 # of the dictionary is a root URI and the value is the session. 

440 self._sessions: dict[str, requests.Session] = {} 

441 

442 # Number of connection pools to keep: there is one pool per remote 

443 # host. See documentation of urllib3 PoolManager class: 

444 # https://urllib3.readthedocs.io 

445 self._num_pools: int = num_pools 

446 

447 # Maximum number of connections per remote host to persist in each 

448 # connection pool. See urllib3 Advanced Usage documentation: 

449 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html 

450 self._max_persistent_connections: int = max_persistent_connections 

451 

452 # Minimum and maximum values of the inverval to compute the exponential 

453 # backoff factor when retrying requests (seconds). 

454 self._backoff_min: float = backoff_min 

455 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0 

456 

457 def clear(self) -> None: 

458 """Destroy all previously created sessions and attempt to close 

459 underlying idle network connections. 

460 """ 

461 # Close all sessions and empty the store. Idle network connections 

462 # should be closed as a consequence. We don't have means through 

463 # the API exposed by Requests to actually force closing the 

464 # underlying open sockets. 

465 for session in self._sessions.values(): 

466 session.close() 

467 

468 self._sessions.clear() 

469 

470 def get(self, rpath: ResourcePath) -> requests.Session: 

471 """Retrieve a session for accessing the remote resource at rpath. 

472 

473 Parameters 

474 ---------- 

475 rpath : `ResourcePath` 

476 URL to a resource at the remote server for which a session is to 

477 be retrieved. 

478 

479 Notes 

480 ----- 

481 Once a session is created for a given endpoint it is cached and 

482 returned every time a session is requested for any path under that same 

483 endpoint. For instance, a single session will be cached and shared 

484 for paths "https://www.example.org/path/to/file" and 

485 "https://www.example.org/any/other/path". 

486 

487 Note that "https://www.example.org" and "https://www.example.org:12345" 

488 will have different sessions since the port number is not identical. 

489 

490 In order to configure the session, some environment variables are 

491 inspected: 

492 

493 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

494 certificates to trust when verifying the server's certificate. 

495 

496 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

497 local file containing a bearer token to be used as the client 

498 authentication mechanism with all requests. 

499 The permissions of the token file must be set so that only its 

500 owner can access it. 

501 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

502 and LSST_HTTP_AUTH_CLIENT_KEY. 

503 

504 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

505 client certificate for authenticating to the server. 

506 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

507 initialized with the path of the client private key file. 

508 The permissions of the client private key must be set so that only 

509 its owner can access it, at least for reading. 

510 """ 

511 root_uri = str(rpath.root_uri()) 

512 if root_uri not in self._sessions: 

513 # We don't have yet a session for this endpoint: create a new one. 

514 self._sessions[root_uri] = self._make_session(rpath) 

515 

516 return self._sessions[root_uri] 

517 

518 def _make_session(self, rpath: ResourcePath) -> requests.Session: 

519 """Make a new session configured from values from the environment.""" 

520 session = requests.Session() 

521 root_uri = str(rpath.root_uri()) 

522 log.debug("Creating new HTTP session for endpoint %s ...", root_uri) 

523 retries = Retry( 

524 # Total number of retries to allow. Takes precedence over other 

525 # counts. 

526 total=6, 

527 # How many connection-related errors to retry on. 

528 connect=3, 

529 # How many times to retry on read errors. 

530 read=3, 

531 # Backoff factor to apply between attempts after the second try 

532 # (seconds). Compute a random jitter to prevent all the clients 

533 # to overwhelm the server by sending requests at the same time. 

534 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(), 

535 # How many times to retry on bad status codes. 

536 status=5, 

537 # Set of uppercased HTTP method verbs that we should retry on. 

538 # We only automatically retry idempotent requests. 

539 allowed_methods=frozenset( 

540 [ 

541 "COPY", 

542 "DELETE", 

543 "GET", 

544 "HEAD", 

545 "MKCOL", 

546 "OPTIONS", 

547 "PROPFIND", 

548 "PUT", 

549 ] 

550 ), 

551 # HTTP status codes that we should force a retry on. 

552 status_forcelist=frozenset( 

553 [ 

554 requests.codes.too_many_requests, # 429 

555 requests.codes.internal_server_error, # 500 

556 requests.codes.bad_gateway, # 502 

557 requests.codes.service_unavailable, # 503 

558 requests.codes.gateway_timeout, # 504 

559 ] 

560 ), 

561 # Whether to respect Retry-After header on status codes defined 

562 # above. 

563 respect_retry_after_header=True, 

564 ) 

565 

566 # Persist the specified number of connections to the front end server. 

567 session.mount( 

568 root_uri, 

569 HTTPAdapter( 

570 pool_connections=self._num_pools, 

571 pool_maxsize=self._max_persistent_connections, 

572 pool_block=False, 

573 max_retries=retries, 

574 ), 

575 ) 

576 

577 # Do not persist the connections to back end servers which may vary 

578 # from request to request. Systematically persisting connections to 

579 # those servers may exhaust their capabilities when there are thousands 

580 # of simultaneous clients. 

581 session.mount( 

582 f"{rpath.scheme}://", 

583 HTTPAdapter( 

584 pool_connections=self._num_pools, 

585 pool_maxsize=0, 

586 pool_block=False, 

587 max_retries=retries, 

588 ), 

589 ) 

590 

591 # If the remote endpoint doesn't use secure HTTP we don't include 

592 # bearer tokens in the requests nor need to authenticate the remote 

593 # server. 

594 if rpath.scheme != "https": 

595 return session 

596 

597 # Should we use a specific CA cert bundle for authenticating the 

598 # server? 

599 session.verify = True 

600 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

601 session.verify = ca_bundle 

602 

603 # Should we use bearer tokens for client authentication? 

604 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

605 log.debug("... using bearer token authentication") 

606 session.auth = BearerTokenAuth(token) 

607 return session 

608 

609 # Should we instead use client certificate and private key? If so, both 

610 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

611 # initialized. 

612 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

613 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

614 if client_cert and client_key: 

615 if not _is_protected(client_key): 

616 raise PermissionError( 

617 f"Private key file at {client_key} must be protected for access only by its owner" 

618 ) 

619 log.debug("... using client certificate authentication.") 

620 session.cert = (client_cert, client_key) 

621 return session 

622 

623 if client_cert: 

624 # Only the client certificate was provided. 

625 raise ValueError( 

626 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

627 ) 

628 

629 if client_key: 

630 # Only the client private key was provided. 

631 raise ValueError( 

632 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

633 ) 

634 

635 log.debug( 

636 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

637 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

638 ) 

639 return session 

640 

641 

642class HttpResourcePath(ResourcePath): 

643 """General HTTP(S) resource. 

644 

645 Notes 

646 ----- 

647 In order to configure the behavior of instances of this class, the 

648 environment variables below are inspected: 

649 

650 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

651 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

652 This header is required by some servers to detect if the client 

653 knows how to handle redirections. In case of redirection, the body 

654 of the PUT request is sent to the redirected location and not to 

655 the front end server. 

656 

657 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a 

658 numeric value, they are interpreted as the number of seconds to wait 

659 for establishing a connection with the server and for reading its 

660 response, respectively. 

661 

662 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and 

663 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number 

664 of connections to attempt to persist with both the front end servers 

665 and the back end servers. 

666 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and 

667 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS. 

668 

669 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to 

670 ask the server to compute for every file's content sent to the server 

671 via a PUT request. No digest is requested if this variable is not set 

672 or is set to an invalid value. 

673 Valid values are those in ACCEPTED_DIGESTS. 

674 """ 

675 

676 _is_webdav: bool | None = None 

677 

678 # Configuration items for this class instances. 

679 _config = HttpResourcePathConfig() 

680 

681 # The session for metadata requests is used for interacting with 

682 # the front end servers for requests such as PROPFIND, HEAD, etc. Those 

683 # interactions are typically served by the front end servers. We want to 

684 # keep the connection to the front end servers open, to reduce the cost 

685 # associated to TCP and TLS handshaking for each new request. 

686 _metadata_session_store = SessionStore( 

687 num_pools=5, 

688 max_persistent_connections=_config.front_end_connections, 

689 backoff_min=_config.backoff_min, 

690 backoff_max=_config.backoff_max, 

691 ) 

692 

693 # The data session is used for interaction with the front end servers which 

694 # typically redirect to the back end servers for serving our PUT and GET 

695 # requests. We attempt to keep a single connection open with the front end 

696 # server, if possible. This depends on how the server behaves and the 

697 # kind of request. Some servers close the connection when redirecting 

698 # the client to a back end server, for instance when serving a PUT 

699 # request. 

700 _data_session_store = SessionStore( 

701 num_pools=25, 

702 max_persistent_connections=_config.back_end_connections, 

703 backoff_min=_config.backoff_min, 

704 backoff_max=_config.backoff_max, 

705 ) 

706 

707 # Process ID which created the session stores above. We need to store this 

708 # to replace sessions created by a parent process and inherited by a 

709 # child process after a fork, to avoid confusing the SSL layer. 

710 _pid: int = -1 

711 

712 @property 

713 def metadata_session(self) -> requests.Session: 

714 """Client session to send requests which do not require upload or 

715 download of data, i.e. mostly metadata requests. 

716 """ 

717 if hasattr(self, "_metadata_session"): 

718 if HttpResourcePath._pid == os.getpid(): 

719 return self._metadata_session 

720 else: 

721 # The metadata session we have in cache was likely created by 

722 # a parent process. Discard all the sessions in that store. 

723 self._metadata_session_store.clear() 

724 

725 # Retrieve a new metadata session. 

726 HttpResourcePath._pid = os.getpid() 

727 self._metadata_session: requests.Session = self._metadata_session_store.get(self) 

728 return self._metadata_session 

729 

730 @property 

731 def data_session(self) -> requests.Session: 

732 """Client session for uploading and downloading data.""" 

733 if hasattr(self, "_data_session"): 

734 if HttpResourcePath._pid == os.getpid(): 

735 return self._data_session 

736 else: 

737 # The data session we have in cache was likely created by 

738 # a parent process. Discard all the sessions in that store. 

739 self._data_session_store.clear() 

740 

741 # Retrieve a new data session. 

742 HttpResourcePath._pid = os.getpid() 

743 self._data_session: requests.Session = self._data_session_store.get(self) 

744 return self._data_session 

745 

746 def _clear_sessions(self) -> None: 

747 """Close the socket connections that are still open. 

748 

749 Used only in test suites to avoid warnings. 

750 """ 

751 self._metadata_session_store.clear() 

752 self._data_session_store.clear() 

753 

754 if hasattr(self, "_metadata_session"): 

755 delattr(self, "_metadata_session") 

756 

757 if hasattr(self, "_data_session"): 

758 delattr(self, "_data_session") 

759 

760 @property 

761 def is_webdav_endpoint(self) -> bool: 

762 """Check if the current endpoint implements WebDAV features. 

763 

764 This is stored per URI but cached by root so there is 

765 only one check per hostname. 

766 """ 

767 if self._is_webdav is not None: 

768 return self._is_webdav 

769 

770 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

771 return self._is_webdav 

772 

773 def exists(self) -> bool: 

774 """Check that a remote HTTP resource exists.""" 

775 log.debug("Checking if resource exists: %s", self.geturl()) 

776 if not self.is_webdav_endpoint: 

777 # The remote is a plain HTTP server. Let's attempt a HEAD 

778 # request, even if the behavior for such a request against a 

779 # directory is not specified, so it depends on the server 

780 # implementation. 

781 resp = self.metadata_session.head( 

782 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

783 ) 

784 return resp.status_code == requests.codes.ok # 200 

785 

786 # The remote endpoint is a webDAV server: send a PROPFIND request 

787 # to determine if it exists. 

788 resp = self._propfind() 

789 if resp.status_code == requests.codes.multi_status: # 207 

790 prop = _parse_propfind_response_body(resp.text)[0] 

791 return prop.exists 

792 else: # 404 Not Found 

793 return False 

794 

795 def size(self) -> int: 

796 """Return the size of the remote resource in bytes.""" 

797 if self.dirLike: 

798 return 0 

799 

800 if not self.is_webdav_endpoint: 

801 # The remote is a plain HTTP server. Send a HEAD request to 

802 # retrieve the size of the resource. 

803 resp = self.metadata_session.head( 

804 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

805 ) 

806 if resp.status_code == requests.codes.ok: # 200 

807 if "Content-Length" in resp.headers: 

808 return int(resp.headers["Content-Length"]) 

809 else: 

810 raise ValueError( 

811 f"Response to HEAD request to {self} does not contain 'Content-Length' header" 

812 ) 

813 elif resp.status_code == requests.codes.not_found: 

814 raise FileNotFoundError( 

815 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

816 ) 

817 else: 

818 raise ValueError( 

819 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} " 

820 f"{resp.reason}" 

821 ) 

822 

823 # The remote is a webDAV server: send a PROPFIND request to retrieve 

824 # the size of the resource. Sizes are only meaningful for files. 

825 resp = self._propfind() 

826 if resp.status_code == requests.codes.multi_status: # 207 

827 prop = _parse_propfind_response_body(resp.text)[0] 

828 if prop.is_file: 

829 return prop.size 

830 elif prop.is_directory: 

831 raise IsADirectoryError( 

832 f"Resource {self} is reported by server as a directory but has a file path" 

833 ) 

834 else: 

835 raise FileNotFoundError(f"Resource {self} does not exist") 

836 else: # 404 Not Found 

837 raise FileNotFoundError( 

838 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

839 ) 

840 

841 def mkdir(self) -> None: 

842 """Create the directory resource if it does not already exist.""" 

843 # Creating directories is only available on WebDAV back ends. 

844 if not self.is_webdav_endpoint: 

845 raise NotImplementedError( 

846 f"Creation of directory {self} is not implemented by plain HTTP servers" 

847 ) 

848 

849 if not self.dirLike: 

850 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

851 

852 # Check if the target directory already exists. 

853 resp = self._propfind() 

854 if resp.status_code == requests.codes.multi_status: # 207 

855 prop = _parse_propfind_response_body(resp.text)[0] 

856 if prop.exists: 

857 if prop.is_directory: 

858 return 

859 else: 

860 # A file exists at this path 

861 raise NotADirectoryError( 

862 f"Can not create a directory for {self} because a file already exists at that path" 

863 ) 

864 

865 # Target directory does not exist. Create it and its ancestors as 

866 # needed. We need to test if parent URL is different from self URL, 

867 # otherwise we could be stuck in a recursive loop 

868 # where self == parent. 

869 if self.geturl() != self.parent().geturl(): 

870 self.parent().mkdir() 

871 

872 log.debug("Creating new directory: %s", self.geturl()) 

873 self._mkcol() 

874 

875 def remove(self) -> None: 

876 """Remove the resource.""" 

877 self._delete() 

878 

879 def read(self, size: int = -1) -> bytes: 

880 """Open the resource and return the contents in bytes. 

881 

882 Parameters 

883 ---------- 

884 size : `int`, optional 

885 The number of bytes to read. Negative or omitted indicates 

886 that all data should be read. 

887 """ 

888 # Use the data session as a context manager to ensure that the 

889 # network connections to both the front end and back end servers are 

890 # closed after downloading the data. 

891 log.debug("Reading from remote resource: %s", self.geturl()) 

892 stream = size > 0 

893 with self.data_session as session: 

894 with time_this(log, msg="GET %s", args=(self,)): 

895 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout) 

896 

897 if resp.status_code != requests.codes.ok: # 200 

898 raise FileNotFoundError( 

899 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}" 

900 ) 

901 if not stream: 

902 return resp.content 

903 else: 

904 return next(resp.iter_content(chunk_size=size)) 

905 

906 def write(self, data: bytes, overwrite: bool = True) -> None: 

907 """Write the supplied bytes to the new resource. 

908 

909 Parameters 

910 ---------- 

911 data : `bytes` 

912 The bytes to write to the resource. The entire contents of the 

913 resource will be replaced. 

914 overwrite : `bool`, optional 

915 If `True` the resource will be overwritten if it exists. Otherwise 

916 the write will fail. 

917 """ 

918 log.debug("Writing to remote resource: %s", self.geturl()) 

919 if not overwrite and self.exists(): 

920 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

921 

922 # Ensure the parent directory exists. 

923 self.parent().mkdir() 

924 

925 # Upload the data. 

926 log.debug("Writing data to remote resource: %s", self.geturl()) 

927 self._put(data=data) 

928 

929 def transfer_from( 

930 self, 

931 src: ResourcePath, 

932 transfer: str = "copy", 

933 overwrite: bool = False, 

934 transaction: TransactionProtocol | None = None, 

935 ) -> None: 

936 """Transfer the current resource to a Webdav repository. 

937 

938 Parameters 

939 ---------- 

940 src : `ResourcePath` 

941 Source URI. 

942 transfer : `str` 

943 Mode to use for transferring the resource. Supports the following 

944 options: copy. 

945 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

946 Currently unused. 

947 """ 

948 # Fail early to prevent delays if remote resources are requested. 

949 if transfer not in self.transferModes: 

950 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

951 

952 # Existence checks cost time so do not call this unless we know 

953 # that debugging is enabled. 

954 if log.isEnabledFor(logging.DEBUG): 

955 log.debug( 

956 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

957 src, 

958 src.exists(), 

959 self, 

960 self.exists(), 

961 transfer, 

962 ) 

963 

964 # Short circuit immediately if the URIs are identical. 

965 if self == src: 

966 log.debug( 

967 "Target and destination URIs are identical: %s, returning immediately." 

968 " No further action required.", 

969 self, 

970 ) 

971 return 

972 

973 if not overwrite and self.exists(): 

974 raise FileExistsError(f"Destination path {self} already exists.") 

975 

976 if transfer == "auto": 

977 transfer = self.transferDefault 

978 

979 # We can use webDAV 'COPY' or 'MOVE' if both the current and source 

980 # resources are located in the same server. 

981 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint: 

982 log.debug("Transfer from %s to %s directly", src, self) 

983 return self._move(src) if transfer == "move" else self._copy(src) 

984 

985 # For resources of different classes or for plain HTTP resources we can 

986 # perform the copy or move operation by downloading to a local file 

987 # and uploading to the destination. 

988 self._copy_via_local(src) 

989 

990 # This was an explicit move, try to remove the source. 

991 if transfer == "move": 

992 src.remove() 

993 

994 def walk( 

995 self, file_filter: str | re.Pattern | None = None 

996 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

997 """Walk the directory tree returning matching files and directories. 

998 

999 Parameters 

1000 ---------- 

1001 file_filter : `str` or `re.Pattern`, optional 

1002 Regex to filter out files from the list before it is returned. 

1003 

1004 Yields 

1005 ------ 

1006 dirpath : `ResourcePath` 

1007 Current directory being examined. 

1008 dirnames : `list` of `str` 

1009 Names of subdirectories within dirpath. 

1010 filenames : `list` of `str` 

1011 Names of all the files within dirpath. 

1012 """ 

1013 if not self.dirLike: 

1014 raise ValueError("Can not walk a non-directory URI") 

1015 

1016 # Walking directories is only available on WebDAV back ends. 

1017 if not self.is_webdav_endpoint: 

1018 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers") 

1019 

1020 if isinstance(file_filter, str): 

1021 file_filter = re.compile(file_filter) 

1022 

1023 resp = self._propfind(depth="1") 

1024 if resp.status_code == requests.codes.multi_status: # 207 

1025 files: list[str] = [] 

1026 dirs: list[str] = [] 

1027 

1028 for prop in _parse_propfind_response_body(resp.text): 

1029 if prop.is_file: 

1030 files.append(prop.name) 

1031 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")): 

1032 # Only include the names of sub-directories not the name of 

1033 # the directory being walked. 

1034 dirs.append(prop.name) 

1035 

1036 if file_filter is not None: 

1037 files = [f for f in files if file_filter.search(f)] 

1038 

1039 if not dirs and not files: 

1040 return 

1041 else: 

1042 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files 

1043 

1044 for dir in dirs: 

1045 new_uri = self.join(dir, forceDirectory=True) 

1046 yield from new_uri.walk(file_filter) 

1047 

1048 def _as_local(self) -> tuple[str, bool]: 

1049 """Download object over HTTP and place in temporary directory. 

1050 

1051 Returns 

1052 ------- 

1053 path : `str` 

1054 Path to local temporary file. 

1055 temporary : `bool` 

1056 Always returns `True`. This is always a temporary file. 

1057 """ 

1058 # Use the session as a context manager to ensure that connections 

1059 # to both the front end and back end servers are closed after the 

1060 # download operation is finished. 

1061 with self.data_session as session: 

1062 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout) 

1063 if resp.status_code != requests.codes.ok: 

1064 raise FileNotFoundError( 

1065 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}" 

1066 ) 

1067 

1068 tmpdir, buffering = _get_temp_dir() 

1069 with tempfile.NamedTemporaryFile( 

1070 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

1071 ) as tmpFile: 

1072 expected_length = int(resp.headers.get("Content-Length", "-1")) 

1073 with time_this( 

1074 log, 

1075 msg="GET %s [length=%d] to local file %s [chunk_size=%d]", 

1076 args=(self, expected_length, tmpFile.name, buffering), 

1077 mem_usage=self._config.collect_memory_usage, 

1078 mem_unit=u.mebibyte, 

1079 ): 

1080 content_length = 0 

1081 for chunk in resp.iter_content(chunk_size=buffering): 

1082 tmpFile.write(chunk) 

1083 content_length += len(chunk) 

1084 

1085 # Check that the expected and actual content lengths match. Perform 

1086 # this check only when the contents of the file was not encoded by 

1087 # the server. 

1088 if ( 

1089 "Content-Encoding" not in resp.headers 

1090 and expected_length >= 0 

1091 and expected_length != content_length 

1092 ): 

1093 raise ValueError( 

1094 f"Size of downloaded file does not match value in Content-Length header for {self}: " 

1095 f"expecting {expected_length} and got {content_length} bytes" 

1096 ) 

1097 

1098 return tmpFile.name, True 

1099 

1100 def _send_webdav_request( 

1101 self, 

1102 method: str, 

1103 url: str | None = None, 

1104 headers: dict[str, str] | None = None, 

1105 body: str | None = None, 

1106 session: requests.Session | None = None, 

1107 timeout: tuple[float, float] | None = None, 

1108 ) -> requests.Response: 

1109 """Send a webDAV request and correctly handle redirects. 

1110 

1111 Parameters 

1112 ---------- 

1113 method : `str` 

1114 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL. 

1115 headers : `dict`, optional 

1116 A dictionary of key-value pairs (both strings) to include as 

1117 headers in the request. 

1118 body: `str`, optional 

1119 The body of the request. 

1120 

1121 Notes 

1122 ----- 

1123 This way of sending webDAV requests is necessary for handling 

1124 redirection ourselves, since the 'requests' package changes the method 

1125 of the redirected request when the server responds with status 302 and 

1126 the method of the original request is not HEAD (which is the case for 

1127 webDAV requests). 

1128 

1129 That means that when the webDAV server we interact with responds with 

1130 a redirection to a PROPFIND or MKCOL request, the request gets 

1131 converted to a GET request when sent to the redirected location. 

1132 

1133 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in 

1134 https://github.com/psf/requests/blob/main/requests/sessions.py 

1135 

1136 This behavior of the 'requests' package is meant to be compatible with 

1137 what is specified in RFC 9110: 

1138 

1139 https://www.rfc-editor.org/rfc/rfc9110#name-302-found 

1140 

1141 For our purposes, we do need to follow the redirection and send a new 

1142 request using the same HTTP verb. 

1143 """ 

1144 if url is None: 

1145 url = self.geturl() 

1146 

1147 if headers is None: 

1148 headers = {} 

1149 

1150 if session is None: 

1151 session = self.metadata_session 

1152 

1153 if timeout is None: 

1154 timeout = self._config.timeout 

1155 

1156 with time_this( 

1157 log, 

1158 msg="%s %s", 

1159 args=( 

1160 method, 

1161 url, 

1162 ), 

1163 mem_usage=self._config.collect_memory_usage, 

1164 mem_unit=u.mebibyte, 

1165 ): 

1166 for _ in range(max_redirects := 5): 

1167 resp = session.request( 

1168 method, 

1169 url, 

1170 data=body, 

1171 headers=headers, 

1172 stream=False, 

1173 timeout=timeout, 

1174 allow_redirects=False, 

1175 ) 

1176 if resp.is_redirect: 

1177 url = resp.headers["Location"] 

1178 else: 

1179 return resp 

1180 

1181 # We reached the maximum allowed number of redirects. 

1182 # Stop trying. 

1183 raise ValueError( 

1184 f"Could not get a response to {method} request for {self} after {max_redirects} redirections" 

1185 ) 

1186 

1187 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response: 

1188 """Send a PROPFIND webDAV request and return the response. 

1189 

1190 Parameters 

1191 ---------- 

1192 body : `str`, optional 

1193 The body of the PROPFIND request to send to the server. If 

1194 provided, it is expected to be a XML document. 

1195 depth : `str`, optional 

1196 The value of the 'Depth' header to include in the request. 

1197 

1198 Returns 

1199 ------- 

1200 response : `requests.Response` 

1201 Response to the PROPFIND request. 

1202 

1203 Notes 

1204 ----- 

1205 It raises `ValueError` if the status code of the PROPFIND request 

1206 is different from "207 Multistatus" or "404 Not Found". 

1207 """ 

1208 if body is None: 

1209 # Request only the DAV live properties we are explicitly interested 

1210 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified' 

1211 # and 'displayname'. 

1212 body = ( 

1213 """<?xml version="1.0" encoding="utf-8" ?>""" 

1214 """<D:propfind xmlns:D="DAV:"><D:prop>""" 

1215 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>""" 

1216 """</D:prop></D:propfind>""" 

1217 ) 

1218 headers = { 

1219 "Depth": depth, 

1220 "Content-Type": 'application/xml; charset="utf-8"', 

1221 "Content-Length": str(len(body)), 

1222 } 

1223 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body) 

1224 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found): 

1225 return resp 

1226 else: 

1227 raise ValueError( 

1228 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} " 

1229 f"{resp.reason}" 

1230 ) 

1231 

1232 def _options(self) -> requests.Response: 

1233 """Send a OPTIONS webDAV request for this resource.""" 

1234 resp = self._send_webdav_request("OPTIONS") 

1235 if resp.status_code in (requests.codes.ok, requests.codes.created): 

1236 return resp 

1237 

1238 raise ValueError( 

1239 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} {resp.reason}" 

1240 ) 

1241 

1242 def _head(self) -> requests.Response: 

1243 """Send a HEAD webDAV request for this resource.""" 

1244 return self._send_webdav_request("HEAD") 

1245 

1246 def _mkcol(self) -> None: 

1247 """Send a MKCOL webDAV request to create a collection. The collection 

1248 may already exist. 

1249 """ 

1250 resp = self._send_webdav_request("MKCOL") 

1251 if resp.status_code == requests.codes.created: # 201 

1252 return 

1253 

1254 if resp.status_code == requests.codes.method_not_allowed: # 405 

1255 # The remote directory already exists 

1256 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

1257 else: 

1258 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}") 

1259 

1260 def _delete(self) -> None: 

1261 """Send a DELETE webDAV request for this resource.""" 

1262 log.debug("Deleting %s ...", self.geturl()) 

1263 

1264 # If this is a directory, ensure the remote is a webDAV server because 

1265 # plain HTTP servers don't support DELETE requests on non-file 

1266 # paths. 

1267 if self.dirLike and not self.is_webdav_endpoint: 

1268 raise NotImplementedError( 

1269 f"Deletion of directory {self} is not implemented by plain HTTP servers" 

1270 ) 

1271 

1272 # Deleting non-empty directories may take some time, so increase 

1273 # the timeout for getting a response from the server. 

1274 timeout = self._config.timeout 

1275 if self.dirLike: 

1276 timeout = (timeout[0], timeout[1] * 100) 

1277 resp = self._send_webdav_request("DELETE", timeout=timeout) 

1278 if resp.status_code in ( 

1279 requests.codes.ok, 

1280 requests.codes.accepted, 

1281 requests.codes.no_content, 

1282 requests.codes.not_found, 

1283 ): 

1284 # We can get a "404 Not Found" error when the file or directory 

1285 # does not exist or when the DELETE request was retried several 

1286 # times and a previous attempt actually deleted the resource. 

1287 # Therefore we consider that a "Not Found" response is not an 

1288 # error since we reached the state desired by the user. 

1289 return 

1290 else: 

1291 # TODO: the response to a DELETE request against a webDAV server 

1292 # may be multistatus. If so, we need to parse the reponse body to 

1293 # determine more precisely the reason of the failure (e.g. a lock) 

1294 # and provide a more helpful error message. 

1295 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}") 

1296 

1297 def _copy_via_local(self, src: ResourcePath) -> None: 

1298 """Replace the contents of this resource with the contents of a remote 

1299 resource by using a local temporary file. 

1300 

1301 Parameters 

1302 ---------- 

1303 src : `HttpResourcePath` 

1304 The source of the contents to copy to `self`. 

1305 """ 

1306 with src.as_local() as local_uri: 

1307 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri) 

1308 with open(local_uri.ospath, "rb") as f: 

1309 self._put(data=f) 

1310 

1311 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None: 

1312 """Send a COPY or MOVE webDAV request to copy or replace the contents 

1313 of this resource with the contents of another resource located in the 

1314 same server. 

1315 

1316 Parameters 

1317 ---------- 

1318 method : `str` 

1319 The method to perform. Valid values are "COPY" or "MOVE" (in 

1320 uppercase). 

1321 

1322 src : `HttpResourcePath` 

1323 The source of the contents to move to `self`. 

1324 """ 

1325 headers = {"Destination": self.geturl()} 

1326 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session) 

1327 if resp.status_code in (requests.codes.created, requests.codes.no_content): 

1328 return 

1329 

1330 if resp.status_code == requests.codes.multi_status: 

1331 tree = eTree.fromstring(resp.content) 

1332 status_element = tree.find("./{DAV:}response/{DAV:}status") 

1333 status = status_element.text if status_element is not None else "unknown" 

1334 error = tree.find("./{DAV:}response/{DAV:}error") 

1335 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}") 

1336 else: 

1337 raise ValueError( 

1338 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}" 

1339 ) 

1340 

1341 def _copy(self, src: HttpResourcePath) -> None: 

1342 """Send a COPY webDAV request to replace the contents of this resource 

1343 (if any) with the contents of another resource located in the same 

1344 server. 

1345 

1346 Parameters 

1347 ---------- 

1348 src : `HttpResourcePath` 

1349 The source of the contents to copy to `self`. 

1350 """ 

1351 # Neither dCache nor XrootD currently implement the COPY 

1352 # webDAV method as documented in 

1353 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY 

1354 # (See issues DM-37603 and DM-37651 for details) 

1355 # 

1356 # For the time being, we use a temporary local file to 

1357 # perform the copy client side. 

1358 # TODO: when those 2 issues above are solved remove the 3 lines below. 

1359 must_use_local = True 

1360 if must_use_local: 

1361 return self._copy_via_local(src) 

1362 

1363 return self._copy_or_move("COPY", src) 

1364 

1365 def _move(self, src: HttpResourcePath) -> None: 

1366 """Send a MOVE webDAV request to replace the contents of this resource 

1367 with the contents of another resource located in the same server. 

1368 

1369 Parameters 

1370 ---------- 

1371 src : `HttpResourcePath` 

1372 The source of the contents to move to `self`. 

1373 """ 

1374 return self._copy_or_move("MOVE", src) 

1375 

1376 def _put(self, data: BinaryIO | bytes) -> None: 

1377 """Perform an HTTP PUT request and handle redirection. 

1378 

1379 Parameters 

1380 ---------- 

1381 data : `Union[BinaryIO, bytes]` 

1382 The data to be included in the body of the PUT request. 

1383 """ 

1384 # Retrieve the final URL for this upload by sending a PUT request with 

1385 # no content. Follow a single server redirection to retrieve the 

1386 # final URL. 

1387 headers = {"Content-Length": "0"} 

1388 if self._config.send_expect_on_put: 

1389 headers["Expect"] = "100-continue" 

1390 

1391 url = self.geturl() 

1392 

1393 # Use the session as a context manager to ensure the underlying 

1394 # connections are closed after finishing uploading the data. 

1395 with self.data_session as session: 

1396 # Send an empty PUT request to get redirected to the final 

1397 # destination. 

1398 log.debug("Sending empty PUT request to %s", url) 

1399 with time_this( 

1400 log, 

1401 msg="PUT (no data) %s", 

1402 args=(url,), 

1403 mem_usage=self._config.collect_memory_usage, 

1404 mem_unit=u.mebibyte, 

1405 ): 

1406 resp = session.request( 

1407 "PUT", 

1408 url, 

1409 data=None, 

1410 headers=headers, 

1411 stream=False, 

1412 timeout=self._config.timeout, 

1413 allow_redirects=False, 

1414 ) 

1415 if resp.is_redirect: 

1416 url = resp.headers["Location"] 

1417 

1418 # Upload the data to the final destination. 

1419 log.debug("Uploading data to %s", url) 

1420 

1421 # Ask the server to compute and record a checksum of the uploaded 

1422 # file contents, for later integrity checks. Since we don't compute 

1423 # the digest ourselves while uploading the data, we cannot control 

1424 # after the request is complete that the data we uploaded is 

1425 # identical to the data recorded by the server, but at least the 

1426 # server has recorded a digest of the data it stored. 

1427 # 

1428 # See RFC-3230 for details and 

1429 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml 

1430 # for the list of supported digest algorithhms. 

1431 # In addition, note that not all servers implement this RFC so 

1432 # the checksum may not be computed by the server. 

1433 put_headers: dict[str, str] | None = None 

1434 if digest := self._config.digest_algorithm: 

1435 put_headers = {"Want-Digest": digest} 

1436 

1437 with time_this( 

1438 log, 

1439 msg="PUT %s", 

1440 args=(url,), 

1441 mem_usage=self._config.collect_memory_usage, 

1442 mem_unit=u.mebibyte, 

1443 ): 

1444 resp = session.request( 

1445 "PUT", 

1446 url, 

1447 data=data, 

1448 headers=put_headers, 

1449 stream=False, 

1450 timeout=self._config.timeout, 

1451 allow_redirects=False, 

1452 ) 

1453 if resp.status_code in ( 

1454 requests.codes.ok, 

1455 requests.codes.created, 

1456 requests.codes.no_content, 

1457 ): 

1458 return 

1459 else: 

1460 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}") 

1461 

1462 @contextlib.contextmanager 

1463 def _openImpl( 

1464 self, 

1465 mode: str = "r", 

1466 *, 

1467 encoding: str | None = None, 

1468 ) -> Iterator[ResourceHandleProtocol]: 

1469 resp = self._head() 

1470 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes" 

1471 handle: ResourceHandleProtocol 

1472 if mode in ("rb", "r") and accepts_range: 

1473 handle = HttpReadResourceHandle( 

1474 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout 

1475 ) 

1476 if mode == "r": 

1477 # cast because the protocol is compatible, but does not have 

1478 # BytesIO in the inheritance tree 

1479 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding) 

1480 else: 

1481 yield handle 

1482 else: 

1483 with super()._openImpl(mode, encoding=encoding) as http_handle: 

1484 yield http_handle 

1485 

1486 

1487def _dump_response(resp: requests.Response) -> None: 

1488 """Log the contents of a HTTP or webDAV request and its response. 

1489 

1490 Parameters 

1491 ---------- 

1492 resp : `requests.Response` 

1493 The response to log. 

1494 

1495 Notes 

1496 ----- 

1497 Intended for development purposes only. 

1498 """ 

1499 log.debug("-----------------------------------------------") 

1500 log.debug("Request") 

1501 log.debug(" method=%s", resp.request.method) 

1502 log.debug(" URL=%s", resp.request.url) 

1503 log.debug(" headers=%s", resp.request.headers) 

1504 if resp.request.method == "PUT": 

1505 log.debug(" body=<data>") 

1506 elif resp.request.body is None: 

1507 log.debug(" body=<empty>") 

1508 else: 

1509 log.debug(" body=%r", resp.request.body[:120]) 

1510 

1511 log.debug("Response:") 

1512 log.debug(" status_code=%d", resp.status_code) 

1513 log.debug(" headers=%s", resp.headers) 

1514 if not resp.content: 

1515 log.debug(" body=<empty>") 

1516 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain": 

1517 log.debug(" body=%r", resp.content) 

1518 else: 

1519 log.debug(" body=%r", resp.content[:80]) 

1520 

1521 

1522def _is_protected(filepath: str) -> bool: 

1523 """Return true if the permissions of file at filepath only allow for access 

1524 by its owner. 

1525 

1526 Parameters 

1527 ---------- 

1528 filepath : `str` 

1529 Path of a local file. 

1530 """ 

1531 if not os.path.isfile(filepath): 

1532 return False 

1533 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

1534 owner_accessible = bool(mode & stat.S_IRWXU) 

1535 group_accessible = bool(mode & stat.S_IRWXG) 

1536 other_accessible = bool(mode & stat.S_IRWXO) 

1537 return owner_accessible and not group_accessible and not other_accessible 

1538 

1539 

1540def _parse_propfind_response_body(body: str) -> list[DavProperty]: 

1541 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND 

1542 request. 

1543 

1544 Parameters 

1545 ---------- 

1546 body : `str` 

1547 XML-encoded response body to a PROPFIND request 

1548 

1549 Returns 

1550 ------- 

1551 responses : `List[DavProperty]` 

1552 

1553 Notes 

1554 ----- 

1555 Is is expected that there is at least one reponse in `body`, otherwise 

1556 this function raises. 

1557 """ 

1558 # A response body to a PROPFIND request is of the form (indented for 

1559 # readability): 

1560 # 

1561 # <?xml version="1.0" encoding="UTF-8"?> 

1562 # <D:multistatus xmlns:D="DAV:"> 

1563 # <D:response> 

1564 # <D:href>path/to/resource</D:href> 

1565 # <D:propstat> 

1566 # <D:prop> 

1567 # <D:resourcetype> 

1568 # <D:collection xmlns:D="DAV:"/> 

1569 # </D:resourcetype> 

1570 # <D:getlastmodified> 

1571 # Fri, 27 Jan 2 023 13:59:01 GMT 

1572 # </D:getlastmodified> 

1573 # <D:getcontentlength> 

1574 # 12345 

1575 # </D:getcontentlength> 

1576 # </D:prop> 

1577 # <D:status> 

1578 # HTTP/1.1 200 OK 

1579 # </D:status> 

1580 # </D:propstat> 

1581 # </D:response> 

1582 # <D:response> 

1583 # ... 

1584 # </D:response> 

1585 # <D:response> 

1586 # ... 

1587 # </D:response> 

1588 # </D:multistatus> 

1589 

1590 # Scan all the 'response' elements and extract the relevant properties 

1591 responses = [] 

1592 multistatus = eTree.fromstring(body.strip()) 

1593 for response in multistatus.findall("./{DAV:}response"): 

1594 responses.append(DavProperty(response)) 

1595 

1596 if responses: 

1597 return responses 

1598 else: 

1599 # Could not parse the body 

1600 raise ValueError(f"Unable to parse response for PROPFIND request: {body}") 

1601 

1602 

1603class DavProperty: 

1604 """Helper class to encapsulate select live DAV properties of a single 

1605 resource, as retrieved via a PROPFIND request. 

1606 """ 

1607 

1608 # Regular expression to compare against the 'status' element of a 

1609 # PROPFIND response's 'propstat' element. 

1610 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE) 

1611 

1612 def __init__(self, response: eTree.Element | None): 

1613 self._href: str = "" 

1614 self._displayname: str = "" 

1615 self._collection: bool = False 

1616 self._getlastmodified: str = "" 

1617 self._getcontentlength: int = -1 

1618 

1619 if response is not None: 

1620 self._parse(response) 

1621 

1622 def _parse(self, response: eTree.Element) -> None: 

1623 # Extract 'href'. 

1624 if (element := response.find("./{DAV:}href")) is not None: 

1625 # We need to use "str(element.text)"" instead of "element.text" to 

1626 # keep mypy happy. 

1627 self._href = str(element.text).strip() 

1628 else: 

1629 raise ValueError( 

1630 "Property 'href' expected but not found in PROPFIND response: " 

1631 f"{eTree.tostring(response, encoding='unicode')}" 

1632 ) 

1633 

1634 for propstat in response.findall("./{DAV:}propstat"): 

1635 # Only extract properties of interest with status OK. 

1636 status = propstat.find("./{DAV:}status") 

1637 if status is None or not self._status_ok_rex.match(str(status.text)): 

1638 continue 

1639 

1640 for prop in propstat.findall("./{DAV:}prop"): 

1641 # Parse "collection". 

1642 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None: 

1643 self._collection = True 

1644 

1645 # Parse "getlastmodified". 

1646 if (element := prop.find("./{DAV:}getlastmodified")) is not None: 

1647 self._getlastmodified = str(element.text) 

1648 

1649 # Parse "getcontentlength". 

1650 if (element := prop.find("./{DAV:}getcontentlength")) is not None: 

1651 self._getcontentlength = int(str(element.text)) 

1652 

1653 # Parse "displayname". 

1654 if (element := prop.find("./{DAV:}displayname")) is not None: 

1655 self._displayname = str(element.text) 

1656 

1657 # Some webDAV servers don't include the 'displayname' property in the 

1658 # response so try to infer it from the value of the 'href' property. 

1659 # Depending on the server the href value may end with '/'. 

1660 if not self._displayname: 

1661 self._displayname = os.path.basename(self._href.rstrip("/")) 

1662 

1663 # Force a size of 0 for collections. 

1664 if self._collection: 

1665 self._getcontentlength = 0 

1666 

1667 @property 

1668 def exists(self) -> bool: 

1669 # It is either a directory or a file with length of at least zero 

1670 return self._collection or self._getcontentlength >= 0 

1671 

1672 @property 

1673 def is_directory(self) -> bool: 

1674 return self._collection 

1675 

1676 @property 

1677 def is_file(self) -> bool: 

1678 return not self._collection 

1679 

1680 @property 

1681 def size(self) -> int: 

1682 return self._getcontentlength 

1683 

1684 @property 

1685 def name(self) -> str: 

1686 return self._displayname 

1687 

1688 @property 

1689 def href(self) -> str: 

1690 return self._href