Coverage for python/lsst/resources/http.py: 23%

571 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-17 10:49 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpResourcePath",) 

15 

16import contextlib 

17import functools 

18import io 

19import logging 

20import math 

21import os 

22import os.path 

23import random 

24import re 

25import stat 

26import tempfile 

27from collections.abc import Iterator 

28from typing import TYPE_CHECKING, BinaryIO, cast 

29 

30try: 

31 # Prefer 'defusedxml' (not part of standard library) if available, since 

32 # 'xml' is vulnerable to XML bombs. 

33 import defusedxml.ElementTree as eTree 

34except ImportError: 

35 import xml.etree.ElementTree as eTree 

36 

37import requests 

38from astropy import units as u 

39from lsst.utils.timer import time_this 

40from requests.adapters import HTTPAdapter 

41from requests.auth import AuthBase 

42from urllib3.util.retry import Retry 

43 

44from ._resourceHandles import ResourceHandleProtocol 

45from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle 

46from ._resourcePath import ResourcePath 

47 

48if TYPE_CHECKING: 

49 from .utils import TransactionProtocol 

50 

51log = logging.getLogger(__name__) 

52 

53 

54def _timeout_from_environment(env_var: str, default_value: float) -> float: 

55 """Convert and return a timeout from the value of an environment variable 

56 or a default value if the environment variable is not initialized. The 

57 value of `env_var` must be a valid `float` otherwise this function raises. 

58 

59 Parameters 

60 ---------- 

61 env_var : `str` 

62 Environment variable to look for. 

63 default_value : `float`` 

64 Value to return if `env_var` is not defined in the environment. 

65 

66 Returns 

67 ------- 

68 _timeout_from_environment : `float` 

69 Converted value. 

70 """ 

71 try: 

72 timeout = float(os.environ.get(env_var, default_value)) 

73 except ValueError: 

74 raise ValueError( 

75 f"Expecting valid timeout value in environment variable {env_var} but found " 

76 f"{os.environ.get(env_var)}" 

77 ) from None 

78 

79 if math.isnan(timeout): 

80 raise ValueError(f"Unexpected timeout value NaN found in environment variable {env_var}") 

81 

82 return timeout 

83 

84 

85class HttpResourcePathConfig: 

86 """Configuration class to encapsulate the configurable items used by class 

87 HttpResourcePath. 

88 """ 

89 

90 # Default timeouts for all HTTP requests (seconds). 

91 DEFAULT_TIMEOUT_CONNECT = 30.0 

92 DEFAULT_TIMEOUT_READ = 1_500.0 

93 

94 # Default lower and upper bounds for the backoff interval (seconds). 

95 # A value in this interval is randomly selected as the backoff factor when 

96 # requests need to be retried. 

97 DEFAULT_BACKOFF_MIN = 1.0 

98 DEFAULT_BACKOFF_MAX = 3.0 

99 

100 # Default number of connections to persist with both the front end and 

101 # back end servers. 

102 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2 

103 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1 

104 

105 # Accepted digest algorithms 

106 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512") 

107 

108 _front_end_connections: int | None = None 

109 _back_end_connections: int | None = None 

110 _digest_algorithm: str | None = None 

111 _send_expect_on_put: bool | None = None 

112 _timeout: tuple[float, float] | None = None 

113 _collect_memory_usage: bool | None = None 

114 _backoff_min: float | None = None 

115 _backoff_max: float | None = None 

116 

117 @property 

118 def front_end_connections(self) -> int: 

119 """Number of persistent connections to the front end server.""" 

120 if self._front_end_connections is not None: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true

121 return self._front_end_connections 

122 

123 try: 

124 self._front_end_connections = int( 

125 os.environ.get( 

126 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

127 ) 

128 ) 

129 except ValueError: 

130 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

131 

132 return self._front_end_connections 

133 

134 @property 

135 def back_end_connections(self) -> int: 

136 """Number of persistent connections to the back end servers.""" 

137 if self._back_end_connections is not None: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true

138 return self._back_end_connections 

139 

140 try: 

141 self._back_end_connections = int( 

142 os.environ.get( 

143 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

144 ) 

145 ) 

146 except ValueError: 

147 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

148 

149 return self._back_end_connections 

150 

151 @property 

152 def digest_algorithm(self) -> str: 

153 """Algorithm to ask the server to use for computing and recording 

154 digests of each file contents in PUT requests. 

155 

156 Returns 

157 ------- 

158 digest_algorithm: `str` 

159 The name of a digest algorithm or the empty string if no algotihm 

160 is configured. 

161 """ 

162 if self._digest_algorithm is not None: 

163 return self._digest_algorithm 

164 

165 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower() 

166 if digest not in self.ACCEPTED_DIGESTS: 

167 digest = "" 

168 

169 self._digest_algorithm = digest 

170 return self._digest_algorithm 

171 

172 @property 

173 def send_expect_on_put(self) -> bool: 

174 """Return True if a "Expect: 100-continue" header is to be sent to 

175 the server on each PUT request. 

176 

177 Some servers (e.g. dCache) uses this information as an indication that 

178 the client knows how to handle redirects to the specific server that 

179 will actually receive the data for PUT requests. 

180 """ 

181 if self._send_expect_on_put is not None: 

182 return self._send_expect_on_put 

183 

184 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

185 return self._send_expect_on_put 

186 

187 @property 

188 def timeout(self) -> tuple[float, float]: 

189 """Return a tuple with the values of timeouts for connecting to the 

190 server and reading its response, respectively. Both values are in 

191 seconds. 

192 """ 

193 if self._timeout is not None: 

194 return self._timeout 

195 

196 self._timeout = ( 

197 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT), 

198 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ), 

199 ) 

200 return self._timeout 

201 

202 @property 

203 def collect_memory_usage(self) -> bool: 

204 """Return true if we want to collect memory usage when timing 

205 operations against the remote server via the `lsst.utils.time_this` 

206 context manager. 

207 """ 

208 if self._collect_memory_usage is not None: 

209 return self._collect_memory_usage 

210 

211 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ 

212 return self._collect_memory_usage 

213 

214 @property 

215 def backoff_min(self) -> float: 

216 """Lower bound of the interval from which a backoff factor is randomly 

217 selected when retrying requests (seconds). 

218 """ 

219 if self._backoff_min is not None: 

220 return self._backoff_min 

221 

222 self._backoff_min = self.DEFAULT_BACKOFF_MIN 

223 try: 

224 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN)) 

225 if not math.isnan(backoff_min): 225 ↛ 230line 225 didn't jump to line 230, because the condition on line 225 was never false

226 self._backoff_min = backoff_min 

227 except ValueError: 

228 pass 

229 

230 return self._backoff_min 

231 

232 @property 

233 def backoff_max(self) -> float: 

234 """Upper bound of the interval from which a backoff factor is randomly 

235 selected when retrying requests (seconds). 

236 """ 

237 if self._backoff_max is not None: 

238 return self._backoff_max 

239 

240 self._backoff_max = self.DEFAULT_BACKOFF_MAX 

241 try: 

242 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX)) 

243 if not math.isnan(backoff_max): 243 ↛ 248line 243 didn't jump to line 248, because the condition on line 243 was never false

244 self._backoff_max = backoff_max 

245 except ValueError: 

246 pass 

247 

248 return self._backoff_max 

249 

250 

251@functools.lru_cache 

252def _is_webdav_endpoint(path: ResourcePath | str) -> bool: 

253 """Check whether the remote HTTP endpoint implements WebDAV features. 

254 

255 Parameters 

256 ---------- 

257 path : `ResourcePath` or `str` 

258 URL to the resource to be checked. 

259 Should preferably refer to the root since the status is shared 

260 by all paths in that server. 

261 

262 Returns 

263 ------- 

264 _is_webdav_endpoint : `bool` 

265 True if the endpoint implements WebDAV, False if it doesn't. 

266 """ 

267 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

268 

269 # Send an OPTIONS request and inspect its response. An OPTIONS 

270 # request does not need authentication of the client, so we don't need 

271 # to provide a client certificate or a bearer token. We set a 

272 # relatively short timeout since an OPTIONS request is relatively cheap 

273 # for the server to compute. 

274 

275 # Create a session for configuring retries 

276 retries = Retry( 

277 # Total number of retries to allow. Takes precedence over other 

278 # counts. 

279 total=6, 

280 # How many connection-related errors to retry on. 

281 connect=3, 

282 # How many times to retry on read errors. 

283 read=3, 

284 # How many times to retry on bad status codes. 

285 status=5, 

286 # Set of uppercased HTTP method verbs that we should retry on. 

287 allowed_methods=frozenset( 

288 [ 

289 "OPTIONS", 

290 ] 

291 ), 

292 # HTTP status codes that we should force a retry on. 

293 status_forcelist=frozenset( 

294 [ 

295 requests.codes.too_many_requests, # 429 

296 requests.codes.internal_server_error, # 500 

297 requests.codes.bad_gateway, # 502 

298 requests.codes.service_unavailable, # 503 

299 requests.codes.gateway_timeout, # 504 

300 ] 

301 ), 

302 # Whether to respect 'Retry-After' header on status codes defined 

303 # above. 

304 respect_retry_after_header=True, 

305 ) 

306 

307 try: 

308 session = requests.Session() 

309 session.mount(str(path), HTTPAdapter(max_retries=retries)) 

310 session.verify = os.environ.get("LSST_HTTP_CACERT_BUNDLE", True) 

311 with session: 

312 resp = session.options( 

313 str(path), 

314 stream=False, 

315 timeout=( 

316 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", 30.0), 

317 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", 60.0), 

318 ), 

319 ) 

320 if resp.status_code not in (requests.codes.ok, requests.codes.created): 

321 return False 

322 

323 # Check that "1" is part of the value of the "DAV" header. We don't 

324 # use locks, so a server complying to class 1 is enough for our 

325 # purposes. All webDAV servers must advertise at least compliance 

326 # class "1". 

327 # 

328 # Compliance classes are documented in 

329 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes 

330 # 

331 # Examples of values for header DAV are: 

332 # DAV: 1, 2 

333 # DAV: 1, <http://apache.org/dav/propset/fs/1> 

334 if "DAV" not in resp.headers: 

335 return False 

336 else: 

337 # Convert to str to keep mypy happy 

338 compliance_class = str(resp.headers.get("DAV")) 

339 return "1" in compliance_class.replace(" ", "").split(",") 

340 

341 except requests.exceptions.SSLError as e: 

342 log.warning( 

343 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to " 

344 "specify tha path to a bundle of certificate authorities you trust " 

345 "which are not included in the default set of trusted authorities " 

346 "of this system." 

347 ) 

348 raise e 

349 

350 

351# Tuple (path, block_size) pointing to the location of a local directory 

352# to save temporary files and the block size of the underlying file system. 

353_TMPDIR: tuple[str, int] | None = None 

354 

355 

356def _get_temp_dir() -> tuple[str, int]: 

357 """Return the temporary directory path and block size. 

358 

359 This function caches its results in _TMPDIR. 

360 """ 

361 global _TMPDIR 

362 if _TMPDIR: 

363 return _TMPDIR 

364 

365 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

366 # 'TMPDIR', if defined. Otherwise use current working directory. 

367 tmpdir = os.getcwd() 

368 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

369 if dir and os.path.isdir(dir): 

370 tmpdir = dir 

371 break 

372 

373 # Compute the block size as 256 blocks of typical size 

374 # (i.e. 4096 bytes) or 10 times the file system block size, 

375 # whichever is higher. This is a reasonable compromise between 

376 # using memory for buffering and the number of system calls 

377 # issued to read from or write to temporary files. 

378 fsstats = os.statvfs(tmpdir) 

379 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

380 

381 

382class BearerTokenAuth(AuthBase): 

383 """Attach a bearer token 'Authorization' header to each request. 

384 

385 Parameters 

386 ---------- 

387 token : `str` 

388 Can be either the path to a local protected file which contains the 

389 value of the token or the token itself. 

390 """ 

391 

392 def __init__(self, token: str): 

393 self._token = self._path = None 

394 self._mtime: float = -1.0 

395 if not token: 

396 return 

397 

398 self._token = token 

399 if os.path.isfile(token): 

400 self._path = os.path.abspath(token) 

401 if not _is_protected(self._path): 

402 raise PermissionError( 

403 f"Bearer token file at {self._path} must be protected for access only by its owner" 

404 ) 

405 self._refresh() 

406 

407 def _refresh(self) -> None: 

408 """Read the token file (if any) if its modification time is more recent 

409 than the last time we read it. 

410 """ 

411 if not self._path: 

412 return 

413 

414 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

415 log.debug("Reading bearer token file at %s", self._path) 

416 self._mtime = mtime 

417 with open(self._path) as f: 

418 self._token = f.read().rstrip("\n") 

419 

420 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

421 # Only add a bearer token to a request when using secure HTTP. 

422 if req.url and req.url.lower().startswith("https://") and self._token: 

423 self._refresh() 

424 req.headers["Authorization"] = f"Bearer {self._token}" 

425 return req 

426 

427 

428class SessionStore: 

429 """Cache a reusable HTTP client session per endpoint. 

430 

431 Parameters 

432 ---------- 

433 num_pools : `int`, optional 

434 Number of connection pools to keep: there is one pool per remote 

435 host. 

436 max_persistent_connections : `int`, optional 

437 Maximum number of connections per remote host to persist in each 

438 connection pool. 

439 backoff_min : `float`, optional 

440 Minimum value of the interval to compute the exponential 

441 backoff factor when retrying requests (seconds). 

442 backoff_max : `float`, optional 

443 Maximum value of the interval to compute the exponential 

444 backoff factor when retrying requests (seconds). 

445 """ 

446 

447 def __init__( 

448 self, 

449 num_pools: int = 10, 

450 max_persistent_connections: int = 1, 

451 backoff_min: float = 1.0, 

452 backoff_max: float = 3.0, 

453 ) -> None: 

454 # Dictionary to store the session associated to a given URI. The key 

455 # of the dictionary is a root URI and the value is the session. 

456 self._sessions: dict[str, requests.Session] = {} 

457 

458 # See documentation of urllib3 PoolManager class: 

459 # https://urllib3.readthedocs.io 

460 self._num_pools: int = num_pools 

461 

462 # See urllib3 Advanced Usage documentation: 

463 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html 

464 self._max_persistent_connections: int = max_persistent_connections 

465 

466 # Minimum and maximum values of the interval to compute the exponential 

467 # backoff factor when retrying requests (seconds). 

468 self._backoff_min: float = backoff_min 

469 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0 

470 

471 def clear(self) -> None: 

472 """Destroy all previously created sessions and attempt to close 

473 underlying idle network connections. 

474 """ 

475 # Close all sessions and empty the store. Idle network connections 

476 # should be closed as a consequence. We don't have means through 

477 # the API exposed by Requests to actually force closing the 

478 # underlying open sockets. 

479 for session in self._sessions.values(): 

480 session.close() 

481 

482 self._sessions.clear() 

483 

484 def get(self, rpath: ResourcePath) -> requests.Session: 

485 """Retrieve a session for accessing the remote resource at rpath. 

486 

487 Parameters 

488 ---------- 

489 rpath : `ResourcePath` 

490 URL to a resource at the remote server for which a session is to 

491 be retrieved. 

492 

493 Notes 

494 ----- 

495 Once a session is created for a given endpoint it is cached and 

496 returned every time a session is requested for any path under that same 

497 endpoint. For instance, a single session will be cached and shared 

498 for paths "https://www.example.org/path/to/file" and 

499 "https://www.example.org/any/other/path". 

500 

501 Note that "https://www.example.org" and "https://www.example.org:12345" 

502 will have different sessions since the port number is not identical. 

503 

504 In order to configure the session, some environment variables are 

505 inspected: 

506 

507 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

508 certificates to trust when verifying the server's certificate. 

509 

510 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

511 local file containing a bearer token to be used as the client 

512 authentication mechanism with all requests. 

513 The permissions of the token file must be set so that only its 

514 owner can access it. 

515 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

516 and LSST_HTTP_AUTH_CLIENT_KEY. 

517 

518 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

519 client certificate for authenticating to the server. 

520 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

521 initialized with the path of the client private key file. 

522 The permissions of the client private key must be set so that only 

523 its owner can access it, at least for reading. 

524 """ 

525 root_uri = str(rpath.root_uri()) 

526 if root_uri not in self._sessions: 

527 # We don't have yet a session for this endpoint: create a new one. 

528 self._sessions[root_uri] = self._make_session(rpath) 

529 

530 return self._sessions[root_uri] 

531 

532 def _make_session(self, rpath: ResourcePath) -> requests.Session: 

533 """Make a new session configured from values from the environment.""" 

534 session = requests.Session() 

535 root_uri = str(rpath.root_uri()) 

536 log.debug("Creating new HTTP session for endpoint %s ...", root_uri) 

537 retries = Retry( 

538 # Total number of retries to allow. Takes precedence over other 

539 # counts. 

540 total=6, 

541 # How many connection-related errors to retry on. 

542 connect=3, 

543 # How many times to retry on read errors. 

544 read=3, 

545 # Backoff factor to apply between attempts after the second try 

546 # (seconds). Compute a random jitter to prevent all the clients 

547 # to overwhelm the server by sending requests at the same time. 

548 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(), 

549 # How many times to retry on bad status codes. 

550 status=5, 

551 # Set of uppercased HTTP method verbs that we should retry on. 

552 # We only automatically retry idempotent requests. 

553 allowed_methods=frozenset( 

554 [ 

555 "COPY", 

556 "DELETE", 

557 "GET", 

558 "HEAD", 

559 "MKCOL", 

560 "OPTIONS", 

561 "PROPFIND", 

562 "PUT", 

563 ] 

564 ), 

565 # HTTP status codes that we should force a retry on. 

566 status_forcelist=frozenset( 

567 [ 

568 requests.codes.too_many_requests, # 429 

569 requests.codes.internal_server_error, # 500 

570 requests.codes.bad_gateway, # 502 

571 requests.codes.service_unavailable, # 503 

572 requests.codes.gateway_timeout, # 504 

573 ] 

574 ), 

575 # Whether to respect Retry-After header on status codes defined 

576 # above. 

577 respect_retry_after_header=True, 

578 ) 

579 

580 # Persist the specified number of connections to the front end server. 

581 session.mount( 

582 root_uri, 

583 HTTPAdapter( 

584 pool_connections=self._num_pools, 

585 pool_maxsize=self._max_persistent_connections, 

586 pool_block=False, 

587 max_retries=retries, 

588 ), 

589 ) 

590 

591 # Do not persist the connections to back end servers which may vary 

592 # from request to request. Systematically persisting connections to 

593 # those servers may exhaust their capabilities when there are thousands 

594 # of simultaneous clients. 

595 session.mount( 

596 f"{rpath.scheme}://", 

597 HTTPAdapter( 

598 pool_connections=self._num_pools, 

599 pool_maxsize=0, 

600 pool_block=False, 

601 max_retries=retries, 

602 ), 

603 ) 

604 

605 # If the remote endpoint doesn't use secure HTTP we don't include 

606 # bearer tokens in the requests nor need to authenticate the remote 

607 # server. 

608 if rpath.scheme != "https": 

609 return session 

610 

611 # Should we use a specific CA cert bundle for authenticating the 

612 # server? 

613 session.verify = True 

614 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

615 session.verify = ca_bundle 

616 

617 # Should we use bearer tokens for client authentication? 

618 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

619 log.debug("... using bearer token authentication") 

620 session.auth = BearerTokenAuth(token) 

621 return session 

622 

623 # Should we instead use client certificate and private key? If so, both 

624 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

625 # initialized. 

626 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

627 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

628 if client_cert and client_key: 

629 if not _is_protected(client_key): 

630 raise PermissionError( 

631 f"Private key file at {client_key} must be protected for access only by its owner" 

632 ) 

633 log.debug("... using client certificate authentication.") 

634 session.cert = (client_cert, client_key) 

635 return session 

636 

637 if client_cert: 

638 # Only the client certificate was provided. 

639 raise ValueError( 

640 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

641 ) 

642 

643 if client_key: 

644 # Only the client private key was provided. 

645 raise ValueError( 

646 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

647 ) 

648 

649 log.debug( 

650 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

651 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

652 ) 

653 return session 

654 

655 

656class HttpResourcePath(ResourcePath): 

657 """General HTTP(S) resource. 

658 

659 Notes 

660 ----- 

661 In order to configure the behavior of instances of this class, the 

662 environment variables below are inspected: 

663 

664 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

665 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

666 This header is required by some servers to detect if the client 

667 knows how to handle redirections. In case of redirection, the body 

668 of the PUT request is sent to the redirected location and not to 

669 the front end server. 

670 

671 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a 

672 numeric value, they are interpreted as the number of seconds to wait 

673 for establishing a connection with the server and for reading its 

674 response, respectively. 

675 

676 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and 

677 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number 

678 of connections to attempt to persist with both the front end servers 

679 and the back end servers. 

680 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and 

681 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS. 

682 

683 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to 

684 ask the server to compute for every file's content sent to the server 

685 via a PUT request. No digest is requested if this variable is not set 

686 or is set to an invalid value. 

687 Valid values are those in ACCEPTED_DIGESTS. 

688 """ 

689 

690 _is_webdav: bool | None = None 

691 

692 # Configuration items for this class instances. 

693 _config = HttpResourcePathConfig() 

694 

695 # The session for metadata requests is used for interacting with 

696 # the front end servers for requests such as PROPFIND, HEAD, etc. Those 

697 # interactions are typically served by the front end servers. We want to 

698 # keep the connection to the front end servers open, to reduce the cost 

699 # associated to TCP and TLS handshaking for each new request. 

700 _metadata_session_store = SessionStore( 

701 num_pools=5, 

702 max_persistent_connections=_config.front_end_connections, 

703 backoff_min=_config.backoff_min, 

704 backoff_max=_config.backoff_max, 

705 ) 

706 

707 # The data session is used for interaction with the front end servers which 

708 # typically redirect to the back end servers for serving our PUT and GET 

709 # requests. We attempt to keep a single connection open with the front end 

710 # server, if possible. This depends on how the server behaves and the 

711 # kind of request. Some servers close the connection when redirecting 

712 # the client to a back end server, for instance when serving a PUT 

713 # request. 

714 _data_session_store = SessionStore( 

715 num_pools=25, 

716 max_persistent_connections=_config.back_end_connections, 

717 backoff_min=_config.backoff_min, 

718 backoff_max=_config.backoff_max, 

719 ) 

720 

721 # Process ID which created the session stores above. We need to store this 

722 # to replace sessions created by a parent process and inherited by a 

723 # child process after a fork, to avoid confusing the SSL layer. 

724 _pid: int = -1 

725 

726 @property 

727 def metadata_session(self) -> requests.Session: 

728 """Client session to send requests which do not require upload or 

729 download of data, i.e. mostly metadata requests. 

730 """ 

731 if hasattr(self, "_metadata_session"): 

732 if HttpResourcePath._pid == os.getpid(): 

733 return self._metadata_session 

734 else: 

735 # The metadata session we have in cache was likely created by 

736 # a parent process. Discard all the sessions in that store. 

737 self._metadata_session_store.clear() 

738 

739 # Retrieve a new metadata session. 

740 HttpResourcePath._pid = os.getpid() 

741 self._metadata_session: requests.Session = self._metadata_session_store.get(self) 

742 return self._metadata_session 

743 

744 @property 

745 def data_session(self) -> requests.Session: 

746 """Client session for uploading and downloading data.""" 

747 if hasattr(self, "_data_session"): 

748 if HttpResourcePath._pid == os.getpid(): 

749 return self._data_session 

750 else: 

751 # The data session we have in cache was likely created by 

752 # a parent process. Discard all the sessions in that store. 

753 self._data_session_store.clear() 

754 

755 # Retrieve a new data session. 

756 HttpResourcePath._pid = os.getpid() 

757 self._data_session: requests.Session = self._data_session_store.get(self) 

758 return self._data_session 

759 

760 def _clear_sessions(self) -> None: 

761 """Close the socket connections that are still open. 

762 

763 Used only in test suites to avoid warnings. 

764 """ 

765 self._metadata_session_store.clear() 

766 self._data_session_store.clear() 

767 

768 if hasattr(self, "_metadata_session"): 

769 delattr(self, "_metadata_session") 

770 

771 if hasattr(self, "_data_session"): 

772 delattr(self, "_data_session") 

773 

774 @property 

775 def is_webdav_endpoint(self) -> bool: 

776 """Check if the current endpoint implements WebDAV features. 

777 

778 This is stored per URI but cached by root so there is 

779 only one check per hostname. 

780 """ 

781 if self._is_webdav is not None: 

782 return self._is_webdav 

783 

784 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

785 return self._is_webdav 

786 

787 def exists(self) -> bool: 

788 """Check that a remote HTTP resource exists.""" 

789 log.debug("Checking if resource exists: %s", self.geturl()) 

790 if not self.is_webdav_endpoint: 

791 # The remote is a plain HTTP server. Let's attempt a HEAD 

792 # request, even if the behavior for such a request against a 

793 # directory is not specified, so it depends on the server 

794 # implementation. 

795 resp = self.metadata_session.head( 

796 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

797 ) 

798 return resp.status_code == requests.codes.ok # 200 

799 

800 # The remote endpoint is a webDAV server: send a PROPFIND request 

801 # to determine if it exists. 

802 resp = self._propfind() 

803 if resp.status_code == requests.codes.multi_status: # 207 

804 prop = _parse_propfind_response_body(resp.text)[0] 

805 return prop.exists 

806 else: # 404 Not Found 

807 return False 

808 

809 def size(self) -> int: 

810 """Return the size of the remote resource in bytes.""" 

811 if self.dirLike: 

812 return 0 

813 

814 if not self.is_webdav_endpoint: 

815 # The remote is a plain HTTP server. Send a HEAD request to 

816 # retrieve the size of the resource. 

817 resp = self.metadata_session.head( 

818 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

819 ) 

820 if resp.status_code == requests.codes.ok: # 200 

821 if "Content-Length" in resp.headers: 

822 return int(resp.headers["Content-Length"]) 

823 else: 

824 raise ValueError( 

825 f"Response to HEAD request to {self} does not contain 'Content-Length' header" 

826 ) 

827 elif resp.status_code == requests.codes.not_found: 

828 raise FileNotFoundError( 

829 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

830 ) 

831 else: 

832 raise ValueError( 

833 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} " 

834 f"{resp.reason}" 

835 ) 

836 

837 # The remote is a webDAV server: send a PROPFIND request to retrieve 

838 # the size of the resource. Sizes are only meaningful for files. 

839 resp = self._propfind() 

840 if resp.status_code == requests.codes.multi_status: # 207 

841 prop = _parse_propfind_response_body(resp.text)[0] 

842 if prop.is_file: 

843 return prop.size 

844 elif prop.is_directory: 

845 raise IsADirectoryError( 

846 f"Resource {self} is reported by server as a directory but has a file path" 

847 ) 

848 else: 

849 raise FileNotFoundError(f"Resource {self} does not exist") 

850 else: # 404 Not Found 

851 raise FileNotFoundError( 

852 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

853 ) 

854 

855 def mkdir(self) -> None: 

856 """Create the directory resource if it does not already exist.""" 

857 # Creating directories is only available on WebDAV back ends. 

858 if not self.is_webdav_endpoint: 

859 raise NotImplementedError( 

860 f"Creation of directory {self} is not implemented by plain HTTP servers" 

861 ) 

862 

863 if not self.dirLike: 

864 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

865 

866 # Check if the target directory already exists. 

867 resp = self._propfind() 

868 if resp.status_code == requests.codes.multi_status: # 207 

869 prop = _parse_propfind_response_body(resp.text)[0] 

870 if prop.exists: 

871 if prop.is_directory: 

872 return 

873 else: 

874 # A file exists at this path 

875 raise NotADirectoryError( 

876 f"Can not create a directory for {self} because a file already exists at that path" 

877 ) 

878 

879 # Target directory does not exist. Create it and its ancestors as 

880 # needed. We need to test if parent URL is different from self URL, 

881 # otherwise we could be stuck in a recursive loop 

882 # where self == parent. 

883 if self.geturl() != self.parent().geturl(): 

884 self.parent().mkdir() 

885 

886 log.debug("Creating new directory: %s", self.geturl()) 

887 self._mkcol() 

888 

889 def remove(self) -> None: 

890 """Remove the resource.""" 

891 self._delete() 

892 

893 def read(self, size: int = -1) -> bytes: 

894 """Open the resource and return the contents in bytes. 

895 

896 Parameters 

897 ---------- 

898 size : `int`, optional 

899 The number of bytes to read. Negative or omitted indicates 

900 that all data should be read. 

901 """ 

902 # Use the data session as a context manager to ensure that the 

903 # network connections to both the front end and back end servers are 

904 # closed after downloading the data. 

905 log.debug("Reading from remote resource: %s", self.geturl()) 

906 stream = size > 0 

907 with self.data_session as session: 

908 with time_this(log, msg="GET %s", args=(self,)): 

909 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout) 

910 

911 if resp.status_code != requests.codes.ok: # 200 

912 raise FileNotFoundError( 

913 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}" 

914 ) 

915 if not stream: 

916 return resp.content 

917 else: 

918 return next(resp.iter_content(chunk_size=size)) 

919 

920 def write(self, data: bytes, overwrite: bool = True) -> None: 

921 """Write the supplied bytes to the new resource. 

922 

923 Parameters 

924 ---------- 

925 data : `bytes` 

926 The bytes to write to the resource. The entire contents of the 

927 resource will be replaced. 

928 overwrite : `bool`, optional 

929 If `True` the resource will be overwritten if it exists. Otherwise 

930 the write will fail. 

931 """ 

932 log.debug("Writing to remote resource: %s", self.geturl()) 

933 if not overwrite and self.exists(): 

934 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

935 

936 # Ensure the parent directory exists. 

937 # This is only meaningful and appropriate for WebDAV, not the general 

938 # HTTP case. e.g. for S3 HTTP URLs, the underlying service has no 

939 # concept of 'directories' at all. 

940 if self.is_webdav_endpoint: 

941 self.parent().mkdir() 

942 

943 # Upload the data. 

944 log.debug("Writing data to remote resource: %s", self.geturl()) 

945 self._put(data=data) 

946 

947 def transfer_from( 

948 self, 

949 src: ResourcePath, 

950 transfer: str = "copy", 

951 overwrite: bool = False, 

952 transaction: TransactionProtocol | None = None, 

953 ) -> None: 

954 """Transfer the current resource to a Webdav repository. 

955 

956 Parameters 

957 ---------- 

958 src : `ResourcePath` 

959 Source URI. 

960 transfer : `str` 

961 Mode to use for transferring the resource. Supports the following 

962 options: copy. 

963 overwrite : `bool`, optional 

964 Whether overwriting the remote resource is allowed or not. 

965 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

966 Currently unused. 

967 """ 

968 # Fail early to prevent delays if remote resources are requested. 

969 if transfer not in self.transferModes: 

970 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

971 

972 # Existence checks cost time so do not call this unless we know 

973 # that debugging is enabled. 

974 if log.isEnabledFor(logging.DEBUG): 

975 log.debug( 

976 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

977 src, 

978 src.exists(), 

979 self, 

980 self.exists(), 

981 transfer, 

982 ) 

983 

984 # Short circuit immediately if the URIs are identical. 

985 if self == src: 

986 log.debug( 

987 "Target and destination URIs are identical: %s, returning immediately." 

988 " No further action required.", 

989 self, 

990 ) 

991 return 

992 

993 if not overwrite and self.exists(): 

994 raise FileExistsError(f"Destination path {self} already exists.") 

995 

996 if transfer == "auto": 

997 transfer = self.transferDefault 

998 

999 # We can use webDAV 'COPY' or 'MOVE' if both the current and source 

1000 # resources are located in the same server. 

1001 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint: 

1002 log.debug("Transfer from %s to %s directly", src, self) 

1003 return self._move(src) if transfer == "move" else self._copy(src) 

1004 

1005 # For resources of different classes or for plain HTTP resources we can 

1006 # perform the copy or move operation by downloading to a local file 

1007 # and uploading to the destination. 

1008 self._copy_via_local(src) 

1009 

1010 # This was an explicit move, try to remove the source. 

1011 if transfer == "move": 

1012 src.remove() 

1013 

1014 def walk( 

1015 self, file_filter: str | re.Pattern | None = None 

1016 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

1017 """Walk the directory tree returning matching files and directories. 

1018 

1019 Parameters 

1020 ---------- 

1021 file_filter : `str` or `re.Pattern`, optional 

1022 Regex to filter out files from the list before it is returned. 

1023 

1024 Yields 

1025 ------ 

1026 dirpath : `ResourcePath` 

1027 Current directory being examined. 

1028 dirnames : `list` of `str` 

1029 Names of subdirectories within dirpath. 

1030 filenames : `list` of `str` 

1031 Names of all the files within dirpath. 

1032 """ 

1033 if not self.dirLike: 

1034 raise ValueError("Can not walk a non-directory URI") 

1035 

1036 # Walking directories is only available on WebDAV back ends. 

1037 if not self.is_webdav_endpoint: 

1038 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers") 

1039 

1040 if isinstance(file_filter, str): 

1041 file_filter = re.compile(file_filter) 

1042 

1043 resp = self._propfind(depth="1") 

1044 if resp.status_code == requests.codes.multi_status: # 207 

1045 files: list[str] = [] 

1046 dirs: list[str] = [] 

1047 

1048 for prop in _parse_propfind_response_body(resp.text): 

1049 if prop.is_file: 

1050 files.append(prop.name) 

1051 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")): 

1052 # Only include the names of sub-directories not the name of 

1053 # the directory being walked. 

1054 dirs.append(prop.name) 

1055 

1056 if file_filter is not None: 

1057 files = [f for f in files if file_filter.search(f)] 

1058 

1059 if not dirs and not files: 

1060 return 

1061 else: 

1062 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files 

1063 

1064 for dir in dirs: 

1065 new_uri = self.join(dir, forceDirectory=True) 

1066 yield from new_uri.walk(file_filter) 

1067 

1068 def _as_local(self) -> tuple[str, bool]: 

1069 """Download object over HTTP and place in temporary directory. 

1070 

1071 Returns 

1072 ------- 

1073 path : `str` 

1074 Path to local temporary file. 

1075 temporary : `bool` 

1076 Always returns `True`. This is always a temporary file. 

1077 """ 

1078 # Use the session as a context manager to ensure that connections 

1079 # to both the front end and back end servers are closed after the 

1080 # download operation is finished. 

1081 with self.data_session as session: 

1082 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout) 

1083 if resp.status_code != requests.codes.ok: 

1084 raise FileNotFoundError( 

1085 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}" 

1086 ) 

1087 

1088 tmpdir, buffering = _get_temp_dir() 

1089 with tempfile.NamedTemporaryFile( 

1090 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

1091 ) as tmpFile: 

1092 expected_length = int(resp.headers.get("Content-Length", "-1")) 

1093 with time_this( 

1094 log, 

1095 msg="GET %s [length=%d] to local file %s [chunk_size=%d]", 

1096 args=(self, expected_length, tmpFile.name, buffering), 

1097 mem_usage=self._config.collect_memory_usage, 

1098 mem_unit=u.mebibyte, 

1099 ): 

1100 content_length = 0 

1101 for chunk in resp.iter_content(chunk_size=buffering): 

1102 tmpFile.write(chunk) 

1103 content_length += len(chunk) 

1104 

1105 # Check that the expected and actual content lengths match. Perform 

1106 # this check only when the contents of the file was not encoded by 

1107 # the server. 

1108 if ( 

1109 "Content-Encoding" not in resp.headers 

1110 and expected_length >= 0 

1111 and expected_length != content_length 

1112 ): 

1113 raise ValueError( 

1114 f"Size of downloaded file does not match value in Content-Length header for {self}: " 

1115 f"expecting {expected_length} and got {content_length} bytes" 

1116 ) 

1117 

1118 return tmpFile.name, True 

1119 

1120 def _send_webdav_request( 

1121 self, 

1122 method: str, 

1123 url: str | None = None, 

1124 headers: dict[str, str] | None = None, 

1125 body: str | None = None, 

1126 session: requests.Session | None = None, 

1127 timeout: tuple[float, float] | None = None, 

1128 ) -> requests.Response: 

1129 """Send a webDAV request and correctly handle redirects. 

1130 

1131 Parameters 

1132 ---------- 

1133 method : `str` 

1134 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL. 

1135 headers : `dict`, optional 

1136 A dictionary of key-value pairs (both strings) to include as 

1137 headers in the request. 

1138 body : `str`, optional 

1139 The body of the request. 

1140 

1141 Notes 

1142 ----- 

1143 This way of sending webDAV requests is necessary for handling 

1144 redirection ourselves, since the 'requests' package changes the method 

1145 of the redirected request when the server responds with status 302 and 

1146 the method of the original request is not HEAD (which is the case for 

1147 webDAV requests). 

1148 

1149 That means that when the webDAV server we interact with responds with 

1150 a redirection to a PROPFIND or MKCOL request, the request gets 

1151 converted to a GET request when sent to the redirected location. 

1152 

1153 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in 

1154 https://github.com/psf/requests/blob/main/requests/sessions.py 

1155 

1156 This behavior of the 'requests' package is meant to be compatible with 

1157 what is specified in RFC 9110: 

1158 

1159 https://www.rfc-editor.org/rfc/rfc9110#name-302-found 

1160 

1161 For our purposes, we do need to follow the redirection and send a new 

1162 request using the same HTTP verb. 

1163 """ 

1164 if url is None: 

1165 url = self.geturl() 

1166 

1167 if headers is None: 

1168 headers = {} 

1169 

1170 if session is None: 

1171 session = self.metadata_session 

1172 

1173 if timeout is None: 

1174 timeout = self._config.timeout 

1175 

1176 with time_this( 

1177 log, 

1178 msg="%s %s", 

1179 args=( 

1180 method, 

1181 url, 

1182 ), 

1183 mem_usage=self._config.collect_memory_usage, 

1184 mem_unit=u.mebibyte, 

1185 ): 

1186 for _ in range(max_redirects := 5): 

1187 resp = session.request( 

1188 method, 

1189 url, 

1190 data=body, 

1191 headers=headers, 

1192 stream=False, 

1193 timeout=timeout, 

1194 allow_redirects=False, 

1195 ) 

1196 if resp.is_redirect: 

1197 url = resp.headers["Location"] 

1198 else: 

1199 return resp 

1200 

1201 # We reached the maximum allowed number of redirects. 

1202 # Stop trying. 

1203 raise ValueError( 

1204 f"Could not get a response to {method} request for {self} after {max_redirects} redirections" 

1205 ) 

1206 

1207 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response: 

1208 """Send a PROPFIND webDAV request and return the response. 

1209 

1210 Parameters 

1211 ---------- 

1212 body : `str`, optional 

1213 The body of the PROPFIND request to send to the server. If 

1214 provided, it is expected to be a XML document. 

1215 depth : `str`, optional 

1216 The value of the 'Depth' header to include in the request. 

1217 

1218 Returns 

1219 ------- 

1220 response : `requests.Response` 

1221 Response to the PROPFIND request. 

1222 

1223 Notes 

1224 ----- 

1225 It raises `ValueError` if the status code of the PROPFIND request 

1226 is different from "207 Multistatus" or "404 Not Found". 

1227 """ 

1228 if body is None: 

1229 # Request only the DAV live properties we are explicitly interested 

1230 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified' 

1231 # and 'displayname'. 

1232 body = ( 

1233 """<?xml version="1.0" encoding="utf-8" ?>""" 

1234 """<D:propfind xmlns:D="DAV:"><D:prop>""" 

1235 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>""" 

1236 """</D:prop></D:propfind>""" 

1237 ) 

1238 headers = { 

1239 "Depth": depth, 

1240 "Content-Type": 'application/xml; charset="utf-8"', 

1241 "Content-Length": str(len(body)), 

1242 } 

1243 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body) 

1244 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found): 

1245 return resp 

1246 else: 

1247 raise ValueError( 

1248 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} " 

1249 f"{resp.reason}" 

1250 ) 

1251 

1252 def _options(self) -> requests.Response: 

1253 """Send a OPTIONS webDAV request for this resource.""" 

1254 resp = self._send_webdav_request("OPTIONS") 

1255 if resp.status_code in (requests.codes.ok, requests.codes.created): 

1256 return resp 

1257 

1258 raise ValueError( 

1259 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} {resp.reason}" 

1260 ) 

1261 

1262 def _head(self) -> requests.Response: 

1263 """Send a HEAD webDAV request for this resource.""" 

1264 return self._send_webdav_request("HEAD") 

1265 

1266 def _mkcol(self) -> None: 

1267 """Send a MKCOL webDAV request to create a collection. The collection 

1268 may already exist. 

1269 """ 

1270 resp = self._send_webdav_request("MKCOL") 

1271 if resp.status_code == requests.codes.created: # 201 

1272 return 

1273 

1274 if resp.status_code == requests.codes.method_not_allowed: # 405 

1275 # The remote directory already exists 

1276 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

1277 else: 

1278 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}") 

1279 

1280 def _delete(self) -> None: 

1281 """Send a DELETE webDAV request for this resource.""" 

1282 log.debug("Deleting %s ...", self.geturl()) 

1283 

1284 # If this is a directory, ensure the remote is a webDAV server because 

1285 # plain HTTP servers don't support DELETE requests on non-file 

1286 # paths. 

1287 if self.dirLike and not self.is_webdav_endpoint: 

1288 raise NotImplementedError( 

1289 f"Deletion of directory {self} is not implemented by plain HTTP servers" 

1290 ) 

1291 

1292 # Deleting non-empty directories may take some time, so increase 

1293 # the timeout for getting a response from the server. 

1294 timeout = self._config.timeout 

1295 if self.dirLike: 

1296 timeout = (timeout[0], timeout[1] * 100) 

1297 resp = self._send_webdav_request("DELETE", timeout=timeout) 

1298 if resp.status_code in ( 

1299 requests.codes.ok, 

1300 requests.codes.accepted, 

1301 requests.codes.no_content, 

1302 requests.codes.not_found, 

1303 ): 

1304 # We can get a "404 Not Found" error when the file or directory 

1305 # does not exist or when the DELETE request was retried several 

1306 # times and a previous attempt actually deleted the resource. 

1307 # Therefore we consider that a "Not Found" response is not an 

1308 # error since we reached the state desired by the user. 

1309 return 

1310 else: 

1311 # TODO: the response to a DELETE request against a webDAV server 

1312 # may be multistatus. If so, we need to parse the reponse body to 

1313 # determine more precisely the reason of the failure (e.g. a lock) 

1314 # and provide a more helpful error message. 

1315 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}") 

1316 

1317 def _copy_via_local(self, src: ResourcePath) -> None: 

1318 """Replace the contents of this resource with the contents of a remote 

1319 resource by using a local temporary file. 

1320 

1321 Parameters 

1322 ---------- 

1323 src : `HttpResourcePath` 

1324 The source of the contents to copy to `self`. 

1325 """ 

1326 with src.as_local() as local_uri: 

1327 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri) 

1328 with open(local_uri.ospath, "rb") as f: 

1329 self._put(data=f) 

1330 

1331 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None: 

1332 """Send a COPY or MOVE webDAV request to copy or replace the contents 

1333 of this resource with the contents of another resource located in the 

1334 same server. 

1335 

1336 Parameters 

1337 ---------- 

1338 method : `str` 

1339 The method to perform. Valid values are "COPY" or "MOVE" (in 

1340 uppercase). 

1341 src : `HttpResourcePath` 

1342 The source of the contents to move to `self`. 

1343 """ 

1344 headers = {"Destination": self.geturl()} 

1345 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session) 

1346 if resp.status_code in (requests.codes.created, requests.codes.no_content): 

1347 return 

1348 

1349 if resp.status_code == requests.codes.multi_status: 

1350 tree = eTree.fromstring(resp.content) 

1351 status_element = tree.find("./{DAV:}response/{DAV:}status") 

1352 status = status_element.text if status_element is not None else "unknown" 

1353 error = tree.find("./{DAV:}response/{DAV:}error") 

1354 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}") 

1355 else: 

1356 raise ValueError( 

1357 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}" 

1358 ) 

1359 

1360 def _copy(self, src: HttpResourcePath) -> None: 

1361 """Send a COPY webDAV request to replace the contents of this resource 

1362 (if any) with the contents of another resource located in the same 

1363 server. 

1364 

1365 Parameters 

1366 ---------- 

1367 src : `HttpResourcePath` 

1368 The source of the contents to copy to `self`. 

1369 """ 

1370 # Neither dCache nor XrootD currently implement the COPY 

1371 # webDAV method as documented in 

1372 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY 

1373 # (See issues DM-37603 and DM-37651 for details) 

1374 # 

1375 # For the time being, we use a temporary local file to 

1376 # perform the copy client side. 

1377 # TODO: when those 2 issues above are solved remove the 3 lines below. 

1378 must_use_local = True 

1379 if must_use_local: 

1380 return self._copy_via_local(src) 

1381 

1382 return self._copy_or_move("COPY", src) 

1383 

1384 def _move(self, src: HttpResourcePath) -> None: 

1385 """Send a MOVE webDAV request to replace the contents of this resource 

1386 with the contents of another resource located in the same server. 

1387 

1388 Parameters 

1389 ---------- 

1390 src : `HttpResourcePath` 

1391 The source of the contents to move to `self`. 

1392 """ 

1393 return self._copy_or_move("MOVE", src) 

1394 

1395 def _put(self, data: BinaryIO | bytes) -> None: 

1396 """Perform an HTTP PUT request and handle redirection. 

1397 

1398 Parameters 

1399 ---------- 

1400 data : `Union[BinaryIO, bytes]` 

1401 The data to be included in the body of the PUT request. 

1402 """ 

1403 # Retrieve the final URL for this upload by sending a PUT request with 

1404 # no content. Follow a single server redirection to retrieve the 

1405 # final URL. 

1406 headers = {"Content-Length": "0"} 

1407 if self._config.send_expect_on_put: 

1408 headers["Expect"] = "100-continue" 

1409 

1410 url = self.geturl() 

1411 

1412 # Use the session as a context manager to ensure the underlying 

1413 # connections are closed after finishing uploading the data. 

1414 with self.data_session as session: 

1415 # Send an empty PUT request to get redirected to the final 

1416 # destination. 

1417 log.debug("Sending empty PUT request to %s", url) 

1418 with time_this( 

1419 log, 

1420 msg="PUT (no data) %s", 

1421 args=(url,), 

1422 mem_usage=self._config.collect_memory_usage, 

1423 mem_unit=u.mebibyte, 

1424 ): 

1425 resp = session.request( 

1426 "PUT", 

1427 url, 

1428 data=None, 

1429 headers=headers, 

1430 stream=False, 

1431 timeout=self._config.timeout, 

1432 allow_redirects=False, 

1433 ) 

1434 if resp.is_redirect: 

1435 url = resp.headers["Location"] 

1436 

1437 # Upload the data to the final destination. 

1438 log.debug("Uploading data to %s", url) 

1439 

1440 # Ask the server to compute and record a checksum of the uploaded 

1441 # file contents, for later integrity checks. Since we don't compute 

1442 # the digest ourselves while uploading the data, we cannot control 

1443 # after the request is complete that the data we uploaded is 

1444 # identical to the data recorded by the server, but at least the 

1445 # server has recorded a digest of the data it stored. 

1446 # 

1447 # See RFC-3230 for details and 

1448 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml 

1449 # for the list of supported digest algorithhms. 

1450 # In addition, note that not all servers implement this RFC so 

1451 # the checksum may not be computed by the server. 

1452 put_headers: dict[str, str] | None = None 

1453 if digest := self._config.digest_algorithm: 

1454 put_headers = {"Want-Digest": digest} 

1455 

1456 with time_this( 

1457 log, 

1458 msg="PUT %s", 

1459 args=(url,), 

1460 mem_usage=self._config.collect_memory_usage, 

1461 mem_unit=u.mebibyte, 

1462 ): 

1463 resp = session.request( 

1464 "PUT", 

1465 url, 

1466 data=data, 

1467 headers=put_headers, 

1468 stream=False, 

1469 timeout=self._config.timeout, 

1470 allow_redirects=False, 

1471 ) 

1472 if resp.status_code in ( 

1473 requests.codes.ok, 

1474 requests.codes.created, 

1475 requests.codes.no_content, 

1476 ): 

1477 return 

1478 else: 

1479 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}") 

1480 

1481 @contextlib.contextmanager 

1482 def _openImpl( 

1483 self, 

1484 mode: str = "r", 

1485 *, 

1486 encoding: str | None = None, 

1487 ) -> Iterator[ResourceHandleProtocol]: 

1488 resp = self._head() 

1489 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes" 

1490 handle: ResourceHandleProtocol 

1491 if mode in ("rb", "r") and accepts_range: 

1492 handle = HttpReadResourceHandle( 

1493 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout 

1494 ) 

1495 if mode == "r": 

1496 # cast because the protocol is compatible, but does not have 

1497 # BytesIO in the inheritance tree 

1498 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding) 

1499 else: 

1500 yield handle 

1501 else: 

1502 with super()._openImpl(mode, encoding=encoding) as http_handle: 

1503 yield http_handle 

1504 

1505 

1506def _dump_response(resp: requests.Response) -> None: 

1507 """Log the contents of a HTTP or webDAV request and its response. 

1508 

1509 Parameters 

1510 ---------- 

1511 resp : `requests.Response` 

1512 The response to log. 

1513 

1514 Notes 

1515 ----- 

1516 Intended for development purposes only. 

1517 """ 

1518 log.debug("-----------------------------------------------") 

1519 log.debug("Request") 

1520 log.debug(" method=%s", resp.request.method) 

1521 log.debug(" URL=%s", resp.request.url) 

1522 log.debug(" headers=%s", resp.request.headers) 

1523 if resp.request.method == "PUT": 

1524 log.debug(" body=<data>") 

1525 elif resp.request.body is None: 

1526 log.debug(" body=<empty>") 

1527 else: 

1528 log.debug(" body=%r", resp.request.body[:120]) 

1529 

1530 log.debug("Response:") 

1531 log.debug(" status_code=%d", resp.status_code) 

1532 log.debug(" headers=%s", resp.headers) 

1533 if not resp.content: 

1534 log.debug(" body=<empty>") 

1535 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain": 

1536 log.debug(" body=%r", resp.content) 

1537 else: 

1538 log.debug(" body=%r", resp.content[:80]) 

1539 

1540 

1541def _is_protected(filepath: str) -> bool: 

1542 """Return true if the permissions of file at filepath only allow for access 

1543 by its owner. 

1544 

1545 Parameters 

1546 ---------- 

1547 filepath : `str` 

1548 Path of a local file. 

1549 """ 

1550 if not os.path.isfile(filepath): 

1551 return False 

1552 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

1553 owner_accessible = bool(mode & stat.S_IRWXU) 

1554 group_accessible = bool(mode & stat.S_IRWXG) 

1555 other_accessible = bool(mode & stat.S_IRWXO) 

1556 return owner_accessible and not group_accessible and not other_accessible 

1557 

1558 

1559def _parse_propfind_response_body(body: str) -> list[DavProperty]: 

1560 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND 

1561 request. 

1562 

1563 Parameters 

1564 ---------- 

1565 body : `str` 

1566 XML-encoded response body to a PROPFIND request 

1567 

1568 Returns 

1569 ------- 

1570 responses : `List[DavProperty]` 

1571 

1572 Notes 

1573 ----- 

1574 Is is expected that there is at least one reponse in `body`, otherwise 

1575 this function raises. 

1576 """ 

1577 # A response body to a PROPFIND request is of the form (indented for 

1578 # readability): 

1579 # 

1580 # <?xml version="1.0" encoding="UTF-8"?> 

1581 # <D:multistatus xmlns:D="DAV:"> 

1582 # <D:response> 

1583 # <D:href>path/to/resource</D:href> 

1584 # <D:propstat> 

1585 # <D:prop> 

1586 # <D:resourcetype> 

1587 # <D:collection xmlns:D="DAV:"/> 

1588 # </D:resourcetype> 

1589 # <D:getlastmodified> 

1590 # Fri, 27 Jan 2 023 13:59:01 GMT 

1591 # </D:getlastmodified> 

1592 # <D:getcontentlength> 

1593 # 12345 

1594 # </D:getcontentlength> 

1595 # </D:prop> 

1596 # <D:status> 

1597 # HTTP/1.1 200 OK 

1598 # </D:status> 

1599 # </D:propstat> 

1600 # </D:response> 

1601 # <D:response> 

1602 # ... 

1603 # </D:response> 

1604 # <D:response> 

1605 # ... 

1606 # </D:response> 

1607 # </D:multistatus> 

1608 

1609 # Scan all the 'response' elements and extract the relevant properties 

1610 responses = [] 

1611 multistatus = eTree.fromstring(body.strip()) 

1612 for response in multistatus.findall("./{DAV:}response"): 

1613 responses.append(DavProperty(response)) 

1614 

1615 if responses: 

1616 return responses 

1617 else: 

1618 # Could not parse the body 

1619 raise ValueError(f"Unable to parse response for PROPFIND request: {body}") 

1620 

1621 

1622class DavProperty: 

1623 """Helper class to encapsulate select live DAV properties of a single 

1624 resource, as retrieved via a PROPFIND request. 

1625 

1626 Parameters 

1627 ---------- 

1628 response : `eTree.Element` or `None` 

1629 The XML response defining the DAV property. 

1630 """ 

1631 

1632 # Regular expression to compare against the 'status' element of a 

1633 # PROPFIND response's 'propstat' element. 

1634 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE) 

1635 

1636 def __init__(self, response: eTree.Element | None): 

1637 self._href: str = "" 

1638 self._displayname: str = "" 

1639 self._collection: bool = False 

1640 self._getlastmodified: str = "" 

1641 self._getcontentlength: int = -1 

1642 

1643 if response is not None: 

1644 self._parse(response) 

1645 

1646 def _parse(self, response: eTree.Element) -> None: 

1647 # Extract 'href'. 

1648 if (element := response.find("./{DAV:}href")) is not None: 

1649 # We need to use "str(element.text)"" instead of "element.text" to 

1650 # keep mypy happy. 

1651 self._href = str(element.text).strip() 

1652 else: 

1653 raise ValueError( 

1654 "Property 'href' expected but not found in PROPFIND response: " 

1655 f"{eTree.tostring(response, encoding='unicode')}" 

1656 ) 

1657 

1658 for propstat in response.findall("./{DAV:}propstat"): 

1659 # Only extract properties of interest with status OK. 

1660 status = propstat.find("./{DAV:}status") 

1661 if status is None or not self._status_ok_rex.match(str(status.text)): 

1662 continue 

1663 

1664 for prop in propstat.findall("./{DAV:}prop"): 

1665 # Parse "collection". 

1666 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None: 

1667 self._collection = True 

1668 

1669 # Parse "getlastmodified". 

1670 if (element := prop.find("./{DAV:}getlastmodified")) is not None: 

1671 self._getlastmodified = str(element.text) 

1672 

1673 # Parse "getcontentlength". 

1674 if (element := prop.find("./{DAV:}getcontentlength")) is not None: 

1675 self._getcontentlength = int(str(element.text)) 

1676 

1677 # Parse "displayname". 

1678 if (element := prop.find("./{DAV:}displayname")) is not None: 

1679 self._displayname = str(element.text) 

1680 

1681 # Some webDAV servers don't include the 'displayname' property in the 

1682 # response so try to infer it from the value of the 'href' property. 

1683 # Depending on the server the href value may end with '/'. 

1684 if not self._displayname: 

1685 self._displayname = os.path.basename(self._href.rstrip("/")) 

1686 

1687 # Force a size of 0 for collections. 

1688 if self._collection: 

1689 self._getcontentlength = 0 

1690 

1691 @property 

1692 def exists(self) -> bool: 

1693 # It is either a directory or a file with length of at least zero 

1694 return self._collection or self._getcontentlength >= 0 

1695 

1696 @property 

1697 def is_directory(self) -> bool: 

1698 return self._collection 

1699 

1700 @property 

1701 def is_file(self) -> bool: 

1702 return not self._collection 

1703 

1704 @property 

1705 def size(self) -> int: 

1706 return self._getcontentlength 

1707 

1708 @property 

1709 def name(self) -> str: 

1710 return self._displayname 

1711 

1712 @property 

1713 def href(self) -> str: 

1714 return self._href