Coverage for python/lsst/resources/http.py: 23%

590 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-19 11:17 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpResourcePath",) 

15 

16import contextlib 

17import functools 

18import io 

19import logging 

20import math 

21import os 

22import os.path 

23import random 

24import re 

25import stat 

26import tempfile 

27from collections.abc import Iterator 

28from typing import TYPE_CHECKING, BinaryIO, cast 

29 

30try: 

31 # Prefer 'defusedxml' (not part of standard library) if available, since 

32 # 'xml' is vulnerable to XML bombs. 

33 import defusedxml.ElementTree as eTree 

34except ImportError: 

35 import xml.etree.ElementTree as eTree 

36 

37from urllib.parse import parse_qs 

38 

39import requests 

40from astropy import units as u 

41from lsst.utils.timer import time_this 

42from requests.adapters import HTTPAdapter 

43from requests.auth import AuthBase 

44from urllib3.util.retry import Retry 

45 

46from ._resourceHandles import ResourceHandleProtocol 

47from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle, parse_content_range_header 

48from ._resourcePath import ResourcePath 

49 

50if TYPE_CHECKING: 

51 from .utils import TransactionProtocol 

52 

53log = logging.getLogger(__name__) 

54 

55 

56def _timeout_from_environment(env_var: str, default_value: float) -> float: 

57 """Convert and return a timeout from the value of an environment variable 

58 or a default value if the environment variable is not initialized. The 

59 value of `env_var` must be a valid `float` otherwise this function raises. 

60 

61 Parameters 

62 ---------- 

63 env_var : `str` 

64 Environment variable to look for. 

65 default_value : `float`` 

66 Value to return if `env_var` is not defined in the environment. 

67 

68 Returns 

69 ------- 

70 _timeout_from_environment : `float` 

71 Converted value. 

72 """ 

73 try: 

74 timeout = float(os.environ.get(env_var, default_value)) 

75 except ValueError: 

76 raise ValueError( 

77 f"Expecting valid timeout value in environment variable {env_var} but found " 

78 f"{os.environ.get(env_var)}" 

79 ) from None 

80 

81 if math.isnan(timeout): 

82 raise ValueError(f"Unexpected timeout value NaN found in environment variable {env_var}") 

83 

84 return timeout 

85 

86 

87class HttpResourcePathConfig: 

88 """Configuration class to encapsulate the configurable items used by class 

89 HttpResourcePath. 

90 """ 

91 

92 # Default timeouts for all HTTP requests (seconds). 

93 DEFAULT_TIMEOUT_CONNECT = 30.0 

94 DEFAULT_TIMEOUT_READ = 1_500.0 

95 

96 # Default lower and upper bounds for the backoff interval (seconds). 

97 # A value in this interval is randomly selected as the backoff factor when 

98 # requests need to be retried. 

99 DEFAULT_BACKOFF_MIN = 1.0 

100 DEFAULT_BACKOFF_MAX = 3.0 

101 

102 # Default number of connections to persist with both the front end and 

103 # back end servers. 

104 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2 

105 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1 

106 

107 # Accepted digest algorithms 

108 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512") 

109 

110 _front_end_connections: int | None = None 

111 _back_end_connections: int | None = None 

112 _digest_algorithm: str | None = None 

113 _send_expect_on_put: bool | None = None 

114 _timeout: tuple[float, float] | None = None 

115 _collect_memory_usage: bool | None = None 

116 _backoff_min: float | None = None 

117 _backoff_max: float | None = None 

118 

119 @property 

120 def front_end_connections(self) -> int: 

121 """Number of persistent connections to the front end server.""" 

122 if self._front_end_connections is not None: 122 ↛ 123line 122 didn't jump to line 123, because the condition on line 122 was never true

123 return self._front_end_connections 

124 

125 try: 

126 self._front_end_connections = int( 

127 os.environ.get( 

128 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

129 ) 

130 ) 

131 except ValueError: 

132 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

133 

134 return self._front_end_connections 

135 

136 @property 

137 def back_end_connections(self) -> int: 

138 """Number of persistent connections to the back end servers.""" 

139 if self._back_end_connections is not None: 139 ↛ 140line 139 didn't jump to line 140, because the condition on line 139 was never true

140 return self._back_end_connections 

141 

142 try: 

143 self._back_end_connections = int( 

144 os.environ.get( 

145 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

146 ) 

147 ) 

148 except ValueError: 

149 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

150 

151 return self._back_end_connections 

152 

153 @property 

154 def digest_algorithm(self) -> str: 

155 """Algorithm to ask the server to use for computing and recording 

156 digests of each file contents in PUT requests. 

157 

158 Returns 

159 ------- 

160 digest_algorithm: `str` 

161 The name of a digest algorithm or the empty string if no algotihm 

162 is configured. 

163 """ 

164 if self._digest_algorithm is not None: 

165 return self._digest_algorithm 

166 

167 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower() 

168 if digest not in self.ACCEPTED_DIGESTS: 

169 digest = "" 

170 

171 self._digest_algorithm = digest 

172 return self._digest_algorithm 

173 

174 @property 

175 def send_expect_on_put(self) -> bool: 

176 """Return True if a "Expect: 100-continue" header is to be sent to 

177 the server on each PUT request. 

178 

179 Some servers (e.g. dCache) uses this information as an indication that 

180 the client knows how to handle redirects to the specific server that 

181 will actually receive the data for PUT requests. 

182 """ 

183 if self._send_expect_on_put is not None: 

184 return self._send_expect_on_put 

185 

186 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

187 return self._send_expect_on_put 

188 

189 @property 

190 def timeout(self) -> tuple[float, float]: 

191 """Return a tuple with the values of timeouts for connecting to the 

192 server and reading its response, respectively. Both values are in 

193 seconds. 

194 """ 

195 if self._timeout is not None: 

196 return self._timeout 

197 

198 self._timeout = ( 

199 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT), 

200 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ), 

201 ) 

202 return self._timeout 

203 

204 @property 

205 def collect_memory_usage(self) -> bool: 

206 """Return true if we want to collect memory usage when timing 

207 operations against the remote server via the `lsst.utils.time_this` 

208 context manager. 

209 """ 

210 if self._collect_memory_usage is not None: 

211 return self._collect_memory_usage 

212 

213 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ 

214 return self._collect_memory_usage 

215 

216 @property 

217 def backoff_min(self) -> float: 

218 """Lower bound of the interval from which a backoff factor is randomly 

219 selected when retrying requests (seconds). 

220 """ 

221 if self._backoff_min is not None: 

222 return self._backoff_min 

223 

224 self._backoff_min = self.DEFAULT_BACKOFF_MIN 

225 try: 

226 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN)) 

227 if not math.isnan(backoff_min): 227 ↛ 232line 227 didn't jump to line 232, because the condition on line 227 was never false

228 self._backoff_min = backoff_min 

229 except ValueError: 

230 pass 

231 

232 return self._backoff_min 

233 

234 @property 

235 def backoff_max(self) -> float: 

236 """Upper bound of the interval from which a backoff factor is randomly 

237 selected when retrying requests (seconds). 

238 """ 

239 if self._backoff_max is not None: 

240 return self._backoff_max 

241 

242 self._backoff_max = self.DEFAULT_BACKOFF_MAX 

243 try: 

244 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX)) 

245 if not math.isnan(backoff_max): 245 ↛ 250line 245 didn't jump to line 250, because the condition on line 245 was never false

246 self._backoff_max = backoff_max 

247 except ValueError: 

248 pass 

249 

250 return self._backoff_max 

251 

252 

253@functools.lru_cache 

254def _is_webdav_endpoint(path: ResourcePath | str) -> bool: 

255 """Check whether the remote HTTP endpoint implements WebDAV features. 

256 

257 Parameters 

258 ---------- 

259 path : `ResourcePath` or `str` 

260 URL to the resource to be checked. 

261 Should preferably refer to the root since the status is shared 

262 by all paths in that server. 

263 

264 Returns 

265 ------- 

266 _is_webdav_endpoint : `bool` 

267 True if the endpoint implements WebDAV, False if it doesn't. 

268 """ 

269 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

270 

271 # Send an OPTIONS request and inspect its response. An OPTIONS 

272 # request does not need authentication of the client, so we don't need 

273 # to provide a client certificate or a bearer token. We set a 

274 # relatively short timeout since an OPTIONS request is relatively cheap 

275 # for the server to compute. 

276 

277 # Create a session for configuring retries 

278 retries = Retry( 

279 # Total number of retries to allow. Takes precedence over other 

280 # counts. 

281 total=6, 

282 # How many connection-related errors to retry on. 

283 connect=3, 

284 # How many times to retry on read errors. 

285 read=3, 

286 # How many times to retry on bad status codes. 

287 status=5, 

288 # Set of uppercased HTTP method verbs that we should retry on. 

289 allowed_methods=frozenset( 

290 [ 

291 "OPTIONS", 

292 ] 

293 ), 

294 # HTTP status codes that we should force a retry on. 

295 status_forcelist=frozenset( 

296 [ 

297 requests.codes.too_many_requests, # 429 

298 requests.codes.internal_server_error, # 500 

299 requests.codes.bad_gateway, # 502 

300 requests.codes.service_unavailable, # 503 

301 requests.codes.gateway_timeout, # 504 

302 ] 

303 ), 

304 # Whether to respect 'Retry-After' header on status codes defined 

305 # above. 

306 respect_retry_after_header=True, 

307 ) 

308 

309 try: 

310 session = requests.Session() 

311 session.mount(str(path), HTTPAdapter(max_retries=retries)) 

312 session.verify = os.environ.get("LSST_HTTP_CACERT_BUNDLE", True) 

313 with session: 

314 resp = session.options( 

315 str(path), 

316 stream=False, 

317 timeout=( 

318 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", 30.0), 

319 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", 60.0), 

320 ), 

321 ) 

322 if resp.status_code not in (requests.codes.ok, requests.codes.created): 

323 return False 

324 

325 # Check that "1" is part of the value of the "DAV" header. We don't 

326 # use locks, so a server complying to class 1 is enough for our 

327 # purposes. All webDAV servers must advertise at least compliance 

328 # class "1". 

329 # 

330 # Compliance classes are documented in 

331 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes 

332 # 

333 # Examples of values for header DAV are: 

334 # DAV: 1, 2 

335 # DAV: 1, <http://apache.org/dav/propset/fs/1> 

336 if "DAV" not in resp.headers: 

337 return False 

338 else: 

339 # Convert to str to keep mypy happy 

340 compliance_class = str(resp.headers.get("DAV")) 

341 return "1" in compliance_class.replace(" ", "").split(",") 

342 

343 except requests.exceptions.SSLError as e: 

344 log.warning( 

345 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to " 

346 "specify tha path to a bundle of certificate authorities you trust " 

347 "which are not included in the default set of trusted authorities " 

348 "of this system." 

349 ) 

350 raise e 

351 

352 

353# Tuple (path, block_size) pointing to the location of a local directory 

354# to save temporary files and the block size of the underlying file system. 

355_TMPDIR: tuple[str, int] | None = None 

356 

357 

358def _get_temp_dir() -> tuple[str, int]: 

359 """Return the temporary directory path and block size. 

360 

361 This function caches its results in _TMPDIR. 

362 """ 

363 global _TMPDIR 

364 if _TMPDIR: 

365 return _TMPDIR 

366 

367 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

368 # 'TMPDIR', if defined. Otherwise use current working directory. 

369 tmpdir = os.getcwd() 

370 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

371 if dir and os.path.isdir(dir): 

372 tmpdir = dir 

373 break 

374 

375 # Compute the block size as 256 blocks of typical size 

376 # (i.e. 4096 bytes) or 10 times the file system block size, 

377 # whichever is higher. This is a reasonable compromise between 

378 # using memory for buffering and the number of system calls 

379 # issued to read from or write to temporary files. 

380 fsstats = os.statvfs(tmpdir) 

381 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

382 

383 

384class BearerTokenAuth(AuthBase): 

385 """Attach a bearer token 'Authorization' header to each request. 

386 

387 Parameters 

388 ---------- 

389 token : `str` 

390 Can be either the path to a local protected file which contains the 

391 value of the token or the token itself. 

392 """ 

393 

394 def __init__(self, token: str): 

395 self._token = self._path = None 

396 self._mtime: float = -1.0 

397 if not token: 

398 return 

399 

400 self._token = token 

401 if os.path.isfile(token): 

402 self._path = os.path.abspath(token) 

403 if not _is_protected(self._path): 

404 raise PermissionError( 

405 f"Bearer token file at {self._path} must be protected for access only by its owner" 

406 ) 

407 self._refresh() 

408 

409 def _refresh(self) -> None: 

410 """Read the token file (if any) if its modification time is more recent 

411 than the last time we read it. 

412 """ 

413 if not self._path: 

414 return 

415 

416 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

417 log.debug("Reading bearer token file at %s", self._path) 

418 self._mtime = mtime 

419 with open(self._path) as f: 

420 self._token = f.read().rstrip("\n") 

421 

422 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

423 # Only add a bearer token to a request when using secure HTTP. 

424 if req.url and req.url.lower().startswith("https://") and self._token: 

425 self._refresh() 

426 req.headers["Authorization"] = f"Bearer {self._token}" 

427 return req 

428 

429 

430class SessionStore: 

431 """Cache a reusable HTTP client session per endpoint. 

432 

433 Parameters 

434 ---------- 

435 num_pools : `int`, optional 

436 Number of connection pools to keep: there is one pool per remote 

437 host. 

438 max_persistent_connections : `int`, optional 

439 Maximum number of connections per remote host to persist in each 

440 connection pool. 

441 backoff_min : `float`, optional 

442 Minimum value of the interval to compute the exponential 

443 backoff factor when retrying requests (seconds). 

444 backoff_max : `float`, optional 

445 Maximum value of the interval to compute the exponential 

446 backoff factor when retrying requests (seconds). 

447 """ 

448 

449 def __init__( 

450 self, 

451 num_pools: int = 10, 

452 max_persistent_connections: int = 1, 

453 backoff_min: float = 1.0, 

454 backoff_max: float = 3.0, 

455 ) -> None: 

456 # Dictionary to store the session associated to a given URI. The key 

457 # of the dictionary is a root URI and the value is the session. 

458 self._sessions: dict[str, requests.Session] = {} 

459 

460 # See documentation of urllib3 PoolManager class: 

461 # https://urllib3.readthedocs.io 

462 self._num_pools: int = num_pools 

463 

464 # See urllib3 Advanced Usage documentation: 

465 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html 

466 self._max_persistent_connections: int = max_persistent_connections 

467 

468 # Minimum and maximum values of the interval to compute the exponential 

469 # backoff factor when retrying requests (seconds). 

470 self._backoff_min: float = backoff_min 

471 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0 

472 

473 def clear(self) -> None: 

474 """Destroy all previously created sessions and attempt to close 

475 underlying idle network connections. 

476 """ 

477 # Close all sessions and empty the store. Idle network connections 

478 # should be closed as a consequence. We don't have means through 

479 # the API exposed by Requests to actually force closing the 

480 # underlying open sockets. 

481 for session in self._sessions.values(): 

482 session.close() 

483 

484 self._sessions.clear() 

485 

486 def get(self, rpath: ResourcePath) -> requests.Session: 

487 """Retrieve a session for accessing the remote resource at rpath. 

488 

489 Parameters 

490 ---------- 

491 rpath : `ResourcePath` 

492 URL to a resource at the remote server for which a session is to 

493 be retrieved. 

494 

495 Notes 

496 ----- 

497 Once a session is created for a given endpoint it is cached and 

498 returned every time a session is requested for any path under that same 

499 endpoint. For instance, a single session will be cached and shared 

500 for paths "https://www.example.org/path/to/file" and 

501 "https://www.example.org/any/other/path". 

502 

503 Note that "https://www.example.org" and "https://www.example.org:12345" 

504 will have different sessions since the port number is not identical. 

505 

506 In order to configure the session, some environment variables are 

507 inspected: 

508 

509 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

510 certificates to trust when verifying the server's certificate. 

511 

512 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

513 local file containing a bearer token to be used as the client 

514 authentication mechanism with all requests. 

515 The permissions of the token file must be set so that only its 

516 owner can access it. 

517 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

518 and LSST_HTTP_AUTH_CLIENT_KEY. 

519 

520 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

521 client certificate for authenticating to the server. 

522 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

523 initialized with the path of the client private key file. 

524 The permissions of the client private key must be set so that only 

525 its owner can access it, at least for reading. 

526 """ 

527 root_uri = str(rpath.root_uri()) 

528 if root_uri not in self._sessions: 

529 # We don't have yet a session for this endpoint: create a new one. 

530 self._sessions[root_uri] = self._make_session(rpath) 

531 

532 return self._sessions[root_uri] 

533 

534 def _make_session(self, rpath: ResourcePath) -> requests.Session: 

535 """Make a new session configured from values from the environment.""" 

536 session = requests.Session() 

537 root_uri = str(rpath.root_uri()) 

538 log.debug("Creating new HTTP session for endpoint %s ...", root_uri) 

539 retries = Retry( 

540 # Total number of retries to allow. Takes precedence over other 

541 # counts. 

542 total=6, 

543 # How many connection-related errors to retry on. 

544 connect=3, 

545 # How many times to retry on read errors. 

546 read=3, 

547 # Backoff factor to apply between attempts after the second try 

548 # (seconds). Compute a random jitter to prevent all the clients 

549 # to overwhelm the server by sending requests at the same time. 

550 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(), 

551 # How many times to retry on bad status codes. 

552 status=5, 

553 # Set of uppercased HTTP method verbs that we should retry on. 

554 # We only automatically retry idempotent requests. 

555 allowed_methods=frozenset( 

556 [ 

557 "COPY", 

558 "DELETE", 

559 "GET", 

560 "HEAD", 

561 "MKCOL", 

562 "OPTIONS", 

563 "PROPFIND", 

564 "PUT", 

565 ] 

566 ), 

567 # HTTP status codes that we should force a retry on. 

568 status_forcelist=frozenset( 

569 [ 

570 requests.codes.too_many_requests, # 429 

571 requests.codes.internal_server_error, # 500 

572 requests.codes.bad_gateway, # 502 

573 requests.codes.service_unavailable, # 503 

574 requests.codes.gateway_timeout, # 504 

575 ] 

576 ), 

577 # Whether to respect Retry-After header on status codes defined 

578 # above. 

579 respect_retry_after_header=True, 

580 ) 

581 

582 # Persist the specified number of connections to the front end server. 

583 session.mount( 

584 root_uri, 

585 HTTPAdapter( 

586 pool_connections=self._num_pools, 

587 pool_maxsize=self._max_persistent_connections, 

588 pool_block=False, 

589 max_retries=retries, 

590 ), 

591 ) 

592 

593 # Do not persist the connections to back end servers which may vary 

594 # from request to request. Systematically persisting connections to 

595 # those servers may exhaust their capabilities when there are thousands 

596 # of simultaneous clients. 

597 session.mount( 

598 f"{rpath.scheme}://", 

599 HTTPAdapter( 

600 pool_connections=self._num_pools, 

601 pool_maxsize=0, 

602 pool_block=False, 

603 max_retries=retries, 

604 ), 

605 ) 

606 

607 # If the remote endpoint doesn't use secure HTTP we don't include 

608 # bearer tokens in the requests nor need to authenticate the remote 

609 # server. 

610 if rpath.scheme != "https": 

611 return session 

612 

613 # Should we use a specific CA cert bundle for authenticating the 

614 # server? 

615 session.verify = True 

616 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

617 session.verify = ca_bundle 

618 

619 # Should we use bearer tokens for client authentication? 

620 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

621 log.debug("... using bearer token authentication") 

622 session.auth = BearerTokenAuth(token) 

623 return session 

624 

625 # Should we instead use client certificate and private key? If so, both 

626 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

627 # initialized. 

628 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

629 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

630 if client_cert and client_key: 

631 if not _is_protected(client_key): 

632 raise PermissionError( 

633 f"Private key file at {client_key} must be protected for access only by its owner" 

634 ) 

635 log.debug("... using client certificate authentication.") 

636 session.cert = (client_cert, client_key) 

637 return session 

638 

639 if client_cert: 

640 # Only the client certificate was provided. 

641 raise ValueError( 

642 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

643 ) 

644 

645 if client_key: 

646 # Only the client private key was provided. 

647 raise ValueError( 

648 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

649 ) 

650 

651 log.debug( 

652 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

653 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

654 ) 

655 return session 

656 

657 

658class HttpResourcePath(ResourcePath): 

659 """General HTTP(S) resource. 

660 

661 Notes 

662 ----- 

663 In order to configure the behavior of instances of this class, the 

664 environment variables below are inspected: 

665 

666 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

667 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

668 This header is required by some servers to detect if the client 

669 knows how to handle redirections. In case of redirection, the body 

670 of the PUT request is sent to the redirected location and not to 

671 the front end server. 

672 

673 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a 

674 numeric value, they are interpreted as the number of seconds to wait 

675 for establishing a connection with the server and for reading its 

676 response, respectively. 

677 

678 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and 

679 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number 

680 of connections to attempt to persist with both the front end servers 

681 and the back end servers. 

682 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and 

683 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS. 

684 

685 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to 

686 ask the server to compute for every file's content sent to the server 

687 via a PUT request. No digest is requested if this variable is not set 

688 or is set to an invalid value. 

689 Valid values are those in ACCEPTED_DIGESTS. 

690 """ 

691 

692 _is_webdav: bool | None = None 

693 

694 # Configuration items for this class instances. 

695 _config = HttpResourcePathConfig() 

696 

697 # The session for metadata requests is used for interacting with 

698 # the front end servers for requests such as PROPFIND, HEAD, etc. Those 

699 # interactions are typically served by the front end servers. We want to 

700 # keep the connection to the front end servers open, to reduce the cost 

701 # associated to TCP and TLS handshaking for each new request. 

702 _metadata_session_store = SessionStore( 

703 num_pools=5, 

704 max_persistent_connections=_config.front_end_connections, 

705 backoff_min=_config.backoff_min, 

706 backoff_max=_config.backoff_max, 

707 ) 

708 

709 # The data session is used for interaction with the front end servers which 

710 # typically redirect to the back end servers for serving our PUT and GET 

711 # requests. We attempt to keep a single connection open with the front end 

712 # server, if possible. This depends on how the server behaves and the 

713 # kind of request. Some servers close the connection when redirecting 

714 # the client to a back end server, for instance when serving a PUT 

715 # request. 

716 _data_session_store = SessionStore( 

717 num_pools=25, 

718 max_persistent_connections=_config.back_end_connections, 

719 backoff_min=_config.backoff_min, 

720 backoff_max=_config.backoff_max, 

721 ) 

722 

723 # Process ID which created the session stores above. We need to store this 

724 # to replace sessions created by a parent process and inherited by a 

725 # child process after a fork, to avoid confusing the SSL layer. 

726 _pid: int = -1 

727 

728 @property 

729 def metadata_session(self) -> requests.Session: 

730 """Client session to send requests which do not require upload or 

731 download of data, i.e. mostly metadata requests. 

732 """ 

733 if hasattr(self, "_metadata_session"): 

734 if HttpResourcePath._pid == os.getpid(): 

735 return self._metadata_session 

736 else: 

737 # The metadata session we have in cache was likely created by 

738 # a parent process. Discard all the sessions in that store. 

739 self._metadata_session_store.clear() 

740 

741 # Retrieve a new metadata session. 

742 HttpResourcePath._pid = os.getpid() 

743 self._metadata_session: requests.Session = self._metadata_session_store.get(self) 

744 return self._metadata_session 

745 

746 @property 

747 def data_session(self) -> requests.Session: 

748 """Client session for uploading and downloading data.""" 

749 if hasattr(self, "_data_session"): 

750 if HttpResourcePath._pid == os.getpid(): 

751 return self._data_session 

752 else: 

753 # The data session we have in cache was likely created by 

754 # a parent process. Discard all the sessions in that store. 

755 self._data_session_store.clear() 

756 

757 # Retrieve a new data session. 

758 HttpResourcePath._pid = os.getpid() 

759 self._data_session: requests.Session = self._data_session_store.get(self) 

760 return self._data_session 

761 

762 def _clear_sessions(self) -> None: 

763 """Close the socket connections that are still open. 

764 

765 Used only in test suites to avoid warnings. 

766 """ 

767 self._metadata_session_store.clear() 

768 self._data_session_store.clear() 

769 

770 if hasattr(self, "_metadata_session"): 

771 delattr(self, "_metadata_session") 

772 

773 if hasattr(self, "_data_session"): 

774 delattr(self, "_data_session") 

775 

776 @property 

777 def is_webdav_endpoint(self) -> bool: 

778 """Check if the current endpoint implements WebDAV features. 

779 

780 This is stored per URI but cached by root so there is 

781 only one check per hostname. 

782 """ 

783 if self._is_webdav is not None: 

784 return self._is_webdav 

785 

786 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

787 return self._is_webdav 

788 

789 def exists(self) -> bool: 

790 """Check that a remote HTTP resource exists.""" 

791 log.debug("Checking if resource exists: %s", self.geturl()) 

792 if not self.is_webdav_endpoint: 

793 # The remote is a plain HTTP server. Let's attempt a HEAD 

794 # request, even if the behavior for such a request against a 

795 # directory is not specified, so it depends on the server 

796 # implementation. 

797 resp = self._head_non_webdav_url() 

798 return self._is_successful_non_webdav_head_request(resp) 

799 

800 # The remote endpoint is a webDAV server: send a PROPFIND request 

801 # to determine if it exists. 

802 resp = self._propfind() 

803 if resp.status_code == requests.codes.multi_status: # 207 

804 prop = _parse_propfind_response_body(resp.text)[0] 

805 return prop.exists 

806 else: # 404 Not Found 

807 return False 

808 

809 def size(self) -> int: 

810 """Return the size of the remote resource in bytes.""" 

811 if self.dirLike: 

812 return 0 

813 

814 if not self.is_webdav_endpoint: 

815 # The remote is a plain HTTP server. Send a HEAD request to 

816 # retrieve the size of the resource. 

817 resp = self._head_non_webdav_url() 

818 if resp.status_code == requests.codes.ok: # 200 

819 if "Content-Length" in resp.headers: 

820 return int(resp.headers["Content-Length"]) 

821 else: 

822 raise ValueError( 

823 f"Response to HEAD request to {self} does not contain 'Content-Length' header" 

824 ) 

825 elif resp.status_code == requests.codes.partial_content: 

826 # 206, returned from a GET request with a Range header (used to 

827 # emulate HEAD for presigned S3 URLs). In this case 

828 # Content-Length is the length of the Range and not the full 

829 # length of the file, so we have to parse Content-Range 

830 # instead. 

831 content_range_header = resp.headers.get("Content-Range") 

832 if content_range_header is None: 

833 raise ValueError( 

834 f"Response to GET request to {self} did not contain 'Content-Range' header" 

835 ) 

836 content_range = parse_content_range_header(content_range_header) 

837 size = content_range.total 

838 if size is None: 

839 raise ValueError(f"Content-Range header for {self} did not include a total file size") 

840 return size 

841 

842 elif resp.status_code == requests.codes.not_found: 

843 raise FileNotFoundError( 

844 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

845 ) 

846 else: 

847 raise ValueError( 

848 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} " 

849 f"{resp.reason}" 

850 ) 

851 

852 # The remote is a webDAV server: send a PROPFIND request to retrieve 

853 # the size of the resource. Sizes are only meaningful for files. 

854 resp = self._propfind() 

855 if resp.status_code == requests.codes.multi_status: # 207 

856 prop = _parse_propfind_response_body(resp.text)[0] 

857 if prop.is_file: 

858 return prop.size 

859 elif prop.is_directory: 

860 raise IsADirectoryError( 

861 f"Resource {self} is reported by server as a directory but has a file path" 

862 ) 

863 else: 

864 raise FileNotFoundError(f"Resource {self} does not exist") 

865 else: # 404 Not Found 

866 raise FileNotFoundError( 

867 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

868 ) 

869 

870 def _head_non_webdav_url(self) -> requests.Response: 

871 """Return a response from a HTTP HEAD request for a non-WebDAV HTTP 

872 URL. 

873 

874 Emulates HEAD using a 0-byte GET for presigned S3 URLs. 

875 """ 

876 if self._looks_like_presigned_s3_url(): 

877 # Presigned S3 URLs are signed for a single method only, so you 

878 # can't call HEAD on a URL signed for GET. However, S3 does 

879 # support Range requests, so you can ask for a 0-byte range with 

880 # GET for a similar effect to HEAD. 

881 # 

882 # Note that some headers differ between a true HEAD request and the 

883 # response returned by this GET, e.g. Content-Length will always be 

884 # 0, and the status code is 206 instead of 200. 

885 return self.metadata_session.get( 

886 self.geturl(), 

887 timeout=self._config.timeout, 

888 allow_redirects=True, 

889 stream=False, 

890 headers={"Range": "bytes=0-0"}, 

891 ) 

892 else: 

893 return self.metadata_session.head( 

894 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

895 ) 

896 

897 def _is_successful_non_webdav_head_request(self, resp: requests.Response) -> bool: 

898 """Return `True` if the status code in the response indicates a 

899 successful HEAD or GET request. 

900 """ 

901 return resp.status_code in ( 

902 requests.codes.ok, # 200, from a normal HEAD or GET request 

903 requests.codes.partial_content, # 206, returned from a GET request with a Range header. 

904 ) 

905 

906 def _looks_like_presigned_s3_url(self) -> bool: 

907 """Return `True` if this ResourcePath's URL is likely to be a presigned 

908 S3 URL. 

909 """ 

910 query_params = parse_qs(self._uri.query) 

911 return "Signature" in query_params and "Expires" in query_params 

912 

913 def mkdir(self) -> None: 

914 """Create the directory resource if it does not already exist.""" 

915 # Creating directories is only available on WebDAV back ends. 

916 if not self.is_webdav_endpoint: 

917 raise NotImplementedError( 

918 f"Creation of directory {self} is not implemented by plain HTTP servers" 

919 ) 

920 

921 if not self.dirLike: 

922 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

923 

924 # Check if the target directory already exists. 

925 resp = self._propfind() 

926 if resp.status_code == requests.codes.multi_status: # 207 

927 prop = _parse_propfind_response_body(resp.text)[0] 

928 if prop.exists: 

929 if prop.is_directory: 

930 return 

931 else: 

932 # A file exists at this path 

933 raise NotADirectoryError( 

934 f"Can not create a directory for {self} because a file already exists at that path" 

935 ) 

936 

937 # Target directory does not exist. Create it and its ancestors as 

938 # needed. We need to test if parent URL is different from self URL, 

939 # otherwise we could be stuck in a recursive loop 

940 # where self == parent. 

941 if self.geturl() != self.parent().geturl(): 

942 self.parent().mkdir() 

943 

944 log.debug("Creating new directory: %s", self.geturl()) 

945 self._mkcol() 

946 

947 def remove(self) -> None: 

948 """Remove the resource.""" 

949 self._delete() 

950 

951 def read(self, size: int = -1) -> bytes: 

952 """Open the resource and return the contents in bytes. 

953 

954 Parameters 

955 ---------- 

956 size : `int`, optional 

957 The number of bytes to read. Negative or omitted indicates 

958 that all data should be read. 

959 """ 

960 # Use the data session as a context manager to ensure that the 

961 # network connections to both the front end and back end servers are 

962 # closed after downloading the data. 

963 log.debug("Reading from remote resource: %s", self.geturl()) 

964 stream = size > 0 

965 with self.data_session as session: 

966 with time_this(log, msg="GET %s", args=(self,)): 

967 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout) 

968 

969 if resp.status_code != requests.codes.ok: # 200 

970 raise FileNotFoundError( 

971 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}" 

972 ) 

973 if not stream: 

974 return resp.content 

975 else: 

976 return next(resp.iter_content(chunk_size=size)) 

977 

978 def write(self, data: bytes, overwrite: bool = True) -> None: 

979 """Write the supplied bytes to the new resource. 

980 

981 Parameters 

982 ---------- 

983 data : `bytes` 

984 The bytes to write to the resource. The entire contents of the 

985 resource will be replaced. 

986 overwrite : `bool`, optional 

987 If `True` the resource will be overwritten if it exists. Otherwise 

988 the write will fail. 

989 """ 

990 log.debug("Writing to remote resource: %s", self.geturl()) 

991 if not overwrite and self.exists(): 

992 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

993 

994 # Ensure the parent directory exists. 

995 # This is only meaningful and appropriate for WebDAV, not the general 

996 # HTTP case. e.g. for S3 HTTP URLs, the underlying service has no 

997 # concept of 'directories' at all. 

998 if self.is_webdav_endpoint: 

999 self.parent().mkdir() 

1000 

1001 # Upload the data. 

1002 log.debug("Writing data to remote resource: %s", self.geturl()) 

1003 self._put(data=data) 

1004 

1005 def transfer_from( 

1006 self, 

1007 src: ResourcePath, 

1008 transfer: str = "copy", 

1009 overwrite: bool = False, 

1010 transaction: TransactionProtocol | None = None, 

1011 ) -> None: 

1012 """Transfer the current resource to a Webdav repository. 

1013 

1014 Parameters 

1015 ---------- 

1016 src : `ResourcePath` 

1017 Source URI. 

1018 transfer : `str` 

1019 Mode to use for transferring the resource. Supports the following 

1020 options: copy. 

1021 overwrite : `bool`, optional 

1022 Whether overwriting the remote resource is allowed or not. 

1023 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

1024 Currently unused. 

1025 """ 

1026 # Fail early to prevent delays if remote resources are requested. 

1027 if transfer not in self.transferModes: 

1028 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

1029 

1030 # Existence checks cost time so do not call this unless we know 

1031 # that debugging is enabled. 

1032 if log.isEnabledFor(logging.DEBUG): 

1033 log.debug( 

1034 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

1035 src, 

1036 src.exists(), 

1037 self, 

1038 self.exists(), 

1039 transfer, 

1040 ) 

1041 

1042 # Short circuit immediately if the URIs are identical. 

1043 if self == src: 

1044 log.debug( 

1045 "Target and destination URIs are identical: %s, returning immediately." 

1046 " No further action required.", 

1047 self, 

1048 ) 

1049 return 

1050 

1051 if not overwrite and self.exists(): 

1052 raise FileExistsError(f"Destination path {self} already exists.") 

1053 

1054 if transfer == "auto": 

1055 transfer = self.transferDefault 

1056 

1057 # We can use webDAV 'COPY' or 'MOVE' if both the current and source 

1058 # resources are located in the same server. 

1059 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint: 

1060 log.debug("Transfer from %s to %s directly", src, self) 

1061 return self._move(src) if transfer == "move" else self._copy(src) 

1062 

1063 # For resources of different classes or for plain HTTP resources we can 

1064 # perform the copy or move operation by downloading to a local file 

1065 # and uploading to the destination. 

1066 self._copy_via_local(src) 

1067 

1068 # This was an explicit move, try to remove the source. 

1069 if transfer == "move": 

1070 src.remove() 

1071 

1072 def walk( 

1073 self, file_filter: str | re.Pattern | None = None 

1074 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

1075 """Walk the directory tree returning matching files and directories. 

1076 

1077 Parameters 

1078 ---------- 

1079 file_filter : `str` or `re.Pattern`, optional 

1080 Regex to filter out files from the list before it is returned. 

1081 

1082 Yields 

1083 ------ 

1084 dirpath : `ResourcePath` 

1085 Current directory being examined. 

1086 dirnames : `list` of `str` 

1087 Names of subdirectories within dirpath. 

1088 filenames : `list` of `str` 

1089 Names of all the files within dirpath. 

1090 """ 

1091 if not self.dirLike: 

1092 raise ValueError("Can not walk a non-directory URI") 

1093 

1094 # Walking directories is only available on WebDAV back ends. 

1095 if not self.is_webdav_endpoint: 

1096 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers") 

1097 

1098 if isinstance(file_filter, str): 

1099 file_filter = re.compile(file_filter) 

1100 

1101 resp = self._propfind(depth="1") 

1102 if resp.status_code == requests.codes.multi_status: # 207 

1103 files: list[str] = [] 

1104 dirs: list[str] = [] 

1105 

1106 for prop in _parse_propfind_response_body(resp.text): 

1107 if prop.is_file: 

1108 files.append(prop.name) 

1109 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")): 

1110 # Only include the names of sub-directories not the name of 

1111 # the directory being walked. 

1112 dirs.append(prop.name) 

1113 

1114 if file_filter is not None: 

1115 files = [f for f in files if file_filter.search(f)] 

1116 

1117 if not dirs and not files: 

1118 return 

1119 else: 

1120 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files 

1121 

1122 for dir in dirs: 

1123 new_uri = self.join(dir, forceDirectory=True) 

1124 yield from new_uri.walk(file_filter) 

1125 

1126 def _as_local(self) -> tuple[str, bool]: 

1127 """Download object over HTTP and place in temporary directory. 

1128 

1129 Returns 

1130 ------- 

1131 path : `str` 

1132 Path to local temporary file. 

1133 temporary : `bool` 

1134 Always returns `True`. This is always a temporary file. 

1135 """ 

1136 # Use the session as a context manager to ensure that connections 

1137 # to both the front end and back end servers are closed after the 

1138 # download operation is finished. 

1139 with self.data_session as session: 

1140 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout) 

1141 if resp.status_code != requests.codes.ok: 

1142 raise FileNotFoundError( 

1143 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}" 

1144 ) 

1145 

1146 tmpdir, buffering = _get_temp_dir() 

1147 with tempfile.NamedTemporaryFile( 

1148 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

1149 ) as tmpFile: 

1150 expected_length = int(resp.headers.get("Content-Length", "-1")) 

1151 with time_this( 

1152 log, 

1153 msg="GET %s [length=%d] to local file %s [chunk_size=%d]", 

1154 args=(self, expected_length, tmpFile.name, buffering), 

1155 mem_usage=self._config.collect_memory_usage, 

1156 mem_unit=u.mebibyte, 

1157 ): 

1158 content_length = 0 

1159 for chunk in resp.iter_content(chunk_size=buffering): 

1160 tmpFile.write(chunk) 

1161 content_length += len(chunk) 

1162 

1163 # Check that the expected and actual content lengths match. Perform 

1164 # this check only when the contents of the file was not encoded by 

1165 # the server. 

1166 if ( 

1167 "Content-Encoding" not in resp.headers 

1168 and expected_length >= 0 

1169 and expected_length != content_length 

1170 ): 

1171 raise ValueError( 

1172 f"Size of downloaded file does not match value in Content-Length header for {self}: " 

1173 f"expecting {expected_length} and got {content_length} bytes" 

1174 ) 

1175 

1176 return tmpFile.name, True 

1177 

1178 def _send_webdav_request( 

1179 self, 

1180 method: str, 

1181 url: str | None = None, 

1182 headers: dict[str, str] | None = None, 

1183 body: str | None = None, 

1184 session: requests.Session | None = None, 

1185 timeout: tuple[float, float] | None = None, 

1186 ) -> requests.Response: 

1187 """Send a webDAV request and correctly handle redirects. 

1188 

1189 Parameters 

1190 ---------- 

1191 method : `str` 

1192 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL. 

1193 headers : `dict`, optional 

1194 A dictionary of key-value pairs (both strings) to include as 

1195 headers in the request. 

1196 body : `str`, optional 

1197 The body of the request. 

1198 

1199 Notes 

1200 ----- 

1201 This way of sending webDAV requests is necessary for handling 

1202 redirection ourselves, since the 'requests' package changes the method 

1203 of the redirected request when the server responds with status 302 and 

1204 the method of the original request is not HEAD (which is the case for 

1205 webDAV requests). 

1206 

1207 That means that when the webDAV server we interact with responds with 

1208 a redirection to a PROPFIND or MKCOL request, the request gets 

1209 converted to a GET request when sent to the redirected location. 

1210 

1211 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in 

1212 https://github.com/psf/requests/blob/main/requests/sessions.py 

1213 

1214 This behavior of the 'requests' package is meant to be compatible with 

1215 what is specified in RFC 9110: 

1216 

1217 https://www.rfc-editor.org/rfc/rfc9110#name-302-found 

1218 

1219 For our purposes, we do need to follow the redirection and send a new 

1220 request using the same HTTP verb. 

1221 """ 

1222 if url is None: 

1223 url = self.geturl() 

1224 

1225 if headers is None: 

1226 headers = {} 

1227 

1228 if session is None: 

1229 session = self.metadata_session 

1230 

1231 if timeout is None: 

1232 timeout = self._config.timeout 

1233 

1234 with time_this( 

1235 log, 

1236 msg="%s %s", 

1237 args=( 

1238 method, 

1239 url, 

1240 ), 

1241 mem_usage=self._config.collect_memory_usage, 

1242 mem_unit=u.mebibyte, 

1243 ): 

1244 for _ in range(max_redirects := 5): 

1245 resp = session.request( 

1246 method, 

1247 url, 

1248 data=body, 

1249 headers=headers, 

1250 stream=False, 

1251 timeout=timeout, 

1252 allow_redirects=False, 

1253 ) 

1254 if resp.is_redirect: 

1255 url = resp.headers["Location"] 

1256 else: 

1257 return resp 

1258 

1259 # We reached the maximum allowed number of redirects. 

1260 # Stop trying. 

1261 raise ValueError( 

1262 f"Could not get a response to {method} request for {self} after {max_redirects} redirections" 

1263 ) 

1264 

1265 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response: 

1266 """Send a PROPFIND webDAV request and return the response. 

1267 

1268 Parameters 

1269 ---------- 

1270 body : `str`, optional 

1271 The body of the PROPFIND request to send to the server. If 

1272 provided, it is expected to be a XML document. 

1273 depth : `str`, optional 

1274 The value of the 'Depth' header to include in the request. 

1275 

1276 Returns 

1277 ------- 

1278 response : `requests.Response` 

1279 Response to the PROPFIND request. 

1280 

1281 Notes 

1282 ----- 

1283 It raises `ValueError` if the status code of the PROPFIND request 

1284 is different from "207 Multistatus" or "404 Not Found". 

1285 """ 

1286 if body is None: 

1287 # Request only the DAV live properties we are explicitly interested 

1288 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified' 

1289 # and 'displayname'. 

1290 body = ( 

1291 """<?xml version="1.0" encoding="utf-8" ?>""" 

1292 """<D:propfind xmlns:D="DAV:"><D:prop>""" 

1293 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>""" 

1294 """</D:prop></D:propfind>""" 

1295 ) 

1296 headers = { 

1297 "Depth": depth, 

1298 "Content-Type": 'application/xml; charset="utf-8"', 

1299 "Content-Length": str(len(body)), 

1300 } 

1301 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body) 

1302 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found): 

1303 return resp 

1304 else: 

1305 raise ValueError( 

1306 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} " 

1307 f"{resp.reason}" 

1308 ) 

1309 

1310 def _options(self) -> requests.Response: 

1311 """Send a OPTIONS webDAV request for this resource.""" 

1312 resp = self._send_webdav_request("OPTIONS") 

1313 if resp.status_code in (requests.codes.ok, requests.codes.created): 

1314 return resp 

1315 

1316 raise ValueError( 

1317 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} {resp.reason}" 

1318 ) 

1319 

1320 def _head(self) -> requests.Response: 

1321 """Send a HEAD webDAV request for this resource.""" 

1322 return self._send_webdav_request("HEAD") 

1323 

1324 def _mkcol(self) -> None: 

1325 """Send a MKCOL webDAV request to create a collection. The collection 

1326 may already exist. 

1327 """ 

1328 resp = self._send_webdav_request("MKCOL") 

1329 if resp.status_code == requests.codes.created: # 201 

1330 return 

1331 

1332 if resp.status_code == requests.codes.method_not_allowed: # 405 

1333 # The remote directory already exists 

1334 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

1335 else: 

1336 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}") 

1337 

1338 def _delete(self) -> None: 

1339 """Send a DELETE webDAV request for this resource.""" 

1340 log.debug("Deleting %s ...", self.geturl()) 

1341 

1342 # If this is a directory, ensure the remote is a webDAV server because 

1343 # plain HTTP servers don't support DELETE requests on non-file 

1344 # paths. 

1345 if self.dirLike and not self.is_webdav_endpoint: 

1346 raise NotImplementedError( 

1347 f"Deletion of directory {self} is not implemented by plain HTTP servers" 

1348 ) 

1349 

1350 # Deleting non-empty directories may take some time, so increase 

1351 # the timeout for getting a response from the server. 

1352 timeout = self._config.timeout 

1353 if self.dirLike: 

1354 timeout = (timeout[0], timeout[1] * 100) 

1355 resp = self._send_webdav_request("DELETE", timeout=timeout) 

1356 if resp.status_code in ( 

1357 requests.codes.ok, 

1358 requests.codes.accepted, 

1359 requests.codes.no_content, 

1360 requests.codes.not_found, 

1361 ): 

1362 # We can get a "404 Not Found" error when the file or directory 

1363 # does not exist or when the DELETE request was retried several 

1364 # times and a previous attempt actually deleted the resource. 

1365 # Therefore we consider that a "Not Found" response is not an 

1366 # error since we reached the state desired by the user. 

1367 return 

1368 else: 

1369 # TODO: the response to a DELETE request against a webDAV server 

1370 # may be multistatus. If so, we need to parse the reponse body to 

1371 # determine more precisely the reason of the failure (e.g. a lock) 

1372 # and provide a more helpful error message. 

1373 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}") 

1374 

1375 def _copy_via_local(self, src: ResourcePath) -> None: 

1376 """Replace the contents of this resource with the contents of a remote 

1377 resource by using a local temporary file. 

1378 

1379 Parameters 

1380 ---------- 

1381 src : `HttpResourcePath` 

1382 The source of the contents to copy to `self`. 

1383 """ 

1384 with src.as_local() as local_uri: 

1385 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri) 

1386 with open(local_uri.ospath, "rb") as f: 

1387 self._put(data=f) 

1388 

1389 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None: 

1390 """Send a COPY or MOVE webDAV request to copy or replace the contents 

1391 of this resource with the contents of another resource located in the 

1392 same server. 

1393 

1394 Parameters 

1395 ---------- 

1396 method : `str` 

1397 The method to perform. Valid values are "COPY" or "MOVE" (in 

1398 uppercase). 

1399 src : `HttpResourcePath` 

1400 The source of the contents to move to `self`. 

1401 """ 

1402 headers = {"Destination": self.geturl()} 

1403 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session) 

1404 if resp.status_code in (requests.codes.created, requests.codes.no_content): 

1405 return 

1406 

1407 if resp.status_code == requests.codes.multi_status: 

1408 tree = eTree.fromstring(resp.content) 

1409 status_element = tree.find("./{DAV:}response/{DAV:}status") 

1410 status = status_element.text if status_element is not None else "unknown" 

1411 error = tree.find("./{DAV:}response/{DAV:}error") 

1412 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}") 

1413 else: 

1414 raise ValueError( 

1415 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}" 

1416 ) 

1417 

1418 def _copy(self, src: HttpResourcePath) -> None: 

1419 """Send a COPY webDAV request to replace the contents of this resource 

1420 (if any) with the contents of another resource located in the same 

1421 server. 

1422 

1423 Parameters 

1424 ---------- 

1425 src : `HttpResourcePath` 

1426 The source of the contents to copy to `self`. 

1427 """ 

1428 # Neither dCache nor XrootD currently implement the COPY 

1429 # webDAV method as documented in 

1430 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY 

1431 # (See issues DM-37603 and DM-37651 for details) 

1432 # 

1433 # For the time being, we use a temporary local file to 

1434 # perform the copy client side. 

1435 # TODO: when those 2 issues above are solved remove the 3 lines below. 

1436 must_use_local = True 

1437 if must_use_local: 

1438 return self._copy_via_local(src) 

1439 

1440 return self._copy_or_move("COPY", src) 

1441 

1442 def _move(self, src: HttpResourcePath) -> None: 

1443 """Send a MOVE webDAV request to replace the contents of this resource 

1444 with the contents of another resource located in the same server. 

1445 

1446 Parameters 

1447 ---------- 

1448 src : `HttpResourcePath` 

1449 The source of the contents to move to `self`. 

1450 """ 

1451 return self._copy_or_move("MOVE", src) 

1452 

1453 def _put(self, data: BinaryIO | bytes) -> None: 

1454 """Perform an HTTP PUT request and handle redirection. 

1455 

1456 Parameters 

1457 ---------- 

1458 data : `Union[BinaryIO, bytes]` 

1459 The data to be included in the body of the PUT request. 

1460 """ 

1461 # Retrieve the final URL for this upload by sending a PUT request with 

1462 # no content. Follow a single server redirection to retrieve the 

1463 # final URL. 

1464 headers = {"Content-Length": "0"} 

1465 if self._config.send_expect_on_put: 

1466 headers["Expect"] = "100-continue" 

1467 

1468 url = self.geturl() 

1469 

1470 # Use the session as a context manager to ensure the underlying 

1471 # connections are closed after finishing uploading the data. 

1472 with self.data_session as session: 

1473 # Send an empty PUT request to get redirected to the final 

1474 # destination. 

1475 log.debug("Sending empty PUT request to %s", url) 

1476 with time_this( 

1477 log, 

1478 msg="PUT (no data) %s", 

1479 args=(url,), 

1480 mem_usage=self._config.collect_memory_usage, 

1481 mem_unit=u.mebibyte, 

1482 ): 

1483 resp = session.request( 

1484 "PUT", 

1485 url, 

1486 data=None, 

1487 headers=headers, 

1488 stream=False, 

1489 timeout=self._config.timeout, 

1490 allow_redirects=False, 

1491 ) 

1492 if resp.is_redirect: 

1493 url = resp.headers["Location"] 

1494 

1495 # Upload the data to the final destination. 

1496 log.debug("Uploading data to %s", url) 

1497 

1498 # Ask the server to compute and record a checksum of the uploaded 

1499 # file contents, for later integrity checks. Since we don't compute 

1500 # the digest ourselves while uploading the data, we cannot control 

1501 # after the request is complete that the data we uploaded is 

1502 # identical to the data recorded by the server, but at least the 

1503 # server has recorded a digest of the data it stored. 

1504 # 

1505 # See RFC-3230 for details and 

1506 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml 

1507 # for the list of supported digest algorithhms. 

1508 # In addition, note that not all servers implement this RFC so 

1509 # the checksum may not be computed by the server. 

1510 put_headers: dict[str, str] | None = None 

1511 if digest := self._config.digest_algorithm: 

1512 put_headers = {"Want-Digest": digest} 

1513 

1514 with time_this( 

1515 log, 

1516 msg="PUT %s", 

1517 args=(url,), 

1518 mem_usage=self._config.collect_memory_usage, 

1519 mem_unit=u.mebibyte, 

1520 ): 

1521 resp = session.request( 

1522 "PUT", 

1523 url, 

1524 data=data, 

1525 headers=put_headers, 

1526 stream=False, 

1527 timeout=self._config.timeout, 

1528 allow_redirects=False, 

1529 ) 

1530 if resp.status_code in ( 

1531 requests.codes.ok, 

1532 requests.codes.created, 

1533 requests.codes.no_content, 

1534 ): 

1535 return 

1536 else: 

1537 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}") 

1538 

1539 @contextlib.contextmanager 

1540 def _openImpl( 

1541 self, 

1542 mode: str = "r", 

1543 *, 

1544 encoding: str | None = None, 

1545 ) -> Iterator[ResourceHandleProtocol]: 

1546 resp = self._head() 

1547 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes" 

1548 handle: ResourceHandleProtocol 

1549 if mode in ("rb", "r") and accepts_range: 

1550 handle = HttpReadResourceHandle( 

1551 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout 

1552 ) 

1553 if mode == "r": 

1554 # cast because the protocol is compatible, but does not have 

1555 # BytesIO in the inheritance tree 

1556 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding) 

1557 else: 

1558 yield handle 

1559 else: 

1560 with super()._openImpl(mode, encoding=encoding) as http_handle: 

1561 yield http_handle 

1562 

1563 

1564def _dump_response(resp: requests.Response) -> None: 

1565 """Log the contents of a HTTP or webDAV request and its response. 

1566 

1567 Parameters 

1568 ---------- 

1569 resp : `requests.Response` 

1570 The response to log. 

1571 

1572 Notes 

1573 ----- 

1574 Intended for development purposes only. 

1575 """ 

1576 log.debug("-----------------------------------------------") 

1577 log.debug("Request") 

1578 log.debug(" method=%s", resp.request.method) 

1579 log.debug(" URL=%s", resp.request.url) 

1580 log.debug(" headers=%s", resp.request.headers) 

1581 if resp.request.method == "PUT": 

1582 log.debug(" body=<data>") 

1583 elif resp.request.body is None: 

1584 log.debug(" body=<empty>") 

1585 else: 

1586 log.debug(" body=%r", resp.request.body[:120]) 

1587 

1588 log.debug("Response:") 

1589 log.debug(" status_code=%d", resp.status_code) 

1590 log.debug(" headers=%s", resp.headers) 

1591 if not resp.content: 

1592 log.debug(" body=<empty>") 

1593 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain": 

1594 log.debug(" body=%r", resp.content) 

1595 else: 

1596 log.debug(" body=%r", resp.content[:80]) 

1597 

1598 

1599def _is_protected(filepath: str) -> bool: 

1600 """Return true if the permissions of file at filepath only allow for access 

1601 by its owner. 

1602 

1603 Parameters 

1604 ---------- 

1605 filepath : `str` 

1606 Path of a local file. 

1607 """ 

1608 if not os.path.isfile(filepath): 

1609 return False 

1610 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

1611 owner_accessible = bool(mode & stat.S_IRWXU) 

1612 group_accessible = bool(mode & stat.S_IRWXG) 

1613 other_accessible = bool(mode & stat.S_IRWXO) 

1614 return owner_accessible and not group_accessible and not other_accessible 

1615 

1616 

1617def _parse_propfind_response_body(body: str) -> list[DavProperty]: 

1618 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND 

1619 request. 

1620 

1621 Parameters 

1622 ---------- 

1623 body : `str` 

1624 XML-encoded response body to a PROPFIND request 

1625 

1626 Returns 

1627 ------- 

1628 responses : `List[DavProperty]` 

1629 

1630 Notes 

1631 ----- 

1632 Is is expected that there is at least one reponse in `body`, otherwise 

1633 this function raises. 

1634 """ 

1635 # A response body to a PROPFIND request is of the form (indented for 

1636 # readability): 

1637 # 

1638 # <?xml version="1.0" encoding="UTF-8"?> 

1639 # <D:multistatus xmlns:D="DAV:"> 

1640 # <D:response> 

1641 # <D:href>path/to/resource</D:href> 

1642 # <D:propstat> 

1643 # <D:prop> 

1644 # <D:resourcetype> 

1645 # <D:collection xmlns:D="DAV:"/> 

1646 # </D:resourcetype> 

1647 # <D:getlastmodified> 

1648 # Fri, 27 Jan 2 023 13:59:01 GMT 

1649 # </D:getlastmodified> 

1650 # <D:getcontentlength> 

1651 # 12345 

1652 # </D:getcontentlength> 

1653 # </D:prop> 

1654 # <D:status> 

1655 # HTTP/1.1 200 OK 

1656 # </D:status> 

1657 # </D:propstat> 

1658 # </D:response> 

1659 # <D:response> 

1660 # ... 

1661 # </D:response> 

1662 # <D:response> 

1663 # ... 

1664 # </D:response> 

1665 # </D:multistatus> 

1666 

1667 # Scan all the 'response' elements and extract the relevant properties 

1668 responses = [] 

1669 multistatus = eTree.fromstring(body.strip()) 

1670 for response in multistatus.findall("./{DAV:}response"): 

1671 responses.append(DavProperty(response)) 

1672 

1673 if responses: 

1674 return responses 

1675 else: 

1676 # Could not parse the body 

1677 raise ValueError(f"Unable to parse response for PROPFIND request: {body}") 

1678 

1679 

1680class DavProperty: 

1681 """Helper class to encapsulate select live DAV properties of a single 

1682 resource, as retrieved via a PROPFIND request. 

1683 

1684 Parameters 

1685 ---------- 

1686 response : `eTree.Element` or `None` 

1687 The XML response defining the DAV property. 

1688 """ 

1689 

1690 # Regular expression to compare against the 'status' element of a 

1691 # PROPFIND response's 'propstat' element. 

1692 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE) 

1693 

1694 def __init__(self, response: eTree.Element | None): 

1695 self._href: str = "" 

1696 self._displayname: str = "" 

1697 self._collection: bool = False 

1698 self._getlastmodified: str = "" 

1699 self._getcontentlength: int = -1 

1700 

1701 if response is not None: 

1702 self._parse(response) 

1703 

1704 def _parse(self, response: eTree.Element) -> None: 

1705 # Extract 'href'. 

1706 if (element := response.find("./{DAV:}href")) is not None: 

1707 # We need to use "str(element.text)"" instead of "element.text" to 

1708 # keep mypy happy. 

1709 self._href = str(element.text).strip() 

1710 else: 

1711 raise ValueError( 

1712 "Property 'href' expected but not found in PROPFIND response: " 

1713 f"{eTree.tostring(response, encoding='unicode')}" 

1714 ) 

1715 

1716 for propstat in response.findall("./{DAV:}propstat"): 

1717 # Only extract properties of interest with status OK. 

1718 status = propstat.find("./{DAV:}status") 

1719 if status is None or not self._status_ok_rex.match(str(status.text)): 

1720 continue 

1721 

1722 for prop in propstat.findall("./{DAV:}prop"): 

1723 # Parse "collection". 

1724 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None: 

1725 self._collection = True 

1726 

1727 # Parse "getlastmodified". 

1728 if (element := prop.find("./{DAV:}getlastmodified")) is not None: 

1729 self._getlastmodified = str(element.text) 

1730 

1731 # Parse "getcontentlength". 

1732 if (element := prop.find("./{DAV:}getcontentlength")) is not None: 

1733 self._getcontentlength = int(str(element.text)) 

1734 

1735 # Parse "displayname". 

1736 if (element := prop.find("./{DAV:}displayname")) is not None: 

1737 self._displayname = str(element.text) 

1738 

1739 # Some webDAV servers don't include the 'displayname' property in the 

1740 # response so try to infer it from the value of the 'href' property. 

1741 # Depending on the server the href value may end with '/'. 

1742 if not self._displayname: 

1743 self._displayname = os.path.basename(self._href.rstrip("/")) 

1744 

1745 # Force a size of 0 for collections. 

1746 if self._collection: 

1747 self._getcontentlength = 0 

1748 

1749 @property 

1750 def exists(self) -> bool: 

1751 # It is either a directory or a file with length of at least zero 

1752 return self._collection or self._getcontentlength >= 0 

1753 

1754 @property 

1755 def is_directory(self) -> bool: 

1756 return self._collection 

1757 

1758 @property 

1759 def is_file(self) -> bool: 

1760 return not self._collection 

1761 

1762 @property 

1763 def size(self) -> int: 

1764 return self._getcontentlength 

1765 

1766 @property 

1767 def name(self) -> str: 

1768 return self._displayname 

1769 

1770 @property 

1771 def href(self) -> str: 

1772 return self._href