Coverage for python/lsst/resources/http.py: 23%

592 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-01 11:14 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpResourcePath",) 

15 

16import contextlib 

17import functools 

18import io 

19import logging 

20import math 

21import os 

22import os.path 

23import random 

24import re 

25import stat 

26import tempfile 

27from collections.abc import Iterator 

28from typing import TYPE_CHECKING, BinaryIO, cast 

29 

30try: 

31 # Prefer 'defusedxml' (not part of standard library) if available, since 

32 # 'xml' is vulnerable to XML bombs. 

33 import defusedxml.ElementTree as eTree 

34except ImportError: 

35 import xml.etree.ElementTree as eTree 

36 

37from urllib.parse import parse_qs 

38 

39import requests 

40from astropy import units as u 

41from lsst.utils.timer import time_this 

42from requests.adapters import HTTPAdapter 

43from requests.auth import AuthBase 

44from urllib3.util.retry import Retry 

45 

46from ._resourceHandles import ResourceHandleProtocol 

47from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle, parse_content_range_header 

48from ._resourcePath import ResourcePath 

49 

50if TYPE_CHECKING: 

51 from .utils import TransactionProtocol 

52 

53log = logging.getLogger(__name__) 

54 

55 

56def _timeout_from_environment(env_var: str, default_value: float) -> float: 

57 """Convert and return a timeout from the value of an environment variable 

58 or a default value if the environment variable is not initialized. The 

59 value of `env_var` must be a valid `float` otherwise this function raises. 

60 

61 Parameters 

62 ---------- 

63 env_var : `str` 

64 Environment variable to look for. 

65 default_value : `float`` 

66 Value to return if `env_var` is not defined in the environment. 

67 

68 Returns 

69 ------- 

70 _timeout_from_environment : `float` 

71 Converted value. 

72 """ 

73 try: 

74 timeout = float(os.environ.get(env_var, default_value)) 

75 except ValueError: 

76 raise ValueError( 

77 f"Expecting valid timeout value in environment variable {env_var} but found " 

78 f"{os.environ.get(env_var)}" 

79 ) from None 

80 

81 if math.isnan(timeout): 

82 raise ValueError(f"Unexpected timeout value NaN found in environment variable {env_var}") 

83 

84 return timeout 

85 

86 

87class HttpResourcePathConfig: 

88 """Configuration class to encapsulate the configurable items used by class 

89 HttpResourcePath. 

90 """ 

91 

92 # Default timeouts for all HTTP requests (seconds). 

93 DEFAULT_TIMEOUT_CONNECT = 30.0 

94 DEFAULT_TIMEOUT_READ = 1_500.0 

95 

96 # Default lower and upper bounds for the backoff interval (seconds). 

97 # A value in this interval is randomly selected as the backoff factor when 

98 # requests need to be retried. 

99 DEFAULT_BACKOFF_MIN = 1.0 

100 DEFAULT_BACKOFF_MAX = 3.0 

101 

102 # Default number of connections to persist with both the front end and 

103 # back end servers. 

104 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2 

105 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1 

106 

107 # Accepted digest algorithms 

108 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512") 

109 

110 _front_end_connections: int | None = None 

111 _back_end_connections: int | None = None 

112 _digest_algorithm: str | None = None 

113 _send_expect_on_put: bool | None = None 

114 _timeout: tuple[float, float] | None = None 

115 _collect_memory_usage: bool | None = None 

116 _backoff_min: float | None = None 

117 _backoff_max: float | None = None 

118 

119 @property 

120 def front_end_connections(self) -> int: 

121 """Number of persistent connections to the front end server.""" 

122 if self._front_end_connections is not None: 122 ↛ 123line 122 didn't jump to line 123, because the condition on line 122 was never true

123 return self._front_end_connections 

124 

125 try: 

126 self._front_end_connections = int( 

127 os.environ.get( 

128 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

129 ) 

130 ) 

131 except ValueError: 

132 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

133 

134 return self._front_end_connections 

135 

136 @property 

137 def back_end_connections(self) -> int: 

138 """Number of persistent connections to the back end servers.""" 

139 if self._back_end_connections is not None: 139 ↛ 140line 139 didn't jump to line 140, because the condition on line 139 was never true

140 return self._back_end_connections 

141 

142 try: 

143 self._back_end_connections = int( 

144 os.environ.get( 

145 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

146 ) 

147 ) 

148 except ValueError: 

149 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

150 

151 return self._back_end_connections 

152 

153 @property 

154 def digest_algorithm(self) -> str: 

155 """Algorithm to ask the server to use for computing and recording 

156 digests of each file contents in PUT requests. 

157 

158 Returns 

159 ------- 

160 digest_algorithm: `str` 

161 The name of a digest algorithm or the empty string if no algotihm 

162 is configured. 

163 """ 

164 if self._digest_algorithm is not None: 

165 return self._digest_algorithm 

166 

167 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower() 

168 if digest not in self.ACCEPTED_DIGESTS: 

169 digest = "" 

170 

171 self._digest_algorithm = digest 

172 return self._digest_algorithm 

173 

174 @property 

175 def send_expect_on_put(self) -> bool: 

176 """Return True if a "Expect: 100-continue" header is to be sent to 

177 the server on each PUT request. 

178 

179 Some servers (e.g. dCache) uses this information as an indication that 

180 the client knows how to handle redirects to the specific server that 

181 will actually receive the data for PUT requests. 

182 """ 

183 if self._send_expect_on_put is not None: 

184 return self._send_expect_on_put 

185 

186 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

187 return self._send_expect_on_put 

188 

189 @property 

190 def timeout(self) -> tuple[float, float]: 

191 """Return a tuple with the values of timeouts for connecting to the 

192 server and reading its response, respectively. Both values are in 

193 seconds. 

194 """ 

195 if self._timeout is not None: 

196 return self._timeout 

197 

198 self._timeout = ( 

199 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT), 

200 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ), 

201 ) 

202 return self._timeout 

203 

204 @property 

205 def collect_memory_usage(self) -> bool: 

206 """Return true if we want to collect memory usage when timing 

207 operations against the remote server via the `lsst.utils.time_this` 

208 context manager. 

209 """ 

210 if self._collect_memory_usage is not None: 

211 return self._collect_memory_usage 

212 

213 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ 

214 return self._collect_memory_usage 

215 

216 @property 

217 def backoff_min(self) -> float: 

218 """Lower bound of the interval from which a backoff factor is randomly 

219 selected when retrying requests (seconds). 

220 """ 

221 if self._backoff_min is not None: 

222 return self._backoff_min 

223 

224 self._backoff_min = self.DEFAULT_BACKOFF_MIN 

225 try: 

226 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN)) 

227 if not math.isnan(backoff_min): 227 ↛ 232line 227 didn't jump to line 232, because the condition on line 227 was never false

228 self._backoff_min = backoff_min 

229 except ValueError: 

230 pass 

231 

232 return self._backoff_min 

233 

234 @property 

235 def backoff_max(self) -> float: 

236 """Upper bound of the interval from which a backoff factor is randomly 

237 selected when retrying requests (seconds). 

238 """ 

239 if self._backoff_max is not None: 

240 return self._backoff_max 

241 

242 self._backoff_max = self.DEFAULT_BACKOFF_MAX 

243 try: 

244 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX)) 

245 if not math.isnan(backoff_max): 245 ↛ 250line 245 didn't jump to line 250, because the condition on line 245 was never false

246 self._backoff_max = backoff_max 

247 except ValueError: 

248 pass 

249 

250 return self._backoff_max 

251 

252 

253@functools.lru_cache 

254def _is_webdav_endpoint(path: ResourcePath | str) -> bool: 

255 """Check whether the remote HTTP endpoint implements WebDAV features. 

256 

257 Parameters 

258 ---------- 

259 path : `ResourcePath` or `str` 

260 URL to the resource to be checked. 

261 Should preferably refer to the root since the status is shared 

262 by all paths in that server. 

263 

264 Returns 

265 ------- 

266 _is_webdav_endpoint : `bool` 

267 True if the endpoint implements WebDAV, False if it doesn't. 

268 """ 

269 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

270 

271 # Send an OPTIONS request and inspect its response. An OPTIONS 

272 # request does not need authentication of the client, so we don't need 

273 # to provide a client certificate or a bearer token. We set a 

274 # relatively short timeout since an OPTIONS request is relatively cheap 

275 # for the server to compute. 

276 

277 # Create a session for configuring retries 

278 retries = Retry( 

279 # Total number of retries to allow. Takes precedence over other 

280 # counts. 

281 total=6, 

282 # How many connection-related errors to retry on. 

283 connect=3, 

284 # How many times to retry on read errors. 

285 read=3, 

286 # How many times to retry on bad status codes. 

287 status=5, 

288 # Set of uppercased HTTP method verbs that we should retry on. 

289 allowed_methods=frozenset( 

290 [ 

291 "OPTIONS", 

292 ] 

293 ), 

294 # HTTP status codes that we should force a retry on. 

295 status_forcelist=frozenset( 

296 [ 

297 requests.codes.too_many_requests, # 429 

298 requests.codes.internal_server_error, # 500 

299 requests.codes.bad_gateway, # 502 

300 requests.codes.service_unavailable, # 503 

301 requests.codes.gateway_timeout, # 504 

302 ] 

303 ), 

304 # Whether to respect 'Retry-After' header on status codes defined 

305 # above. 

306 respect_retry_after_header=True, 

307 ) 

308 

309 try: 

310 session = requests.Session() 

311 session.mount(str(path), HTTPAdapter(max_retries=retries)) 

312 session.verify = os.environ.get("LSST_HTTP_CACERT_BUNDLE", True) 

313 with session: 

314 resp = session.options( 

315 str(path), 

316 stream=False, 

317 timeout=( 

318 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", 30.0), 

319 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", 60.0), 

320 ), 

321 ) 

322 if resp.status_code not in (requests.codes.ok, requests.codes.created): 

323 return False 

324 

325 # Check that "1" is part of the value of the "DAV" header. We don't 

326 # use locks, so a server complying to class 1 is enough for our 

327 # purposes. All webDAV servers must advertise at least compliance 

328 # class "1". 

329 # 

330 # Compliance classes are documented in 

331 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes 

332 # 

333 # Examples of values for header DAV are: 

334 # DAV: 1, 2 

335 # DAV: 1, <http://apache.org/dav/propset/fs/1> 

336 if "DAV" not in resp.headers: 

337 return False 

338 else: 

339 # Convert to str to keep mypy happy 

340 compliance_class = str(resp.headers.get("DAV")) 

341 return "1" in compliance_class.replace(" ", "").split(",") 

342 

343 except requests.exceptions.SSLError as e: 

344 log.warning( 

345 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to " 

346 "specify tha path to a bundle of certificate authorities you trust " 

347 "which are not included in the default set of trusted authorities " 

348 "of this system." 

349 ) 

350 raise e 

351 

352 

353# Tuple (path, block_size) pointing to the location of a local directory 

354# to save temporary files and the block size of the underlying file system. 

355_TMPDIR: tuple[str, int] | None = None 

356 

357 

358def _get_temp_dir() -> tuple[str, int]: 

359 """Return the temporary directory path and block size. 

360 

361 This function caches its results in _TMPDIR. 

362 """ 

363 global _TMPDIR 

364 if _TMPDIR: 

365 return _TMPDIR 

366 

367 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

368 # 'TMPDIR', if defined. Otherwise use current working directory. 

369 tmpdir = os.getcwd() 

370 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

371 if dir and os.path.isdir(dir): 

372 tmpdir = dir 

373 break 

374 

375 # Compute the block size as 256 blocks of typical size 

376 # (i.e. 4096 bytes) or 10 times the file system block size, 

377 # whichever is higher. This is a reasonable compromise between 

378 # using memory for buffering and the number of system calls 

379 # issued to read from or write to temporary files. 

380 fsstats = os.statvfs(tmpdir) 

381 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

382 

383 

384class BearerTokenAuth(AuthBase): 

385 """Attach a bearer token 'Authorization' header to each request. 

386 

387 Parameters 

388 ---------- 

389 token : `str` 

390 Can be either the path to a local protected file which contains the 

391 value of the token or the token itself. 

392 """ 

393 

394 def __init__(self, token: str): 

395 self._token = self._path = None 

396 self._mtime: float = -1.0 

397 if not token: 

398 return 

399 

400 self._token = token 

401 if os.path.isfile(token): 

402 self._path = os.path.abspath(token) 

403 if not _is_protected(self._path): 

404 raise PermissionError( 

405 f"Bearer token file at {self._path} must be protected for access only by its owner" 

406 ) 

407 self._refresh() 

408 

409 def _refresh(self) -> None: 

410 """Read the token file (if any) if its modification time is more recent 

411 than the last time we read it. 

412 """ 

413 if not self._path: 

414 return 

415 

416 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

417 log.debug("Reading bearer token file at %s", self._path) 

418 self._mtime = mtime 

419 with open(self._path) as f: 

420 self._token = f.read().rstrip("\n") 

421 

422 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

423 # Only add a bearer token to a request when using secure HTTP. 

424 if req.url and req.url.lower().startswith("https://") and self._token: 

425 self._refresh() 

426 req.headers["Authorization"] = f"Bearer {self._token}" 

427 return req 

428 

429 

430class SessionStore: 

431 """Cache a reusable HTTP client session per endpoint. 

432 

433 Parameters 

434 ---------- 

435 num_pools : `int`, optional 

436 Number of connection pools to keep: there is one pool per remote 

437 host. 

438 max_persistent_connections : `int`, optional 

439 Maximum number of connections per remote host to persist in each 

440 connection pool. 

441 backoff_min : `float`, optional 

442 Minimum value of the interval to compute the exponential 

443 backoff factor when retrying requests (seconds). 

444 backoff_max : `float`, optional 

445 Maximum value of the interval to compute the exponential 

446 backoff factor when retrying requests (seconds). 

447 """ 

448 

449 def __init__( 

450 self, 

451 num_pools: int = 10, 

452 max_persistent_connections: int = 1, 

453 backoff_min: float = 1.0, 

454 backoff_max: float = 3.0, 

455 ) -> None: 

456 # Dictionary to store the session associated to a given URI. The key 

457 # of the dictionary is a root URI and the value is the session. 

458 self._sessions: dict[str, requests.Session] = {} 

459 

460 # See documentation of urllib3 PoolManager class: 

461 # https://urllib3.readthedocs.io 

462 self._num_pools: int = num_pools 

463 

464 # See urllib3 Advanced Usage documentation: 

465 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html 

466 self._max_persistent_connections: int = max_persistent_connections 

467 

468 # Minimum and maximum values of the interval to compute the exponential 

469 # backoff factor when retrying requests (seconds). 

470 self._backoff_min: float = backoff_min 

471 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0 

472 

473 def clear(self) -> None: 

474 """Destroy all previously created sessions and attempt to close 

475 underlying idle network connections. 

476 """ 

477 # Close all sessions and empty the store. Idle network connections 

478 # should be closed as a consequence. We don't have means through 

479 # the API exposed by Requests to actually force closing the 

480 # underlying open sockets. 

481 for session in self._sessions.values(): 

482 session.close() 

483 

484 self._sessions.clear() 

485 

486 def get(self, rpath: ResourcePath) -> requests.Session: 

487 """Retrieve a session for accessing the remote resource at rpath. 

488 

489 Parameters 

490 ---------- 

491 rpath : `ResourcePath` 

492 URL to a resource at the remote server for which a session is to 

493 be retrieved. 

494 

495 Notes 

496 ----- 

497 Once a session is created for a given endpoint it is cached and 

498 returned every time a session is requested for any path under that same 

499 endpoint. For instance, a single session will be cached and shared 

500 for paths "https://www.example.org/path/to/file" and 

501 "https://www.example.org/any/other/path". 

502 

503 Note that "https://www.example.org" and "https://www.example.org:12345" 

504 will have different sessions since the port number is not identical. 

505 

506 In order to configure the session, some environment variables are 

507 inspected: 

508 

509 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

510 certificates to trust when verifying the server's certificate. 

511 

512 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

513 local file containing a bearer token to be used as the client 

514 authentication mechanism with all requests. 

515 The permissions of the token file must be set so that only its 

516 owner can access it. 

517 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

518 and LSST_HTTP_AUTH_CLIENT_KEY. 

519 

520 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

521 client certificate for authenticating to the server. 

522 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

523 initialized with the path of the client private key file. 

524 The permissions of the client private key must be set so that only 

525 its owner can access it, at least for reading. 

526 """ 

527 root_uri = str(rpath.root_uri()) 

528 if root_uri not in self._sessions: 

529 # We don't have yet a session for this endpoint: create a new one. 

530 self._sessions[root_uri] = self._make_session(rpath) 

531 

532 return self._sessions[root_uri] 

533 

534 def _make_session(self, rpath: ResourcePath) -> requests.Session: 

535 """Make a new session configured from values from the environment.""" 

536 session = requests.Session() 

537 root_uri = str(rpath.root_uri()) 

538 log.debug("Creating new HTTP session for endpoint %s ...", root_uri) 

539 retries = Retry( 

540 # Total number of retries to allow. Takes precedence over other 

541 # counts. 

542 total=6, 

543 # How many connection-related errors to retry on. 

544 connect=3, 

545 # How many times to retry on read errors. 

546 read=3, 

547 # Backoff factor to apply between attempts after the second try 

548 # (seconds). Compute a random jitter to prevent all the clients 

549 # to overwhelm the server by sending requests at the same time. 

550 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(), 

551 # How many times to retry on bad status codes. 

552 status=5, 

553 # Set of uppercased HTTP method verbs that we should retry on. 

554 # We only automatically retry idempotent requests. 

555 allowed_methods=frozenset( 

556 [ 

557 "COPY", 

558 "DELETE", 

559 "GET", 

560 "HEAD", 

561 "MKCOL", 

562 "OPTIONS", 

563 "PROPFIND", 

564 "PUT", 

565 ] 

566 ), 

567 # HTTP status codes that we should force a retry on. 

568 status_forcelist=frozenset( 

569 [ 

570 requests.codes.too_many_requests, # 429 

571 requests.codes.internal_server_error, # 500 

572 requests.codes.bad_gateway, # 502 

573 requests.codes.service_unavailable, # 503 

574 requests.codes.gateway_timeout, # 504 

575 ] 

576 ), 

577 # Whether to respect Retry-After header on status codes defined 

578 # above. 

579 respect_retry_after_header=True, 

580 ) 

581 

582 # Persist the specified number of connections to the front end server. 

583 session.mount( 

584 root_uri, 

585 HTTPAdapter( 

586 pool_connections=self._num_pools, 

587 pool_maxsize=self._max_persistent_connections, 

588 pool_block=False, 

589 max_retries=retries, 

590 ), 

591 ) 

592 

593 # Do not persist the connections to back end servers which may vary 

594 # from request to request. Systematically persisting connections to 

595 # those servers may exhaust their capabilities when there are thousands 

596 # of simultaneous clients. 

597 session.mount( 

598 f"{rpath.scheme}://", 

599 HTTPAdapter( 

600 pool_connections=self._num_pools, 

601 pool_maxsize=0, 

602 pool_block=False, 

603 max_retries=retries, 

604 ), 

605 ) 

606 

607 # If the remote endpoint doesn't use secure HTTP we don't include 

608 # bearer tokens in the requests nor need to authenticate the remote 

609 # server. 

610 if rpath.scheme != "https": 

611 return session 

612 

613 # Should we use a specific CA cert bundle for authenticating the 

614 # server? 

615 session.verify = True 

616 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

617 session.verify = ca_bundle 

618 

619 # Should we use bearer tokens for client authentication? 

620 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

621 log.debug("... using bearer token authentication") 

622 session.auth = BearerTokenAuth(token) 

623 return session 

624 

625 # Should we instead use client certificate and private key? If so, both 

626 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

627 # initialized. 

628 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

629 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

630 if client_cert and client_key: 

631 if not _is_protected(client_key): 

632 raise PermissionError( 

633 f"Private key file at {client_key} must be protected for access only by its owner" 

634 ) 

635 log.debug("... using client certificate authentication.") 

636 session.cert = (client_cert, client_key) 

637 return session 

638 

639 if client_cert: 

640 # Only the client certificate was provided. 

641 raise ValueError( 

642 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

643 ) 

644 

645 if client_key: 

646 # Only the client private key was provided. 

647 raise ValueError( 

648 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

649 ) 

650 

651 log.debug( 

652 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

653 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

654 ) 

655 return session 

656 

657 

658class HttpResourcePath(ResourcePath): 

659 """General HTTP(S) resource. 

660 

661 Notes 

662 ----- 

663 In order to configure the behavior of instances of this class, the 

664 environment variables below are inspected: 

665 

666 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

667 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

668 This header is required by some servers to detect if the client 

669 knows how to handle redirections. In case of redirection, the body 

670 of the PUT request is sent to the redirected location and not to 

671 the front end server. 

672 

673 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a 

674 numeric value, they are interpreted as the number of seconds to wait 

675 for establishing a connection with the server and for reading its 

676 response, respectively. 

677 

678 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and 

679 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number 

680 of connections to attempt to persist with both the front end servers 

681 and the back end servers. 

682 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and 

683 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS. 

684 

685 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to 

686 ask the server to compute for every file's content sent to the server 

687 via a PUT request. No digest is requested if this variable is not set 

688 or is set to an invalid value. 

689 Valid values are those in ACCEPTED_DIGESTS. 

690 """ 

691 

692 _is_webdav: bool | None = None 

693 

694 # Configuration items for this class instances. 

695 _config = HttpResourcePathConfig() 

696 

697 # The session for metadata requests is used for interacting with 

698 # the front end servers for requests such as PROPFIND, HEAD, etc. Those 

699 # interactions are typically served by the front end servers. We want to 

700 # keep the connection to the front end servers open, to reduce the cost 

701 # associated to TCP and TLS handshaking for each new request. 

702 _metadata_session_store = SessionStore( 

703 num_pools=5, 

704 max_persistent_connections=_config.front_end_connections, 

705 backoff_min=_config.backoff_min, 

706 backoff_max=_config.backoff_max, 

707 ) 

708 

709 # The data session is used for interaction with the front end servers which 

710 # typically redirect to the back end servers for serving our PUT and GET 

711 # requests. We attempt to keep a single connection open with the front end 

712 # server, if possible. This depends on how the server behaves and the 

713 # kind of request. Some servers close the connection when redirecting 

714 # the client to a back end server, for instance when serving a PUT 

715 # request. 

716 _data_session_store = SessionStore( 

717 num_pools=25, 

718 max_persistent_connections=_config.back_end_connections, 

719 backoff_min=_config.backoff_min, 

720 backoff_max=_config.backoff_max, 

721 ) 

722 

723 # Process ID which created the session stores above. We need to store this 

724 # to replace sessions created by a parent process and inherited by a 

725 # child process after a fork, to avoid confusing the SSL layer. 

726 _pid: int = -1 

727 

728 @property 

729 def metadata_session(self) -> requests.Session: 

730 """Client session to send requests which do not require upload or 

731 download of data, i.e. mostly metadata requests. 

732 """ 

733 if hasattr(self, "_metadata_session"): 

734 if HttpResourcePath._pid == os.getpid(): 

735 return self._metadata_session 

736 else: 

737 # The metadata session we have in cache was likely created by 

738 # a parent process. Discard all the sessions in that store. 

739 self._metadata_session_store.clear() 

740 

741 # Retrieve a new metadata session. 

742 HttpResourcePath._pid = os.getpid() 

743 self._metadata_session: requests.Session = self._metadata_session_store.get(self) 

744 return self._metadata_session 

745 

746 @property 

747 def data_session(self) -> requests.Session: 

748 """Client session for uploading and downloading data.""" 

749 if hasattr(self, "_data_session"): 

750 if HttpResourcePath._pid == os.getpid(): 

751 return self._data_session 

752 else: 

753 # The data session we have in cache was likely created by 

754 # a parent process. Discard all the sessions in that store. 

755 self._data_session_store.clear() 

756 

757 # Retrieve a new data session. 

758 HttpResourcePath._pid = os.getpid() 

759 self._data_session: requests.Session = self._data_session_store.get(self) 

760 return self._data_session 

761 

762 def _clear_sessions(self) -> None: 

763 """Close the socket connections that are still open. 

764 

765 Used only in test suites to avoid warnings. 

766 """ 

767 self._metadata_session_store.clear() 

768 self._data_session_store.clear() 

769 

770 if hasattr(self, "_metadata_session"): 

771 delattr(self, "_metadata_session") 

772 

773 if hasattr(self, "_data_session"): 

774 delattr(self, "_data_session") 

775 

776 @property 

777 def is_webdav_endpoint(self) -> bool: 

778 """Check if the current endpoint implements WebDAV features. 

779 

780 This is stored per URI but cached by root so there is 

781 only one check per hostname. 

782 """ 

783 if self._is_webdav is not None: 

784 return self._is_webdav 

785 

786 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

787 return self._is_webdav 

788 

789 def exists(self) -> bool: 

790 """Check that a remote HTTP resource exists.""" 

791 log.debug("Checking if resource exists: %s", self.geturl()) 

792 if not self.is_webdav_endpoint: 

793 # The remote is a plain HTTP server. Let's attempt a HEAD 

794 # request, even if the behavior for such a request against a 

795 # directory is not specified, so it depends on the server 

796 # implementation. 

797 resp = self._head_non_webdav_url() 

798 return self._is_successful_non_webdav_head_request(resp) 

799 

800 # The remote endpoint is a webDAV server: send a PROPFIND request 

801 # to determine if it exists. 

802 resp = self._propfind() 

803 if resp.status_code == requests.codes.multi_status: # 207 

804 prop = _parse_propfind_response_body(resp.text)[0] 

805 return prop.exists 

806 else: # 404 Not Found 

807 return False 

808 

809 def size(self) -> int: 

810 """Return the size of the remote resource in bytes.""" 

811 if self.dirLike: 

812 return 0 

813 

814 if not self.is_webdav_endpoint: 

815 # The remote is a plain HTTP server. Send a HEAD request to 

816 # retrieve the size of the resource. 

817 resp = self._head_non_webdav_url() 

818 if resp.status_code == requests.codes.ok: # 200 

819 if "Content-Length" in resp.headers: 

820 return int(resp.headers["Content-Length"]) 

821 else: 

822 raise ValueError( 

823 f"Response to HEAD request to {self} does not contain 'Content-Length' header" 

824 ) 

825 elif resp.status_code == requests.codes.partial_content: 

826 # 206 Partial Content, returned from a GET request with a Range 

827 # header (used to emulate HEAD for presigned S3 URLs). 

828 # In this case Content-Length is the length of the Range and 

829 # not the full length of the file, so we have to parse 

830 # Content-Range instead. 

831 content_range_header = resp.headers.get("Content-Range") 

832 if content_range_header is None: 

833 raise ValueError( 

834 f"Response to GET request to {self} did not contain 'Content-Range' header" 

835 ) 

836 content_range = parse_content_range_header(content_range_header) 

837 size = content_range.total 

838 if size is None: 

839 raise ValueError(f"Content-Range header for {self} did not include a total file size") 

840 return size 

841 elif resp.status_code == requests.codes.range_not_satisfiable: 

842 # 416 Range Not Satisfiable, which can occur on a GET for a 0 

843 # byte file since we asked for 1 byte Range which is longer 

844 # than the file. 

845 # 

846 # Servers are supposed to include a Content-Range header in 

847 # this case, but Google's S3 implementation doesn't. Any 

848 # non-zero file size should have been handled by the 206 and 

849 # 200 cases above, so assume we have a zero here. 

850 return 0 

851 elif resp.status_code == requests.codes.not_found: 

852 raise FileNotFoundError( 

853 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

854 ) 

855 else: 

856 raise ValueError( 

857 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} " 

858 f"{resp.reason}" 

859 ) 

860 

861 # The remote is a webDAV server: send a PROPFIND request to retrieve 

862 # the size of the resource. Sizes are only meaningful for files. 

863 resp = self._propfind() 

864 if resp.status_code == requests.codes.multi_status: # 207 

865 prop = _parse_propfind_response_body(resp.text)[0] 

866 if prop.is_file: 

867 return prop.size 

868 elif prop.is_directory: 

869 raise IsADirectoryError( 

870 f"Resource {self} is reported by server as a directory but has a file path" 

871 ) 

872 else: 

873 raise FileNotFoundError(f"Resource {self} does not exist") 

874 else: # 404 Not Found 

875 raise FileNotFoundError( 

876 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

877 ) 

878 

879 def _head_non_webdav_url(self) -> requests.Response: 

880 """Return a response from a HTTP HEAD request for a non-WebDAV HTTP 

881 URL. 

882 

883 Emulates HEAD using a 1-byte GET for presigned S3 URLs. 

884 """ 

885 if self._looks_like_presigned_s3_url(): 

886 # Presigned S3 URLs are signed for a single method only, so you 

887 # can't call HEAD on a URL signed for GET. However, S3 does 

888 # support Range requests, so you can ask for a 1-byte range with 

889 # GET for a similar effect to HEAD. 

890 # 

891 # Note that some headers differ between a true HEAD request and the 

892 # response returned by this GET, e.g. Content-Length will always be 

893 # 1, and the status code is 206 instead of 200. 

894 return self.metadata_session.get( 

895 self.geturl(), 

896 timeout=self._config.timeout, 

897 allow_redirects=True, 

898 stream=False, 

899 headers={"Range": "bytes=0-0"}, 

900 ) 

901 else: 

902 return self.metadata_session.head( 

903 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

904 ) 

905 

906 def _is_successful_non_webdav_head_request(self, resp: requests.Response) -> bool: 

907 """Return `True` if the status code in the response indicates a 

908 successful response to ``_head_non_webdav_url``. 

909 """ 

910 return resp.status_code in ( 

911 requests.codes.ok, # 200, from a normal HEAD or GET request 

912 requests.codes.partial_content, # 206, returned from a GET request with a Range header. 

913 # 416, returned from a GET request with a 1-byte Range header that 

914 # is longer than the 0-byte file. 

915 requests.codes.range_not_satisfiable, 

916 ) 

917 

918 def _looks_like_presigned_s3_url(self) -> bool: 

919 """Return `True` if this ResourcePath's URL is likely to be a presigned 

920 S3 URL. 

921 """ 

922 query_params = parse_qs(self._uri.query) 

923 return "Signature" in query_params and "Expires" in query_params 

924 

925 def mkdir(self) -> None: 

926 """Create the directory resource if it does not already exist.""" 

927 # Creating directories is only available on WebDAV back ends. 

928 if not self.is_webdav_endpoint: 

929 raise NotImplementedError( 

930 f"Creation of directory {self} is not implemented by plain HTTP servers" 

931 ) 

932 

933 if not self.dirLike: 

934 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

935 

936 # Check if the target directory already exists. 

937 resp = self._propfind() 

938 if resp.status_code == requests.codes.multi_status: # 207 

939 prop = _parse_propfind_response_body(resp.text)[0] 

940 if prop.exists: 

941 if prop.is_directory: 

942 return 

943 else: 

944 # A file exists at this path 

945 raise NotADirectoryError( 

946 f"Can not create a directory for {self} because a file already exists at that path" 

947 ) 

948 

949 # Target directory does not exist. Create it and its ancestors as 

950 # needed. We need to test if parent URL is different from self URL, 

951 # otherwise we could be stuck in a recursive loop 

952 # where self == parent. 

953 if self.geturl() != self.parent().geturl(): 

954 self.parent().mkdir() 

955 

956 log.debug("Creating new directory: %s", self.geturl()) 

957 self._mkcol() 

958 

959 def remove(self) -> None: 

960 """Remove the resource.""" 

961 self._delete() 

962 

963 def read(self, size: int = -1) -> bytes: 

964 """Open the resource and return the contents in bytes. 

965 

966 Parameters 

967 ---------- 

968 size : `int`, optional 

969 The number of bytes to read. Negative or omitted indicates 

970 that all data should be read. 

971 """ 

972 # Use the data session as a context manager to ensure that the 

973 # network connections to both the front end and back end servers are 

974 # closed after downloading the data. 

975 log.debug("Reading from remote resource: %s", self.geturl()) 

976 stream = size > 0 

977 with self.data_session as session: 

978 with time_this(log, msg="GET %s", args=(self,)): 

979 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout) 

980 

981 if resp.status_code != requests.codes.ok: # 200 

982 raise FileNotFoundError( 

983 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}" 

984 ) 

985 if not stream: 

986 return resp.content 

987 else: 

988 return next(resp.iter_content(chunk_size=size)) 

989 

990 def write(self, data: bytes, overwrite: bool = True) -> None: 

991 """Write the supplied bytes to the new resource. 

992 

993 Parameters 

994 ---------- 

995 data : `bytes` 

996 The bytes to write to the resource. The entire contents of the 

997 resource will be replaced. 

998 overwrite : `bool`, optional 

999 If `True` the resource will be overwritten if it exists. Otherwise 

1000 the write will fail. 

1001 """ 

1002 log.debug("Writing to remote resource: %s", self.geturl()) 

1003 if not overwrite and self.exists(): 

1004 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

1005 

1006 # Ensure the parent directory exists. 

1007 # This is only meaningful and appropriate for WebDAV, not the general 

1008 # HTTP case. e.g. for S3 HTTP URLs, the underlying service has no 

1009 # concept of 'directories' at all. 

1010 if self.is_webdav_endpoint: 

1011 self.parent().mkdir() 

1012 

1013 # Upload the data. 

1014 log.debug("Writing data to remote resource: %s", self.geturl()) 

1015 self._put(data=data) 

1016 

1017 def transfer_from( 

1018 self, 

1019 src: ResourcePath, 

1020 transfer: str = "copy", 

1021 overwrite: bool = False, 

1022 transaction: TransactionProtocol | None = None, 

1023 ) -> None: 

1024 """Transfer the current resource to a Webdav repository. 

1025 

1026 Parameters 

1027 ---------- 

1028 src : `ResourcePath` 

1029 Source URI. 

1030 transfer : `str` 

1031 Mode to use for transferring the resource. Supports the following 

1032 options: copy. 

1033 overwrite : `bool`, optional 

1034 Whether overwriting the remote resource is allowed or not. 

1035 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

1036 Currently unused. 

1037 """ 

1038 # Fail early to prevent delays if remote resources are requested. 

1039 if transfer not in self.transferModes: 

1040 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

1041 

1042 # Existence checks cost time so do not call this unless we know 

1043 # that debugging is enabled. 

1044 if log.isEnabledFor(logging.DEBUG): 

1045 log.debug( 

1046 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

1047 src, 

1048 src.exists(), 

1049 self, 

1050 self.exists(), 

1051 transfer, 

1052 ) 

1053 

1054 # Short circuit immediately if the URIs are identical. 

1055 if self == src: 

1056 log.debug( 

1057 "Target and destination URIs are identical: %s, returning immediately." 

1058 " No further action required.", 

1059 self, 

1060 ) 

1061 return 

1062 

1063 if not overwrite and self.exists(): 

1064 raise FileExistsError(f"Destination path {self} already exists.") 

1065 

1066 if transfer == "auto": 

1067 transfer = self.transferDefault 

1068 

1069 # We can use webDAV 'COPY' or 'MOVE' if both the current and source 

1070 # resources are located in the same server. 

1071 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint: 

1072 log.debug("Transfer from %s to %s directly", src, self) 

1073 return self._move(src) if transfer == "move" else self._copy(src) 

1074 

1075 # For resources of different classes or for plain HTTP resources we can 

1076 # perform the copy or move operation by downloading to a local file 

1077 # and uploading to the destination. 

1078 self._copy_via_local(src) 

1079 

1080 # This was an explicit move, try to remove the source. 

1081 if transfer == "move": 

1082 src.remove() 

1083 

1084 def walk( 

1085 self, file_filter: str | re.Pattern | None = None 

1086 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

1087 """Walk the directory tree returning matching files and directories. 

1088 

1089 Parameters 

1090 ---------- 

1091 file_filter : `str` or `re.Pattern`, optional 

1092 Regex to filter out files from the list before it is returned. 

1093 

1094 Yields 

1095 ------ 

1096 dirpath : `ResourcePath` 

1097 Current directory being examined. 

1098 dirnames : `list` of `str` 

1099 Names of subdirectories within dirpath. 

1100 filenames : `list` of `str` 

1101 Names of all the files within dirpath. 

1102 """ 

1103 if not self.dirLike: 

1104 raise ValueError("Can not walk a non-directory URI") 

1105 

1106 # Walking directories is only available on WebDAV back ends. 

1107 if not self.is_webdav_endpoint: 

1108 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers") 

1109 

1110 if isinstance(file_filter, str): 

1111 file_filter = re.compile(file_filter) 

1112 

1113 resp = self._propfind(depth="1") 

1114 if resp.status_code == requests.codes.multi_status: # 207 

1115 files: list[str] = [] 

1116 dirs: list[str] = [] 

1117 

1118 for prop in _parse_propfind_response_body(resp.text): 

1119 if prop.is_file: 

1120 files.append(prop.name) 

1121 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")): 

1122 # Only include the names of sub-directories not the name of 

1123 # the directory being walked. 

1124 dirs.append(prop.name) 

1125 

1126 if file_filter is not None: 

1127 files = [f for f in files if file_filter.search(f)] 

1128 

1129 if not dirs and not files: 

1130 return 

1131 else: 

1132 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files 

1133 

1134 for dir in dirs: 

1135 new_uri = self.join(dir, forceDirectory=True) 

1136 yield from new_uri.walk(file_filter) 

1137 

1138 def _as_local(self) -> tuple[str, bool]: 

1139 """Download object over HTTP and place in temporary directory. 

1140 

1141 Returns 

1142 ------- 

1143 path : `str` 

1144 Path to local temporary file. 

1145 temporary : `bool` 

1146 Always returns `True`. This is always a temporary file. 

1147 """ 

1148 # Use the session as a context manager to ensure that connections 

1149 # to both the front end and back end servers are closed after the 

1150 # download operation is finished. 

1151 with self.data_session as session: 

1152 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout) 

1153 if resp.status_code != requests.codes.ok: 

1154 raise FileNotFoundError( 

1155 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}" 

1156 ) 

1157 

1158 tmpdir, buffering = _get_temp_dir() 

1159 with tempfile.NamedTemporaryFile( 

1160 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

1161 ) as tmpFile: 

1162 expected_length = int(resp.headers.get("Content-Length", "-1")) 

1163 with time_this( 

1164 log, 

1165 msg="GET %s [length=%d] to local file %s [chunk_size=%d]", 

1166 args=(self, expected_length, tmpFile.name, buffering), 

1167 mem_usage=self._config.collect_memory_usage, 

1168 mem_unit=u.mebibyte, 

1169 ): 

1170 content_length = 0 

1171 for chunk in resp.iter_content(chunk_size=buffering): 

1172 tmpFile.write(chunk) 

1173 content_length += len(chunk) 

1174 

1175 # Check that the expected and actual content lengths match. Perform 

1176 # this check only when the contents of the file was not encoded by 

1177 # the server. 

1178 if ( 

1179 "Content-Encoding" not in resp.headers 

1180 and expected_length >= 0 

1181 and expected_length != content_length 

1182 ): 

1183 raise ValueError( 

1184 f"Size of downloaded file does not match value in Content-Length header for {self}: " 

1185 f"expecting {expected_length} and got {content_length} bytes" 

1186 ) 

1187 

1188 return tmpFile.name, True 

1189 

1190 def _send_webdav_request( 

1191 self, 

1192 method: str, 

1193 url: str | None = None, 

1194 headers: dict[str, str] | None = None, 

1195 body: str | None = None, 

1196 session: requests.Session | None = None, 

1197 timeout: tuple[float, float] | None = None, 

1198 ) -> requests.Response: 

1199 """Send a webDAV request and correctly handle redirects. 

1200 

1201 Parameters 

1202 ---------- 

1203 method : `str` 

1204 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL. 

1205 headers : `dict`, optional 

1206 A dictionary of key-value pairs (both strings) to include as 

1207 headers in the request. 

1208 body : `str`, optional 

1209 The body of the request. 

1210 

1211 Notes 

1212 ----- 

1213 This way of sending webDAV requests is necessary for handling 

1214 redirection ourselves, since the 'requests' package changes the method 

1215 of the redirected request when the server responds with status 302 and 

1216 the method of the original request is not HEAD (which is the case for 

1217 webDAV requests). 

1218 

1219 That means that when the webDAV server we interact with responds with 

1220 a redirection to a PROPFIND or MKCOL request, the request gets 

1221 converted to a GET request when sent to the redirected location. 

1222 

1223 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in 

1224 https://github.com/psf/requests/blob/main/requests/sessions.py 

1225 

1226 This behavior of the 'requests' package is meant to be compatible with 

1227 what is specified in RFC 9110: 

1228 

1229 https://www.rfc-editor.org/rfc/rfc9110#name-302-found 

1230 

1231 For our purposes, we do need to follow the redirection and send a new 

1232 request using the same HTTP verb. 

1233 """ 

1234 if url is None: 

1235 url = self.geturl() 

1236 

1237 if headers is None: 

1238 headers = {} 

1239 

1240 if session is None: 

1241 session = self.metadata_session 

1242 

1243 if timeout is None: 

1244 timeout = self._config.timeout 

1245 

1246 with time_this( 

1247 log, 

1248 msg="%s %s", 

1249 args=( 

1250 method, 

1251 url, 

1252 ), 

1253 mem_usage=self._config.collect_memory_usage, 

1254 mem_unit=u.mebibyte, 

1255 ): 

1256 for _ in range(max_redirects := 5): 

1257 resp = session.request( 

1258 method, 

1259 url, 

1260 data=body, 

1261 headers=headers, 

1262 stream=False, 

1263 timeout=timeout, 

1264 allow_redirects=False, 

1265 ) 

1266 if resp.is_redirect: 

1267 url = resp.headers["Location"] 

1268 else: 

1269 return resp 

1270 

1271 # We reached the maximum allowed number of redirects. 

1272 # Stop trying. 

1273 raise ValueError( 

1274 f"Could not get a response to {method} request for {self} after {max_redirects} redirections" 

1275 ) 

1276 

1277 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response: 

1278 """Send a PROPFIND webDAV request and return the response. 

1279 

1280 Parameters 

1281 ---------- 

1282 body : `str`, optional 

1283 The body of the PROPFIND request to send to the server. If 

1284 provided, it is expected to be a XML document. 

1285 depth : `str`, optional 

1286 The value of the 'Depth' header to include in the request. 

1287 

1288 Returns 

1289 ------- 

1290 response : `requests.Response` 

1291 Response to the PROPFIND request. 

1292 

1293 Notes 

1294 ----- 

1295 It raises `ValueError` if the status code of the PROPFIND request 

1296 is different from "207 Multistatus" or "404 Not Found". 

1297 """ 

1298 if body is None: 

1299 # Request only the DAV live properties we are explicitly interested 

1300 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified' 

1301 # and 'displayname'. 

1302 body = ( 

1303 """<?xml version="1.0" encoding="utf-8" ?>""" 

1304 """<D:propfind xmlns:D="DAV:"><D:prop>""" 

1305 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>""" 

1306 """</D:prop></D:propfind>""" 

1307 ) 

1308 headers = { 

1309 "Depth": depth, 

1310 "Content-Type": 'application/xml; charset="utf-8"', 

1311 "Content-Length": str(len(body)), 

1312 } 

1313 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body) 

1314 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found): 

1315 return resp 

1316 else: 

1317 raise ValueError( 

1318 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} " 

1319 f"{resp.reason}" 

1320 ) 

1321 

1322 def _options(self) -> requests.Response: 

1323 """Send a OPTIONS webDAV request for this resource.""" 

1324 resp = self._send_webdav_request("OPTIONS") 

1325 if resp.status_code in (requests.codes.ok, requests.codes.created): 

1326 return resp 

1327 

1328 raise ValueError( 

1329 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} {resp.reason}" 

1330 ) 

1331 

1332 def _head(self) -> requests.Response: 

1333 """Send a HEAD webDAV request for this resource.""" 

1334 return self._send_webdav_request("HEAD") 

1335 

1336 def _mkcol(self) -> None: 

1337 """Send a MKCOL webDAV request to create a collection. The collection 

1338 may already exist. 

1339 """ 

1340 resp = self._send_webdav_request("MKCOL") 

1341 if resp.status_code == requests.codes.created: # 201 

1342 return 

1343 

1344 if resp.status_code == requests.codes.method_not_allowed: # 405 

1345 # The remote directory already exists 

1346 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

1347 else: 

1348 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}") 

1349 

1350 def _delete(self) -> None: 

1351 """Send a DELETE webDAV request for this resource.""" 

1352 log.debug("Deleting %s ...", self.geturl()) 

1353 

1354 # If this is a directory, ensure the remote is a webDAV server because 

1355 # plain HTTP servers don't support DELETE requests on non-file 

1356 # paths. 

1357 if self.dirLike and not self.is_webdav_endpoint: 

1358 raise NotImplementedError( 

1359 f"Deletion of directory {self} is not implemented by plain HTTP servers" 

1360 ) 

1361 

1362 # Deleting non-empty directories may take some time, so increase 

1363 # the timeout for getting a response from the server. 

1364 timeout = self._config.timeout 

1365 if self.dirLike: 

1366 timeout = (timeout[0], timeout[1] * 100) 

1367 resp = self._send_webdav_request("DELETE", timeout=timeout) 

1368 if resp.status_code in ( 

1369 requests.codes.ok, 

1370 requests.codes.accepted, 

1371 requests.codes.no_content, 

1372 requests.codes.not_found, 

1373 ): 

1374 # We can get a "404 Not Found" error when the file or directory 

1375 # does not exist or when the DELETE request was retried several 

1376 # times and a previous attempt actually deleted the resource. 

1377 # Therefore we consider that a "Not Found" response is not an 

1378 # error since we reached the state desired by the user. 

1379 return 

1380 else: 

1381 # TODO: the response to a DELETE request against a webDAV server 

1382 # may be multistatus. If so, we need to parse the reponse body to 

1383 # determine more precisely the reason of the failure (e.g. a lock) 

1384 # and provide a more helpful error message. 

1385 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}") 

1386 

1387 def _copy_via_local(self, src: ResourcePath) -> None: 

1388 """Replace the contents of this resource with the contents of a remote 

1389 resource by using a local temporary file. 

1390 

1391 Parameters 

1392 ---------- 

1393 src : `HttpResourcePath` 

1394 The source of the contents to copy to `self`. 

1395 """ 

1396 with src.as_local() as local_uri: 

1397 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri) 

1398 with open(local_uri.ospath, "rb") as f: 

1399 self._put(data=f) 

1400 

1401 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None: 

1402 """Send a COPY or MOVE webDAV request to copy or replace the contents 

1403 of this resource with the contents of another resource located in the 

1404 same server. 

1405 

1406 Parameters 

1407 ---------- 

1408 method : `str` 

1409 The method to perform. Valid values are "COPY" or "MOVE" (in 

1410 uppercase). 

1411 src : `HttpResourcePath` 

1412 The source of the contents to move to `self`. 

1413 """ 

1414 headers = {"Destination": self.geturl()} 

1415 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session) 

1416 if resp.status_code in (requests.codes.created, requests.codes.no_content): 

1417 return 

1418 

1419 if resp.status_code == requests.codes.multi_status: 

1420 tree = eTree.fromstring(resp.content) 

1421 status_element = tree.find("./{DAV:}response/{DAV:}status") 

1422 status = status_element.text if status_element is not None else "unknown" 

1423 error = tree.find("./{DAV:}response/{DAV:}error") 

1424 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}") 

1425 else: 

1426 raise ValueError( 

1427 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}" 

1428 ) 

1429 

1430 def _copy(self, src: HttpResourcePath) -> None: 

1431 """Send a COPY webDAV request to replace the contents of this resource 

1432 (if any) with the contents of another resource located in the same 

1433 server. 

1434 

1435 Parameters 

1436 ---------- 

1437 src : `HttpResourcePath` 

1438 The source of the contents to copy to `self`. 

1439 """ 

1440 # Neither dCache nor XrootD currently implement the COPY 

1441 # webDAV method as documented in 

1442 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY 

1443 # (See issues DM-37603 and DM-37651 for details) 

1444 # 

1445 # For the time being, we use a temporary local file to 

1446 # perform the copy client side. 

1447 # TODO: when those 2 issues above are solved remove the 3 lines below. 

1448 must_use_local = True 

1449 if must_use_local: 

1450 return self._copy_via_local(src) 

1451 

1452 return self._copy_or_move("COPY", src) 

1453 

1454 def _move(self, src: HttpResourcePath) -> None: 

1455 """Send a MOVE webDAV request to replace the contents of this resource 

1456 with the contents of another resource located in the same server. 

1457 

1458 Parameters 

1459 ---------- 

1460 src : `HttpResourcePath` 

1461 The source of the contents to move to `self`. 

1462 """ 

1463 return self._copy_or_move("MOVE", src) 

1464 

1465 def _put(self, data: BinaryIO | bytes) -> None: 

1466 """Perform an HTTP PUT request and handle redirection. 

1467 

1468 Parameters 

1469 ---------- 

1470 data : `Union[BinaryIO, bytes]` 

1471 The data to be included in the body of the PUT request. 

1472 """ 

1473 # Retrieve the final URL for this upload by sending a PUT request with 

1474 # no content. Follow a single server redirection to retrieve the 

1475 # final URL. 

1476 headers = {"Content-Length": "0"} 

1477 if self._config.send_expect_on_put: 

1478 headers["Expect"] = "100-continue" 

1479 

1480 url = self.geturl() 

1481 

1482 # Use the session as a context manager to ensure the underlying 

1483 # connections are closed after finishing uploading the data. 

1484 with self.data_session as session: 

1485 # Send an empty PUT request to get redirected to the final 

1486 # destination. 

1487 log.debug("Sending empty PUT request to %s", url) 

1488 with time_this( 

1489 log, 

1490 msg="PUT (no data) %s", 

1491 args=(url,), 

1492 mem_usage=self._config.collect_memory_usage, 

1493 mem_unit=u.mebibyte, 

1494 ): 

1495 resp = session.request( 

1496 "PUT", 

1497 url, 

1498 data=None, 

1499 headers=headers, 

1500 stream=False, 

1501 timeout=self._config.timeout, 

1502 allow_redirects=False, 

1503 ) 

1504 if resp.is_redirect: 

1505 url = resp.headers["Location"] 

1506 

1507 # Upload the data to the final destination. 

1508 log.debug("Uploading data to %s", url) 

1509 

1510 # Ask the server to compute and record a checksum of the uploaded 

1511 # file contents, for later integrity checks. Since we don't compute 

1512 # the digest ourselves while uploading the data, we cannot control 

1513 # after the request is complete that the data we uploaded is 

1514 # identical to the data recorded by the server, but at least the 

1515 # server has recorded a digest of the data it stored. 

1516 # 

1517 # See RFC-3230 for details and 

1518 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml 

1519 # for the list of supported digest algorithhms. 

1520 # In addition, note that not all servers implement this RFC so 

1521 # the checksum may not be computed by the server. 

1522 put_headers: dict[str, str] | None = None 

1523 if digest := self._config.digest_algorithm: 

1524 put_headers = {"Want-Digest": digest} 

1525 

1526 with time_this( 

1527 log, 

1528 msg="PUT %s", 

1529 args=(url,), 

1530 mem_usage=self._config.collect_memory_usage, 

1531 mem_unit=u.mebibyte, 

1532 ): 

1533 resp = session.request( 

1534 "PUT", 

1535 url, 

1536 data=data, 

1537 headers=put_headers, 

1538 stream=False, 

1539 timeout=self._config.timeout, 

1540 allow_redirects=False, 

1541 ) 

1542 if resp.status_code in ( 

1543 requests.codes.ok, 

1544 requests.codes.created, 

1545 requests.codes.no_content, 

1546 ): 

1547 return 

1548 else: 

1549 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}") 

1550 

1551 @contextlib.contextmanager 

1552 def _openImpl( 

1553 self, 

1554 mode: str = "r", 

1555 *, 

1556 encoding: str | None = None, 

1557 ) -> Iterator[ResourceHandleProtocol]: 

1558 resp = self._head() 

1559 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes" 

1560 handle: ResourceHandleProtocol 

1561 if mode in ("rb", "r") and accepts_range: 

1562 handle = HttpReadResourceHandle( 

1563 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout 

1564 ) 

1565 if mode == "r": 

1566 # cast because the protocol is compatible, but does not have 

1567 # BytesIO in the inheritance tree 

1568 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding) 

1569 else: 

1570 yield handle 

1571 else: 

1572 with super()._openImpl(mode, encoding=encoding) as http_handle: 

1573 yield http_handle 

1574 

1575 

1576def _dump_response(resp: requests.Response) -> None: 

1577 """Log the contents of a HTTP or webDAV request and its response. 

1578 

1579 Parameters 

1580 ---------- 

1581 resp : `requests.Response` 

1582 The response to log. 

1583 

1584 Notes 

1585 ----- 

1586 Intended for development purposes only. 

1587 """ 

1588 log.debug("-----------------------------------------------") 

1589 log.debug("Request") 

1590 log.debug(" method=%s", resp.request.method) 

1591 log.debug(" URL=%s", resp.request.url) 

1592 log.debug(" headers=%s", resp.request.headers) 

1593 if resp.request.method == "PUT": 

1594 log.debug(" body=<data>") 

1595 elif resp.request.body is None: 

1596 log.debug(" body=<empty>") 

1597 else: 

1598 log.debug(" body=%r", resp.request.body[:120]) 

1599 

1600 log.debug("Response:") 

1601 log.debug(" status_code=%d", resp.status_code) 

1602 log.debug(" headers=%s", resp.headers) 

1603 if not resp.content: 

1604 log.debug(" body=<empty>") 

1605 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain": 

1606 log.debug(" body=%r", resp.content) 

1607 else: 

1608 log.debug(" body=%r", resp.content[:80]) 

1609 

1610 

1611def _is_protected(filepath: str) -> bool: 

1612 """Return true if the permissions of file at filepath only allow for access 

1613 by its owner. 

1614 

1615 Parameters 

1616 ---------- 

1617 filepath : `str` 

1618 Path of a local file. 

1619 """ 

1620 if not os.path.isfile(filepath): 

1621 return False 

1622 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

1623 owner_accessible = bool(mode & stat.S_IRWXU) 

1624 group_accessible = bool(mode & stat.S_IRWXG) 

1625 other_accessible = bool(mode & stat.S_IRWXO) 

1626 return owner_accessible and not group_accessible and not other_accessible 

1627 

1628 

1629def _parse_propfind_response_body(body: str) -> list[DavProperty]: 

1630 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND 

1631 request. 

1632 

1633 Parameters 

1634 ---------- 

1635 body : `str` 

1636 XML-encoded response body to a PROPFIND request 

1637 

1638 Returns 

1639 ------- 

1640 responses : `List[DavProperty]` 

1641 

1642 Notes 

1643 ----- 

1644 Is is expected that there is at least one reponse in `body`, otherwise 

1645 this function raises. 

1646 """ 

1647 # A response body to a PROPFIND request is of the form (indented for 

1648 # readability): 

1649 # 

1650 # <?xml version="1.0" encoding="UTF-8"?> 

1651 # <D:multistatus xmlns:D="DAV:"> 

1652 # <D:response> 

1653 # <D:href>path/to/resource</D:href> 

1654 # <D:propstat> 

1655 # <D:prop> 

1656 # <D:resourcetype> 

1657 # <D:collection xmlns:D="DAV:"/> 

1658 # </D:resourcetype> 

1659 # <D:getlastmodified> 

1660 # Fri, 27 Jan 2 023 13:59:01 GMT 

1661 # </D:getlastmodified> 

1662 # <D:getcontentlength> 

1663 # 12345 

1664 # </D:getcontentlength> 

1665 # </D:prop> 

1666 # <D:status> 

1667 # HTTP/1.1 200 OK 

1668 # </D:status> 

1669 # </D:propstat> 

1670 # </D:response> 

1671 # <D:response> 

1672 # ... 

1673 # </D:response> 

1674 # <D:response> 

1675 # ... 

1676 # </D:response> 

1677 # </D:multistatus> 

1678 

1679 # Scan all the 'response' elements and extract the relevant properties 

1680 responses = [] 

1681 multistatus = eTree.fromstring(body.strip()) 

1682 for response in multistatus.findall("./{DAV:}response"): 

1683 responses.append(DavProperty(response)) 

1684 

1685 if responses: 

1686 return responses 

1687 else: 

1688 # Could not parse the body 

1689 raise ValueError(f"Unable to parse response for PROPFIND request: {body}") 

1690 

1691 

1692class DavProperty: 

1693 """Helper class to encapsulate select live DAV properties of a single 

1694 resource, as retrieved via a PROPFIND request. 

1695 

1696 Parameters 

1697 ---------- 

1698 response : `eTree.Element` or `None` 

1699 The XML response defining the DAV property. 

1700 """ 

1701 

1702 # Regular expression to compare against the 'status' element of a 

1703 # PROPFIND response's 'propstat' element. 

1704 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE) 

1705 

1706 def __init__(self, response: eTree.Element | None): 

1707 self._href: str = "" 

1708 self._displayname: str = "" 

1709 self._collection: bool = False 

1710 self._getlastmodified: str = "" 

1711 self._getcontentlength: int = -1 

1712 

1713 if response is not None: 

1714 self._parse(response) 

1715 

1716 def _parse(self, response: eTree.Element) -> None: 

1717 # Extract 'href'. 

1718 if (element := response.find("./{DAV:}href")) is not None: 

1719 # We need to use "str(element.text)"" instead of "element.text" to 

1720 # keep mypy happy. 

1721 self._href = str(element.text).strip() 

1722 else: 

1723 raise ValueError( 

1724 "Property 'href' expected but not found in PROPFIND response: " 

1725 f"{eTree.tostring(response, encoding='unicode')}" 

1726 ) 

1727 

1728 for propstat in response.findall("./{DAV:}propstat"): 

1729 # Only extract properties of interest with status OK. 

1730 status = propstat.find("./{DAV:}status") 

1731 if status is None or not self._status_ok_rex.match(str(status.text)): 

1732 continue 

1733 

1734 for prop in propstat.findall("./{DAV:}prop"): 

1735 # Parse "collection". 

1736 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None: 

1737 self._collection = True 

1738 

1739 # Parse "getlastmodified". 

1740 if (element := prop.find("./{DAV:}getlastmodified")) is not None: 

1741 self._getlastmodified = str(element.text) 

1742 

1743 # Parse "getcontentlength". 

1744 if (element := prop.find("./{DAV:}getcontentlength")) is not None: 

1745 self._getcontentlength = int(str(element.text)) 

1746 

1747 # Parse "displayname". 

1748 if (element := prop.find("./{DAV:}displayname")) is not None: 

1749 self._displayname = str(element.text) 

1750 

1751 # Some webDAV servers don't include the 'displayname' property in the 

1752 # response so try to infer it from the value of the 'href' property. 

1753 # Depending on the server the href value may end with '/'. 

1754 if not self._displayname: 

1755 self._displayname = os.path.basename(self._href.rstrip("/")) 

1756 

1757 # Force a size of 0 for collections. 

1758 if self._collection: 

1759 self._getcontentlength = 0 

1760 

1761 @property 

1762 def exists(self) -> bool: 

1763 # It is either a directory or a file with length of at least zero 

1764 return self._collection or self._getcontentlength >= 0 

1765 

1766 @property 

1767 def is_directory(self) -> bool: 

1768 return self._collection 

1769 

1770 @property 

1771 def is_file(self) -> bool: 

1772 return not self._collection 

1773 

1774 @property 

1775 def size(self) -> int: 

1776 return self._getcontentlength 

1777 

1778 @property 

1779 def name(self) -> str: 

1780 return self._displayname 

1781 

1782 @property 

1783 def href(self) -> str: 

1784 return self._href