Coverage for python/lsst/resources/http.py: 20%

562 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 10:52 -0700

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpResourcePath",) 

15 

16import contextlib 

17import functools 

18import io 

19import logging 

20import math 

21import os 

22import os.path 

23import random 

24import re 

25import stat 

26import tempfile 

27import xml.etree.ElementTree as eTree 

28from collections.abc import Iterator 

29from typing import TYPE_CHECKING, BinaryIO, cast 

30 

31import requests 

32from astropy import units as u 

33from lsst.utils.timer import time_this 

34from requests.adapters import HTTPAdapter 

35from requests.auth import AuthBase 

36from urllib3.util.retry import Retry 

37 

38from ._resourceHandles import ResourceHandleProtocol 

39from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle 

40from ._resourcePath import ResourcePath 

41 

42if TYPE_CHECKING: 

43 from .utils import TransactionProtocol 

44 

45log = logging.getLogger(__name__) 

46 

47 

48class HttpResourcePathConfig: 

49 """Configuration class to encapsulate the configurable items used by class 

50 HttpResourcePath. 

51 """ 

52 

53 # Default timeouts for all HTTP requests (seconds). 

54 DEFAULT_TIMEOUT_CONNECT = 30.0 

55 DEFAULT_TIMEOUT_READ = 1_500.0 

56 

57 # Default lower and upper bounds for the backoff interval (seconds). 

58 # A value in this interval is randomly selected as the backoff factor when 

59 # requests need to be retried. 

60 DEFAULT_BACKOFF_MIN = 1.0 

61 DEFAULT_BACKOFF_MAX = 3.0 

62 

63 # Default number of connections to persist with both the front end and 

64 # back end servers. 

65 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2 

66 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1 

67 

68 # Accepted digest algorithms 

69 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512") 

70 

71 _front_end_connections: int | None = None 

72 _back_end_connections: int | None = None 

73 _digest_algorithm: str | None = None 

74 _send_expect_on_put: bool | None = None 

75 _timeout: tuple[float, float] | None = None 

76 _collect_memory_usage: bool | None = None 

77 _backoff_min: float | None = None 

78 _backoff_max: float | None = None 

79 

80 @property 

81 def front_end_connections(self) -> int: 

82 """Number of persistent connections to the front end server.""" 

83 if self._front_end_connections is not None: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true

84 return self._front_end_connections 

85 

86 try: 

87 self._front_end_connections = int( 

88 os.environ.get( 

89 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

90 ) 

91 ) 

92 except ValueError: 

93 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

94 

95 return self._front_end_connections 

96 

97 @property 

98 def back_end_connections(self) -> int: 

99 """Number of persistent connections to the back end servers.""" 

100 if self._back_end_connections is not None: 100 ↛ 101line 100 didn't jump to line 101, because the condition on line 100 was never true

101 return self._back_end_connections 

102 

103 try: 

104 self._back_end_connections = int( 

105 os.environ.get( 

106 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

107 ) 

108 ) 

109 except ValueError: 

110 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

111 

112 return self._back_end_connections 

113 

114 @property 

115 def digest_algorithm(self) -> str: 

116 """Algorithm to ask the server to use for computing and recording 

117 digests of each file contents in PUT requests. 

118 

119 Returns 

120 ------- 

121 digest_algorithm: `str` 

122 The name of a digest algorithm or the empty string if no algotihm 

123 is configured. 

124 """ 

125 if self._digest_algorithm is not None: 

126 return self._digest_algorithm 

127 

128 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower() 

129 if digest not in self.ACCEPTED_DIGESTS: 

130 digest = "" 

131 

132 self._digest_algorithm = digest 

133 return self._digest_algorithm 

134 

135 @property 

136 def send_expect_on_put(self) -> bool: 

137 """Return True if a "Expect: 100-continue" header is to be sent to 

138 the server on each PUT request. 

139 

140 Some servers (e.g. dCache) uses this information as an indication that 

141 the client knows how to handle redirects to the specific server that 

142 will actually receive the data for PUT requests. 

143 """ 

144 if self._send_expect_on_put is not None: 

145 return self._send_expect_on_put 

146 

147 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

148 return self._send_expect_on_put 

149 

150 @property 

151 def timeout(self) -> tuple[float, float]: 

152 """Return a tuple with the values of timeouts for connecting to the 

153 server and reading its response, respectively. Both values are in 

154 seconds. 

155 """ 

156 if self._timeout is not None: 

157 return self._timeout 

158 

159 self._timeout = (self.DEFAULT_TIMEOUT_CONNECT, self.DEFAULT_TIMEOUT_READ) 

160 try: 

161 timeout = ( 

162 float(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT)), 

163 float(os.environ.get("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ)), 

164 ) 

165 if not math.isnan(timeout[0]) and not math.isnan(timeout[1]): 

166 self._timeout = timeout 

167 except ValueError: 

168 pass 

169 

170 return self._timeout 

171 

172 @property 

173 def collect_memory_usage(self) -> bool: 

174 """Return true if we want to collect memory usage when timing 

175 operations against the remote server via the `lsst.utils.time_this` 

176 context manager. 

177 """ 

178 if self._collect_memory_usage is not None: 

179 return self._collect_memory_usage 

180 

181 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ 

182 return self._collect_memory_usage 

183 

184 @property 

185 def backoff_min(self) -> float: 

186 """Lower bound of the interval from which a backoff factor is randomly 

187 selected when retrying requests (seconds). 

188 """ 

189 if self._backoff_min is not None: 

190 return self._backoff_min 

191 

192 self._backoff_min = self.DEFAULT_BACKOFF_MIN 

193 try: 

194 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN)) 

195 if not math.isnan(backoff_min): 195 ↛ 200line 195 didn't jump to line 200, because the condition on line 195 was never false

196 self._backoff_min = backoff_min 

197 except ValueError: 

198 pass 

199 

200 return self._backoff_min 

201 

202 @property 

203 def backoff_max(self) -> float: 

204 """Upper bound of the interval from which a backoff factor is randomly 

205 selected when retrying requests (seconds). 

206 """ 

207 if self._backoff_max is not None: 

208 return self._backoff_max 

209 

210 self._backoff_max = self.DEFAULT_BACKOFF_MAX 

211 try: 

212 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX)) 

213 if not math.isnan(backoff_max): 213 ↛ 218line 213 didn't jump to line 218, because the condition on line 213 was never false

214 self._backoff_max = backoff_max 

215 except ValueError: 

216 pass 

217 

218 return self._backoff_max 

219 

220 

221@functools.lru_cache 

222def _is_webdav_endpoint(path: ResourcePath | str) -> bool: 

223 """Check whether the remote HTTP endpoint implements WebDAV features. 

224 

225 Parameters 

226 ---------- 

227 path : `ResourcePath` or `str` 

228 URL to the resource to be checked. 

229 Should preferably refer to the root since the status is shared 

230 by all paths in that server. 

231 

232 Returns 

233 ------- 

234 _is_webdav_endpoint : `bool` 

235 True if the endpoint implements WebDAV, False if it doesn't. 

236 """ 

237 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

238 try: 

239 ca_cert_bundle = os.getenv("LSST_HTTP_CACERT_BUNDLE") 

240 verify: bool | str = ca_cert_bundle if ca_cert_bundle else True 

241 resp = requests.options(str(path), verify=verify, stream=False) 

242 if resp.status_code not in (requests.codes.ok, requests.codes.created): 

243 raise ValueError( 

244 f"Unexpected response to OPTIONS request for {path}, status: {resp.status_code} " 

245 f"{resp.reason}" 

246 ) 

247 

248 # Check that "1" is part of the value of the "DAV" header. We don't 

249 # use locks, so a server complying to class 1 is enough for our 

250 # purposes. All webDAV servers must advertise at least compliance 

251 # class "1". 

252 # 

253 # Compliance classes are documented in 

254 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes 

255 # 

256 # Examples of values for header DAV are: 

257 # DAV: 1, 2 

258 # DAV: 1, <http://apache.org/dav/propset/fs/1> 

259 if "DAV" not in resp.headers: 

260 return False 

261 else: 

262 # Convert to str to keep mypy happy 

263 compliance_class = str(resp.headers.get("DAV")) 

264 return "1" in compliance_class.replace(" ", "").split(",") 

265 except requests.exceptions.SSLError as e: 

266 log.warning( 

267 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to " 

268 "specify a bundle of certificate authorities you trust which are " 

269 "not included in the default set of trusted authorities of your " 

270 "system." 

271 ) 

272 raise e 

273 

274 

275# Tuple (path, block_size) pointing to the location of a local directory 

276# to save temporary files and the block size of the underlying file system. 

277_TMPDIR: tuple[str, int] | None = None 

278 

279 

280def _get_temp_dir() -> tuple[str, int]: 

281 """Return the temporary directory path and block size. 

282 

283 This function caches its results in _TMPDIR. 

284 """ 

285 global _TMPDIR 

286 if _TMPDIR: 

287 return _TMPDIR 

288 

289 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

290 # 'TMPDIR', if defined. Otherwise use current working directory. 

291 tmpdir = os.getcwd() 

292 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

293 if dir and os.path.isdir(dir): 

294 tmpdir = dir 

295 break 

296 

297 # Compute the block size as 256 blocks of typical size 

298 # (i.e. 4096 bytes) or 10 times the file system block size, 

299 # whichever is higher. This is a reasonable compromise between 

300 # using memory for buffering and the number of system calls 

301 # issued to read from or write to temporary files. 

302 fsstats = os.statvfs(tmpdir) 

303 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

304 

305 

306class BearerTokenAuth(AuthBase): 

307 """Attach a bearer token 'Authorization' header to each request. 

308 

309 Parameters 

310 ---------- 

311 token : `str` 

312 Can be either the path to a local protected file which contains the 

313 value of the token or the token itself. 

314 """ 

315 

316 def __init__(self, token: str): 

317 self._token = self._path = None 

318 self._mtime: float = -1.0 

319 if not token: 

320 return 

321 

322 self._token = token 

323 if os.path.isfile(token): 

324 self._path = os.path.abspath(token) 

325 if not _is_protected(self._path): 

326 raise PermissionError( 

327 f"Bearer token file at {self._path} must be protected for access only by its owner" 

328 ) 

329 self._refresh() 

330 

331 def _refresh(self) -> None: 

332 """Read the token file (if any) if its modification time is more recent 

333 than the last time we read it. 

334 """ 

335 if not self._path: 

336 return 

337 

338 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

339 log.debug("Reading bearer token file at %s", self._path) 

340 self._mtime = mtime 

341 with open(self._path) as f: 

342 self._token = f.read().rstrip("\n") 

343 

344 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

345 if self._token: 

346 self._refresh() 

347 req.headers["Authorization"] = f"Bearer {self._token}" 

348 return req 

349 

350 

351class SessionStore: 

352 """Cache a reusable HTTP client session per endpoint.""" 

353 

354 def __init__( 

355 self, 

356 num_pools: int = 10, 

357 max_persistent_connections: int = 1, 

358 backoff_min: float = 1.0, 

359 backoff_max: float = 3.0, 

360 ) -> None: 

361 # Dictionary to store the session associated to a given URI. The key 

362 # of the dictionary is a root URI and the value is the session. 

363 self._sessions: dict[str, requests.Session] = {} 

364 

365 # Number of connection pools to keep: there is one pool per remote 

366 # host. See documentation of urllib3 PoolManager class: 

367 # https://urllib3.readthedocs.io 

368 self._num_pools: int = num_pools 

369 

370 # Maximum number of connections per remote host to persist in each 

371 # connection pool. See urllib3 Advanced Usage documentation: 

372 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html 

373 self._max_persistent_connections: int = max_persistent_connections 

374 

375 # Minimum and maximum values of the inverval to compute the exponential 

376 # backoff factor when retrying requests (seconds). 

377 self._backoff_min: float = backoff_min 

378 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0 

379 

380 def clear(self) -> None: 

381 """Destroy all previously created sessions and attempt to close 

382 underlying idle network connections. 

383 """ 

384 # Close all sessions and empty the store. Idle network connections 

385 # should be closed as a consequence. We don't have means through 

386 # the API exposed by Requests to actually force closing the 

387 # underlying open sockets. 

388 for session in self._sessions.values(): 

389 session.close() 

390 

391 self._sessions.clear() 

392 

393 def get(self, rpath: ResourcePath) -> requests.Session: 

394 """Retrieve a session for accessing the remote resource at rpath. 

395 

396 Parameters 

397 ---------- 

398 rpath : `ResourcePath` 

399 URL to a resource at the remote server for which a session is to 

400 be retrieved. 

401 

402 Notes 

403 ----- 

404 Once a session is created for a given endpoint it is cached and 

405 returned every time a session is requested for any path under that same 

406 endpoint. For instance, a single session will be cached and shared 

407 for paths "https://www.example.org/path/to/file" and 

408 "https://www.example.org/any/other/path". 

409 

410 Note that "https://www.example.org" and "https://www.example.org:12345" 

411 will have different sessions since the port number is not identical. 

412 

413 In order to configure the session, some environment variables are 

414 inspected: 

415 

416 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

417 certificates to trust when verifying the server's certificate. 

418 

419 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

420 local file containing a bearer token to be used as the client 

421 authentication mechanism with all requests. 

422 The permissions of the token file must be set so that only its 

423 owner can access it. 

424 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

425 and LSST_HTTP_AUTH_CLIENT_KEY. 

426 

427 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

428 client certificate for authenticating to the server. 

429 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

430 initialized with the path of the client private key file. 

431 The permissions of the client private key must be set so that only 

432 its owner can access it, at least for reading. 

433 """ 

434 root_uri = str(rpath.root_uri()) 

435 if root_uri not in self._sessions: 

436 # We don't have yet a session for this endpoint: create a new one. 

437 self._sessions[root_uri] = self._make_session(rpath) 

438 

439 return self._sessions[root_uri] 

440 

441 def _make_session(self, rpath: ResourcePath) -> requests.Session: 

442 """Make a new session configured from values from the environment.""" 

443 session = requests.Session() 

444 root_uri = str(rpath.root_uri()) 

445 log.debug("Creating new HTTP session for endpoint %s ...", root_uri) 

446 retries = Retry( 

447 # Total number of retries to allow. Takes precedence over other 

448 # counts. 

449 total=6, 

450 # How many connection-related errors to retry on. 

451 connect=3, 

452 # How many times to retry on read errors. 

453 read=3, 

454 # Backoff factor to apply between attempts after the second try 

455 # (seconds). Compute a random jitter to prevent all the clients 

456 # to overwhelm the server by sending requests at the same time. 

457 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(), 

458 # How many times to retry on bad status codes. 

459 status=5, 

460 # Set of uppercased HTTP method verbs that we should retry on. 

461 # We only automatically retry idempotent requests. 

462 allowed_methods=frozenset( 

463 [ 

464 "COPY", 

465 "DELETE", 

466 "GET", 

467 "HEAD", 

468 "MKCOL", 

469 "OPTIONS", 

470 "PROPFIND", 

471 "PUT", 

472 ] 

473 ), 

474 # HTTP status codes that we should force a retry on. 

475 status_forcelist=frozenset( 

476 [ 

477 requests.codes.too_many_requests, # 429 

478 requests.codes.internal_server_error, # 500 

479 requests.codes.bad_gateway, # 502 

480 requests.codes.service_unavailable, # 503 

481 requests.codes.gateway_timeout, # 504 

482 ] 

483 ), 

484 # Whether to respect Retry-After header on status codes defined 

485 # above. 

486 respect_retry_after_header=True, 

487 ) 

488 

489 # Persist the specified number of connections to the front end server. 

490 session.mount( 

491 root_uri, 

492 HTTPAdapter( 

493 pool_connections=self._num_pools, 

494 pool_maxsize=self._max_persistent_connections, 

495 pool_block=False, 

496 max_retries=retries, 

497 ), 

498 ) 

499 

500 # Do not persist the connections to back end servers which may vary 

501 # from request to request. Systematically persisting connections to 

502 # those servers may exhaust their capabilities when there are thousands 

503 # of simultaneous clients. 

504 session.mount( 

505 f"{rpath.scheme}://", 

506 HTTPAdapter( 

507 pool_connections=self._num_pools, 

508 pool_maxsize=0, 

509 pool_block=False, 

510 max_retries=retries, 

511 ), 

512 ) 

513 

514 # If the remote endpoint doesn't use secure HTTP we don't include 

515 # bearer tokens in the requests nor need to authenticate the remote 

516 # server. 

517 if rpath.scheme != "https": 

518 return session 

519 

520 # Should we use a specific CA cert bundle for authenticating the 

521 # server? 

522 session.verify = True 

523 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

524 session.verify = ca_bundle 

525 

526 # Should we use bearer tokens for client authentication? 

527 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

528 log.debug("... using bearer token authentication") 

529 session.auth = BearerTokenAuth(token) 

530 return session 

531 

532 # Should we instead use client certificate and private key? If so, both 

533 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

534 # initialized. 

535 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

536 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

537 if client_cert and client_key: 

538 if not _is_protected(client_key): 

539 raise PermissionError( 

540 f"Private key file at {client_key} must be protected for access only by its owner" 

541 ) 

542 log.debug("... using client certificate authentication.") 

543 session.cert = (client_cert, client_key) 

544 return session 

545 

546 if client_cert: 

547 # Only the client certificate was provided. 

548 raise ValueError( 

549 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

550 ) 

551 

552 if client_key: 

553 # Only the client private key was provided. 

554 raise ValueError( 

555 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

556 ) 

557 

558 log.debug( 

559 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

560 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

561 ) 

562 return session 

563 

564 

565class HttpResourcePath(ResourcePath): 

566 """General HTTP(S) resource. 

567 

568 Notes 

569 ----- 

570 In order to configure the behavior of instances of this class, the 

571 environment variables below are inspected: 

572 

573 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

574 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

575 This header is required by some servers to detect if the client 

576 knows how to handle redirections. In case of redirection, the body 

577 of the PUT request is sent to the redirected location and not to 

578 the front end server. 

579 

580 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a 

581 numeric value, they are interpreted as the number of seconds to wait 

582 for establishing a connection with the server and for reading its 

583 response, respectively. 

584 

585 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and 

586 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number 

587 of connections to attempt to persist with both the front end servers 

588 and the back end servers. 

589 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and 

590 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS. 

591 

592 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to 

593 ask the server to compute for every file's content sent to the server 

594 via a PUT request. No digest is requested if this variable is not set 

595 or is set to an invalid value. 

596 Valid values are those in ACCEPTED_DIGESTS. 

597 """ 

598 

599 _is_webdav: bool | None = None 

600 

601 # Configuration items for this class instances. 

602 _config = HttpResourcePathConfig() 

603 

604 # The session for metadata requests is used for interacting with 

605 # the front end servers for requests such as PROPFIND, HEAD, etc. Those 

606 # interactions are typically served by the front end servers. We want to 

607 # keep the connection to the front end servers open, to reduce the cost 

608 # associated to TCP and TLS handshaking for each new request. 

609 _metadata_session_store = SessionStore( 

610 num_pools=5, 

611 max_persistent_connections=_config.front_end_connections, 

612 backoff_min=_config.backoff_min, 

613 backoff_max=_config.backoff_max, 

614 ) 

615 

616 # The data session is used for interaction with the front end servers which 

617 # typically redirect to the back end servers for serving our PUT and GET 

618 # requests. We attempt to keep a single connection open with the front end 

619 # server, if possible. This depends on how the server behaves and the 

620 # kind of request. Some servers close the connection when redirecting 

621 # the client to a back end server, for instance when serving a PUT 

622 # request. 

623 _data_session_store = SessionStore( 

624 num_pools=25, 

625 max_persistent_connections=_config.back_end_connections, 

626 backoff_min=_config.backoff_min, 

627 backoff_max=_config.backoff_max, 

628 ) 

629 

630 # Process ID which created the session stores above. We need to store this 

631 # to replace sessions created by a parent process and inherited by a 

632 # child process after a fork, to avoid confusing the SSL layer. 

633 _pid: int = -1 

634 

635 @property 

636 def metadata_session(self) -> requests.Session: 

637 """Client session to send requests which do not require upload or 

638 download of data, i.e. mostly metadata requests. 

639 """ 

640 if hasattr(self, "_metadata_session"): 

641 if HttpResourcePath._pid == os.getpid(): 

642 return self._metadata_session 

643 else: 

644 # The metadata session we have in cache was likely created by 

645 # a parent process. Discard all the sessions in that store. 

646 self._metadata_session_store.clear() 

647 

648 # Retrieve a new metadata session. 

649 HttpResourcePath._pid = os.getpid() 

650 self._metadata_session: requests.Session = self._metadata_session_store.get(self) 

651 return self._metadata_session 

652 

653 @property 

654 def data_session(self) -> requests.Session: 

655 """Client session for uploading and downloading data.""" 

656 if hasattr(self, "_data_session"): 

657 if HttpResourcePath._pid == os.getpid(): 

658 return self._data_session 

659 else: 

660 # The data session we have in cache was likely created by 

661 # a parent process. Discard all the sessions in that store. 

662 self._data_session_store.clear() 

663 

664 # Retrieve a new data session. 

665 HttpResourcePath._pid = os.getpid() 

666 self._data_session: requests.Session = self._data_session_store.get(self) 

667 return self._data_session 

668 

669 def _clear_sessions(self) -> None: 

670 """Close the socket connections that are still open. 

671 

672 Used only in test suites to avoid warnings. 

673 """ 

674 self._metadata_session_store.clear() 

675 self._data_session_store.clear() 

676 

677 if hasattr(self, "_metadata_session"): 

678 delattr(self, "_metadata_session") 

679 

680 if hasattr(self, "_data_session"): 

681 delattr(self, "_data_session") 

682 

683 @property 

684 def is_webdav_endpoint(self) -> bool: 

685 """Check if the current endpoint implements WebDAV features. 

686 

687 This is stored per URI but cached by root so there is 

688 only one check per hostname. 

689 """ 

690 if self._is_webdav is not None: 

691 return self._is_webdav 

692 

693 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

694 return self._is_webdav 

695 

696 def exists(self) -> bool: 

697 """Check that a remote HTTP resource exists.""" 

698 log.debug("Checking if resource exists: %s", self.geturl()) 

699 if not self.is_webdav_endpoint: 

700 # The remote is a plain HTTP server. Let's attempt a HEAD 

701 # request, even if the behavior for such a request against a 

702 # directory is not specified, so it depends on the server 

703 # implementation. 

704 resp = self.metadata_session.head( 

705 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

706 ) 

707 return resp.status_code == requests.codes.ok # 200 

708 

709 # The remote endpoint is a webDAV server: send a PROPFIND request 

710 # to determine if it exists. 

711 resp = self._propfind() 

712 if resp.status_code == requests.codes.multi_status: # 207 

713 prop = _parse_propfind_response_body(resp.text)[0] 

714 return prop.exists 

715 else: # 404 Not Found 

716 return False 

717 

718 def size(self) -> int: 

719 """Return the size of the remote resource in bytes.""" 

720 if self.dirLike: 

721 return 0 

722 

723 if not self.is_webdav_endpoint: 

724 # The remote is a plain HTTP server. Send a HEAD request to 

725 # retrieve the size of the resource. 

726 resp = self.metadata_session.head( 

727 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

728 ) 

729 if resp.status_code == requests.codes.ok: # 200 

730 if "Content-Length" in resp.headers: 

731 return int(resp.headers["Content-Length"]) 

732 else: 

733 raise ValueError( 

734 f"Response to HEAD request to {self} does not contain 'Content-Length' header" 

735 ) 

736 elif resp.status_code == requests.codes.not_found: 

737 raise FileNotFoundError( 

738 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

739 ) 

740 else: 

741 raise ValueError( 

742 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} " 

743 f"{resp.reason}" 

744 ) 

745 

746 # The remote is a webDAV server: send a PROPFIND request to retrieve 

747 # the size of the resource. Sizes are only meaningful for files. 

748 resp = self._propfind() 

749 if resp.status_code == requests.codes.multi_status: # 207 

750 prop = _parse_propfind_response_body(resp.text)[0] 

751 if prop.is_file: 

752 return prop.size 

753 elif prop.is_directory: 

754 raise IsADirectoryError( 

755 f"Resource {self} is reported by server as a directory but has a file path" 

756 ) 

757 else: 

758 raise FileNotFoundError(f"Resource {self} does not exist") 

759 else: # 404 Not Found 

760 raise FileNotFoundError( 

761 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

762 ) 

763 

764 def mkdir(self) -> None: 

765 """Create the directory resource if it does not already exist.""" 

766 # Creating directories is only available on WebDAV back ends. 

767 if not self.is_webdav_endpoint: 

768 raise NotImplementedError( 

769 f"Creation of directory {self} is not implemented by plain HTTP servers" 

770 ) 

771 

772 if not self.dirLike: 

773 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

774 

775 # Check if the target directory already exists. 

776 resp = self._propfind() 

777 if resp.status_code == requests.codes.multi_status: # 207 

778 prop = _parse_propfind_response_body(resp.text)[0] 

779 if prop.exists: 

780 if prop.is_directory: 

781 return 

782 else: 

783 # A file exists at this path 

784 raise NotADirectoryError( 

785 f"Can not create a directory for {self} because a file already exists at that path" 

786 ) 

787 

788 # Target directory does not exist. Create it and its ancestors as 

789 # needed. We need to test if parent URL is different from self URL, 

790 # otherwise we could be stuck in a recursive loop 

791 # where self == parent. 

792 if self.geturl() != self.parent().geturl(): 

793 self.parent().mkdir() 

794 

795 log.debug("Creating new directory: %s", self.geturl()) 

796 self._mkcol() 

797 

798 def remove(self) -> None: 

799 """Remove the resource.""" 

800 self._delete() 

801 

802 def read(self, size: int = -1) -> bytes: 

803 """Open the resource and return the contents in bytes. 

804 

805 Parameters 

806 ---------- 

807 size : `int`, optional 

808 The number of bytes to read. Negative or omitted indicates 

809 that all data should be read. 

810 """ 

811 # Use the data session as a context manager to ensure that the 

812 # network connections to both the front end and back end servers are 

813 # closed after downloading the data. 

814 log.debug("Reading from remote resource: %s", self.geturl()) 

815 stream = True if size > 0 else False 

816 with self.data_session as session: 

817 with time_this(log, msg="GET %s", args=(self,)): 

818 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout) 

819 

820 if resp.status_code != requests.codes.ok: # 200 

821 raise FileNotFoundError( 

822 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}" 

823 ) 

824 if not stream: 

825 return resp.content 

826 else: 

827 return next(resp.iter_content(chunk_size=size)) 

828 

829 def write(self, data: bytes, overwrite: bool = True) -> None: 

830 """Write the supplied bytes to the new resource. 

831 

832 Parameters 

833 ---------- 

834 data : `bytes` 

835 The bytes to write to the resource. The entire contents of the 

836 resource will be replaced. 

837 overwrite : `bool`, optional 

838 If `True` the resource will be overwritten if it exists. Otherwise 

839 the write will fail. 

840 """ 

841 log.debug("Writing to remote resource: %s", self.geturl()) 

842 if not overwrite: 

843 if self.exists(): 

844 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

845 

846 # Ensure the parent directory exists. 

847 self.parent().mkdir() 

848 

849 # Upload the data. 

850 log.debug("Writing data to remote resource: %s", self.geturl()) 

851 self._put(data=data) 

852 

853 def transfer_from( 

854 self, 

855 src: ResourcePath, 

856 transfer: str = "copy", 

857 overwrite: bool = False, 

858 transaction: TransactionProtocol | None = None, 

859 ) -> None: 

860 """Transfer the current resource to a Webdav repository. 

861 

862 Parameters 

863 ---------- 

864 src : `ResourcePath` 

865 Source URI. 

866 transfer : `str` 

867 Mode to use for transferring the resource. Supports the following 

868 options: copy. 

869 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

870 Currently unused. 

871 """ 

872 # Fail early to prevent delays if remote resources are requested. 

873 if transfer not in self.transferModes: 

874 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

875 

876 # Existence checks cost time so do not call this unless we know 

877 # that debugging is enabled. 

878 if log.isEnabledFor(logging.DEBUG): 

879 log.debug( 

880 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

881 src, 

882 src.exists(), 

883 self, 

884 self.exists(), 

885 transfer, 

886 ) 

887 

888 # Short circuit immediately if the URIs are identical. 

889 if self == src: 

890 log.debug( 

891 "Target and destination URIs are identical: %s, returning immediately." 

892 " No further action required.", 

893 self, 

894 ) 

895 return 

896 

897 if not overwrite and self.exists(): 

898 raise FileExistsError(f"Destination path {self} already exists.") 

899 

900 if transfer == "auto": 

901 transfer = self.transferDefault 

902 

903 # We can use webDAV 'COPY' or 'MOVE' if both the current and source 

904 # resources are located in the same server. 

905 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint: 

906 log.debug("Transfer from %s to %s directly", src, self) 

907 return self._move(src) if transfer == "move" else self._copy(src) 

908 

909 # For resources of different classes or for plain HTTP resources we can 

910 # perform the copy or move operation by downloading to a local file 

911 # and uploading to the destination. 

912 self._copy_via_local(src) 

913 

914 # This was an explicit move, try to remove the source. 

915 if transfer == "move": 

916 src.remove() 

917 

918 def walk( 

919 self, file_filter: str | re.Pattern | None = None 

920 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

921 """Walk the directory tree returning matching files and directories. 

922 

923 Parameters 

924 ---------- 

925 file_filter : `str` or `re.Pattern`, optional 

926 Regex to filter out files from the list before it is returned. 

927 

928 Yields 

929 ------ 

930 dirpath : `ResourcePath` 

931 Current directory being examined. 

932 dirnames : `list` of `str` 

933 Names of subdirectories within dirpath. 

934 filenames : `list` of `str` 

935 Names of all the files within dirpath. 

936 """ 

937 if not self.dirLike: 

938 raise ValueError("Can not walk a non-directory URI") 

939 

940 # Walking directories is only available on WebDAV back ends. 

941 if not self.is_webdav_endpoint: 

942 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers") 

943 

944 if isinstance(file_filter, str): 

945 file_filter = re.compile(file_filter) 

946 

947 resp = self._propfind(depth="1") 

948 if resp.status_code == requests.codes.multi_status: # 207 

949 files: list[str] = [] 

950 dirs: list[str] = [] 

951 

952 for prop in _parse_propfind_response_body(resp.text): 

953 if prop.is_file: 

954 files.append(prop.name) 

955 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")): 

956 # Only include the names of sub-directories not the name of 

957 # the directory being walked. 

958 dirs.append(prop.name) 

959 

960 if file_filter is not None: 

961 files = [f for f in files if file_filter.search(f)] 

962 

963 if not dirs and not files: 

964 return 

965 else: 

966 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files 

967 

968 for dir in dirs: 

969 new_uri = self.join(dir, forceDirectory=True) 

970 yield from new_uri.walk(file_filter) 

971 

972 def _as_local(self) -> tuple[str, bool]: 

973 """Download object over HTTP and place in temporary directory. 

974 

975 Returns 

976 ------- 

977 path : `str` 

978 Path to local temporary file. 

979 temporary : `bool` 

980 Always returns `True`. This is always a temporary file. 

981 """ 

982 # Use the session as a context manager to ensure that connections 

983 # to both the front end and back end servers are closed after the 

984 # download operation is finished. 

985 with self.data_session as session: 

986 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout) 

987 if resp.status_code != requests.codes.ok: 

988 raise FileNotFoundError( 

989 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}" 

990 ) 

991 

992 tmpdir, buffering = _get_temp_dir() 

993 with tempfile.NamedTemporaryFile( 

994 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

995 ) as tmpFile: 

996 expected_length = int(resp.headers.get("Content-Length", "-1")) 

997 with time_this( 

998 log, 

999 msg="GET %s [length=%d] to local file %s [chunk_size=%d]", 

1000 args=(self, expected_length, tmpFile.name, buffering), 

1001 mem_usage=self._config.collect_memory_usage, 

1002 mem_unit=u.mebibyte, 

1003 ): 

1004 content_length = 0 

1005 for chunk in resp.iter_content(chunk_size=buffering): 

1006 tmpFile.write(chunk) 

1007 content_length += len(chunk) 

1008 

1009 # Check that the expected and actual content lengths match. Perform 

1010 # this check only when the contents of the file was not encoded by 

1011 # the server. 

1012 if "Content-Encoding" not in resp.headers: 

1013 if expected_length >= 0 and expected_length != content_length: 

1014 raise ValueError( 

1015 f"Size of downloaded file does not match value in Content-Length header for {self}: " 

1016 f"expecting {expected_length} and got {content_length} bytes" 

1017 ) 

1018 

1019 return tmpFile.name, True 

1020 

1021 def _send_webdav_request( 

1022 self, 

1023 method: str, 

1024 url: str | None = None, 

1025 headers: dict[str, str] = {}, 

1026 body: str | None = None, 

1027 session: requests.Session | None = None, 

1028 timeout: tuple[float, float] | None = None, 

1029 ) -> requests.Response: 

1030 """Send a webDAV request and correctly handle redirects. 

1031 

1032 Parameters 

1033 ---------- 

1034 method : `str` 

1035 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL. 

1036 headers : `dict`, optional 

1037 A dictionary of key-value pairs (both strings) to include as 

1038 headers in the request. 

1039 body: `str`, optional 

1040 The body of the request. 

1041 

1042 Notes 

1043 ----- 

1044 This way of sending webDAV requests is necessary for handling 

1045 redirection ourselves, since the 'requests' package changes the method 

1046 of the redirected request when the server responds with status 302 and 

1047 the method of the original request is not HEAD (which is the case for 

1048 webDAV requests). 

1049 

1050 That means that when the webDAV server we interact with responds with 

1051 a redirection to a PROPFIND or MKCOL request, the request gets 

1052 converted to a GET request when sent to the redirected location. 

1053 

1054 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in 

1055 https://github.com/psf/requests/blob/main/requests/sessions.py 

1056 

1057 This behavior of the 'requests' package is meant to be compatible with 

1058 what is specified in RFC 9110: 

1059 

1060 https://www.rfc-editor.org/rfc/rfc9110#name-302-found 

1061 

1062 For our purposes, we do need to follow the redirection and send a new 

1063 request using the same HTTP verb. 

1064 """ 

1065 if url is None: 

1066 url = self.geturl() 

1067 

1068 if session is None: 

1069 session = self.metadata_session 

1070 

1071 if timeout is None: 

1072 timeout = self._config.timeout 

1073 

1074 with time_this( 

1075 log, 

1076 msg="%s %s", 

1077 args=( 

1078 method, 

1079 url, 

1080 ), 

1081 mem_usage=self._config.collect_memory_usage, 

1082 mem_unit=u.mebibyte, 

1083 ): 

1084 for _ in range(max_redirects := 5): 

1085 resp = session.request( 

1086 method, 

1087 url, 

1088 data=body, 

1089 headers=headers, 

1090 stream=False, 

1091 timeout=timeout, 

1092 allow_redirects=False, 

1093 ) 

1094 if resp.is_redirect: 

1095 url = resp.headers["Location"] 

1096 else: 

1097 return resp 

1098 

1099 # We reached the maximum allowed number of redirects. 

1100 # Stop trying. 

1101 raise ValueError( 

1102 f"Could not get a response to {method} request for {self} after " 

1103 f"{max_redirects} redirections" 

1104 ) 

1105 

1106 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response: 

1107 """Send a PROPFIND webDAV request and return the response. 

1108 

1109 Parameters 

1110 ---------- 

1111 body : `str`, optional 

1112 The body of the PROPFIND request to send to the server. If 

1113 provided, it is expected to be a XML document. 

1114 depth : `str`, optional 

1115 The value of the 'Depth' header to include in the request. 

1116 

1117 Returns 

1118 ------- 

1119 response : `requests.Response` 

1120 Response to the PROPFIND request. 

1121 

1122 Notes 

1123 ----- 

1124 It raises `ValueError` if the status code of the PROPFIND request 

1125 is different from "207 Multistatus" or "404 Not Found". 

1126 """ 

1127 if body is None: 

1128 # Request only the DAV live properties we are explicitly interested 

1129 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified' 

1130 # and 'displayname'. 

1131 body = ( 

1132 """<?xml version="1.0" encoding="utf-8" ?>""" 

1133 """<D:propfind xmlns:D="DAV:"><D:prop>""" 

1134 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>""" 

1135 """</D:prop></D:propfind>""" 

1136 ) 

1137 headers = { 

1138 "Depth": depth, 

1139 "Content-Type": 'application/xml; charset="utf-8"', 

1140 "Content-Length": str(len(body)), 

1141 } 

1142 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body) 

1143 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found): 

1144 return resp 

1145 else: 

1146 raise ValueError( 

1147 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} " 

1148 f"{resp.reason}" 

1149 ) 

1150 

1151 def _options(self) -> requests.Response: 

1152 """Send a OPTIONS webDAV request for this resource.""" 

1153 resp = self._send_webdav_request("OPTIONS") 

1154 if resp.status_code in (requests.codes.ok, requests.codes.created): 

1155 return resp 

1156 

1157 raise ValueError( 

1158 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} " f"{resp.reason}" 

1159 ) 

1160 

1161 def _head(self) -> requests.Response: 

1162 """Send a HEAD webDAV request for this resource.""" 

1163 return self._send_webdav_request("HEAD") 

1164 

1165 def _mkcol(self) -> None: 

1166 """Send a MKCOL webDAV request to create a collection. The collection 

1167 may already exist. 

1168 """ 

1169 resp = self._send_webdav_request("MKCOL") 

1170 if resp.status_code == requests.codes.created: # 201 

1171 return 

1172 

1173 if resp.status_code == requests.codes.method_not_allowed: # 405 

1174 # The remote directory already exists 

1175 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

1176 else: 

1177 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}") 

1178 

1179 def _delete(self) -> None: 

1180 """Send a DELETE webDAV request for this resource.""" 

1181 log.debug("Deleting %s ...", self.geturl()) 

1182 

1183 # If this is a directory, ensure the remote is a webDAV server because 

1184 # plain HTTP servers don't support DELETE requests on non-file 

1185 # paths. 

1186 if self.dirLike and not self.is_webdav_endpoint: 

1187 raise NotImplementedError( 

1188 f"Deletion of directory {self} is not implemented by plain HTTP servers" 

1189 ) 

1190 

1191 # Deleting non-empty directories may take some time, so increase 

1192 # the timeout for getting a response from the server. 

1193 timeout = self._config.timeout 

1194 if self.dirLike: 

1195 timeout = (timeout[0], timeout[1] * 100) 

1196 resp = self._send_webdav_request("DELETE", timeout=timeout) 

1197 if resp.status_code in ( 

1198 requests.codes.ok, 

1199 requests.codes.accepted, 

1200 requests.codes.no_content, 

1201 requests.codes.not_found, 

1202 ): 

1203 # We can get a "404 Not Found" error when the file or directory 

1204 # does not exist or when the DELETE request was retried several 

1205 # times and a previous attempt actually deleted the resource. 

1206 # Therefore we consider that a "Not Found" response is not an 

1207 # error since we reached the state desired by the user. 

1208 return 

1209 else: 

1210 # TODO: the response to a DELETE request against a webDAV server 

1211 # may be multistatus. If so, we need to parse the reponse body to 

1212 # determine more precisely the reason of the failure (e.g. a lock) 

1213 # and provide a more helpful error message. 

1214 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}") 

1215 

1216 def _copy_via_local(self, src: ResourcePath) -> None: 

1217 """Replace the contents of this resource with the contents of a remote 

1218 resource by using a local temporary file. 

1219 

1220 Parameters 

1221 ---------- 

1222 src : `HttpResourcePath` 

1223 The source of the contents to copy to `self`. 

1224 """ 

1225 with src.as_local() as local_uri: 

1226 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri) 

1227 with open(local_uri.ospath, "rb") as f: 

1228 self._put(data=f) 

1229 

1230 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None: 

1231 """Send a COPY or MOVE webDAV request to copy or replace the contents 

1232 of this resource with the contents of another resource located in the 

1233 same server. 

1234 

1235 Parameters 

1236 ---------- 

1237 method : `str` 

1238 The method to perform. Valid values are "COPY" or "MOVE" (in 

1239 uppercase). 

1240 

1241 src : `HttpResourcePath` 

1242 The source of the contents to move to `self`. 

1243 """ 

1244 headers = {"Destination": self.geturl()} 

1245 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session) 

1246 if resp.status_code in (requests.codes.created, requests.codes.no_content): 

1247 return 

1248 

1249 if resp.status_code == requests.codes.multi_status: 

1250 tree = eTree.fromstring(resp.content) 

1251 status_element = tree.find("./{DAV:}response/{DAV:}status") 

1252 status = status_element.text if status_element is not None else "unknown" 

1253 error = tree.find("./{DAV:}response/{DAV:}error") 

1254 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}") 

1255 else: 

1256 raise ValueError( 

1257 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}" 

1258 ) 

1259 

1260 def _copy(self, src: HttpResourcePath) -> None: 

1261 """Send a COPY webDAV request to replace the contents of this resource 

1262 (if any) with the contents of another resource located in the same 

1263 server. 

1264 

1265 Parameters 

1266 ---------- 

1267 src : `HttpResourcePath` 

1268 The source of the contents to copy to `self`. 

1269 """ 

1270 # Neither dCache nor XrootD currently implement the COPY 

1271 # webDAV method as documented in 

1272 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY 

1273 # (See issues DM-37603 and DM-37651 for details) 

1274 # 

1275 # For the time being, we use a temporary local file to 

1276 # perform the copy client side. 

1277 # TODO: when those 2 issues above are solved remove the 3 lines below. 

1278 must_use_local = True 

1279 if must_use_local: 

1280 return self._copy_via_local(src) 

1281 

1282 return self._copy_or_move("COPY", src) 

1283 

1284 def _move(self, src: HttpResourcePath) -> None: 

1285 """Send a MOVE webDAV request to replace the contents of this resource 

1286 with the contents of another resource located in the same server. 

1287 

1288 Parameters 

1289 ---------- 

1290 src : `HttpResourcePath` 

1291 The source of the contents to move to `self`. 

1292 """ 

1293 return self._copy_or_move("MOVE", src) 

1294 

1295 def _put(self, data: BinaryIO | bytes) -> None: 

1296 """Perform an HTTP PUT request and handle redirection. 

1297 

1298 Parameters 

1299 ---------- 

1300 data : `Union[BinaryIO, bytes]` 

1301 The data to be included in the body of the PUT request. 

1302 """ 

1303 # Retrieve the final URL for this upload by sending a PUT request with 

1304 # no content. Follow a single server redirection to retrieve the 

1305 # final URL. 

1306 headers = {"Content-Length": "0"} 

1307 if self._config.send_expect_on_put: 

1308 headers["Expect"] = "100-continue" 

1309 

1310 url = self.geturl() 

1311 

1312 # Use the session as a context manager to ensure the underlying 

1313 # connections are closed after finishing uploading the data. 

1314 with self.data_session as session: 

1315 # Send an empty PUT request to get redirected to the final 

1316 # destination. 

1317 log.debug("Sending empty PUT request to %s", url) 

1318 with time_this( 

1319 log, 

1320 msg="PUT (no data) %s", 

1321 args=(url,), 

1322 mem_usage=self._config.collect_memory_usage, 

1323 mem_unit=u.mebibyte, 

1324 ): 

1325 resp = session.request( 

1326 "PUT", 

1327 url, 

1328 data=None, 

1329 headers=headers, 

1330 stream=False, 

1331 timeout=self._config.timeout, 

1332 allow_redirects=False, 

1333 ) 

1334 if resp.is_redirect: 

1335 url = resp.headers["Location"] 

1336 

1337 # Upload the data to the final destination. 

1338 log.debug("Uploading data to %s", url) 

1339 

1340 # Ask the server to compute and record a checksum of the uploaded 

1341 # file contents, for later integrity checks. Since we don't compute 

1342 # the digest ourselves while uploading the data, we cannot control 

1343 # after the request is complete that the data we uploaded is 

1344 # identical to the data recorded by the server, but at least the 

1345 # server has recorded a digest of the data it stored. 

1346 # 

1347 # See RFC-3230 for details and 

1348 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml 

1349 # for the list of supported digest algorithhms. 

1350 # In addition, note that not all servers implement this RFC so 

1351 # the checksum may not be computed by the server. 

1352 put_headers: dict[str, str] | None = None 

1353 if digest := self._config.digest_algorithm: 

1354 put_headers = {"Want-Digest": digest} 

1355 

1356 with time_this( 

1357 log, 

1358 msg="PUT %s", 

1359 args=(url,), 

1360 mem_usage=self._config.collect_memory_usage, 

1361 mem_unit=u.mebibyte, 

1362 ): 

1363 resp = session.request( 

1364 "PUT", 

1365 url, 

1366 data=data, 

1367 headers=put_headers, 

1368 stream=False, 

1369 timeout=self._config.timeout, 

1370 allow_redirects=False, 

1371 ) 

1372 if resp.status_code in ( 

1373 requests.codes.ok, 

1374 requests.codes.created, 

1375 requests.codes.no_content, 

1376 ): 

1377 return 

1378 else: 

1379 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}") 

1380 

1381 @contextlib.contextmanager 

1382 def _openImpl( 

1383 self, 

1384 mode: str = "r", 

1385 *, 

1386 encoding: str | None = None, 

1387 ) -> Iterator[ResourceHandleProtocol]: 

1388 resp = self._head() 

1389 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes" 

1390 handle: ResourceHandleProtocol 

1391 if mode in ("rb", "r") and accepts_range: 

1392 handle = HttpReadResourceHandle( 

1393 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout 

1394 ) 

1395 if mode == "r": 

1396 # cast because the protocol is compatible, but does not have 

1397 # BytesIO in the inheritance tree 

1398 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding) 

1399 else: 

1400 yield handle 

1401 else: 

1402 with super()._openImpl(mode, encoding=encoding) as http_handle: 

1403 yield http_handle 

1404 

1405 

1406def _dump_response(resp: requests.Response) -> None: 

1407 """Log the contents of a HTTP or webDAV request and its response. 

1408 

1409 Parameters 

1410 ---------- 

1411 resp : `requests.Response` 

1412 The response to log. 

1413 

1414 Notes 

1415 ----- 

1416 Intended for development purposes only. 

1417 """ 

1418 log.debug("-----------------------------------------------") 

1419 log.debug("Request") 

1420 log.debug(" method=%s", resp.request.method) 

1421 log.debug(" URL=%s", resp.request.url) 

1422 log.debug(" headers=%s", resp.request.headers) 

1423 if resp.request.method == "PUT": 

1424 log.debug(" body=<data>") 

1425 elif resp.request.body is None: 

1426 log.debug(" body=<empty>") 

1427 else: 

1428 log.debug(" body=%r", resp.request.body[:120]) 

1429 

1430 log.debug("Response:") 

1431 log.debug(" status_code=%d", resp.status_code) 

1432 log.debug(" headers=%s", resp.headers) 

1433 if not resp.content: 

1434 log.debug(" body=<empty>") 

1435 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain": 

1436 log.debug(" body=%r", resp.content) 

1437 else: 

1438 log.debug(" body=%r", resp.content[:80]) 

1439 

1440 

1441def _is_protected(filepath: str) -> bool: 

1442 """Return true if the permissions of file at filepath only allow for access 

1443 by its owner. 

1444 

1445 Parameters 

1446 ---------- 

1447 filepath : `str` 

1448 Path of a local file. 

1449 """ 

1450 if not os.path.isfile(filepath): 

1451 return False 

1452 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

1453 owner_accessible = bool(mode & stat.S_IRWXU) 

1454 group_accessible = bool(mode & stat.S_IRWXG) 

1455 other_accessible = bool(mode & stat.S_IRWXO) 

1456 return owner_accessible and not group_accessible and not other_accessible 

1457 

1458 

1459def _parse_propfind_response_body(body: str) -> list[DavProperty]: 

1460 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND 

1461 request. 

1462 

1463 Parameters 

1464 ---------- 

1465 body : `str` 

1466 XML-encoded response body to a PROPFIND request 

1467 

1468 Returns 

1469 ------- 

1470 responses : `List[DavProperty]` 

1471 

1472 Notes 

1473 ----- 

1474 Is is expected that there is at least one reponse in `body`, otherwise 

1475 this function raises. 

1476 """ 

1477 # A response body to a PROPFIND request is of the form (indented for 

1478 # readability): 

1479 # 

1480 # <?xml version="1.0" encoding="UTF-8"?> 

1481 # <D:multistatus xmlns:D="DAV:"> 

1482 # <D:response> 

1483 # <D:href>path/to/resource</D:href> 

1484 # <D:propstat> 

1485 # <D:prop> 

1486 # <D:resourcetype> 

1487 # <D:collection xmlns:D="DAV:"/> 

1488 # </D:resourcetype> 

1489 # <D:getlastmodified> 

1490 # Fri, 27 Jan 2 023 13:59:01 GMT 

1491 # </D:getlastmodified> 

1492 # <D:getcontentlength> 

1493 # 12345 

1494 # </D:getcontentlength> 

1495 # </D:prop> 

1496 # <D:status> 

1497 # HTTP/1.1 200 OK 

1498 # </D:status> 

1499 # </D:propstat> 

1500 # </D:response> 

1501 # <D:response> 

1502 # ... 

1503 # </D:response> 

1504 # <D:response> 

1505 # ... 

1506 # </D:response> 

1507 # </D:multistatus> 

1508 

1509 # Scan all the 'response' elements and extract the relevant properties 

1510 responses = [] 

1511 multistatus = eTree.fromstring(body.strip()) 

1512 for response in multistatus.findall("./{DAV:}response"): 

1513 responses.append(DavProperty(response)) 

1514 

1515 if responses: 

1516 return responses 

1517 else: 

1518 # Could not parse the body 

1519 raise ValueError(f"Unable to parse response for PROPFIND request: {body}") 

1520 

1521 

1522class DavProperty: 

1523 """Helper class to encapsulate select live DAV properties of a single 

1524 resource, as retrieved via a PROPFIND request. 

1525 """ 

1526 

1527 # Regular expression to compare against the 'status' element of a 

1528 # PROPFIND response's 'propstat' element. 

1529 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE) 

1530 

1531 def __init__(self, response: eTree.Element | None): 

1532 self._href: str = "" 

1533 self._displayname: str = "" 

1534 self._collection: bool = False 

1535 self._getlastmodified: str = "" 

1536 self._getcontentlength: int = -1 

1537 

1538 if response is not None: 

1539 self._parse(response) 

1540 

1541 def _parse(self, response: eTree.Element) -> None: 

1542 # Extract 'href'. 

1543 if (element := response.find("./{DAV:}href")) is not None: 

1544 # We need to use "str(element.text)"" instead of "element.text" to 

1545 # keep mypy happy. 

1546 self._href = str(element.text).strip() 

1547 else: 

1548 raise ValueError( 

1549 f"Property 'href' expected but not found in PROPFIND response: " 

1550 f"{eTree.tostring(response, encoding='unicode')}" 

1551 ) 

1552 

1553 for propstat in response.findall("./{DAV:}propstat"): 

1554 # Only extract properties of interest with status OK. 

1555 status = propstat.find("./{DAV:}status") 

1556 if status is None or not self._status_ok_rex.match(str(status.text)): 

1557 continue 

1558 

1559 for prop in propstat.findall("./{DAV:}prop"): 

1560 # Parse "collection". 

1561 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None: 

1562 self._collection = True 

1563 

1564 # Parse "getlastmodified". 

1565 if (element := prop.find("./{DAV:}getlastmodified")) is not None: 

1566 self._getlastmodified = str(element.text) 

1567 

1568 # Parse "getcontentlength". 

1569 if (element := prop.find("./{DAV:}getcontentlength")) is not None: 

1570 self._getcontentlength = int(str(element.text)) 

1571 

1572 # Parse "displayname". 

1573 if (element := prop.find("./{DAV:}displayname")) is not None: 

1574 self._displayname = str(element.text) 

1575 

1576 # Some webDAV servers don't include the 'displayname' property in the 

1577 # response so try to infer it from the value of the 'href' property. 

1578 # Depending on the server the href value may end with '/'. 

1579 if not self._displayname: 

1580 self._displayname = os.path.basename(self._href.rstrip("/")) 

1581 

1582 # Force a size of 0 for collections. 

1583 if self._collection: 

1584 self._getcontentlength = 0 

1585 

1586 @property 

1587 def exists(self) -> bool: 

1588 # It is either a directory or a file with length of at least zero 

1589 return self._collection or self._getcontentlength >= 0 

1590 

1591 @property 

1592 def is_directory(self) -> bool: 

1593 return self._collection 

1594 

1595 @property 

1596 def is_file(self) -> bool: 

1597 return not self._collection 

1598 

1599 @property 

1600 def size(self) -> int: 

1601 return self._getcontentlength 

1602 

1603 @property 

1604 def name(self) -> str: 

1605 return self._displayname 

1606 

1607 @property 

1608 def href(self) -> str: 

1609 return self._href