Coverage for python/lsst/resources/http.py: 20%

561 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-18 02:06 -0700

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpResourcePath",) 

15 

16import contextlib 

17import functools 

18import io 

19import logging 

20import math 

21import os 

22import os.path 

23import random 

24import re 

25import stat 

26import tempfile 

27import xml.etree.ElementTree as eTree 

28from typing import TYPE_CHECKING, BinaryIO, Iterator, List, Optional, Tuple, Union, cast 

29 

30import requests 

31from astropy import units as u 

32from lsst.utils.timer import time_this 

33from requests.adapters import HTTPAdapter 

34from requests.auth import AuthBase 

35from urllib3.util.retry import Retry 

36 

37from ._resourceHandles import ResourceHandleProtocol 

38from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle 

39from ._resourcePath import ResourcePath 

40 

41if TYPE_CHECKING: 

42 from .utils import TransactionProtocol 

43 

44log = logging.getLogger(__name__) 

45 

46 

47class HttpResourcePathConfig: 

48 """Configuration class to encapsulate the configurable items used by class 

49 HttpResourcePath. 

50 """ 

51 

52 # Default timeouts for all HTTP requests (seconds). 

53 DEFAULT_TIMEOUT_CONNECT = 30.0 

54 DEFAULT_TIMEOUT_READ = 1_500.0 

55 

56 # Default lower and upper bounds for the backoff interval (seconds). 

57 # A value in this interval is randomly selected as the backoff factor when 

58 # requests need to be retried. 

59 DEFAULT_BACKOFF_MIN = 1.0 

60 DEFAULT_BACKOFF_MAX = 3.0 

61 

62 # Default number of connections to persist with both the front end and 

63 # back end servers. 

64 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2 

65 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1 

66 

67 # Accepted digest algorithms 

68 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512") 

69 

70 _front_end_connections: Optional[int] = None 

71 _back_end_connections: Optional[int] = None 

72 _digest_algorithm: Optional[str] = None 

73 _send_expect_on_put: Optional[bool] = None 

74 _timeout: Optional[tuple[float, float]] = None 

75 _collect_memory_usage: Optional[bool] = None 

76 _backoff_min: Optional[float] = None 

77 _backoff_max: Optional[float] = None 

78 

79 @property 

80 def front_end_connections(self) -> int: 

81 """Number of persistent connections to the front end server.""" 

82 

83 if self._front_end_connections is not None: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true

84 return self._front_end_connections 

85 

86 try: 

87 self._front_end_connections = int( 

88 os.environ.get( 

89 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

90 ) 

91 ) 

92 except ValueError: 

93 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS 

94 

95 return self._front_end_connections 

96 

97 @property 

98 def back_end_connections(self) -> int: 

99 """Number of persistent connections to the back end servers.""" 

100 

101 if self._back_end_connections is not None: 101 ↛ 102line 101 didn't jump to line 102, because the condition on line 101 was never true

102 return self._back_end_connections 

103 

104 try: 

105 self._back_end_connections = int( 

106 os.environ.get( 

107 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

108 ) 

109 ) 

110 except ValueError: 

111 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS 

112 

113 return self._back_end_connections 

114 

115 @property 

116 def digest_algorithm(self) -> str: 

117 """Algorithm to ask the server to use for computing and recording 

118 digests of each file contents in PUT requests. 

119 

120 Returns 

121 ------- 

122 digest_algorithm: `str` 

123 The name of a digest algorithm or the empty string if no algotihm 

124 is configured. 

125 """ 

126 

127 if self._digest_algorithm is not None: 

128 return self._digest_algorithm 

129 

130 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower() 

131 if digest not in self.ACCEPTED_DIGESTS: 

132 digest = "" 

133 

134 self._digest_algorithm = digest 

135 return self._digest_algorithm 

136 

137 @property 

138 def send_expect_on_put(self) -> bool: 

139 """Return True if a "Expect: 100-continue" header is to be sent to 

140 the server on each PUT request. 

141 

142 Some servers (e.g. dCache) uses this information as an indication that 

143 the client knows how to handle redirects to the specific server that 

144 will actually receive the data for PUT requests. 

145 """ 

146 

147 if self._send_expect_on_put is not None: 

148 return self._send_expect_on_put 

149 

150 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

151 return self._send_expect_on_put 

152 

153 @property 

154 def timeout(self) -> tuple[float, float]: 

155 """Return a tuple with the values of timeouts for connecting to the 

156 server and reading its response, respectively. Both values are in 

157 seconds. 

158 """ 

159 

160 if self._timeout is not None: 

161 return self._timeout 

162 

163 self._timeout = (self.DEFAULT_TIMEOUT_CONNECT, self.DEFAULT_TIMEOUT_READ) 

164 try: 

165 timeout = ( 

166 float(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT)), 

167 float(os.environ.get("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ)), 

168 ) 

169 if not math.isnan(timeout[0]) and not math.isnan(timeout[1]): 

170 self._timeout = timeout 

171 except ValueError: 

172 pass 

173 

174 return self._timeout 

175 

176 @property 

177 def collect_memory_usage(self) -> bool: 

178 """Return true if we want to collect memory usage when timing 

179 operations against the remote server via the `lsst.utils.time_this` 

180 context manager. 

181 """ 

182 

183 if self._collect_memory_usage is not None: 

184 return self._collect_memory_usage 

185 

186 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ 

187 return self._collect_memory_usage 

188 

189 @property 

190 def backoff_min(self) -> float: 

191 """Lower bound of the interval from which a backoff factor is randomly 

192 selected when retrying requests (seconds). 

193 """ 

194 

195 if self._backoff_min is not None: 

196 return self._backoff_min 

197 

198 self._backoff_min = self.DEFAULT_BACKOFF_MIN 

199 try: 

200 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN)) 

201 if not math.isnan(backoff_min): 201 ↛ 206line 201 didn't jump to line 206, because the condition on line 201 was never false

202 self._backoff_min = backoff_min 

203 except ValueError: 

204 pass 

205 

206 return self._backoff_min 

207 

208 @property 

209 def backoff_max(self) -> float: 

210 """Upper bound of the interval from which a backoff factor is randomly 

211 selected when retrying requests (seconds). 

212 """ 

213 

214 if self._backoff_max is not None: 

215 return self._backoff_max 

216 

217 self._backoff_max = self.DEFAULT_BACKOFF_MAX 

218 try: 

219 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX)) 

220 if not math.isnan(backoff_max): 220 ↛ 225line 220 didn't jump to line 225, because the condition on line 220 was never false

221 self._backoff_max = backoff_max 

222 except ValueError: 

223 pass 

224 

225 return self._backoff_max 

226 

227 

228@functools.lru_cache 

229def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool: 

230 """Check whether the remote HTTP endpoint implements WebDAV features. 

231 

232 Parameters 

233 ---------- 

234 path : `ResourcePath` or `str` 

235 URL to the resource to be checked. 

236 Should preferably refer to the root since the status is shared 

237 by all paths in that server. 

238 

239 Returns 

240 ------- 

241 _is_webdav_endpoint : `bool` 

242 True if the endpoint implements WebDAV, False if it doesn't. 

243 """ 

244 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

245 try: 

246 ca_cert_bundle = os.getenv("LSST_HTTP_CACERT_BUNDLE") 

247 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True 

248 resp = requests.options(str(path), verify=verify, stream=False) 

249 if resp.status_code not in (requests.codes.ok, requests.codes.created): 

250 raise ValueError( 

251 f"Unexpected response to OPTIONS request for {path}, status: {resp.status_code} " 

252 f"{resp.reason}" 

253 ) 

254 

255 # Check that "1" is part of the value of the "DAV" header. We don't 

256 # use locks, so a server complying to class 1 is enough for our 

257 # purposes. All webDAV servers must advertise at least compliance 

258 # class "1". 

259 # 

260 # Compliance classes are documented in 

261 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes 

262 # 

263 # Examples of values for header DAV are: 

264 # DAV: 1, 2 

265 # DAV: 1, <http://apache.org/dav/propset/fs/1> 

266 if "DAV" not in resp.headers: 

267 return False 

268 else: 

269 # Convert to str to keep mypy happy 

270 compliance_class = str(resp.headers.get("DAV")) 

271 return "1" in compliance_class.replace(" ", "").split(",") 

272 except requests.exceptions.SSLError as e: 

273 log.warning( 

274 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to " 

275 "specify a bundle of certificate authorities you trust which are " 

276 "not included in the default set of trusted authorities of your " 

277 "system." 

278 ) 

279 raise e 

280 

281 

282# Tuple (path, block_size) pointing to the location of a local directory 

283# to save temporary files and the block size of the underlying file system. 

284_TMPDIR: Optional[tuple[str, int]] = None 

285 

286 

287def _get_temp_dir() -> tuple[str, int]: 

288 """Return the temporary directory path and block size. 

289 

290 This function caches its results in _TMPDIR. 

291 """ 

292 global _TMPDIR 

293 if _TMPDIR: 

294 return _TMPDIR 

295 

296 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

297 # 'TMPDIR', if defined. Otherwise use current working directory. 

298 tmpdir = os.getcwd() 

299 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

300 if dir and os.path.isdir(dir): 

301 tmpdir = dir 

302 break 

303 

304 # Compute the block size as 256 blocks of typical size 

305 # (i.e. 4096 bytes) or 10 times the file system block size, 

306 # whichever is higher. This is a reasonable compromise between 

307 # using memory for buffering and the number of system calls 

308 # issued to read from or write to temporary files. 

309 fsstats = os.statvfs(tmpdir) 

310 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

311 

312 

313class BearerTokenAuth(AuthBase): 

314 """Attach a bearer token 'Authorization' header to each request. 

315 

316 Parameters 

317 ---------- 

318 token : `str` 

319 Can be either the path to a local protected file which contains the 

320 value of the token or the token itself. 

321 """ 

322 

323 def __init__(self, token: str): 

324 self._token = self._path = None 

325 self._mtime: float = -1.0 

326 if not token: 

327 return 

328 

329 self._token = token 

330 if os.path.isfile(token): 

331 self._path = os.path.abspath(token) 

332 if not _is_protected(self._path): 

333 raise PermissionError( 

334 f"Bearer token file at {self._path} must be protected for access only by its owner" 

335 ) 

336 self._refresh() 

337 

338 def _refresh(self) -> None: 

339 """Read the token file (if any) if its modification time is more recent 

340 than the last time we read it. 

341 """ 

342 if not self._path: 

343 return 

344 

345 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

346 log.debug("Reading bearer token file at %s", self._path) 

347 self._mtime = mtime 

348 with open(self._path) as f: 

349 self._token = f.read().rstrip("\n") 

350 

351 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

352 if self._token: 

353 self._refresh() 

354 req.headers["Authorization"] = f"Bearer {self._token}" 

355 return req 

356 

357 

358class SessionStore: 

359 """Cache a reusable HTTP client session per endpoint.""" 

360 

361 def __init__( 

362 self, 

363 num_pools: int = 10, 

364 max_persistent_connections: int = 1, 

365 backoff_min: float = 1.0, 

366 backoff_max: float = 3.0, 

367 ) -> None: 

368 # Dictionary to store the session associated to a given URI. The key 

369 # of the dictionary is a root URI and the value is the session. 

370 self._sessions: dict[str, requests.Session] = {} 

371 

372 # Number of connection pools to keep: there is one pool per remote 

373 # host. See documentation of urllib3 PoolManager class: 

374 # https://urllib3.readthedocs.io 

375 self._num_pools: int = num_pools 

376 

377 # Maximum number of connections per remote host to persist in each 

378 # connection pool. See urllib3 Advanced Usage documentation: 

379 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html 

380 self._max_persistent_connections: int = max_persistent_connections 

381 

382 # Minimum and maximum values of the inverval to compute the exponential 

383 # backoff factor when retrying requests (seconds). 

384 self._backoff_min: float = backoff_min 

385 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0 

386 

387 def clear(self) -> None: 

388 """Destroy all previously created sessions and attempt to close 

389 underlying idle network connections. 

390 """ 

391 

392 # Close all sessions and empty the store. Idle network connections 

393 # should be closed as a consequence. We don't have means through 

394 # the API exposed by Requests to actually force closing the 

395 # underlying open sockets. 

396 for session in self._sessions.values(): 

397 session.close() 

398 

399 self._sessions.clear() 

400 

401 def get(self, rpath: ResourcePath) -> requests.Session: 

402 """Retrieve a session for accessing the remote resource at rpath. 

403 

404 Parameters 

405 ---------- 

406 rpath : `ResourcePath` 

407 URL to a resource at the remote server for which a session is to 

408 be retrieved. 

409 

410 Notes 

411 ----- 

412 Once a session is created for a given endpoint it is cached and 

413 returned every time a session is requested for any path under that same 

414 endpoint. For instance, a single session will be cached and shared 

415 for paths "https://www.example.org/path/to/file" and 

416 "https://www.example.org/any/other/path". 

417 

418 Note that "https://www.example.org" and "https://www.example.org:12345" 

419 will have different sessions since the port number is not identical. 

420 

421 In order to configure the session, some environment variables are 

422 inspected: 

423 

424 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

425 certificates to trust when verifying the server's certificate. 

426 

427 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

428 local file containing a bearer token to be used as the client 

429 authentication mechanism with all requests. 

430 The permissions of the token file must be set so that only its 

431 owner can access it. 

432 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

433 and LSST_HTTP_AUTH_CLIENT_KEY. 

434 

435 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

436 client certificate for authenticating to the server. 

437 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

438 initialized with the path of the client private key file. 

439 The permissions of the client private key must be set so that only 

440 its owner can access it, at least for reading. 

441 """ 

442 root_uri = str(rpath.root_uri()) 

443 if root_uri not in self._sessions: 

444 # We don't have yet a session for this endpoint: create a new one. 

445 self._sessions[root_uri] = self._make_session(rpath) 

446 

447 return self._sessions[root_uri] 

448 

449 def _make_session(self, rpath: ResourcePath) -> requests.Session: 

450 """Make a new session configured from values from the environment.""" 

451 session = requests.Session() 

452 root_uri = str(rpath.root_uri()) 

453 log.debug("Creating new HTTP session for endpoint %s ...", root_uri) 

454 retries = Retry( 

455 # Total number of retries to allow. Takes precedence over other 

456 # counts. 

457 total=6, 

458 # How many connection-related errors to retry on. 

459 connect=3, 

460 # How many times to retry on read errors. 

461 read=3, 

462 # Backoff factor to apply between attempts after the second try 

463 # (seconds). Compute a random jitter to prevent all the clients 

464 # to overwhelm the server by sending requests at the same time. 

465 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(), 

466 # How many times to retry on bad status codes. 

467 status=5, 

468 # Set of uppercased HTTP method verbs that we should retry on. 

469 # We only automatically retry idempotent requests. 

470 allowed_methods=frozenset( 

471 [ 

472 "COPY", 

473 "DELETE", 

474 "GET", 

475 "HEAD", 

476 "MKCOL", 

477 "OPTIONS", 

478 "PROPFIND", 

479 "PUT", 

480 ] 

481 ), 

482 # HTTP status codes that we should force a retry on. 

483 status_forcelist=frozenset( 

484 [ 

485 requests.codes.too_many_requests, # 429 

486 requests.codes.internal_server_error, # 500 

487 requests.codes.bad_gateway, # 502 

488 requests.codes.service_unavailable, # 503 

489 requests.codes.gateway_timeout, # 504 

490 ] 

491 ), 

492 # Whether to respect Retry-After header on status codes defined 

493 # above. 

494 respect_retry_after_header=True, 

495 ) 

496 

497 # Persist the specified number of connections to the front end server. 

498 session.mount( 

499 root_uri, 

500 HTTPAdapter( 

501 pool_connections=self._num_pools, 

502 pool_maxsize=self._max_persistent_connections, 

503 pool_block=False, 

504 max_retries=retries, 

505 ), 

506 ) 

507 

508 # Do not persist the connections to back end servers which may vary 

509 # from request to request. Systematically persisting connections to 

510 # those servers may exhaust their capabilities when there are thousands 

511 # of simultaneous clients. 

512 session.mount( 

513 f"{rpath.scheme}://", 

514 HTTPAdapter( 

515 pool_connections=self._num_pools, 

516 pool_maxsize=0, 

517 pool_block=False, 

518 max_retries=retries, 

519 ), 

520 ) 

521 

522 # If the remote endpoint doesn't use secure HTTP we don't include 

523 # bearer tokens in the requests nor need to authenticate the remote 

524 # server. 

525 if rpath.scheme != "https": 

526 return session 

527 

528 # Should we use a specific CA cert bundle for authenticating the 

529 # server? 

530 session.verify = True 

531 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

532 session.verify = ca_bundle 

533 

534 # Should we use bearer tokens for client authentication? 

535 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

536 log.debug("... using bearer token authentication") 

537 session.auth = BearerTokenAuth(token) 

538 return session 

539 

540 # Should we instead use client certificate and private key? If so, both 

541 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

542 # initialized. 

543 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

544 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

545 if client_cert and client_key: 

546 if not _is_protected(client_key): 

547 raise PermissionError( 

548 f"Private key file at {client_key} must be protected for access only by its owner" 

549 ) 

550 log.debug("... using client certificate authentication.") 

551 session.cert = (client_cert, client_key) 

552 return session 

553 

554 if client_cert: 

555 # Only the client certificate was provided. 

556 raise ValueError( 

557 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

558 ) 

559 

560 if client_key: 

561 # Only the client private key was provided. 

562 raise ValueError( 

563 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

564 ) 

565 

566 log.debug( 

567 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

568 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

569 ) 

570 return session 

571 

572 

573class HttpResourcePath(ResourcePath): 

574 """General HTTP(S) resource. 

575 

576 Notes 

577 ----- 

578 In order to configure the behavior of instances of this class, the 

579 environment variables below are inspected: 

580 

581 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

582 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

583 This header is required by some servers to detect if the client 

584 knows how to handle redirections. In case of redirection, the body 

585 of the PUT request is sent to the redirected location and not to 

586 the front end server. 

587 

588 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a 

589 numeric value, they are interpreted as the number of seconds to wait 

590 for establishing a connection with the server and for reading its 

591 response, respectively. 

592 

593 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and 

594 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number 

595 of connections to attempt to persist with both the front end servers 

596 and the back end servers. 

597 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and 

598 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS. 

599 

600 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to 

601 ask the server to compute for every file's content sent to the server 

602 via a PUT request. No digest is requested if this variable is not set 

603 or is set to an invalid value. 

604 Valid values are those in ACCEPTED_DIGESTS. 

605 """ 

606 

607 _is_webdav: Optional[bool] = None 

608 

609 # Configuration items for this class instances. 

610 _config = HttpResourcePathConfig() 

611 

612 # The session for metadata requests is used for interacting with 

613 # the front end servers for requests such as PROPFIND, HEAD, etc. Those 

614 # interactions are typically served by the front end servers. We want to 

615 # keep the connection to the front end servers open, to reduce the cost 

616 # associated to TCP and TLS handshaking for each new request. 

617 _metadata_session_store = SessionStore( 

618 num_pools=5, 

619 max_persistent_connections=_config.front_end_connections, 

620 backoff_min=_config.backoff_min, 

621 backoff_max=_config.backoff_max, 

622 ) 

623 

624 # The data session is used for interaction with the front end servers which 

625 # typically redirect to the back end servers for serving our PUT and GET 

626 # requests. We attempt to keep a single connection open with the front end 

627 # server, if possible. This depends on how the server behaves and the 

628 # kind of request. Some servers close the connection when redirecting 

629 # the client to a back end server, for instance when serving a PUT 

630 # request. 

631 _data_session_store = SessionStore( 

632 num_pools=25, 

633 max_persistent_connections=_config.back_end_connections, 

634 backoff_min=_config.backoff_min, 

635 backoff_max=_config.backoff_max, 

636 ) 

637 

638 # Process ID which created the session stores above. We need to store this 

639 # to replace sessions created by a parent process and inherited by a 

640 # child process after a fork, to avoid confusing the SSL layer. 

641 _pid: int = -1 

642 

643 @property 

644 def metadata_session(self) -> requests.Session: 

645 """Client session to send requests which do not require upload or 

646 download of data, i.e. mostly metadata requests. 

647 """ 

648 

649 if hasattr(self, "_metadata_session"): 

650 if HttpResourcePath._pid == os.getpid(): 

651 return self._metadata_session 

652 else: 

653 # The metadata session we have in cache was likely created by 

654 # a parent process. Discard all the sessions in that store. 

655 self._metadata_session_store.clear() 

656 

657 # Retrieve a new metadata session. 

658 HttpResourcePath._pid = os.getpid() 

659 self._metadata_session: requests.Session = self._metadata_session_store.get(self) 

660 return self._metadata_session 

661 

662 @property 

663 def data_session(self) -> requests.Session: 

664 """Client session for uploading and downloading data.""" 

665 

666 if hasattr(self, "_data_session"): 

667 if HttpResourcePath._pid == os.getpid(): 

668 return self._data_session 

669 else: 

670 # The data session we have in cache was likely created by 

671 # a parent process. Discard all the sessions in that store. 

672 self._data_session_store.clear() 

673 

674 # Retrieve a new data session. 

675 HttpResourcePath._pid = os.getpid() 

676 self._data_session: requests.Session = self._data_session_store.get(self) 

677 return self._data_session 

678 

679 def _clear_sessions(self) -> None: 

680 """Internal method to close the socket connections still open. Used 

681 only in test suites to avoid warnings. 

682 """ 

683 self._metadata_session_store.clear() 

684 self._data_session_store.clear() 

685 

686 if hasattr(self, "_metadata_session"): 

687 delattr(self, "_metadata_session") 

688 

689 if hasattr(self, "_data_session"): 

690 delattr(self, "_data_session") 

691 

692 @property 

693 def is_webdav_endpoint(self) -> bool: 

694 """Check if the current endpoint implements WebDAV features. 

695 

696 This is stored per URI but cached by root so there is 

697 only one check per hostname. 

698 """ 

699 if self._is_webdav is not None: 

700 return self._is_webdav 

701 

702 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

703 return self._is_webdav 

704 

705 def exists(self) -> bool: 

706 """Check that a remote HTTP resource exists.""" 

707 log.debug("Checking if resource exists: %s", self.geturl()) 

708 if not self.is_webdav_endpoint: 

709 # The remote is a plain HTTP server. Let's attempt a HEAD 

710 # request, even if the behavior for such a request against a 

711 # directory is not specified, so it depends on the server 

712 # implementation. 

713 resp = self.metadata_session.head( 

714 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

715 ) 

716 return resp.status_code == requests.codes.ok # 200 

717 

718 # The remote endpoint is a webDAV server: send a PROPFIND request 

719 # to determine if it exists. 

720 resp = self._propfind() 

721 if resp.status_code == requests.codes.multi_status: # 207 

722 prop = _parse_propfind_response_body(resp.text)[0] 

723 return prop.exists 

724 else: # 404 Not Found 

725 return False 

726 

727 def size(self) -> int: 

728 """Return the size of the remote resource in bytes.""" 

729 if self.dirLike: 

730 return 0 

731 

732 if not self.is_webdav_endpoint: 

733 # The remote is a plain HTTP server. Send a HEAD request to 

734 # retrieve the size of the resource. 

735 resp = self.metadata_session.head( 

736 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False 

737 ) 

738 if resp.status_code == requests.codes.ok: # 200 

739 if "Content-Length" in resp.headers: 

740 return int(resp.headers["Content-Length"]) 

741 else: 

742 raise ValueError( 

743 f"Response to HEAD request to {self} does not contain 'Content-Length' header" 

744 ) 

745 elif resp.status_code == requests.codes.not_found: 

746 raise FileNotFoundError( 

747 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

748 ) 

749 else: 

750 raise ValueError( 

751 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} " 

752 f"{resp.reason}" 

753 ) 

754 

755 # The remote is a webDAV server: send a PROPFIND request to retrieve 

756 # the size of the resource. Sizes are only meaningful for files. 

757 resp = self._propfind() 

758 if resp.status_code == requests.codes.multi_status: # 207 

759 prop = _parse_propfind_response_body(resp.text)[0] 

760 if prop.is_file: 

761 return prop.size 

762 elif prop.is_directory: 

763 raise IsADirectoryError( 

764 f"Resource {self} is reported by server as a directory but has a file path" 

765 ) 

766 else: 

767 raise FileNotFoundError(f"Resource {self} does not exist") 

768 else: # 404 Not Found 

769 raise FileNotFoundError( 

770 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

771 ) 

772 

773 def mkdir(self) -> None: 

774 """Create the directory resource if it does not already exist.""" 

775 # Creating directories is only available on WebDAV back ends. 

776 if not self.is_webdav_endpoint: 

777 raise NotImplementedError( 

778 f"Creation of directory {self} is not implemented by plain HTTP servers" 

779 ) 

780 

781 if not self.dirLike: 

782 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

783 

784 # Check if the target directory already exists. 

785 resp = self._propfind() 

786 if resp.status_code == requests.codes.multi_status: # 207 

787 prop = _parse_propfind_response_body(resp.text)[0] 

788 if prop.exists: 

789 if prop.is_directory: 

790 return 

791 else: 

792 # A file exists at this path 

793 raise NotADirectoryError( 

794 f"Can not create a directory for {self} because a file already exists at that path" 

795 ) 

796 

797 # Target directory does not exist. Create it and its ancestors as 

798 # needed. We need to test if parent URL is different from self URL, 

799 # otherwise we could be stuck in a recursive loop 

800 # where self == parent. 

801 if self.geturl() != self.parent().geturl(): 

802 self.parent().mkdir() 

803 

804 log.debug("Creating new directory: %s", self.geturl()) 

805 self._mkcol() 

806 

807 def remove(self) -> None: 

808 """Remove the resource.""" 

809 self._delete() 

810 

811 def read(self, size: int = -1) -> bytes: 

812 """Open the resource and return the contents in bytes. 

813 

814 Parameters 

815 ---------- 

816 size : `int`, optional 

817 The number of bytes to read. Negative or omitted indicates 

818 that all data should be read. 

819 """ 

820 

821 # Use the data session as a context manager to ensure that the 

822 # network connections to both the front end and back end servers are 

823 # closed after downloading the data. 

824 log.debug("Reading from remote resource: %s", self.geturl()) 

825 stream = True if size > 0 else False 

826 with self.data_session as session: 

827 with time_this(log, msg="GET %s", args=(self,)): 

828 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout) 

829 

830 if resp.status_code != requests.codes.ok: # 200 

831 raise FileNotFoundError( 

832 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}" 

833 ) 

834 if not stream: 

835 return resp.content 

836 else: 

837 return next(resp.iter_content(chunk_size=size)) 

838 

839 def write(self, data: bytes, overwrite: bool = True) -> None: 

840 """Write the supplied bytes to the new resource. 

841 

842 Parameters 

843 ---------- 

844 data : `bytes` 

845 The bytes to write to the resource. The entire contents of the 

846 resource will be replaced. 

847 overwrite : `bool`, optional 

848 If `True` the resource will be overwritten if it exists. Otherwise 

849 the write will fail. 

850 """ 

851 log.debug("Writing to remote resource: %s", self.geturl()) 

852 if not overwrite: 

853 if self.exists(): 

854 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

855 

856 # Ensure the parent directory exists. 

857 self.parent().mkdir() 

858 

859 # Upload the data. 

860 log.debug("Writing data to remote resource: %s", self.geturl()) 

861 self._put(data=data) 

862 

863 def transfer_from( 

864 self, 

865 src: ResourcePath, 

866 transfer: str = "copy", 

867 overwrite: bool = False, 

868 transaction: Optional[TransactionProtocol] = None, 

869 ) -> None: 

870 """Transfer the current resource to a Webdav repository. 

871 

872 Parameters 

873 ---------- 

874 src : `ResourcePath` 

875 Source URI. 

876 transfer : `str` 

877 Mode to use for transferring the resource. Supports the following 

878 options: copy. 

879 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

880 Currently unused. 

881 """ 

882 # Fail early to prevent delays if remote resources are requested. 

883 if transfer not in self.transferModes: 

884 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

885 

886 # Existence checks cost time so do not call this unless we know 

887 # that debugging is enabled. 

888 if log.isEnabledFor(logging.DEBUG): 

889 log.debug( 

890 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

891 src, 

892 src.exists(), 

893 self, 

894 self.exists(), 

895 transfer, 

896 ) 

897 

898 # Short circuit immediately if the URIs are identical. 

899 if self == src: 

900 log.debug( 

901 "Target and destination URIs are identical: %s, returning immediately." 

902 " No further action required.", 

903 self, 

904 ) 

905 return 

906 

907 if not overwrite and self.exists(): 

908 raise FileExistsError(f"Destination path {self} already exists.") 

909 

910 if transfer == "auto": 

911 transfer = self.transferDefault 

912 

913 # We can use webDAV 'COPY' or 'MOVE' if both the current and source 

914 # resources are located in the same server. 

915 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint: 

916 log.debug("Transfer from %s to %s directly", src, self) 

917 return self._move(src) if transfer == "move" else self._copy(src) 

918 

919 # For resources of different classes or for plain HTTP resources we can 

920 # perform the copy or move operation by downloading to a local file 

921 # and uploading to the destination. 

922 self._copy_via_local(src) 

923 

924 # This was an explicit move, try to remove the source. 

925 if transfer == "move": 

926 src.remove() 

927 

928 def walk( 

929 self, file_filter: Optional[Union[str, re.Pattern]] = None 

930 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]: 

931 """Walk the directory tree returning matching files and directories. 

932 Parameters 

933 ---------- 

934 file_filter : `str` or `re.Pattern`, optional 

935 Regex to filter out files from the list before it is returned. 

936 Yields 

937 ------ 

938 dirpath : `ResourcePath` 

939 Current directory being examined. 

940 dirnames : `list` of `str` 

941 Names of subdirectories within dirpath. 

942 filenames : `list` of `str` 

943 Names of all the files within dirpath. 

944 """ 

945 if not self.dirLike: 

946 raise ValueError("Can not walk a non-directory URI") 

947 

948 # Walking directories is only available on WebDAV back ends. 

949 if not self.is_webdav_endpoint: 

950 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers") 

951 

952 if isinstance(file_filter, str): 

953 file_filter = re.compile(file_filter) 

954 

955 resp = self._propfind(depth="1") 

956 if resp.status_code == requests.codes.multi_status: # 207 

957 files: List[str] = [] 

958 dirs: List[str] = [] 

959 

960 for prop in _parse_propfind_response_body(resp.text): 

961 if prop.is_file: 

962 files.append(prop.name) 

963 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")): 

964 # Only include the names of sub-directories not the name of 

965 # the directory being walked. 

966 dirs.append(prop.name) 

967 

968 if file_filter is not None: 

969 files = [f for f in files if file_filter.search(f)] 

970 

971 if not dirs and not files: 

972 return 

973 else: 

974 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files 

975 

976 for dir in dirs: 

977 new_uri = self.join(dir, forceDirectory=True) 

978 yield from new_uri.walk(file_filter) 

979 

980 def _as_local(self) -> Tuple[str, bool]: 

981 """Download object over HTTP and place in temporary directory. 

982 

983 Returns 

984 ------- 

985 path : `str` 

986 Path to local temporary file. 

987 temporary : `bool` 

988 Always returns `True`. This is always a temporary file. 

989 """ 

990 

991 # Use the session as a context manager to ensure that connections 

992 # to both the front end and back end servers are closed after the 

993 # download operation is finished. 

994 with self.data_session as session: 

995 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout) 

996 if resp.status_code != requests.codes.ok: 

997 raise FileNotFoundError( 

998 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}" 

999 ) 

1000 

1001 tmpdir, buffering = _get_temp_dir() 

1002 with tempfile.NamedTemporaryFile( 

1003 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

1004 ) as tmpFile: 

1005 expected_length = int(resp.headers.get("Content-Length", "-1")) 

1006 with time_this( 

1007 log, 

1008 msg="GET %s [length=%d] to local file %s [chunk_size=%d]", 

1009 args=(self, expected_length, tmpFile.name, buffering), 

1010 mem_usage=self._config.collect_memory_usage, 

1011 mem_unit=u.mebibyte, 

1012 ): 

1013 content_length = 0 

1014 for chunk in resp.iter_content(chunk_size=buffering): 

1015 tmpFile.write(chunk) 

1016 content_length += len(chunk) 

1017 

1018 # Check that the expected and actual content lengths match. Perform 

1019 # this check only when the contents of the file was not encoded by 

1020 # the server. 

1021 if "Content-Encoding" not in resp.headers: 

1022 if expected_length >= 0 and expected_length != content_length: 

1023 raise ValueError( 

1024 f"Size of downloaded file does not match value in Content-Length header for {self}: " 

1025 f"expecting {expected_length} and got {content_length} bytes" 

1026 ) 

1027 

1028 return tmpFile.name, True 

1029 

1030 def _send_webdav_request( 

1031 self, 

1032 method: str, 

1033 url: Optional[str] = None, 

1034 headers: dict[str, str] = {}, 

1035 body: Optional[str] = None, 

1036 session: Optional[requests.Session] = None, 

1037 timeout: Optional[tuple[float, float]] = None, 

1038 ) -> requests.Response: 

1039 """Send a webDAV request and correctly handle redirects. 

1040 

1041 Parameters 

1042 ---------- 

1043 method : `str` 

1044 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL. 

1045 headers : `dict`, optional 

1046 A dictionary of key-value pairs (both strings) to include as 

1047 headers in the request. 

1048 body: `str`, optional 

1049 The body of the request. 

1050 

1051 Notes 

1052 ----- 

1053 This way of sending webDAV requests is necessary for handling 

1054 redirection ourselves, since the 'requests' package changes the method 

1055 of the redirected request when the server responds with status 302 and 

1056 the method of the original request is not HEAD (which is the case for 

1057 webDAV requests). 

1058 

1059 That means that when the webDAV server we interact with responds with 

1060 a redirection to a PROPFIND or MKCOL request, the request gets 

1061 converted to a GET request when sent to the redirected location. 

1062 

1063 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in 

1064 https://github.com/psf/requests/blob/main/requests/sessions.py 

1065 

1066 This behavior of the 'requests' package is meant to be compatible with 

1067 what is specified in RFC 9110: 

1068 

1069 https://www.rfc-editor.org/rfc/rfc9110#name-302-found 

1070 

1071 For our purposes, we do need to follow the redirection and send a new 

1072 request using the same HTTP verb. 

1073 """ 

1074 if url is None: 

1075 url = self.geturl() 

1076 

1077 if session is None: 

1078 session = self.metadata_session 

1079 

1080 if timeout is None: 

1081 timeout = self._config.timeout 

1082 

1083 with time_this( 

1084 log, 

1085 msg="%s %s", 

1086 args=( 

1087 method, 

1088 url, 

1089 ), 

1090 mem_usage=self._config.collect_memory_usage, 

1091 mem_unit=u.mebibyte, 

1092 ): 

1093 for _ in range(max_redirects := 5): 

1094 resp = session.request( 

1095 method, 

1096 url, 

1097 data=body, 

1098 headers=headers, 

1099 stream=False, 

1100 timeout=timeout, 

1101 allow_redirects=False, 

1102 ) 

1103 if resp.is_redirect: 

1104 url = resp.headers["Location"] 

1105 else: 

1106 return resp 

1107 

1108 # We reached the maximum allowed number of redirects. 

1109 # Stop trying. 

1110 raise ValueError( 

1111 f"Could not get a response to {method} request for {self} after " 

1112 f"{max_redirects} redirections" 

1113 ) 

1114 

1115 def _propfind(self, body: Optional[str] = None, depth: str = "0") -> requests.Response: 

1116 """Send a PROPFIND webDAV request and return the response. 

1117 

1118 Parameters 

1119 ---------- 

1120 body : `str`, optional 

1121 The body of the PROPFIND request to send to the server. If 

1122 provided, it is expected to be a XML document. 

1123 depth : `str`, optional 

1124 The value of the 'Depth' header to include in the request. 

1125 

1126 Returns 

1127 ------- 

1128 response : `requests.Response` 

1129 Response to the PROPFIND request. 

1130 

1131 Notes 

1132 ----- 

1133 It raises `ValueError` if the status code of the PROPFIND request 

1134 is different from "207 Multistatus" or "404 Not Found". 

1135 """ 

1136 if body is None: 

1137 # Request only the DAV live properties we are explicitly interested 

1138 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified' 

1139 # and 'displayname'. 

1140 body = ( 

1141 """<?xml version="1.0" encoding="utf-8" ?>""" 

1142 """<D:propfind xmlns:D="DAV:"><D:prop>""" 

1143 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>""" 

1144 """</D:prop></D:propfind>""" 

1145 ) 

1146 headers = { 

1147 "Depth": depth, 

1148 "Content-Type": 'application/xml; charset="utf-8"', 

1149 "Content-Length": str(len(body)), 

1150 } 

1151 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body) 

1152 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found): 

1153 return resp 

1154 else: 

1155 raise ValueError( 

1156 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} " 

1157 f"{resp.reason}" 

1158 ) 

1159 

1160 def _options(self) -> requests.Response: 

1161 """Send a OPTIONS webDAV request for this resource.""" 

1162 resp = self._send_webdav_request("OPTIONS") 

1163 if resp.status_code in (requests.codes.ok, requests.codes.created): 

1164 return resp 

1165 

1166 raise ValueError( 

1167 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} " f"{resp.reason}" 

1168 ) 

1169 

1170 def _head(self) -> requests.Response: 

1171 """Send a HEAD webDAV request for this resource.""" 

1172 

1173 return self._send_webdav_request("HEAD") 

1174 

1175 def _mkcol(self) -> None: 

1176 """Send a MKCOL webDAV request to create a collection. The collection 

1177 may already exist. 

1178 """ 

1179 resp = self._send_webdav_request("MKCOL") 

1180 if resp.status_code == requests.codes.created: # 201 

1181 return 

1182 

1183 if resp.status_code == requests.codes.method_not_allowed: # 405 

1184 # The remote directory already exists 

1185 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

1186 else: 

1187 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}") 

1188 

1189 def _delete(self) -> None: 

1190 """Send a DELETE webDAV request for this resource.""" 

1191 

1192 log.debug("Deleting %s ...", self.geturl()) 

1193 

1194 # If this is a directory, ensure the remote is a webDAV server because 

1195 # plain HTTP servers don't support DELETE requests on non-file 

1196 # paths. 

1197 if self.dirLike and not self.is_webdav_endpoint: 

1198 raise NotImplementedError( 

1199 f"Deletion of directory {self} is not implemented by plain HTTP servers" 

1200 ) 

1201 

1202 # Deleting non-empty directories may take some time, so increase 

1203 # the timeout for getting a response from the server. 

1204 timeout = self._config.timeout 

1205 if self.dirLike: 

1206 timeout = (timeout[0], timeout[1] * 100) 

1207 resp = self._send_webdav_request("DELETE", timeout=timeout) 

1208 if resp.status_code in ( 

1209 requests.codes.ok, 

1210 requests.codes.accepted, 

1211 requests.codes.no_content, 

1212 requests.codes.not_found, 

1213 ): 

1214 # We can get a "404 Not Found" error when the file or directory 

1215 # does not exist or when the DELETE request was retried several 

1216 # times and a previous attempt actually deleted the resource. 

1217 # Therefore we consider that a "Not Found" response is not an 

1218 # error since we reached the state desired by the user. 

1219 return 

1220 else: 

1221 # TODO: the response to a DELETE request against a webDAV server 

1222 # may be multistatus. If so, we need to parse the reponse body to 

1223 # determine more precisely the reason of the failure (e.g. a lock) 

1224 # and provide a more helpful error message. 

1225 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}") 

1226 

1227 def _copy_via_local(self, src: ResourcePath) -> None: 

1228 """Replace the contents of this resource with the contents of a remote 

1229 resource by using a local temporary file. 

1230 

1231 Parameters 

1232 ---------- 

1233 src : `HttpResourcePath` 

1234 The source of the contents to copy to `self`. 

1235 """ 

1236 with src.as_local() as local_uri: 

1237 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri) 

1238 with open(local_uri.ospath, "rb") as f: 

1239 self._put(data=f) 

1240 

1241 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None: 

1242 """Send a COPY or MOVE webDAV request to copy or replace the contents 

1243 of this resource with the contents of another resource located in the 

1244 same server. 

1245 

1246 Parameters 

1247 ---------- 

1248 method : `str` 

1249 The method to perform. Valid values are "COPY" or "MOVE" (in 

1250 uppercase). 

1251 

1252 src : `HttpResourcePath` 

1253 The source of the contents to move to `self`. 

1254 """ 

1255 headers = {"Destination": self.geturl()} 

1256 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session) 

1257 if resp.status_code in (requests.codes.created, requests.codes.no_content): 

1258 return 

1259 

1260 if resp.status_code == requests.codes.multi_status: 

1261 tree = eTree.fromstring(resp.content) 

1262 status_element = tree.find("./{DAV:}response/{DAV:}status") 

1263 status = status_element.text if status_element is not None else "unknown" 

1264 error = tree.find("./{DAV:}response/{DAV:}error") 

1265 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}") 

1266 else: 

1267 raise ValueError( 

1268 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}" 

1269 ) 

1270 

1271 def _copy(self, src: HttpResourcePath) -> None: 

1272 """Send a COPY webDAV request to replace the contents of this resource 

1273 (if any) with the contents of another resource located in the same 

1274 server. 

1275 

1276 Parameters 

1277 ---------- 

1278 src : `HttpResourcePath` 

1279 The source of the contents to copy to `self`. 

1280 """ 

1281 # Neither dCache nor XrootD currently implement the COPY 

1282 # webDAV method as documented in 

1283 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY 

1284 # (See issues DM-37603 and DM-37651 for details) 

1285 # 

1286 # For the time being, we use a temporary local file to 

1287 # perform the copy client side. 

1288 # TODO: when those 2 issues above are solved remove the 3 lines below. 

1289 must_use_local = True 

1290 if must_use_local: 

1291 return self._copy_via_local(src) 

1292 

1293 return self._copy_or_move("COPY", src) 

1294 

1295 def _move(self, src: HttpResourcePath) -> None: 

1296 """Send a MOVE webDAV request to replace the contents of this resource 

1297 with the contents of another resource located in the same server. 

1298 

1299 Parameters 

1300 ---------- 

1301 src : `HttpResourcePath` 

1302 The source of the contents to move to `self`. 

1303 """ 

1304 return self._copy_or_move("MOVE", src) 

1305 

1306 def _put(self, data: Union[BinaryIO, bytes]) -> None: 

1307 """Perform an HTTP PUT request and handle redirection. 

1308 

1309 Parameters 

1310 ---------- 

1311 data : `Union[BinaryIO, bytes]` 

1312 The data to be included in the body of the PUT request. 

1313 """ 

1314 # Retrieve the final URL for this upload by sending a PUT request with 

1315 # no content. Follow a single server redirection to retrieve the 

1316 # final URL. 

1317 headers = {"Content-Length": "0"} 

1318 if self._config.send_expect_on_put: 

1319 headers["Expect"] = "100-continue" 

1320 

1321 url = self.geturl() 

1322 

1323 # Use the session as a context manager to ensure the underlying 

1324 # connections are closed after finishing uploading the data. 

1325 with self.data_session as session: 

1326 # Send an empty PUT request to get redirected to the final 

1327 # destination. 

1328 log.debug("Sending empty PUT request to %s", url) 

1329 with time_this( 

1330 log, 

1331 msg="PUT (no data) %s", 

1332 args=(url,), 

1333 mem_usage=self._config.collect_memory_usage, 

1334 mem_unit=u.mebibyte, 

1335 ): 

1336 resp = session.request( 

1337 "PUT", 

1338 url, 

1339 data=None, 

1340 headers=headers, 

1341 stream=False, 

1342 timeout=self._config.timeout, 

1343 allow_redirects=False, 

1344 ) 

1345 if resp.is_redirect: 

1346 url = resp.headers["Location"] 

1347 

1348 # Upload the data to the final destination. 

1349 log.debug("Uploading data to %s", url) 

1350 

1351 # Ask the server to compute and record a checksum of the uploaded 

1352 # file contents, for later integrity checks. Since we don't compute 

1353 # the digest ourselves while uploading the data, we cannot control 

1354 # after the request is complete that the data we uploaded is 

1355 # identical to the data recorded by the server, but at least the 

1356 # server has recorded a digest of the data it stored. 

1357 # 

1358 # See RFC-3230 for details and 

1359 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml 

1360 # for the list of supported digest algorithhms. 

1361 # In addition, note that not all servers implement this RFC so 

1362 # the checksum may not be computed by the server. 

1363 put_headers: Optional[dict[str, str]] = None 

1364 if digest := self._config.digest_algorithm: 

1365 put_headers = {"Want-Digest": digest} 

1366 

1367 with time_this( 

1368 log, 

1369 msg="PUT %s", 

1370 args=(url,), 

1371 mem_usage=self._config.collect_memory_usage, 

1372 mem_unit=u.mebibyte, 

1373 ): 

1374 resp = session.request( 

1375 "PUT", 

1376 url, 

1377 data=data, 

1378 headers=put_headers, 

1379 stream=False, 

1380 timeout=self._config.timeout, 

1381 allow_redirects=False, 

1382 ) 

1383 if resp.status_code in ( 

1384 requests.codes.ok, 

1385 requests.codes.created, 

1386 requests.codes.no_content, 

1387 ): 

1388 return 

1389 else: 

1390 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}") 

1391 

1392 @contextlib.contextmanager 

1393 def _openImpl( 

1394 self, 

1395 mode: str = "r", 

1396 *, 

1397 encoding: Optional[str] = None, 

1398 ) -> Iterator[ResourceHandleProtocol]: 

1399 resp = self._head() 

1400 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes" 

1401 handle: ResourceHandleProtocol 

1402 if mode in ("rb", "r") and accepts_range: 

1403 handle = HttpReadResourceHandle( 

1404 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout 

1405 ) 

1406 if mode == "r": 

1407 # cast because the protocol is compatible, but does not have 

1408 # BytesIO in the inheritance tree 

1409 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding) 

1410 else: 

1411 yield handle 

1412 else: 

1413 with super()._openImpl(mode, encoding=encoding) as http_handle: 

1414 yield http_handle 

1415 

1416 

1417def _dump_response(resp: requests.Response) -> None: 

1418 """Log the contents of a HTTP or webDAV request and its response. 

1419 

1420 Parameters 

1421 ---------- 

1422 resp : `requests.Response` 

1423 The response to log. 

1424 

1425 Notes 

1426 ----- 

1427 Intended for development purposes only. 

1428 """ 

1429 log.debug("-----------------------------------------------") 

1430 log.debug("Request") 

1431 log.debug(" method=%s", resp.request.method) 

1432 log.debug(" URL=%s", resp.request.url) 

1433 log.debug(" headers=%s", resp.request.headers) 

1434 if resp.request.method == "PUT": 

1435 log.debug(" body=<data>") 

1436 elif resp.request.body is None: 

1437 log.debug(" body=<empty>") 

1438 else: 

1439 log.debug(" body=%r", resp.request.body[:120]) 

1440 

1441 log.debug("Response:") 

1442 log.debug(" status_code=%d", resp.status_code) 

1443 log.debug(" headers=%s", resp.headers) 

1444 if not resp.content: 

1445 log.debug(" body=<empty>") 

1446 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain": 

1447 log.debug(" body=%r", resp.content) 

1448 else: 

1449 log.debug(" body=%r", resp.content[:80]) 

1450 

1451 

1452def _is_protected(filepath: str) -> bool: 

1453 """Return true if the permissions of file at filepath only allow for access 

1454 by its owner. 

1455 

1456 Parameters 

1457 ---------- 

1458 filepath : `str` 

1459 Path of a local file. 

1460 """ 

1461 if not os.path.isfile(filepath): 

1462 return False 

1463 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

1464 owner_accessible = bool(mode & stat.S_IRWXU) 

1465 group_accessible = bool(mode & stat.S_IRWXG) 

1466 other_accessible = bool(mode & stat.S_IRWXO) 

1467 return owner_accessible and not group_accessible and not other_accessible 

1468 

1469 

1470def _parse_propfind_response_body(body: str) -> List[DavProperty]: 

1471 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND 

1472 request. 

1473 

1474 Parameters 

1475 ---------- 

1476 body : `str` 

1477 XML-encoded response body to a PROPFIND request 

1478 

1479 Returns 

1480 ------- 

1481 responses : `List[DavProperty]` 

1482 

1483 Notes 

1484 ----- 

1485 Is is expected that there is at least one reponse in `body`, otherwise 

1486 this function raises. 

1487 """ 

1488 # A response body to a PROPFIND request is of the form (indented for 

1489 # readability): 

1490 # 

1491 # <?xml version="1.0" encoding="UTF-8"?> 

1492 # <D:multistatus xmlns:D="DAV:"> 

1493 # <D:response> 

1494 # <D:href>path/to/resource</D:href> 

1495 # <D:propstat> 

1496 # <D:prop> 

1497 # <D:resourcetype> 

1498 # <D:collection xmlns:D="DAV:"/> 

1499 # </D:resourcetype> 

1500 # <D:getlastmodified> 

1501 # Fri, 27 Jan 2 023 13:59:01 GMT 

1502 # </D:getlastmodified> 

1503 # <D:getcontentlength> 

1504 # 12345 

1505 # </D:getcontentlength> 

1506 # </D:prop> 

1507 # <D:status> 

1508 # HTTP/1.1 200 OK 

1509 # </D:status> 

1510 # </D:propstat> 

1511 # </D:response> 

1512 # <D:response> 

1513 # ... 

1514 # </D:response> 

1515 # <D:response> 

1516 # ... 

1517 # </D:response> 

1518 # </D:multistatus> 

1519 

1520 # Scan all the 'response' elements and extract the relevant properties 

1521 responses = [] 

1522 multistatus = eTree.fromstring(body.strip()) 

1523 for response in multistatus.findall("./{DAV:}response"): 

1524 responses.append(DavProperty(response)) 

1525 

1526 if responses: 

1527 return responses 

1528 else: 

1529 # Could not parse the body 

1530 raise ValueError(f"Unable to parse response for PROPFIND request: {body}") 

1531 

1532 

1533class DavProperty: 

1534 """Helper class to encapsulate select live DAV properties of a single 

1535 resource, as retrieved via a PROPFIND request. 

1536 """ 

1537 

1538 # Regular expression to compare against the 'status' element of a 

1539 # PROPFIND response's 'propstat' element. 

1540 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE) 

1541 

1542 def __init__(self, response: Optional[eTree.Element]): 

1543 self._href: str = "" 

1544 self._displayname: str = "" 

1545 self._collection: bool = False 

1546 self._getlastmodified: str = "" 

1547 self._getcontentlength: int = -1 

1548 

1549 if response is not None: 

1550 self._parse(response) 

1551 

1552 def _parse(self, response: eTree.Element) -> None: 

1553 # Extract 'href'. 

1554 if (element := response.find("./{DAV:}href")) is not None: 

1555 # We need to use "str(element.text)"" instead of "element.text" to 

1556 # keep mypy happy. 

1557 self._href = str(element.text).strip() 

1558 else: 

1559 raise ValueError( 

1560 f"Property 'href' expected but not found in PROPFIND response: " 

1561 f"{eTree.tostring(response, encoding='unicode')}" 

1562 ) 

1563 

1564 for propstat in response.findall("./{DAV:}propstat"): 

1565 # Only extract properties of interest with status OK. 

1566 status = propstat.find("./{DAV:}status") 

1567 if status is None or not self._status_ok_rex.match(str(status.text)): 

1568 continue 

1569 

1570 for prop in propstat.findall("./{DAV:}prop"): 

1571 # Parse "collection". 

1572 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None: 

1573 self._collection = True 

1574 

1575 # Parse "getlastmodified". 

1576 if (element := prop.find("./{DAV:}getlastmodified")) is not None: 

1577 self._getlastmodified = str(element.text) 

1578 

1579 # Parse "getcontentlength". 

1580 if (element := prop.find("./{DAV:}getcontentlength")) is not None: 

1581 self._getcontentlength = int(str(element.text)) 

1582 

1583 # Parse "displayname". 

1584 if (element := prop.find("./{DAV:}displayname")) is not None: 

1585 self._displayname = str(element.text) 

1586 

1587 # Some webDAV servers don't include the 'displayname' property in the 

1588 # response so try to infer it from the value of the 'href' property. 

1589 # Depending on the server the href value may end with '/'. 

1590 if not self._displayname: 

1591 self._displayname = os.path.basename(self._href.rstrip("/")) 

1592 

1593 # Force a size of 0 for collections. 

1594 if self._collection: 

1595 self._getcontentlength = 0 

1596 

1597 @property 

1598 def exists(self) -> bool: 

1599 # It is either a directory or a file with length of at least zero 

1600 return self._collection or self._getcontentlength >= 0 

1601 

1602 @property 

1603 def is_directory(self) -> bool: 

1604 return self._collection 

1605 

1606 @property 

1607 def is_file(self) -> bool: 

1608 return not self._collection 

1609 

1610 @property 

1611 def size(self) -> int: 

1612 return self._getcontentlength 

1613 

1614 @property 

1615 def name(self) -> str: 

1616 return self._displayname 

1617 

1618 @property 

1619 def href(self) -> str: 

1620 return self._href