Coverage for python/lsst/resources/http.py: 15%

428 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-09 03:06 -0800

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("HttpResourcePath",) 

15 

16import contextlib 

17import functools 

18import io 

19import logging 

20import os 

21import os.path 

22import random 

23import re 

24import stat 

25import tempfile 

26import xml.etree.ElementTree as eTree 

27from typing import TYPE_CHECKING, BinaryIO, Iterator, List, Optional, Tuple, Union, cast 

28 

29import requests 

30from astropy import units as u 

31from lsst.utils.timer import time_this 

32from requests.adapters import HTTPAdapter 

33from requests.auth import AuthBase 

34from urllib3.util.retry import Retry 

35 

36from ._resourceHandles import ResourceHandleProtocol 

37from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle 

38from ._resourcePath import ResourcePath 

39 

40if TYPE_CHECKING: 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true

41 from .utils import TransactionProtocol 

42 

43log = logging.getLogger(__name__) 

44 

45 

46# Default timeouts for all HTTP requests, in seconds. 

47DEFAULT_TIMEOUT_CONNECT = 60 

48DEFAULT_TIMEOUT_READ = 300 

49 

50# Allow for network timeouts to be set in the environment. 

51TIMEOUT = ( 

52 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)), 

53 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)), 

54) 

55 

56# Should we send a "Expect: 100-continue" header on PUT requests? 

57# The "Expect: 100-continue" header is used by some servers (e.g. dCache) 

58# as an indication that the client knows how to handle redirects to 

59# the specific server that will actually receive the data for PUT 

60# requests. 

61_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ 

62 

63 

64@functools.lru_cache 

65def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool: 

66 """Check whether the remote HTTP endpoint implements WebDAV features. 

67 

68 Parameters 

69 ---------- 

70 path : `ResourcePath` or `str` 

71 URL to the resource to be checked. 

72 Should preferably refer to the root since the status is shared 

73 by all paths in that server. 

74 

75 Returns 

76 ------- 

77 _is_webdav_endpoint : `bool` 

78 True if the endpoint implements WebDAV, False if it doesn't. 

79 """ 

80 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

81 try: 

82 ca_cert_bundle = os.getenv("LSST_HTTP_CACERT_BUNDLE") 

83 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True 

84 resp = requests.options(str(path), verify=verify, stream=True) 

85 

86 # Check that "1" is part of the value of the "DAV" header. We don't 

87 # use locks, so a server complying to class 1 is enough for our 

88 # purposes. All webDAV servers must advertise at least compliance 

89 # class "1". 

90 # 

91 # Compliance classes are documented in 

92 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes 

93 # 

94 # Examples of values for header DAV are: 

95 # DAV: 1, 2 

96 # DAV: 1, <http://apache.org/dav/propset/fs/1> 

97 if "DAV" not in resp.headers: 

98 return False 

99 else: 

100 # Convert to str to keep mypy happy 

101 compliance_class = str(resp.headers.get("DAV")) 

102 return "1" in compliance_class.replace(" ", "").split(",") 

103 except requests.exceptions.SSLError as e: 

104 log.warning( 

105 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to " 

106 "specify a bundle of certificate authorities you trust which are " 

107 "not included in the default set of trusted authorities of your " 

108 "system." 

109 ) 

110 raise e 

111 

112 

113# Tuple (path, block_size) pointing to the location of a local directory 

114# to save temporary files and the block size of the underlying file system. 

115_TMPDIR: Optional[Tuple[str, int]] = None 

116 

117 

118def _get_temp_dir() -> Tuple[str, int]: 

119 """Return the temporary directory path and block size. 

120 

121 This function caches its results in _TMPDIR. 

122 """ 

123 global _TMPDIR 

124 if _TMPDIR: 

125 return _TMPDIR 

126 

127 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or 

128 # 'TMPDIR', if defined. Otherwise use current working directory. 

129 tmpdir = os.getcwd() 

130 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")): 

131 if dir and os.path.isdir(dir): 

132 tmpdir = dir 

133 break 

134 

135 # Compute the block size as 256 blocks of typical size 

136 # (i.e. 4096 bytes) or 10 times the file system block size, 

137 # whichever is higher. This is a reasonable compromise between 

138 # using memory for buffering and the number of system calls 

139 # issued to read from or write to temporary files. 

140 fsstats = os.statvfs(tmpdir) 

141 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096))) 

142 

143 

144class BearerTokenAuth(AuthBase): 

145 """Attach a bearer token 'Authorization' header to each request. 

146 

147 Parameters 

148 ---------- 

149 token : `str` 

150 Can be either the path to a local protected file which contains the 

151 value of the token or the token itself. 

152 """ 

153 

154 def __init__(self, token: str): 

155 self._token = self._path = None 

156 self._mtime: float = -1.0 

157 if not token: 

158 return 

159 

160 self._token = token 

161 if os.path.isfile(token): 

162 self._path = os.path.abspath(token) 

163 if not _is_protected(self._path): 

164 raise PermissionError( 

165 f"Bearer token file at {self._path} must be protected for access only by its owner" 

166 ) 

167 self._refresh() 

168 

169 def _refresh(self) -> None: 

170 """Read the token file (if any) if its modification time is more recent 

171 than the last time we read it. 

172 """ 

173 if not self._path: 

174 return 

175 

176 if (mtime := os.stat(self._path).st_mtime) > self._mtime: 

177 log.debug("Reading bearer token file at %s", self._path) 

178 self._mtime = mtime 

179 with open(self._path) as f: 

180 self._token = f.read().rstrip("\n") 

181 

182 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest: 

183 if self._token: 

184 self._refresh() 

185 req.headers["Authorization"] = f"Bearer {self._token}" 

186 return req 

187 

188 

189class SessionStore: 

190 """Cache a single reusable HTTP client session per enpoint.""" 

191 

192 def __init__(self) -> None: 

193 # The key of the dictionary is a root URI and the value is the 

194 # session 

195 self._sessions: dict[str, requests.Session] = {} 

196 

197 def get(self, rpath: ResourcePath, persist: bool = True) -> requests.Session: 

198 """Retrieve a session for accessing the remote resource at rpath. 

199 

200 Parameters 

201 ---------- 

202 rpath : `ResourcePath` 

203 URL to a resource at the remote server for which a session is to 

204 be retrieved. 

205 

206 persist : `bool` 

207 if `True`, make the network connection with the front end server 

208 of the endpoint persistent. Connections to the backend servers 

209 are persisted. 

210 

211 Notes 

212 ----- 

213 Once a session is created for a given endpoint it is cached and 

214 returned every time a session is requested for any path under that same 

215 endpoint. For instance, a single session will be cached and shared 

216 for paths "https://www.example.org/path/to/file" and 

217 "https://www.example.org/any/other/path". 

218 

219 Note that "https://www.example.org" and "https://www.example.org:12345" 

220 will have different sessions since the port number is not identical. 

221 

222 In order to configure the session, some environment variables are 

223 inspected: 

224 

225 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA 

226 certificates to trust when verifying the server's certificate. 

227 

228 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a 

229 local file containing a bearer token to be used as the client 

230 authentication mechanism with all requests. 

231 The permissions of the token file must be set so that only its 

232 owner can access it. 

233 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT 

234 and LSST_HTTP_AUTH_CLIENT_KEY. 

235 

236 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the 

237 client certificate for authenticating to the server. 

238 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be 

239 initialized with the path of the client private key file. 

240 The permissions of the client private key must be set so that only 

241 its owner can access it, at least for reading. 

242 """ 

243 root_uri = str(rpath.root_uri()) 

244 if root_uri not in self._sessions: 

245 # We don't have yet a session for this endpoint: create a new one 

246 self._sessions[root_uri] = self._make_session(rpath, persist) 

247 return self._sessions[root_uri] 

248 

249 def _make_session(self, rpath: ResourcePath, persist: bool) -> requests.Session: 

250 """Make a new session configured from values from the environment.""" 

251 session = requests.Session() 

252 root_uri = str(rpath.root_uri()) 

253 log.debug("Creating new HTTP session for endpoint %s (persist connection=%s)...", root_uri, persist) 

254 

255 retries = Retry( 

256 # Total number of retries to allow. Takes precedence over other 

257 # counts. 

258 total=3, 

259 # How many connection-related errors to retry on. 

260 connect=3, 

261 # How many times to retry on read errors. 

262 read=3, 

263 # Backoff factor to apply between attempts after the second try 

264 # (seconds) 

265 backoff_factor=5.0 + random.random(), 

266 # How many times to retry on bad status codes 

267 status=3, 

268 # HTTP status codes that we should force a retry on 

269 status_forcelist=[ 

270 requests.codes.too_many_requests, # 429 

271 requests.codes.internal_server_error, # 500 

272 requests.codes.bad_gateway, # 502 

273 requests.codes.service_unavailable, # 503 

274 requests.codes.gateway_timeout, # 504 

275 ], 

276 ) 

277 

278 # Persist a single connection to the front end server, if required 

279 num_connections = 1 if persist else 0 

280 session.mount( 

281 root_uri, 

282 HTTPAdapter( 

283 pool_connections=1, pool_maxsize=num_connections, pool_block=False, max_retries=retries 

284 ), 

285 ) 

286 

287 # Prevent persisting connections to back-end servers which may vary 

288 # from request to request. Systematically persisting connections to 

289 # those servers may exhaust their capabilities when there are thousands 

290 # of simultaneous clients 

291 session.mount( 

292 f"{rpath.scheme}://", 

293 HTTPAdapter(pool_connections=1, pool_maxsize=0, pool_block=False, max_retries=retries), 

294 ) 

295 

296 # If the remote endpoint don't use secure HTTP we dont include bearer 

297 # tokens in the requests nor need to authenticate the remove server. 

298 if rpath.scheme != "https": 

299 return session 

300 

301 # Should we use a specific CA cert bundle for authenticating the 

302 # server? 

303 session.verify = True 

304 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"): 

305 session.verify = ca_bundle 

306 

307 # Should we use bearer tokens for client authentication? 

308 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"): 

309 log.debug("... using bearer token authentication") 

310 session.auth = BearerTokenAuth(token) 

311 return session 

312 

313 # Should we instead use client certificate and private key? If so, both 

314 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be 

315 # initialized. 

316 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT") 

317 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY") 

318 if client_cert and client_key: 

319 if not _is_protected(client_key): 

320 raise PermissionError( 

321 f"Private key file at {client_key} must be protected for access only by its owner" 

322 ) 

323 log.debug("... using client certificate authentication.") 

324 session.cert = (client_cert, client_key) 

325 return session 

326 

327 if client_cert: 

328 # Only the client certificate was provided. 

329 raise ValueError( 

330 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path" 

331 ) 

332 

333 if client_key: 

334 # Only the client private key was provided. 

335 raise ValueError( 

336 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path" 

337 ) 

338 

339 log.debug( 

340 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and " 

341 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled." 

342 ) 

343 return session 

344 

345 

346class HttpResourcePath(ResourcePath): 

347 """General HTTP(S) resource. 

348 

349 Notes 

350 ----- 

351 In order to configure the behavior of the object, one environment variable 

352 is inspected: 

353 

354 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a 

355 "Expect: 100-Continue" header will be added to all HTTP PUT requests. 

356 This header is required by some servers to detect if the client 

357 knows how to handle redirections. In case of redirection, the body 

358 of the PUT request is sent to the redirected location and not to 

359 the front end server. 

360 """ 

361 

362 _is_webdav: Optional[bool] = None 

363 _sessions_store = SessionStore() 

364 _put_sessions_store = SessionStore() 

365 

366 # Use a session exclusively for PUT requests and another session for 

367 # all other requests. PUT requests may be redirected and in that case 

368 # the server may close the persisted connection. If that is the case 

369 # only the connection persisted for PUT requests will be closed and 

370 # the other persisted connection will be kept alive and reused for 

371 # other requests. 

372 

373 @property 

374 def session(self) -> requests.Session: 

375 """Client session to address remote resource for all HTTP methods but 

376 PUT. 

377 """ 

378 if hasattr(self, "_session"): 

379 return self._session 

380 

381 self._session: requests.Session = self._sessions_store.get(self) 

382 return self._session 

383 

384 @property 

385 def put_session(self) -> requests.Session: 

386 """Client session for uploading data to the remote resource.""" 

387 if hasattr(self, "_put_session"): 

388 return self._put_session 

389 

390 self._put_session: requests.Session = self._put_sessions_store.get(self) 

391 return self._put_session 

392 

393 @property 

394 def is_webdav_endpoint(self) -> bool: 

395 """Check if the current endpoint implements WebDAV features. 

396 

397 This is stored per URI but cached by root so there is 

398 only one check per hostname. 

399 """ 

400 if self._is_webdav is not None: 

401 return self._is_webdav 

402 

403 self._is_webdav = _is_webdav_endpoint(self.root_uri()) 

404 return self._is_webdav 

405 

406 def exists(self) -> bool: 

407 """Check that a remote HTTP resource exists.""" 

408 log.debug("Checking if resource exists: %s", self.geturl()) 

409 if not self.is_webdav_endpoint: 

410 # The remote is a plain HTTP server. Let's attempt a HEAD 

411 # request, even if the behavior for such a request against a 

412 # directory is not specified, so it depends on the server 

413 # implementation. 

414 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True, stream=True) 

415 return resp.status_code == requests.codes.ok # 200 

416 

417 # The remote endpoint is a webDAV server: send a PROPFIND request 

418 # to determine if it exists. 

419 resp = self._propfind() 

420 if resp.status_code == requests.codes.multi_status: # 207 

421 prop = _parse_propfind_response_body(resp.text)[0] 

422 return prop.exists 

423 else: # 404 Not Found 

424 return False 

425 

426 def size(self) -> int: 

427 """Return the size of the remote resource in bytes.""" 

428 if self.dirLike: 

429 return 0 

430 

431 if not self.is_webdav_endpoint: 

432 # The remote is a plain HTTP server. Send a HEAD request to 

433 # retrieve the size of the resource. 

434 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True, stream=True) 

435 if resp.status_code == requests.codes.ok: # 200 

436 if "Content-Length" in resp.headers: 

437 return int(resp.headers["Content-Length"]) 

438 else: 

439 raise ValueError( 

440 f"Response to HEAD request to {self} does not contain 'Content-Length' header" 

441 ) 

442 elif resp.status_code == requests.codes.not_found: 

443 raise FileNotFoundError( 

444 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

445 ) 

446 else: 

447 raise ValueError( 

448 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} " 

449 f"{resp.reason}" 

450 ) 

451 

452 # The remote is a webDAV server: send a PROPFIND request to retrieve 

453 # the size of the resource. Sizes are only meaningful for files. 

454 resp = self._propfind() 

455 if resp.status_code == requests.codes.multi_status: # 207 

456 prop = _parse_propfind_response_body(resp.text)[0] 

457 if prop.is_file: 

458 return prop.size 

459 elif prop.is_directory: 

460 raise IsADirectoryError( 

461 f"Resource {self} is reported by server as a directory but has a file path" 

462 ) 

463 else: 

464 raise FileNotFoundError(f"Resource {self} does not exist") 

465 else: # 404 Not Found 

466 raise FileNotFoundError( 

467 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

468 ) 

469 

470 def mkdir(self) -> None: 

471 """Create the directory resource if it does not already exist.""" 

472 # Creating directories is only available on WebDAV backends. 

473 if not self.is_webdav_endpoint: 

474 raise NotImplementedError( 

475 f"Creation of directory {self} is not implemented by plain HTTP servers" 

476 ) 

477 

478 if not self.dirLike: 

479 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}") 

480 

481 # Check if the target directory already exists. 

482 resp = self._propfind() 

483 if resp.status_code == requests.codes.multi_status: # 207 

484 prop = _parse_propfind_response_body(resp.text)[0] 

485 if prop.exists: 

486 if prop.is_directory: 

487 return 

488 else: 

489 # A file exists at this path 

490 raise NotADirectoryError( 

491 f"Can not create a directory for {self} because a file already exists at that path" 

492 ) 

493 

494 # Target directory does not exist. Create it and its ancestors as 

495 # needed. We need to test if parent URL is different from self URL, 

496 # otherwise we could be stuck in a recursive loop 

497 # where self == parent. 

498 if self.geturl() != self.parent().geturl(): 

499 self.parent().mkdir() 

500 

501 log.debug("Creating new directory: %s", self.geturl()) 

502 self._mkcol() 

503 

504 def remove(self) -> None: 

505 """Remove the resource.""" 

506 self._delete() 

507 

508 def read(self, size: int = -1) -> bytes: 

509 """Open the resource and return the contents in bytes. 

510 

511 Parameters 

512 ---------- 

513 size : `int`, optional 

514 The number of bytes to read. Negative or omitted indicates 

515 that all data should be read. 

516 """ 

517 log.debug("Reading from remote resource: %s", self.geturl()) 

518 stream = True if size > 0 else False 

519 with time_this(log, msg="GET %s", args=(self,)): 

520 resp = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

521 

522 if resp.status_code != requests.codes.ok: # 200 

523 raise FileNotFoundError( 

524 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}" 

525 ) 

526 if not stream: 

527 return resp.content 

528 else: 

529 return next(resp.iter_content(chunk_size=size)) 

530 

531 def write(self, data: bytes, overwrite: bool = True) -> None: 

532 """Write the supplied bytes to the new resource. 

533 

534 Parameters 

535 ---------- 

536 data : `bytes` 

537 The bytes to write to the resource. The entire contents of the 

538 resource will be replaced. 

539 overwrite : `bool`, optional 

540 If `True` the resource will be overwritten if it exists. Otherwise 

541 the write will fail. 

542 """ 

543 log.debug("Writing to remote resource: %s", self.geturl()) 

544 if not overwrite: 

545 if self.exists(): 

546 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

547 

548 # Ensure the parent directory exists. 

549 self.parent().mkdir() 

550 

551 # Upload the data. 

552 log.debug("Writing data to remote resource: %s", self.geturl()) 

553 self._put(data=data) 

554 

555 def transfer_from( 

556 self, 

557 src: ResourcePath, 

558 transfer: str = "copy", 

559 overwrite: bool = False, 

560 transaction: Optional[TransactionProtocol] = None, 

561 ) -> None: 

562 """Transfer the current resource to a Webdav repository. 

563 

564 Parameters 

565 ---------- 

566 src : `ResourcePath` 

567 Source URI. 

568 transfer : `str` 

569 Mode to use for transferring the resource. Supports the following 

570 options: copy. 

571 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

572 Currently unused. 

573 """ 

574 # Fail early to prevent delays if remote resources are requested. 

575 if transfer not in self.transferModes: 

576 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

577 

578 # Existence checks cost time so do not call this unless we know 

579 # that debugging is enabled. 

580 if log.isEnabledFor(logging.DEBUG): 

581 log.debug( 

582 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

583 src, 

584 src.exists(), 

585 self, 

586 self.exists(), 

587 transfer, 

588 ) 

589 

590 # Short circuit immediately if the URIs are identical. 

591 if self == src: 

592 log.debug( 

593 "Target and destination URIs are identical: %s, returning immediately." 

594 " No further action required.", 

595 self, 

596 ) 

597 return 

598 

599 if not overwrite and self.exists(): 

600 raise FileExistsError(f"Destination path {self} already exists.") 

601 

602 if transfer == "auto": 

603 transfer = self.transferDefault 

604 

605 # We can use webDAV 'COPY' or 'MOVE' if both the current and source 

606 # resources are located in the same server. 

607 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint: 

608 log.debug("Transfer from %s to %s directly", src, self) 

609 return self._move(src) if transfer == "move" else self._copy(src) 

610 

611 # For resources of different classes or for plain HTTP resources we can 

612 # perform the copy or move operation by downloading to a local file 

613 # and uploading to the destination. 

614 self._copy_via_local(src) 

615 

616 # This was an explicit move, try to remove the source. 

617 if transfer == "move": 

618 src.remove() 

619 

620 def walk( 

621 self, file_filter: Optional[Union[str, re.Pattern]] = None 

622 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]: 

623 """Walk the directory tree returning matching files and directories. 

624 Parameters 

625 ---------- 

626 file_filter : `str` or `re.Pattern`, optional 

627 Regex to filter out files from the list before it is returned. 

628 Yields 

629 ------ 

630 dirpath : `ResourcePath` 

631 Current directory being examined. 

632 dirnames : `list` of `str` 

633 Names of subdirectories within dirpath. 

634 filenames : `list` of `str` 

635 Names of all the files within dirpath. 

636 """ 

637 if not self.dirLike: 

638 raise ValueError("Can not walk a non-directory URI") 

639 

640 # Walking directories is only available on WebDAV backends. 

641 if not self.is_webdav_endpoint: 

642 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers") 

643 

644 if isinstance(file_filter, str): 

645 file_filter = re.compile(file_filter) 

646 

647 resp = self._propfind(depth="1") 

648 if resp.status_code == requests.codes.multi_status: # 207 

649 files: List[str] = [] 

650 dirs: List[str] = [] 

651 

652 for prop in _parse_propfind_response_body(resp.text): 

653 if prop.is_file: 

654 files.append(prop.name) 

655 elif not self.path.endswith(prop.href): 

656 # Only include the names of sub-directories not the 

657 # directory being walked. 

658 dirs.append(prop.name) 

659 

660 if file_filter is not None: 

661 files = [f for f in files if file_filter.search(f)] 

662 

663 if not dirs and not files: 

664 return 

665 else: 

666 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files 

667 

668 for dir in dirs: 

669 new_uri = self.join(dir, forceDirectory=True) 

670 yield from new_uri.walk(file_filter) 

671 

672 def _as_local(self) -> Tuple[str, bool]: 

673 """Download object over HTTP and place in temporary directory. 

674 

675 Returns 

676 ------- 

677 path : `str` 

678 Path to local temporary file. 

679 temporary : `bool` 

680 Always returns `True`. This is always a temporary file. 

681 """ 

682 resp = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

683 if resp.status_code != requests.codes.ok: 

684 raise FileNotFoundError( 

685 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}" 

686 ) 

687 

688 tmpdir, buffering = _get_temp_dir() 

689 with tempfile.NamedTemporaryFile( 

690 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False 

691 ) as tmpFile: 

692 with time_this( 

693 log, 

694 msg="GET %s [length=%s] to local file %s [chunk_size=%d]", 

695 args=(self, resp.headers.get("Content-Length"), tmpFile.name, buffering), 

696 mem_usage=True, 

697 mem_unit=u.mebibyte, 

698 ): 

699 for chunk in resp.iter_content(chunk_size=buffering): 

700 tmpFile.write(chunk) 

701 

702 return tmpFile.name, True 

703 

704 def _send_webdav_request( 

705 self, method: str, url: Optional[str] = None, headers: dict[str, str] = {}, body: Optional[str] = None 

706 ) -> requests.Response: 

707 """Send a webDAV request and correctly handle redirects. 

708 

709 Parameters 

710 ---------- 

711 method : `str` 

712 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL. 

713 headers : `dict`, optional 

714 A dictionary of key-value pairs (both strings) to include as 

715 headers in the request. 

716 body: `str`, optional 

717 The body of the request. 

718 

719 Notes 

720 ----- 

721 This way of sending webDAV requests is necessary for handling 

722 redirection ourselves, since the 'requests' package changes the method 

723 of the redirected request when the server responds with status 302 and 

724 the method of the original request is not HEAD (which is the case for 

725 webDAV requests). 

726 

727 That means that when the webDAV server we interact with responds with 

728 a redirection to a PROPFIND or MKCOL request, the request gets 

729 converted to a GET request when sent to the redirected location. 

730 

731 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in 

732 https://github.com/psf/requests/blob/main/requests/sessions.py 

733 

734 This behavior of the 'requests' package is meant to be compatible with 

735 what is specified in RFC 9110: 

736 

737 https://www.rfc-editor.org/rfc/rfc9110#name-302-found 

738 

739 For our purposes, we do need to follow the redirection and send a new 

740 request using the same HTTP verb. 

741 """ 

742 if url is None: 

743 url = self.geturl() 

744 

745 with time_this( 

746 log, 

747 msg="%s %s", 

748 args=( 

749 method, 

750 url, 

751 ), 

752 mem_usage=True, 

753 mem_unit=u.mebibyte, 

754 ): 

755 for _ in range(max_redirects := 5): 

756 resp = self.session.request( 

757 method, 

758 url, 

759 data=body, 

760 headers=headers, 

761 stream=True, 

762 timeout=TIMEOUT, 

763 allow_redirects=False, 

764 ) 

765 if resp.is_redirect: 

766 url = resp.headers["Location"] 

767 else: 

768 return resp 

769 

770 # We reached the maximum allowed number of redirects. Stop trying. 

771 raise ValueError( 

772 f"Could not get a response to {method} request for {self} after {max_redirects} redirections" 

773 ) 

774 

775 def _propfind(self, body: Optional[str] = None, depth: str = "0") -> requests.Response: 

776 """Send a PROPFIND webDAV request and return the response. 

777 

778 Parameters 

779 ---------- 

780 body : `str`, optional 

781 The body of the PROPFIND request to send to the server. If 

782 provided, it is expected to be a XML document. 

783 depth : `str`, optional 

784 The value of the 'Depth' header to include in the request. 

785 

786 Returns 

787 ------- 

788 response : `requests.Response` 

789 Response to the PROPFIND request. 

790 

791 Notes 

792 ----- 

793 It raises `ValueError` if the status code of the PROPFIND request 

794 is different from "207 Multistatus" or "404 Not Found". 

795 """ 

796 if body is None: 

797 # Request only the DAV live properties we are explicitly interested 

798 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified' 

799 # and 'displayname'. 

800 body = ( 

801 """<?xml version="1.0" encoding="utf-8" ?>""" 

802 """<D:propfind xmlns:D="DAV:"><D:prop>""" 

803 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>""" 

804 """</D:prop></D:propfind>""" 

805 ) 

806 headers = { 

807 "Depth": depth, 

808 "Content-Type": 'application/xml; charset="utf-8"', 

809 "Content-Length": str(len(body)), 

810 } 

811 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body) 

812 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found): 

813 return resp 

814 else: 

815 raise ValueError( 

816 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} " 

817 f"{resp.reason}" 

818 ) 

819 

820 def _options(self) -> requests.Response: 

821 """Send a OPTIONS webDAV request for this resource.""" 

822 

823 return self._send_webdav_request("OPTIONS") 

824 

825 def _head(self) -> requests.Response: 

826 """Send a HEAD webDAV request for this resource.""" 

827 

828 return self._send_webdav_request("HEAD") 

829 

830 def _mkcol(self) -> None: 

831 """Send a MKCOL webDAV request to create a collection. The collection 

832 may already exist. 

833 """ 

834 resp = self._send_webdav_request("MKCOL") 

835 if resp.status_code == requests.codes.created: # 201 

836 return 

837 

838 if resp.status_code == requests.codes.method_not_allowed: # 405 

839 # The remote directory already exists 

840 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

841 else: 

842 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}") 

843 

844 def _delete(self) -> None: 

845 """Send a DELETE webDAV request for this resource.""" 

846 

847 log.debug("Deleting %s ...", self.geturl()) 

848 

849 # If this is a directory, ensure the remote is a webDAV server because 

850 # plain HTTP servers don't support DELETE requests on non-file 

851 # paths. 

852 if self.dirLike and not self.is_webdav_endpoint: 

853 raise NotImplementedError( 

854 f"Deletion of directory {self} is not implemented by plain HTTP servers" 

855 ) 

856 

857 resp = self._send_webdav_request("DELETE") 

858 if resp.status_code in (requests.codes.ok, requests.codes.accepted, requests.codes.no_content): 

859 return 

860 elif resp.status_code == requests.codes.not_found: 

861 raise FileNotFoundError( 

862 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}" 

863 ) 

864 else: 

865 # TODO: the response to a DELETE request against a webDAV server 

866 # may be multistatus. If so, we need to parse the reponse body to 

867 # determine more precisely the reason of the failure (e.g. a lock) 

868 # and provide a more helpful error message. 

869 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}") 

870 

871 def _copy_via_local(self, src: ResourcePath) -> None: 

872 """Replace the contents of this resource with the contents of a remote 

873 resource by using a local temporary file. 

874 

875 Parameters 

876 ---------- 

877 src : `HttpResourcePath` 

878 The source of the contents to copy to `self`. 

879 """ 

880 with src.as_local() as local_uri: 

881 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri) 

882 with open(local_uri.ospath, "rb") as f: 

883 self._put(data=f) 

884 

885 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None: 

886 """Send a COPY or MOVE webDAV request to copy or replace the contents 

887 of this resource with the contents of another resource located in the 

888 same server. 

889 

890 Parameters 

891 ---------- 

892 method : `str` 

893 The method to perform. Valid values are "COPY" or "MOVE" (in 

894 uppercase). 

895 

896 src : `HttpResourcePath` 

897 The source of the contents to move to `self`. 

898 """ 

899 headers = {"Destination": self.geturl()} 

900 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers) 

901 if resp.status_code in (requests.codes.created, requests.codes.no_content): 

902 return 

903 

904 if resp.status_code == requests.codes.multi_status: 

905 tree = eTree.fromstring(resp.content) 

906 status_element = tree.find("./{DAV:}response/{DAV:}status") 

907 status = status_element.text if status_element is not None else "unknown" 

908 error = tree.find("./{DAV:}response/{DAV:}error") 

909 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}") 

910 else: 

911 raise ValueError( 

912 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}" 

913 ) 

914 

915 def _copy(self, src: HttpResourcePath) -> None: 

916 """Send a COPY webDAV request to replace the contents of this resource 

917 (if any) with the contents of another resource located in the same 

918 server. 

919 

920 Parameters 

921 ---------- 

922 src : `HttpResourcePath` 

923 The source of the contents to copy to `self`. 

924 """ 

925 # Neither dCache nor XrootD currently implement the COPY 

926 # webDAV method as documented in 

927 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY 

928 # (See issues DM-37603 and DM-37651 for details) 

929 # 

930 # For the time being, we use a temporary local file to 

931 # perform the copy client side. 

932 # TODO: when those 2 issues above are solved remove the 3 lines below. 

933 must_use_local = True 

934 if must_use_local: 

935 return self._copy_via_local(src) 

936 

937 return self._copy_or_move("COPY", src) 

938 

939 def _move(self, src: HttpResourcePath) -> None: 

940 """Send a MOVE webDAV request to replace the contents of this resource 

941 with the contents of another resource located in the same server. 

942 

943 Parameters 

944 ---------- 

945 src : `HttpResourcePath` 

946 The source of the contents to move to `self`. 

947 """ 

948 return self._copy_or_move("MOVE", src) 

949 

950 def _put(self, data: Union[BinaryIO, bytes]) -> None: 

951 """Perform an HTTP PUT request and handle redirection. 

952 

953 Parameters 

954 ---------- 

955 data : `Union[BinaryIO, bytes]` 

956 The data to be included in the body of the PUT request. 

957 """ 

958 # Retrieve the final URL for this upload by sending a PUT request with 

959 # no content. Follow a single server redirection to retrieve the 

960 # final URL. 

961 headers = {"Content-Length": "0"} 

962 if _SEND_EXPECT_HEADER_ON_PUT: 

963 headers["Expect"] = "100-continue" 

964 

965 url = self.geturl() 

966 

967 log.debug("Sending empty PUT request to %s", url) 

968 with time_this(log, msg="PUT (no data) %s", args=(url,), mem_usage=True, mem_unit=u.mebibyte): 

969 resp = self.session.request( 

970 "PUT", url, data=None, headers=headers, stream=True, timeout=TIMEOUT, allow_redirects=False 

971 ) 

972 if resp.is_redirect: 

973 url = resp.headers["Location"] 

974 

975 # Upload the data to the final destination using the PUT session 

976 log.debug("Uploading data to %s", url) 

977 with time_this(log, msg="PUT %s", args=(url,), mem_usage=True, mem_unit=u.mebibyte): 

978 resp = self.put_session.put(url, data=data, stream=True, timeout=TIMEOUT, allow_redirects=False) 

979 if resp.status_code not in (requests.codes.ok, requests.codes.created, requests.codes.no_content): 

980 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}") 

981 

982 @contextlib.contextmanager 

983 def _openImpl( 

984 self, 

985 mode: str = "r", 

986 *, 

987 encoding: Optional[str] = None, 

988 ) -> Iterator[ResourceHandleProtocol]: 

989 resp = self._head() 

990 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes" 

991 handle: ResourceHandleProtocol 

992 if mode in ("rb", "r") and accepts_range: 

993 handle = HttpReadResourceHandle( 

994 mode, log, url=self.geturl(), session=self.session, timeout=TIMEOUT 

995 ) 

996 if mode == "r": 

997 # cast because the protocol is compatible, but does not have 

998 # BytesIO in the inheritance tree 

999 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding) 

1000 else: 

1001 yield handle 

1002 else: 

1003 with super()._openImpl(mode, encoding=encoding) as http_handle: 

1004 yield http_handle 

1005 

1006 

1007def _dump_response(resp: requests.Response) -> None: 

1008 """Log the contents of a HTTP or webDAV request and its response. 

1009 

1010 Parameters 

1011 ---------- 

1012 resp : `requests.Response` 

1013 The response to log. 

1014 

1015 Notes 

1016 ----- 

1017 Intended for development purposes only. 

1018 """ 

1019 log.debug("-----------------------------------------------") 

1020 log.debug("Request") 

1021 log.debug(" method=%s", resp.request.method) 

1022 log.debug(" URL=%s", resp.request.url) 

1023 log.debug(" headers=%s", resp.request.headers) 

1024 if resp.request.method == "PUT": 

1025 log.debug(" body=<data>") 

1026 elif resp.request.body is None: 

1027 log.debug(" body=<empty>") 

1028 else: 

1029 log.debug(" body=%r", resp.request.body[:120]) 

1030 

1031 log.debug("Response:") 

1032 log.debug(" status_code=%d", resp.status_code) 

1033 log.debug(" headers=%s", resp.headers) 

1034 if not resp.content: 

1035 log.debug(" body=<empty>") 

1036 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain": 

1037 log.debug(" body=%r", resp.content) 

1038 else: 

1039 log.debug(" body=%r", resp.content[:80]) 

1040 

1041 

1042def _is_protected(filepath: str) -> bool: 

1043 """Return true if the permissions of file at filepath only allow for access 

1044 by its owner. 

1045 

1046 Parameters 

1047 ---------- 

1048 filepath : `str` 

1049 Path of a local file. 

1050 """ 

1051 if not os.path.isfile(filepath): 

1052 return False 

1053 mode = stat.S_IMODE(os.stat(filepath).st_mode) 

1054 owner_accessible = bool(mode & stat.S_IRWXU) 

1055 group_accessible = bool(mode & stat.S_IRWXG) 

1056 other_accessible = bool(mode & stat.S_IRWXO) 

1057 return owner_accessible and not group_accessible and not other_accessible 

1058 

1059 

1060def _parse_propfind_response_body(body: str) -> List[DavProperty]: 

1061 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND 

1062 request. 

1063 

1064 Parameters 

1065 ---------- 

1066 body : `str` 

1067 XML-encoded response body to a PROPFIND request 

1068 

1069 Returns 

1070 ------- 

1071 responses : `List[DavProperty]` 

1072 

1073 Notes 

1074 ----- 

1075 Is is expected that there is at least one reponse in `body`, otherwise 

1076 this function raises. 

1077 """ 

1078 # A response body to a PROPFIND request is of the form (indented for 

1079 # readability): 

1080 # 

1081 # <?xml version="1.0" encoding="UTF-8"?> 

1082 # <D:multistatus xmlns:D="DAV:"> 

1083 # <D:response> 

1084 # <D:href>path/to/resource</D:href> 

1085 # <D:propstat> 

1086 # <D:prop> 

1087 # <D:resourcetype> 

1088 # <D:collection xmlns:D="DAV:"/> 

1089 # </D:resourcetype> 

1090 # <D:getlastmodified> 

1091 # Fri, 27 Jan 2 023 13:59:01 GMT 

1092 # </D:getlastmodified> 

1093 # <D:getcontentlength> 

1094 # 12345 

1095 # </D:getcontentlength> 

1096 # </D:prop> 

1097 # <D:status> 

1098 # HTTP/1.1 200 OK 

1099 # </D:status> 

1100 # </D:propstat> 

1101 # </D:response> 

1102 # <D:response> 

1103 # ... 

1104 # </D:response> 

1105 # <D:response> 

1106 # ... 

1107 # </D:response> 

1108 # </D:multistatus> 

1109 

1110 # Scan all the 'response' elements and extract the relevant properties 

1111 responses = [] 

1112 multistatus = eTree.fromstring(body.strip()) 

1113 for response in multistatus.findall("./{DAV:}response"): 

1114 responses.append(DavProperty(response)) 

1115 

1116 if responses: 

1117 return responses 

1118 else: 

1119 # Could not parse the body 

1120 raise ValueError(f"Unable to parse response for PROPFIND request: {response}") 

1121 

1122 

1123class DavProperty: 

1124 """Helper class to encapsulate select live DAV properties of a single 

1125 resource, as retrieved via a PROPFIND request. 

1126 """ 

1127 

1128 # Regular expression to compare against the 'status' element of a 

1129 # PROPFIND response's 'propstat' element. 

1130 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE) 

1131 

1132 def __init__(self, response: Optional[eTree.Element]): 

1133 self._href: str = "" 

1134 self._displayname: str = "" 

1135 self._collection: bool = False 

1136 self._getlastmodified: str = "" 

1137 self._getcontentlength: int = -1 

1138 

1139 if response is not None: 

1140 self._parse(response) 

1141 

1142 def _parse(self, response: eTree.Element) -> None: 

1143 # Extract 'href' 

1144 if (element := response.find("./{DAV:}href")) is not None: 

1145 # We need to use "str(element.text)"" instead of "element.text" to 

1146 # keep mypy happy 

1147 self._href = str(element.text).strip() 

1148 

1149 for propstat in response.findall("./{DAV:}propstat"): 

1150 # Only extract properties of interest with status OK. 

1151 status = propstat.find("./{DAV:}status") 

1152 if status is None or not self._status_ok_rex.match(str(status.text)): 

1153 continue 

1154 

1155 for prop in propstat.findall("./{DAV:}prop"): 

1156 # Parse "collection". 

1157 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None: 

1158 self._collection = True 

1159 

1160 # Parse "getlastmodified". 

1161 if (element := prop.find("./{DAV:}getlastmodified")) is not None: 

1162 self._getlastmodified = str(element.text) 

1163 

1164 # Parse "getcontentlength". 

1165 if (element := prop.find("./{DAV:}getcontentlength")) is not None: 

1166 self._getcontentlength = int(str(element.text)) 

1167 

1168 # Parse "displayname". 

1169 if (element := prop.find("./{DAV:}displayname")) is not None: 

1170 self._displayname = str(element.text) 

1171 

1172 @property 

1173 def exists(self) -> bool: 

1174 # It is either a directory or a file with length of at least zero 

1175 return self._collection or self._getcontentlength >= 0 

1176 

1177 @property 

1178 def is_directory(self) -> bool: 

1179 return self._collection 

1180 

1181 @property 

1182 def is_file(self) -> bool: 

1183 return self._getcontentlength >= 0 

1184 

1185 @property 

1186 def size(self) -> int: 

1187 # Only valid if is_file is True 

1188 return self._getcontentlength 

1189 

1190 @property 

1191 def name(self) -> str: 

1192 return self._displayname 

1193 

1194 @property 

1195 def href(self) -> str: 

1196 return self._href