Coverage for python/lsst/resources/http.py: 20%
562 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-10 09:42 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-10 09:42 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import math
21import os
22import os.path
23import random
24import re
25import stat
26import tempfile
27import xml.etree.ElementTree as eTree
28from collections.abc import Iterator
29from typing import TYPE_CHECKING, BinaryIO, cast
31import requests
32from astropy import units as u
33from lsst.utils.timer import time_this
34from requests.adapters import HTTPAdapter
35from requests.auth import AuthBase
36from urllib3.util.retry import Retry
38from ._resourceHandles import ResourceHandleProtocol
39from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
40from ._resourcePath import ResourcePath
42if TYPE_CHECKING:
43 from .utils import TransactionProtocol
45log = logging.getLogger(__name__)
48class HttpResourcePathConfig:
49 """Configuration class to encapsulate the configurable items used by class
50 HttpResourcePath.
51 """
53 # Default timeouts for all HTTP requests (seconds).
54 DEFAULT_TIMEOUT_CONNECT = 30.0
55 DEFAULT_TIMEOUT_READ = 1_500.0
57 # Default lower and upper bounds for the backoff interval (seconds).
58 # A value in this interval is randomly selected as the backoff factor when
59 # requests need to be retried.
60 DEFAULT_BACKOFF_MIN = 1.0
61 DEFAULT_BACKOFF_MAX = 3.0
63 # Default number of connections to persist with both the front end and
64 # back end servers.
65 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2
66 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1
68 # Accepted digest algorithms
69 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512")
71 _front_end_connections: int | None = None
72 _back_end_connections: int | None = None
73 _digest_algorithm: str | None = None
74 _send_expect_on_put: bool | None = None
75 _timeout: tuple[float, float] | None = None
76 _collect_memory_usage: bool | None = None
77 _backoff_min: float | None = None
78 _backoff_max: float | None = None
80 @property
81 def front_end_connections(self) -> int:
82 """Number of persistent connections to the front end server."""
83 if self._front_end_connections is not None: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 return self._front_end_connections
86 try:
87 self._front_end_connections = int(
88 os.environ.get(
89 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
90 )
91 )
92 except ValueError:
93 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
95 return self._front_end_connections
97 @property
98 def back_end_connections(self) -> int:
99 """Number of persistent connections to the back end servers."""
100 if self._back_end_connections is not None: 100 ↛ 101line 100 didn't jump to line 101, because the condition on line 100 was never true
101 return self._back_end_connections
103 try:
104 self._back_end_connections = int(
105 os.environ.get(
106 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
107 )
108 )
109 except ValueError:
110 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
112 return self._back_end_connections
114 @property
115 def digest_algorithm(self) -> str:
116 """Algorithm to ask the server to use for computing and recording
117 digests of each file contents in PUT requests.
119 Returns
120 -------
121 digest_algorithm: `str`
122 The name of a digest algorithm or the empty string if no algotihm
123 is configured.
124 """
125 if self._digest_algorithm is not None:
126 return self._digest_algorithm
128 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower()
129 if digest not in self.ACCEPTED_DIGESTS:
130 digest = ""
132 self._digest_algorithm = digest
133 return self._digest_algorithm
135 @property
136 def send_expect_on_put(self) -> bool:
137 """Return True if a "Expect: 100-continue" header is to be sent to
138 the server on each PUT request.
140 Some servers (e.g. dCache) uses this information as an indication that
141 the client knows how to handle redirects to the specific server that
142 will actually receive the data for PUT requests.
143 """
144 if self._send_expect_on_put is not None:
145 return self._send_expect_on_put
147 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
148 return self._send_expect_on_put
150 @property
151 def timeout(self) -> tuple[float, float]:
152 """Return a tuple with the values of timeouts for connecting to the
153 server and reading its response, respectively. Both values are in
154 seconds.
155 """
156 if self._timeout is not None:
157 return self._timeout
159 self._timeout = (self.DEFAULT_TIMEOUT_CONNECT, self.DEFAULT_TIMEOUT_READ)
160 try:
161 timeout = (
162 float(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT)),
163 float(os.environ.get("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ)),
164 )
165 if not math.isnan(timeout[0]) and not math.isnan(timeout[1]):
166 self._timeout = timeout
167 except ValueError:
168 pass
170 return self._timeout
172 @property
173 def collect_memory_usage(self) -> bool:
174 """Return true if we want to collect memory usage when timing
175 operations against the remote server via the `lsst.utils.time_this`
176 context manager.
177 """
178 if self._collect_memory_usage is not None:
179 return self._collect_memory_usage
181 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ
182 return self._collect_memory_usage
184 @property
185 def backoff_min(self) -> float:
186 """Lower bound of the interval from which a backoff factor is randomly
187 selected when retrying requests (seconds).
188 """
189 if self._backoff_min is not None:
190 return self._backoff_min
192 self._backoff_min = self.DEFAULT_BACKOFF_MIN
193 try:
194 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN))
195 if not math.isnan(backoff_min): 195 ↛ 200line 195 didn't jump to line 200, because the condition on line 195 was never false
196 self._backoff_min = backoff_min
197 except ValueError:
198 pass
200 return self._backoff_min
202 @property
203 def backoff_max(self) -> float:
204 """Upper bound of the interval from which a backoff factor is randomly
205 selected when retrying requests (seconds).
206 """
207 if self._backoff_max is not None:
208 return self._backoff_max
210 self._backoff_max = self.DEFAULT_BACKOFF_MAX
211 try:
212 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX))
213 if not math.isnan(backoff_max): 213 ↛ 218line 213 didn't jump to line 218, because the condition on line 213 was never false
214 self._backoff_max = backoff_max
215 except ValueError:
216 pass
218 return self._backoff_max
221@functools.lru_cache
222def _is_webdav_endpoint(path: ResourcePath | str) -> bool:
223 """Check whether the remote HTTP endpoint implements WebDAV features.
225 Parameters
226 ----------
227 path : `ResourcePath` or `str`
228 URL to the resource to be checked.
229 Should preferably refer to the root since the status is shared
230 by all paths in that server.
232 Returns
233 -------
234 _is_webdav_endpoint : `bool`
235 True if the endpoint implements WebDAV, False if it doesn't.
236 """
237 log.debug("Detecting HTTP endpoint type for '%s'...", path)
238 try:
239 ca_cert_bundle = os.getenv("LSST_HTTP_CACERT_BUNDLE")
240 verify: bool | str = ca_cert_bundle if ca_cert_bundle else True
241 resp = requests.options(str(path), verify=verify, stream=False)
242 if resp.status_code not in (requests.codes.ok, requests.codes.created):
243 raise ValueError(
244 f"Unexpected response to OPTIONS request for {path}, status: {resp.status_code} "
245 f"{resp.reason}"
246 )
248 # Check that "1" is part of the value of the "DAV" header. We don't
249 # use locks, so a server complying to class 1 is enough for our
250 # purposes. All webDAV servers must advertise at least compliance
251 # class "1".
252 #
253 # Compliance classes are documented in
254 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
255 #
256 # Examples of values for header DAV are:
257 # DAV: 1, 2
258 # DAV: 1, <http://apache.org/dav/propset/fs/1>
259 if "DAV" not in resp.headers:
260 return False
261 else:
262 # Convert to str to keep mypy happy
263 compliance_class = str(resp.headers.get("DAV"))
264 return "1" in compliance_class.replace(" ", "").split(",")
265 except requests.exceptions.SSLError as e:
266 log.warning(
267 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to "
268 "specify a bundle of certificate authorities you trust which are "
269 "not included in the default set of trusted authorities of your "
270 "system."
271 )
272 raise e
275# Tuple (path, block_size) pointing to the location of a local directory
276# to save temporary files and the block size of the underlying file system.
277_TMPDIR: tuple[str, int] | None = None
280def _get_temp_dir() -> tuple[str, int]:
281 """Return the temporary directory path and block size.
283 This function caches its results in _TMPDIR.
284 """
285 global _TMPDIR
286 if _TMPDIR:
287 return _TMPDIR
289 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
290 # 'TMPDIR', if defined. Otherwise use current working directory.
291 tmpdir = os.getcwd()
292 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
293 if dir and os.path.isdir(dir):
294 tmpdir = dir
295 break
297 # Compute the block size as 256 blocks of typical size
298 # (i.e. 4096 bytes) or 10 times the file system block size,
299 # whichever is higher. This is a reasonable compromise between
300 # using memory for buffering and the number of system calls
301 # issued to read from or write to temporary files.
302 fsstats = os.statvfs(tmpdir)
303 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
306class BearerTokenAuth(AuthBase):
307 """Attach a bearer token 'Authorization' header to each request.
309 Parameters
310 ----------
311 token : `str`
312 Can be either the path to a local protected file which contains the
313 value of the token or the token itself.
314 """
316 def __init__(self, token: str):
317 self._token = self._path = None
318 self._mtime: float = -1.0
319 if not token:
320 return
322 self._token = token
323 if os.path.isfile(token):
324 self._path = os.path.abspath(token)
325 if not _is_protected(self._path):
326 raise PermissionError(
327 f"Bearer token file at {self._path} must be protected for access only by its owner"
328 )
329 self._refresh()
331 def _refresh(self) -> None:
332 """Read the token file (if any) if its modification time is more recent
333 than the last time we read it.
334 """
335 if not self._path:
336 return
338 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
339 log.debug("Reading bearer token file at %s", self._path)
340 self._mtime = mtime
341 with open(self._path) as f:
342 self._token = f.read().rstrip("\n")
344 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
345 if self._token:
346 self._refresh()
347 req.headers["Authorization"] = f"Bearer {self._token}"
348 return req
351class SessionStore:
352 """Cache a reusable HTTP client session per endpoint."""
354 def __init__(
355 self,
356 num_pools: int = 10,
357 max_persistent_connections: int = 1,
358 backoff_min: float = 1.0,
359 backoff_max: float = 3.0,
360 ) -> None:
361 # Dictionary to store the session associated to a given URI. The key
362 # of the dictionary is a root URI and the value is the session.
363 self._sessions: dict[str, requests.Session] = {}
365 # Number of connection pools to keep: there is one pool per remote
366 # host. See documentation of urllib3 PoolManager class:
367 # https://urllib3.readthedocs.io
368 self._num_pools: int = num_pools
370 # Maximum number of connections per remote host to persist in each
371 # connection pool. See urllib3 Advanced Usage documentation:
372 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html
373 self._max_persistent_connections: int = max_persistent_connections
375 # Minimum and maximum values of the inverval to compute the exponential
376 # backoff factor when retrying requests (seconds).
377 self._backoff_min: float = backoff_min
378 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0
380 def clear(self) -> None:
381 """Destroy all previously created sessions and attempt to close
382 underlying idle network connections.
383 """
384 # Close all sessions and empty the store. Idle network connections
385 # should be closed as a consequence. We don't have means through
386 # the API exposed by Requests to actually force closing the
387 # underlying open sockets.
388 for session in self._sessions.values():
389 session.close()
391 self._sessions.clear()
393 def get(self, rpath: ResourcePath) -> requests.Session:
394 """Retrieve a session for accessing the remote resource at rpath.
396 Parameters
397 ----------
398 rpath : `ResourcePath`
399 URL to a resource at the remote server for which a session is to
400 be retrieved.
402 Notes
403 -----
404 Once a session is created for a given endpoint it is cached and
405 returned every time a session is requested for any path under that same
406 endpoint. For instance, a single session will be cached and shared
407 for paths "https://www.example.org/path/to/file" and
408 "https://www.example.org/any/other/path".
410 Note that "https://www.example.org" and "https://www.example.org:12345"
411 will have different sessions since the port number is not identical.
413 In order to configure the session, some environment variables are
414 inspected:
416 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
417 certificates to trust when verifying the server's certificate.
419 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
420 local file containing a bearer token to be used as the client
421 authentication mechanism with all requests.
422 The permissions of the token file must be set so that only its
423 owner can access it.
424 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
425 and LSST_HTTP_AUTH_CLIENT_KEY.
427 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
428 client certificate for authenticating to the server.
429 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
430 initialized with the path of the client private key file.
431 The permissions of the client private key must be set so that only
432 its owner can access it, at least for reading.
433 """
434 root_uri = str(rpath.root_uri())
435 if root_uri not in self._sessions:
436 # We don't have yet a session for this endpoint: create a new one.
437 self._sessions[root_uri] = self._make_session(rpath)
439 return self._sessions[root_uri]
441 def _make_session(self, rpath: ResourcePath) -> requests.Session:
442 """Make a new session configured from values from the environment."""
443 session = requests.Session()
444 root_uri = str(rpath.root_uri())
445 log.debug("Creating new HTTP session for endpoint %s ...", root_uri)
446 retries = Retry(
447 # Total number of retries to allow. Takes precedence over other
448 # counts.
449 total=6,
450 # How many connection-related errors to retry on.
451 connect=3,
452 # How many times to retry on read errors.
453 read=3,
454 # Backoff factor to apply between attempts after the second try
455 # (seconds). Compute a random jitter to prevent all the clients
456 # to overwhelm the server by sending requests at the same time.
457 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(),
458 # How many times to retry on bad status codes.
459 status=5,
460 # Set of uppercased HTTP method verbs that we should retry on.
461 # We only automatically retry idempotent requests.
462 allowed_methods=frozenset(
463 [
464 "COPY",
465 "DELETE",
466 "GET",
467 "HEAD",
468 "MKCOL",
469 "OPTIONS",
470 "PROPFIND",
471 "PUT",
472 ]
473 ),
474 # HTTP status codes that we should force a retry on.
475 status_forcelist=frozenset(
476 [
477 requests.codes.too_many_requests, # 429
478 requests.codes.internal_server_error, # 500
479 requests.codes.bad_gateway, # 502
480 requests.codes.service_unavailable, # 503
481 requests.codes.gateway_timeout, # 504
482 ]
483 ),
484 # Whether to respect Retry-After header on status codes defined
485 # above.
486 respect_retry_after_header=True,
487 )
489 # Persist the specified number of connections to the front end server.
490 session.mount(
491 root_uri,
492 HTTPAdapter(
493 pool_connections=self._num_pools,
494 pool_maxsize=self._max_persistent_connections,
495 pool_block=False,
496 max_retries=retries,
497 ),
498 )
500 # Do not persist the connections to back end servers which may vary
501 # from request to request. Systematically persisting connections to
502 # those servers may exhaust their capabilities when there are thousands
503 # of simultaneous clients.
504 session.mount(
505 f"{rpath.scheme}://",
506 HTTPAdapter(
507 pool_connections=self._num_pools,
508 pool_maxsize=0,
509 pool_block=False,
510 max_retries=retries,
511 ),
512 )
514 # If the remote endpoint doesn't use secure HTTP we don't include
515 # bearer tokens in the requests nor need to authenticate the remote
516 # server.
517 if rpath.scheme != "https":
518 return session
520 # Should we use a specific CA cert bundle for authenticating the
521 # server?
522 session.verify = True
523 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
524 session.verify = ca_bundle
526 # Should we use bearer tokens for client authentication?
527 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
528 log.debug("... using bearer token authentication")
529 session.auth = BearerTokenAuth(token)
530 return session
532 # Should we instead use client certificate and private key? If so, both
533 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
534 # initialized.
535 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
536 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
537 if client_cert and client_key:
538 if not _is_protected(client_key):
539 raise PermissionError(
540 f"Private key file at {client_key} must be protected for access only by its owner"
541 )
542 log.debug("... using client certificate authentication.")
543 session.cert = (client_cert, client_key)
544 return session
546 if client_cert:
547 # Only the client certificate was provided.
548 raise ValueError(
549 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
550 )
552 if client_key:
553 # Only the client private key was provided.
554 raise ValueError(
555 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
556 )
558 log.debug(
559 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
560 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
561 )
562 return session
565class HttpResourcePath(ResourcePath):
566 """General HTTP(S) resource.
568 Notes
569 -----
570 In order to configure the behavior of instances of this class, the
571 environment variables below are inspected:
573 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
574 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
575 This header is required by some servers to detect if the client
576 knows how to handle redirections. In case of redirection, the body
577 of the PUT request is sent to the redirected location and not to
578 the front end server.
580 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a
581 numeric value, they are interpreted as the number of seconds to wait
582 for establishing a connection with the server and for reading its
583 response, respectively.
585 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and
586 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number
587 of connections to attempt to persist with both the front end servers
588 and the back end servers.
589 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and
590 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS.
592 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to
593 ask the server to compute for every file's content sent to the server
594 via a PUT request. No digest is requested if this variable is not set
595 or is set to an invalid value.
596 Valid values are those in ACCEPTED_DIGESTS.
597 """
599 _is_webdav: bool | None = None
601 # Configuration items for this class instances.
602 _config = HttpResourcePathConfig()
604 # The session for metadata requests is used for interacting with
605 # the front end servers for requests such as PROPFIND, HEAD, etc. Those
606 # interactions are typically served by the front end servers. We want to
607 # keep the connection to the front end servers open, to reduce the cost
608 # associated to TCP and TLS handshaking for each new request.
609 _metadata_session_store = SessionStore(
610 num_pools=5,
611 max_persistent_connections=_config.front_end_connections,
612 backoff_min=_config.backoff_min,
613 backoff_max=_config.backoff_max,
614 )
616 # The data session is used for interaction with the front end servers which
617 # typically redirect to the back end servers for serving our PUT and GET
618 # requests. We attempt to keep a single connection open with the front end
619 # server, if possible. This depends on how the server behaves and the
620 # kind of request. Some servers close the connection when redirecting
621 # the client to a back end server, for instance when serving a PUT
622 # request.
623 _data_session_store = SessionStore(
624 num_pools=25,
625 max_persistent_connections=_config.back_end_connections,
626 backoff_min=_config.backoff_min,
627 backoff_max=_config.backoff_max,
628 )
630 # Process ID which created the session stores above. We need to store this
631 # to replace sessions created by a parent process and inherited by a
632 # child process after a fork, to avoid confusing the SSL layer.
633 _pid: int = -1
635 @property
636 def metadata_session(self) -> requests.Session:
637 """Client session to send requests which do not require upload or
638 download of data, i.e. mostly metadata requests.
639 """
640 if hasattr(self, "_metadata_session"):
641 if HttpResourcePath._pid == os.getpid():
642 return self._metadata_session
643 else:
644 # The metadata session we have in cache was likely created by
645 # a parent process. Discard all the sessions in that store.
646 self._metadata_session_store.clear()
648 # Retrieve a new metadata session.
649 HttpResourcePath._pid = os.getpid()
650 self._metadata_session: requests.Session = self._metadata_session_store.get(self)
651 return self._metadata_session
653 @property
654 def data_session(self) -> requests.Session:
655 """Client session for uploading and downloading data."""
656 if hasattr(self, "_data_session"):
657 if HttpResourcePath._pid == os.getpid():
658 return self._data_session
659 else:
660 # The data session we have in cache was likely created by
661 # a parent process. Discard all the sessions in that store.
662 self._data_session_store.clear()
664 # Retrieve a new data session.
665 HttpResourcePath._pid = os.getpid()
666 self._data_session: requests.Session = self._data_session_store.get(self)
667 return self._data_session
669 def _clear_sessions(self) -> None:
670 """Close the socket connections that are still open.
672 Used only in test suites to avoid warnings.
673 """
674 self._metadata_session_store.clear()
675 self._data_session_store.clear()
677 if hasattr(self, "_metadata_session"):
678 delattr(self, "_metadata_session")
680 if hasattr(self, "_data_session"):
681 delattr(self, "_data_session")
683 @property
684 def is_webdav_endpoint(self) -> bool:
685 """Check if the current endpoint implements WebDAV features.
687 This is stored per URI but cached by root so there is
688 only one check per hostname.
689 """
690 if self._is_webdav is not None:
691 return self._is_webdav
693 self._is_webdav = _is_webdav_endpoint(self.root_uri())
694 return self._is_webdav
696 def exists(self) -> bool:
697 """Check that a remote HTTP resource exists."""
698 log.debug("Checking if resource exists: %s", self.geturl())
699 if not self.is_webdav_endpoint:
700 # The remote is a plain HTTP server. Let's attempt a HEAD
701 # request, even if the behavior for such a request against a
702 # directory is not specified, so it depends on the server
703 # implementation.
704 resp = self.metadata_session.head(
705 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
706 )
707 return resp.status_code == requests.codes.ok # 200
709 # The remote endpoint is a webDAV server: send a PROPFIND request
710 # to determine if it exists.
711 resp = self._propfind()
712 if resp.status_code == requests.codes.multi_status: # 207
713 prop = _parse_propfind_response_body(resp.text)[0]
714 return prop.exists
715 else: # 404 Not Found
716 return False
718 def size(self) -> int:
719 """Return the size of the remote resource in bytes."""
720 if self.dirLike:
721 return 0
723 if not self.is_webdav_endpoint:
724 # The remote is a plain HTTP server. Send a HEAD request to
725 # retrieve the size of the resource.
726 resp = self.metadata_session.head(
727 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
728 )
729 if resp.status_code == requests.codes.ok: # 200
730 if "Content-Length" in resp.headers:
731 return int(resp.headers["Content-Length"])
732 else:
733 raise ValueError(
734 f"Response to HEAD request to {self} does not contain 'Content-Length' header"
735 )
736 elif resp.status_code == requests.codes.not_found:
737 raise FileNotFoundError(
738 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
739 )
740 else:
741 raise ValueError(
742 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} "
743 f"{resp.reason}"
744 )
746 # The remote is a webDAV server: send a PROPFIND request to retrieve
747 # the size of the resource. Sizes are only meaningful for files.
748 resp = self._propfind()
749 if resp.status_code == requests.codes.multi_status: # 207
750 prop = _parse_propfind_response_body(resp.text)[0]
751 if prop.is_file:
752 return prop.size
753 elif prop.is_directory:
754 raise IsADirectoryError(
755 f"Resource {self} is reported by server as a directory but has a file path"
756 )
757 else:
758 raise FileNotFoundError(f"Resource {self} does not exist")
759 else: # 404 Not Found
760 raise FileNotFoundError(
761 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
762 )
764 def mkdir(self) -> None:
765 """Create the directory resource if it does not already exist."""
766 # Creating directories is only available on WebDAV back ends.
767 if not self.is_webdav_endpoint:
768 raise NotImplementedError(
769 f"Creation of directory {self} is not implemented by plain HTTP servers"
770 )
772 if not self.dirLike:
773 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
775 # Check if the target directory already exists.
776 resp = self._propfind()
777 if resp.status_code == requests.codes.multi_status: # 207
778 prop = _parse_propfind_response_body(resp.text)[0]
779 if prop.exists:
780 if prop.is_directory:
781 return
782 else:
783 # A file exists at this path
784 raise NotADirectoryError(
785 f"Can not create a directory for {self} because a file already exists at that path"
786 )
788 # Target directory does not exist. Create it and its ancestors as
789 # needed. We need to test if parent URL is different from self URL,
790 # otherwise we could be stuck in a recursive loop
791 # where self == parent.
792 if self.geturl() != self.parent().geturl():
793 self.parent().mkdir()
795 log.debug("Creating new directory: %s", self.geturl())
796 self._mkcol()
798 def remove(self) -> None:
799 """Remove the resource."""
800 self._delete()
802 def read(self, size: int = -1) -> bytes:
803 """Open the resource and return the contents in bytes.
805 Parameters
806 ----------
807 size : `int`, optional
808 The number of bytes to read. Negative or omitted indicates
809 that all data should be read.
810 """
811 # Use the data session as a context manager to ensure that the
812 # network connections to both the front end and back end servers are
813 # closed after downloading the data.
814 log.debug("Reading from remote resource: %s", self.geturl())
815 stream = True if size > 0 else False
816 with self.data_session as session:
817 with time_this(log, msg="GET %s", args=(self,)):
818 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout)
820 if resp.status_code != requests.codes.ok: # 200
821 raise FileNotFoundError(
822 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}"
823 )
824 if not stream:
825 return resp.content
826 else:
827 return next(resp.iter_content(chunk_size=size))
829 def write(self, data: bytes, overwrite: bool = True) -> None:
830 """Write the supplied bytes to the new resource.
832 Parameters
833 ----------
834 data : `bytes`
835 The bytes to write to the resource. The entire contents of the
836 resource will be replaced.
837 overwrite : `bool`, optional
838 If `True` the resource will be overwritten if it exists. Otherwise
839 the write will fail.
840 """
841 log.debug("Writing to remote resource: %s", self.geturl())
842 if not overwrite:
843 if self.exists():
844 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
846 # Ensure the parent directory exists.
847 self.parent().mkdir()
849 # Upload the data.
850 log.debug("Writing data to remote resource: %s", self.geturl())
851 self._put(data=data)
853 def transfer_from(
854 self,
855 src: ResourcePath,
856 transfer: str = "copy",
857 overwrite: bool = False,
858 transaction: TransactionProtocol | None = None,
859 ) -> None:
860 """Transfer the current resource to a Webdav repository.
862 Parameters
863 ----------
864 src : `ResourcePath`
865 Source URI.
866 transfer : `str`
867 Mode to use for transferring the resource. Supports the following
868 options: copy.
869 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
870 Currently unused.
871 """
872 # Fail early to prevent delays if remote resources are requested.
873 if transfer not in self.transferModes:
874 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
876 # Existence checks cost time so do not call this unless we know
877 # that debugging is enabled.
878 if log.isEnabledFor(logging.DEBUG):
879 log.debug(
880 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
881 src,
882 src.exists(),
883 self,
884 self.exists(),
885 transfer,
886 )
888 # Short circuit immediately if the URIs are identical.
889 if self == src:
890 log.debug(
891 "Target and destination URIs are identical: %s, returning immediately."
892 " No further action required.",
893 self,
894 )
895 return
897 if not overwrite and self.exists():
898 raise FileExistsError(f"Destination path {self} already exists.")
900 if transfer == "auto":
901 transfer = self.transferDefault
903 # We can use webDAV 'COPY' or 'MOVE' if both the current and source
904 # resources are located in the same server.
905 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint:
906 log.debug("Transfer from %s to %s directly", src, self)
907 return self._move(src) if transfer == "move" else self._copy(src)
909 # For resources of different classes or for plain HTTP resources we can
910 # perform the copy or move operation by downloading to a local file
911 # and uploading to the destination.
912 self._copy_via_local(src)
914 # This was an explicit move, try to remove the source.
915 if transfer == "move":
916 src.remove()
918 def walk(
919 self, file_filter: str | re.Pattern | None = None
920 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
921 """Walk the directory tree returning matching files and directories.
923 Parameters
924 ----------
925 file_filter : `str` or `re.Pattern`, optional
926 Regex to filter out files from the list before it is returned.
928 Yields
929 ------
930 dirpath : `ResourcePath`
931 Current directory being examined.
932 dirnames : `list` of `str`
933 Names of subdirectories within dirpath.
934 filenames : `list` of `str`
935 Names of all the files within dirpath.
936 """
937 if not self.dirLike:
938 raise ValueError("Can not walk a non-directory URI")
940 # Walking directories is only available on WebDAV back ends.
941 if not self.is_webdav_endpoint:
942 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers")
944 if isinstance(file_filter, str):
945 file_filter = re.compile(file_filter)
947 resp = self._propfind(depth="1")
948 if resp.status_code == requests.codes.multi_status: # 207
949 files: list[str] = []
950 dirs: list[str] = []
952 for prop in _parse_propfind_response_body(resp.text):
953 if prop.is_file:
954 files.append(prop.name)
955 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")):
956 # Only include the names of sub-directories not the name of
957 # the directory being walked.
958 dirs.append(prop.name)
960 if file_filter is not None:
961 files = [f for f in files if file_filter.search(f)]
963 if not dirs and not files:
964 return
965 else:
966 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files
968 for dir in dirs:
969 new_uri = self.join(dir, forceDirectory=True)
970 yield from new_uri.walk(file_filter)
972 def _as_local(self) -> tuple[str, bool]:
973 """Download object over HTTP and place in temporary directory.
975 Returns
976 -------
977 path : `str`
978 Path to local temporary file.
979 temporary : `bool`
980 Always returns `True`. This is always a temporary file.
981 """
982 # Use the session as a context manager to ensure that connections
983 # to both the front end and back end servers are closed after the
984 # download operation is finished.
985 with self.data_session as session:
986 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout)
987 if resp.status_code != requests.codes.ok:
988 raise FileNotFoundError(
989 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}"
990 )
992 tmpdir, buffering = _get_temp_dir()
993 with tempfile.NamedTemporaryFile(
994 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
995 ) as tmpFile:
996 expected_length = int(resp.headers.get("Content-Length", "-1"))
997 with time_this(
998 log,
999 msg="GET %s [length=%d] to local file %s [chunk_size=%d]",
1000 args=(self, expected_length, tmpFile.name, buffering),
1001 mem_usage=self._config.collect_memory_usage,
1002 mem_unit=u.mebibyte,
1003 ):
1004 content_length = 0
1005 for chunk in resp.iter_content(chunk_size=buffering):
1006 tmpFile.write(chunk)
1007 content_length += len(chunk)
1009 # Check that the expected and actual content lengths match. Perform
1010 # this check only when the contents of the file was not encoded by
1011 # the server.
1012 if "Content-Encoding" not in resp.headers:
1013 if expected_length >= 0 and expected_length != content_length:
1014 raise ValueError(
1015 f"Size of downloaded file does not match value in Content-Length header for {self}: "
1016 f"expecting {expected_length} and got {content_length} bytes"
1017 )
1019 return tmpFile.name, True
1021 def _send_webdav_request(
1022 self,
1023 method: str,
1024 url: str | None = None,
1025 headers: dict[str, str] = {},
1026 body: str | None = None,
1027 session: requests.Session | None = None,
1028 timeout: tuple[float, float] | None = None,
1029 ) -> requests.Response:
1030 """Send a webDAV request and correctly handle redirects.
1032 Parameters
1033 ----------
1034 method : `str`
1035 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL.
1036 headers : `dict`, optional
1037 A dictionary of key-value pairs (both strings) to include as
1038 headers in the request.
1039 body: `str`, optional
1040 The body of the request.
1042 Notes
1043 -----
1044 This way of sending webDAV requests is necessary for handling
1045 redirection ourselves, since the 'requests' package changes the method
1046 of the redirected request when the server responds with status 302 and
1047 the method of the original request is not HEAD (which is the case for
1048 webDAV requests).
1050 That means that when the webDAV server we interact with responds with
1051 a redirection to a PROPFIND or MKCOL request, the request gets
1052 converted to a GET request when sent to the redirected location.
1054 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in
1055 https://github.com/psf/requests/blob/main/requests/sessions.py
1057 This behavior of the 'requests' package is meant to be compatible with
1058 what is specified in RFC 9110:
1060 https://www.rfc-editor.org/rfc/rfc9110#name-302-found
1062 For our purposes, we do need to follow the redirection and send a new
1063 request using the same HTTP verb.
1064 """
1065 if url is None:
1066 url = self.geturl()
1068 if session is None:
1069 session = self.metadata_session
1071 if timeout is None:
1072 timeout = self._config.timeout
1074 with time_this(
1075 log,
1076 msg="%s %s",
1077 args=(
1078 method,
1079 url,
1080 ),
1081 mem_usage=self._config.collect_memory_usage,
1082 mem_unit=u.mebibyte,
1083 ):
1084 for _ in range(max_redirects := 5):
1085 resp = session.request(
1086 method,
1087 url,
1088 data=body,
1089 headers=headers,
1090 stream=False,
1091 timeout=timeout,
1092 allow_redirects=False,
1093 )
1094 if resp.is_redirect:
1095 url = resp.headers["Location"]
1096 else:
1097 return resp
1099 # We reached the maximum allowed number of redirects.
1100 # Stop trying.
1101 raise ValueError(
1102 f"Could not get a response to {method} request for {self} after "
1103 f"{max_redirects} redirections"
1104 )
1106 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response:
1107 """Send a PROPFIND webDAV request and return the response.
1109 Parameters
1110 ----------
1111 body : `str`, optional
1112 The body of the PROPFIND request to send to the server. If
1113 provided, it is expected to be a XML document.
1114 depth : `str`, optional
1115 The value of the 'Depth' header to include in the request.
1117 Returns
1118 -------
1119 response : `requests.Response`
1120 Response to the PROPFIND request.
1122 Notes
1123 -----
1124 It raises `ValueError` if the status code of the PROPFIND request
1125 is different from "207 Multistatus" or "404 Not Found".
1126 """
1127 if body is None:
1128 # Request only the DAV live properties we are explicitly interested
1129 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
1130 # and 'displayname'.
1131 body = (
1132 """<?xml version="1.0" encoding="utf-8" ?>"""
1133 """<D:propfind xmlns:D="DAV:"><D:prop>"""
1134 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
1135 """</D:prop></D:propfind>"""
1136 )
1137 headers = {
1138 "Depth": depth,
1139 "Content-Type": 'application/xml; charset="utf-8"',
1140 "Content-Length": str(len(body)),
1141 }
1142 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body)
1143 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found):
1144 return resp
1145 else:
1146 raise ValueError(
1147 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} "
1148 f"{resp.reason}"
1149 )
1151 def _options(self) -> requests.Response:
1152 """Send a OPTIONS webDAV request for this resource."""
1153 resp = self._send_webdav_request("OPTIONS")
1154 if resp.status_code in (requests.codes.ok, requests.codes.created):
1155 return resp
1157 raise ValueError(
1158 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} " f"{resp.reason}"
1159 )
1161 def _head(self) -> requests.Response:
1162 """Send a HEAD webDAV request for this resource."""
1163 return self._send_webdav_request("HEAD")
1165 def _mkcol(self) -> None:
1166 """Send a MKCOL webDAV request to create a collection. The collection
1167 may already exist.
1168 """
1169 resp = self._send_webdav_request("MKCOL")
1170 if resp.status_code == requests.codes.created: # 201
1171 return
1173 if resp.status_code == requests.codes.method_not_allowed: # 405
1174 # The remote directory already exists
1175 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
1176 else:
1177 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}")
1179 def _delete(self) -> None:
1180 """Send a DELETE webDAV request for this resource."""
1181 log.debug("Deleting %s ...", self.geturl())
1183 # If this is a directory, ensure the remote is a webDAV server because
1184 # plain HTTP servers don't support DELETE requests on non-file
1185 # paths.
1186 if self.dirLike and not self.is_webdav_endpoint:
1187 raise NotImplementedError(
1188 f"Deletion of directory {self} is not implemented by plain HTTP servers"
1189 )
1191 # Deleting non-empty directories may take some time, so increase
1192 # the timeout for getting a response from the server.
1193 timeout = self._config.timeout
1194 if self.dirLike:
1195 timeout = (timeout[0], timeout[1] * 100)
1196 resp = self._send_webdav_request("DELETE", timeout=timeout)
1197 if resp.status_code in (
1198 requests.codes.ok,
1199 requests.codes.accepted,
1200 requests.codes.no_content,
1201 requests.codes.not_found,
1202 ):
1203 # We can get a "404 Not Found" error when the file or directory
1204 # does not exist or when the DELETE request was retried several
1205 # times and a previous attempt actually deleted the resource.
1206 # Therefore we consider that a "Not Found" response is not an
1207 # error since we reached the state desired by the user.
1208 return
1209 else:
1210 # TODO: the response to a DELETE request against a webDAV server
1211 # may be multistatus. If so, we need to parse the reponse body to
1212 # determine more precisely the reason of the failure (e.g. a lock)
1213 # and provide a more helpful error message.
1214 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}")
1216 def _copy_via_local(self, src: ResourcePath) -> None:
1217 """Replace the contents of this resource with the contents of a remote
1218 resource by using a local temporary file.
1220 Parameters
1221 ----------
1222 src : `HttpResourcePath`
1223 The source of the contents to copy to `self`.
1224 """
1225 with src.as_local() as local_uri:
1226 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri)
1227 with open(local_uri.ospath, "rb") as f:
1228 self._put(data=f)
1230 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None:
1231 """Send a COPY or MOVE webDAV request to copy or replace the contents
1232 of this resource with the contents of another resource located in the
1233 same server.
1235 Parameters
1236 ----------
1237 method : `str`
1238 The method to perform. Valid values are "COPY" or "MOVE" (in
1239 uppercase).
1241 src : `HttpResourcePath`
1242 The source of the contents to move to `self`.
1243 """
1244 headers = {"Destination": self.geturl()}
1245 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session)
1246 if resp.status_code in (requests.codes.created, requests.codes.no_content):
1247 return
1249 if resp.status_code == requests.codes.multi_status:
1250 tree = eTree.fromstring(resp.content)
1251 status_element = tree.find("./{DAV:}response/{DAV:}status")
1252 status = status_element.text if status_element is not None else "unknown"
1253 error = tree.find("./{DAV:}response/{DAV:}error")
1254 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}")
1255 else:
1256 raise ValueError(
1257 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}"
1258 )
1260 def _copy(self, src: HttpResourcePath) -> None:
1261 """Send a COPY webDAV request to replace the contents of this resource
1262 (if any) with the contents of another resource located in the same
1263 server.
1265 Parameters
1266 ----------
1267 src : `HttpResourcePath`
1268 The source of the contents to copy to `self`.
1269 """
1270 # Neither dCache nor XrootD currently implement the COPY
1271 # webDAV method as documented in
1272 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
1273 # (See issues DM-37603 and DM-37651 for details)
1274 #
1275 # For the time being, we use a temporary local file to
1276 # perform the copy client side.
1277 # TODO: when those 2 issues above are solved remove the 3 lines below.
1278 must_use_local = True
1279 if must_use_local:
1280 return self._copy_via_local(src)
1282 return self._copy_or_move("COPY", src)
1284 def _move(self, src: HttpResourcePath) -> None:
1285 """Send a MOVE webDAV request to replace the contents of this resource
1286 with the contents of another resource located in the same server.
1288 Parameters
1289 ----------
1290 src : `HttpResourcePath`
1291 The source of the contents to move to `self`.
1292 """
1293 return self._copy_or_move("MOVE", src)
1295 def _put(self, data: BinaryIO | bytes) -> None:
1296 """Perform an HTTP PUT request and handle redirection.
1298 Parameters
1299 ----------
1300 data : `Union[BinaryIO, bytes]`
1301 The data to be included in the body of the PUT request.
1302 """
1303 # Retrieve the final URL for this upload by sending a PUT request with
1304 # no content. Follow a single server redirection to retrieve the
1305 # final URL.
1306 headers = {"Content-Length": "0"}
1307 if self._config.send_expect_on_put:
1308 headers["Expect"] = "100-continue"
1310 url = self.geturl()
1312 # Use the session as a context manager to ensure the underlying
1313 # connections are closed after finishing uploading the data.
1314 with self.data_session as session:
1315 # Send an empty PUT request to get redirected to the final
1316 # destination.
1317 log.debug("Sending empty PUT request to %s", url)
1318 with time_this(
1319 log,
1320 msg="PUT (no data) %s",
1321 args=(url,),
1322 mem_usage=self._config.collect_memory_usage,
1323 mem_unit=u.mebibyte,
1324 ):
1325 resp = session.request(
1326 "PUT",
1327 url,
1328 data=None,
1329 headers=headers,
1330 stream=False,
1331 timeout=self._config.timeout,
1332 allow_redirects=False,
1333 )
1334 if resp.is_redirect:
1335 url = resp.headers["Location"]
1337 # Upload the data to the final destination.
1338 log.debug("Uploading data to %s", url)
1340 # Ask the server to compute and record a checksum of the uploaded
1341 # file contents, for later integrity checks. Since we don't compute
1342 # the digest ourselves while uploading the data, we cannot control
1343 # after the request is complete that the data we uploaded is
1344 # identical to the data recorded by the server, but at least the
1345 # server has recorded a digest of the data it stored.
1346 #
1347 # See RFC-3230 for details and
1348 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
1349 # for the list of supported digest algorithhms.
1350 # In addition, note that not all servers implement this RFC so
1351 # the checksum may not be computed by the server.
1352 put_headers: dict[str, str] | None = None
1353 if digest := self._config.digest_algorithm:
1354 put_headers = {"Want-Digest": digest}
1356 with time_this(
1357 log,
1358 msg="PUT %s",
1359 args=(url,),
1360 mem_usage=self._config.collect_memory_usage,
1361 mem_unit=u.mebibyte,
1362 ):
1363 resp = session.request(
1364 "PUT",
1365 url,
1366 data=data,
1367 headers=put_headers,
1368 stream=False,
1369 timeout=self._config.timeout,
1370 allow_redirects=False,
1371 )
1372 if resp.status_code in (
1373 requests.codes.ok,
1374 requests.codes.created,
1375 requests.codes.no_content,
1376 ):
1377 return
1378 else:
1379 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}")
1381 @contextlib.contextmanager
1382 def _openImpl(
1383 self,
1384 mode: str = "r",
1385 *,
1386 encoding: str | None = None,
1387 ) -> Iterator[ResourceHandleProtocol]:
1388 resp = self._head()
1389 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes"
1390 handle: ResourceHandleProtocol
1391 if mode in ("rb", "r") and accepts_range:
1392 handle = HttpReadResourceHandle(
1393 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout
1394 )
1395 if mode == "r":
1396 # cast because the protocol is compatible, but does not have
1397 # BytesIO in the inheritance tree
1398 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
1399 else:
1400 yield handle
1401 else:
1402 with super()._openImpl(mode, encoding=encoding) as http_handle:
1403 yield http_handle
1406def _dump_response(resp: requests.Response) -> None:
1407 """Log the contents of a HTTP or webDAV request and its response.
1409 Parameters
1410 ----------
1411 resp : `requests.Response`
1412 The response to log.
1414 Notes
1415 -----
1416 Intended for development purposes only.
1417 """
1418 log.debug("-----------------------------------------------")
1419 log.debug("Request")
1420 log.debug(" method=%s", resp.request.method)
1421 log.debug(" URL=%s", resp.request.url)
1422 log.debug(" headers=%s", resp.request.headers)
1423 if resp.request.method == "PUT":
1424 log.debug(" body=<data>")
1425 elif resp.request.body is None:
1426 log.debug(" body=<empty>")
1427 else:
1428 log.debug(" body=%r", resp.request.body[:120])
1430 log.debug("Response:")
1431 log.debug(" status_code=%d", resp.status_code)
1432 log.debug(" headers=%s", resp.headers)
1433 if not resp.content:
1434 log.debug(" body=<empty>")
1435 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain":
1436 log.debug(" body=%r", resp.content)
1437 else:
1438 log.debug(" body=%r", resp.content[:80])
1441def _is_protected(filepath: str) -> bool:
1442 """Return true if the permissions of file at filepath only allow for access
1443 by its owner.
1445 Parameters
1446 ----------
1447 filepath : `str`
1448 Path of a local file.
1449 """
1450 if not os.path.isfile(filepath):
1451 return False
1452 mode = stat.S_IMODE(os.stat(filepath).st_mode)
1453 owner_accessible = bool(mode & stat.S_IRWXU)
1454 group_accessible = bool(mode & stat.S_IRWXG)
1455 other_accessible = bool(mode & stat.S_IRWXO)
1456 return owner_accessible and not group_accessible and not other_accessible
1459def _parse_propfind_response_body(body: str) -> list[DavProperty]:
1460 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND
1461 request.
1463 Parameters
1464 ----------
1465 body : `str`
1466 XML-encoded response body to a PROPFIND request
1468 Returns
1469 -------
1470 responses : `List[DavProperty]`
1472 Notes
1473 -----
1474 Is is expected that there is at least one reponse in `body`, otherwise
1475 this function raises.
1476 """
1477 # A response body to a PROPFIND request is of the form (indented for
1478 # readability):
1479 #
1480 # <?xml version="1.0" encoding="UTF-8"?>
1481 # <D:multistatus xmlns:D="DAV:">
1482 # <D:response>
1483 # <D:href>path/to/resource</D:href>
1484 # <D:propstat>
1485 # <D:prop>
1486 # <D:resourcetype>
1487 # <D:collection xmlns:D="DAV:"/>
1488 # </D:resourcetype>
1489 # <D:getlastmodified>
1490 # Fri, 27 Jan 2 023 13:59:01 GMT
1491 # </D:getlastmodified>
1492 # <D:getcontentlength>
1493 # 12345
1494 # </D:getcontentlength>
1495 # </D:prop>
1496 # <D:status>
1497 # HTTP/1.1 200 OK
1498 # </D:status>
1499 # </D:propstat>
1500 # </D:response>
1501 # <D:response>
1502 # ...
1503 # </D:response>
1504 # <D:response>
1505 # ...
1506 # </D:response>
1507 # </D:multistatus>
1509 # Scan all the 'response' elements and extract the relevant properties
1510 responses = []
1511 multistatus = eTree.fromstring(body.strip())
1512 for response in multistatus.findall("./{DAV:}response"):
1513 responses.append(DavProperty(response))
1515 if responses:
1516 return responses
1517 else:
1518 # Could not parse the body
1519 raise ValueError(f"Unable to parse response for PROPFIND request: {body}")
1522class DavProperty:
1523 """Helper class to encapsulate select live DAV properties of a single
1524 resource, as retrieved via a PROPFIND request.
1525 """
1527 # Regular expression to compare against the 'status' element of a
1528 # PROPFIND response's 'propstat' element.
1529 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE)
1531 def __init__(self, response: eTree.Element | None):
1532 self._href: str = ""
1533 self._displayname: str = ""
1534 self._collection: bool = False
1535 self._getlastmodified: str = ""
1536 self._getcontentlength: int = -1
1538 if response is not None:
1539 self._parse(response)
1541 def _parse(self, response: eTree.Element) -> None:
1542 # Extract 'href'.
1543 if (element := response.find("./{DAV:}href")) is not None:
1544 # We need to use "str(element.text)"" instead of "element.text" to
1545 # keep mypy happy.
1546 self._href = str(element.text).strip()
1547 else:
1548 raise ValueError(
1549 f"Property 'href' expected but not found in PROPFIND response: "
1550 f"{eTree.tostring(response, encoding='unicode')}"
1551 )
1553 for propstat in response.findall("./{DAV:}propstat"):
1554 # Only extract properties of interest with status OK.
1555 status = propstat.find("./{DAV:}status")
1556 if status is None or not self._status_ok_rex.match(str(status.text)):
1557 continue
1559 for prop in propstat.findall("./{DAV:}prop"):
1560 # Parse "collection".
1561 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None:
1562 self._collection = True
1564 # Parse "getlastmodified".
1565 if (element := prop.find("./{DAV:}getlastmodified")) is not None:
1566 self._getlastmodified = str(element.text)
1568 # Parse "getcontentlength".
1569 if (element := prop.find("./{DAV:}getcontentlength")) is not None:
1570 self._getcontentlength = int(str(element.text))
1572 # Parse "displayname".
1573 if (element := prop.find("./{DAV:}displayname")) is not None:
1574 self._displayname = str(element.text)
1576 # Some webDAV servers don't include the 'displayname' property in the
1577 # response so try to infer it from the value of the 'href' property.
1578 # Depending on the server the href value may end with '/'.
1579 if not self._displayname:
1580 self._displayname = os.path.basename(self._href.rstrip("/"))
1582 # Force a size of 0 for collections.
1583 if self._collection:
1584 self._getcontentlength = 0
1586 @property
1587 def exists(self) -> bool:
1588 # It is either a directory or a file with length of at least zero
1589 return self._collection or self._getcontentlength >= 0
1591 @property
1592 def is_directory(self) -> bool:
1593 return self._collection
1595 @property
1596 def is_file(self) -> bool:
1597 return not self._collection
1599 @property
1600 def size(self) -> int:
1601 return self._getcontentlength
1603 @property
1604 def name(self) -> str:
1605 return self._displayname
1607 @property
1608 def href(self) -> str:
1609 return self._href