Coverage for python/lsst/resources/http.py: 20%
559 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-19 03:38 -0700
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-19 03:38 -0700
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import math
21import os
22import os.path
23import random
24import re
25import stat
26import tempfile
27import xml.etree.ElementTree as eTree
28from typing import TYPE_CHECKING, BinaryIO, Iterator, List, Optional, Tuple, Union, cast
30import requests
31from astropy import units as u
32from lsst.utils.timer import time_this
33from requests.adapters import HTTPAdapter
34from requests.auth import AuthBase
35from urllib3.util.retry import Retry
37from ._resourceHandles import ResourceHandleProtocol
38from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
39from ._resourcePath import ResourcePath
41if TYPE_CHECKING:
42 from .utils import TransactionProtocol
44log = logging.getLogger(__name__)
47class HttpResourcePathConfig:
48 """Configuration class to encapsulate the configurable items used by class
49 HttpResourcePath.
50 """
52 # Default timeouts for all HTTP requests (seconds).
53 DEFAULT_TIMEOUT_CONNECT = 30.0
54 DEFAULT_TIMEOUT_READ = 1_500.0
56 # Default lower and upper bounds for the backoff interval (seconds).
57 # A value in this interval is randomly selected as the backoff factor when
58 # requests need to be retried.
59 DEFAULT_BACKOFF_MIN = 1.0
60 DEFAULT_BACKOFF_MAX = 3.0
62 # Default number of connections to persist with both the front end and
63 # back end servers.
64 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2
65 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1
67 # Accepted digest algorithms
68 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512")
70 _front_end_connections: Optional[int] = None
71 _back_end_connections: Optional[int] = None
72 _digest_algorithm: Optional[str] = None
73 _send_expect_on_put: Optional[bool] = None
74 _timeout: Optional[tuple[float, float]] = None
75 _collect_memory_usage: Optional[bool] = None
76 _backoff_min: Optional[float] = None
77 _backoff_max: Optional[float] = None
79 @property
80 def front_end_connections(self) -> int:
81 """Number of persistent connections to the front end server."""
83 if self._front_end_connections is not None: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 return self._front_end_connections
86 try:
87 self._front_end_connections = int(
88 os.environ.get(
89 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
90 )
91 )
92 except ValueError:
93 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
95 return self._front_end_connections
97 @property
98 def back_end_connections(self) -> int:
99 """Number of persistent connections to the back end servers."""
101 if self._back_end_connections is not None: 101 ↛ 102line 101 didn't jump to line 102, because the condition on line 101 was never true
102 return self._back_end_connections
104 try:
105 self._back_end_connections = int(
106 os.environ.get(
107 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
108 )
109 )
110 except ValueError:
111 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
113 return self._back_end_connections
115 @property
116 def digest_algorithm(self) -> str:
117 """Algorithm to ask the server to use for computing and recording
118 digests of each file contents in PUT requests.
120 Returns
121 -------
122 digest_algorithm: `str`
123 The name of a digest algorithm or the empty string if no algotihm
124 is configured.
125 """
127 if self._digest_algorithm is not None:
128 return self._digest_algorithm
130 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower()
131 if digest not in self.ACCEPTED_DIGESTS:
132 digest = ""
134 self._digest_algorithm = digest
135 return self._digest_algorithm
137 @property
138 def send_expect_on_put(self) -> bool:
139 """Return True if a "Expect: 100-continue" header is to be sent to
140 the server on each PUT request.
142 Some servers (e.g. dCache) uses this information as an indication that
143 the client knows how to handle redirects to the specific server that
144 will actually receive the data for PUT requests.
145 """
147 if self._send_expect_on_put is not None:
148 return self._send_expect_on_put
150 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
151 return self._send_expect_on_put
153 @property
154 def timeout(self) -> tuple[float, float]:
155 """Return a tuple with the values of timeouts for connecting to the
156 server and reading its response, respectively. Both values are in
157 seconds.
158 """
160 if self._timeout is not None:
161 return self._timeout
163 self._timeout = (self.DEFAULT_TIMEOUT_CONNECT, self.DEFAULT_TIMEOUT_READ)
164 try:
165 timeout = (
166 float(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT)),
167 float(os.environ.get("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ)),
168 )
169 if not math.isnan(timeout[0]) and not math.isnan(timeout[1]):
170 self._timeout = timeout
171 except ValueError:
172 pass
174 return self._timeout
176 @property
177 def collect_memory_usage(self) -> bool:
178 """Return true if we want to collect memory usage when timing
179 operations against the remote server via the `lsst.utils.time_this`
180 context manager.
181 """
183 if self._collect_memory_usage is not None:
184 return self._collect_memory_usage
186 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ
187 return self._collect_memory_usage
189 @property
190 def backoff_min(self) -> float:
191 """Lower bound of the interval from which a backoff factor is randomly
192 selected when retrying requests (seconds).
193 """
195 if self._backoff_min is not None:
196 return self._backoff_min
198 self._backoff_min = self.DEFAULT_BACKOFF_MIN
199 try:
200 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN))
201 if not math.isnan(backoff_min): 201 ↛ 206line 201 didn't jump to line 206, because the condition on line 201 was never false
202 self._backoff_min = backoff_min
203 except ValueError:
204 pass
206 return self._backoff_min
208 @property
209 def backoff_max(self) -> float:
210 """Upper bound of the interval from which a backoff factor is randomly
211 selected when retrying requests (seconds).
212 """
214 if self._backoff_max is not None:
215 return self._backoff_max
217 self._backoff_max = self.DEFAULT_BACKOFF_MAX
218 try:
219 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX))
220 if not math.isnan(backoff_max): 220 ↛ 225line 220 didn't jump to line 225, because the condition on line 220 was never false
221 self._backoff_max = backoff_max
222 except ValueError:
223 pass
225 return self._backoff_max
228@functools.lru_cache
229def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool:
230 """Check whether the remote HTTP endpoint implements WebDAV features.
232 Parameters
233 ----------
234 path : `ResourcePath` or `str`
235 URL to the resource to be checked.
236 Should preferably refer to the root since the status is shared
237 by all paths in that server.
239 Returns
240 -------
241 _is_webdav_endpoint : `bool`
242 True if the endpoint implements WebDAV, False if it doesn't.
243 """
244 log.debug("Detecting HTTP endpoint type for '%s'...", path)
245 try:
246 ca_cert_bundle = os.getenv("LSST_HTTP_CACERT_BUNDLE")
247 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True
248 resp = requests.options(str(path), verify=verify, stream=False)
249 if resp.status_code not in (requests.codes.ok, requests.codes.created):
250 raise ValueError(
251 f"Unexpected response to OPTIONS request for {path}, status: {resp.status_code} "
252 f"{resp.reason}"
253 )
255 # Check that "1" is part of the value of the "DAV" header. We don't
256 # use locks, so a server complying to class 1 is enough for our
257 # purposes. All webDAV servers must advertise at least compliance
258 # class "1".
259 #
260 # Compliance classes are documented in
261 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
262 #
263 # Examples of values for header DAV are:
264 # DAV: 1, 2
265 # DAV: 1, <http://apache.org/dav/propset/fs/1>
266 if "DAV" not in resp.headers:
267 return False
268 else:
269 # Convert to str to keep mypy happy
270 compliance_class = str(resp.headers.get("DAV"))
271 return "1" in compliance_class.replace(" ", "").split(",")
272 except requests.exceptions.SSLError as e:
273 log.warning(
274 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to "
275 "specify a bundle of certificate authorities you trust which are "
276 "not included in the default set of trusted authorities of your "
277 "system."
278 )
279 raise e
282# Tuple (path, block_size) pointing to the location of a local directory
283# to save temporary files and the block size of the underlying file system.
284_TMPDIR: Optional[tuple[str, int]] = None
287def _get_temp_dir() -> tuple[str, int]:
288 """Return the temporary directory path and block size.
290 This function caches its results in _TMPDIR.
291 """
292 global _TMPDIR
293 if _TMPDIR:
294 return _TMPDIR
296 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
297 # 'TMPDIR', if defined. Otherwise use current working directory.
298 tmpdir = os.getcwd()
299 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
300 if dir and os.path.isdir(dir):
301 tmpdir = dir
302 break
304 # Compute the block size as 256 blocks of typical size
305 # (i.e. 4096 bytes) or 10 times the file system block size,
306 # whichever is higher. This is a reasonable compromise between
307 # using memory for buffering and the number of system calls
308 # issued to read from or write to temporary files.
309 fsstats = os.statvfs(tmpdir)
310 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
313class BearerTokenAuth(AuthBase):
314 """Attach a bearer token 'Authorization' header to each request.
316 Parameters
317 ----------
318 token : `str`
319 Can be either the path to a local protected file which contains the
320 value of the token or the token itself.
321 """
323 def __init__(self, token: str):
324 self._token = self._path = None
325 self._mtime: float = -1.0
326 if not token:
327 return
329 self._token = token
330 if os.path.isfile(token):
331 self._path = os.path.abspath(token)
332 if not _is_protected(self._path):
333 raise PermissionError(
334 f"Bearer token file at {self._path} must be protected for access only by its owner"
335 )
336 self._refresh()
338 def _refresh(self) -> None:
339 """Read the token file (if any) if its modification time is more recent
340 than the last time we read it.
341 """
342 if not self._path:
343 return
345 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
346 log.debug("Reading bearer token file at %s", self._path)
347 self._mtime = mtime
348 with open(self._path) as f:
349 self._token = f.read().rstrip("\n")
351 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
352 if self._token:
353 self._refresh()
354 req.headers["Authorization"] = f"Bearer {self._token}"
355 return req
358class SessionStore:
359 """Cache a reusable HTTP client session per endpoint."""
361 def __init__(
362 self,
363 num_pools: int = 10,
364 max_persistent_connections: int = 1,
365 backoff_min: float = 1.0,
366 backoff_max: float = 3.0,
367 ) -> None:
368 # Dictionary to store the session associated to a given URI. The key
369 # of the dictionary is a root URI and the value is the session.
370 self._sessions: dict[str, requests.Session] = {}
372 # Number of connection pools to keep: there is one pool per remote
373 # host. See documentation of urllib3 PoolManager class:
374 # https://urllib3.readthedocs.io
375 self._num_pools: int = num_pools
377 # Maximum number of connections per remote host to persist in each
378 # connection pool. See urllib3 Advanced Usage documentation:
379 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html
380 self._max_persistent_connections: int = max_persistent_connections
382 # Minimum and maximum values of the inverval to compute the exponential
383 # backoff factor when retrying requests (seconds).
384 self._backoff_min: float = backoff_min
385 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0
387 def clear(self) -> None:
388 """Destroy all previously created sessions and attempt to close
389 underlying idle network connections.
390 """
392 # Close all sessions and empty the store. Idle network connections
393 # should be closed as a consequence. We don't have means through
394 # the API exposed by Requests to actually force closing the
395 # underlying open sockets.
396 for session in self._sessions.values():
397 session.close()
399 self._sessions.clear()
401 def get(self, rpath: ResourcePath) -> requests.Session:
402 """Retrieve a session for accessing the remote resource at rpath.
404 Parameters
405 ----------
406 rpath : `ResourcePath`
407 URL to a resource at the remote server for which a session is to
408 be retrieved.
410 Notes
411 -----
412 Once a session is created for a given endpoint it is cached and
413 returned every time a session is requested for any path under that same
414 endpoint. For instance, a single session will be cached and shared
415 for paths "https://www.example.org/path/to/file" and
416 "https://www.example.org/any/other/path".
418 Note that "https://www.example.org" and "https://www.example.org:12345"
419 will have different sessions since the port number is not identical.
421 In order to configure the session, some environment variables are
422 inspected:
424 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
425 certificates to trust when verifying the server's certificate.
427 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
428 local file containing a bearer token to be used as the client
429 authentication mechanism with all requests.
430 The permissions of the token file must be set so that only its
431 owner can access it.
432 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
433 and LSST_HTTP_AUTH_CLIENT_KEY.
435 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
436 client certificate for authenticating to the server.
437 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
438 initialized with the path of the client private key file.
439 The permissions of the client private key must be set so that only
440 its owner can access it, at least for reading.
441 """
442 root_uri = str(rpath.root_uri())
443 if root_uri not in self._sessions:
444 # We don't have yet a session for this endpoint: create a new one.
445 self._sessions[root_uri] = self._make_session(rpath)
447 return self._sessions[root_uri]
449 def _make_session(self, rpath: ResourcePath) -> requests.Session:
450 """Make a new session configured from values from the environment."""
451 session = requests.Session()
452 root_uri = str(rpath.root_uri())
453 log.debug("Creating new HTTP session for endpoint %s ...", root_uri)
454 retries = Retry(
455 # Total number of retries to allow. Takes precedence over other
456 # counts.
457 total=6,
458 # How many connection-related errors to retry on.
459 connect=3,
460 # How many times to retry on read errors.
461 read=3,
462 # Backoff factor to apply between attempts after the second try
463 # (seconds). Compute a random jitter to prevent all the clients
464 # to overwhelm the server by sending requests at the same time.
465 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(),
466 # How many times to retry on bad status codes.
467 status=5,
468 # Set of uppercased HTTP method verbs that we should retry on.
469 # We only automatically retry idempotent requests.
470 allowed_methods=frozenset(
471 [
472 "COPY",
473 "DELETE",
474 "GET",
475 "HEAD",
476 "MKCOL",
477 "OPTIONS",
478 "PROPFIND",
479 "PUT",
480 ]
481 ),
482 # HTTP status codes that we should force a retry on.
483 status_forcelist=frozenset(
484 [
485 requests.codes.too_many_requests, # 429
486 requests.codes.internal_server_error, # 500
487 requests.codes.bad_gateway, # 502
488 requests.codes.service_unavailable, # 503
489 requests.codes.gateway_timeout, # 504
490 ]
491 ),
492 # Whether to respect Retry-After header on status codes defined
493 # above.
494 respect_retry_after_header=True,
495 )
497 # Persist the specified number of connections to the front end server.
498 session.mount(
499 root_uri,
500 HTTPAdapter(
501 pool_connections=self._num_pools,
502 pool_maxsize=self._max_persistent_connections,
503 pool_block=False,
504 max_retries=retries,
505 ),
506 )
508 # Do not persist the connections to back end servers which may vary
509 # from request to request. Systematically persisting connections to
510 # those servers may exhaust their capabilities when there are thousands
511 # of simultaneous clients.
512 session.mount(
513 f"{rpath.scheme}://",
514 HTTPAdapter(
515 pool_connections=self._num_pools,
516 pool_maxsize=0,
517 pool_block=False,
518 max_retries=retries,
519 ),
520 )
522 # If the remote endpoint doesn't use secure HTTP we don't include
523 # bearer tokens in the requests nor need to authenticate the remote
524 # server.
525 if rpath.scheme != "https":
526 return session
528 # Should we use a specific CA cert bundle for authenticating the
529 # server?
530 session.verify = True
531 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
532 session.verify = ca_bundle
534 # Should we use bearer tokens for client authentication?
535 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
536 log.debug("... using bearer token authentication")
537 session.auth = BearerTokenAuth(token)
538 return session
540 # Should we instead use client certificate and private key? If so, both
541 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
542 # initialized.
543 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
544 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
545 if client_cert and client_key:
546 if not _is_protected(client_key):
547 raise PermissionError(
548 f"Private key file at {client_key} must be protected for access only by its owner"
549 )
550 log.debug("... using client certificate authentication.")
551 session.cert = (client_cert, client_key)
552 return session
554 if client_cert:
555 # Only the client certificate was provided.
556 raise ValueError(
557 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
558 )
560 if client_key:
561 # Only the client private key was provided.
562 raise ValueError(
563 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
564 )
566 log.debug(
567 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
568 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
569 )
570 return session
573class HttpResourcePath(ResourcePath):
574 """General HTTP(S) resource.
576 Notes
577 -----
578 In order to configure the behavior of instances of this class, the
579 environment variables below are inspected:
581 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
582 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
583 This header is required by some servers to detect if the client
584 knows how to handle redirections. In case of redirection, the body
585 of the PUT request is sent to the redirected location and not to
586 the front end server.
588 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a
589 numeric value, they are interpreted as the number of seconds to wait
590 for establishing a connection with the server and for reading its
591 response, respectively.
593 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and
594 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number
595 of connections to attempt to persist with both the front end servers
596 and the back end servers.
597 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and
598 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS.
600 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to
601 ask the server to compute for every file's content sent to the server
602 via a PUT request. No digest is requested if this variable is not set
603 or is set to an invalid value.
604 Valid values are those in ACCEPTED_DIGESTS.
605 """
607 _is_webdav: Optional[bool] = None
609 # Configuration items for this class instances.
610 _config = HttpResourcePathConfig()
612 # The session for metadata requests is used for interacting with
613 # the front end servers for requests such as PROPFIND, HEAD, etc. Those
614 # interactions are typically served by the front end servers. We want to
615 # keep the connection to the front end servers open, to reduce the cost
616 # associated to TCP and TLS handshaking for each new request.
617 _metadata_session_store = SessionStore(
618 num_pools=5,
619 max_persistent_connections=_config.front_end_connections,
620 backoff_min=_config.backoff_min,
621 backoff_max=_config.backoff_max,
622 )
624 # The data session is used for interaction with the front end servers which
625 # typically redirect to the back end servers for serving our PUT and GET
626 # requests. We attempt to keep a single connection open with the front end
627 # server, if possible. This depends on how the server behaves and the
628 # kind of request. Some servers close the connection when redirecting
629 # the client to a back end server, for instance when serving a PUT
630 # request.
631 _data_session_store = SessionStore(
632 num_pools=25,
633 max_persistent_connections=_config.back_end_connections,
634 backoff_min=_config.backoff_min,
635 backoff_max=_config.backoff_max,
636 )
638 # Process ID which created the sessions above. We need to store this
639 # to replace sessions created by a parent process and inherited by a
640 # child process after a fork, to avoid confusing the SSL layer.
641 _pid: int = -1
643 @property
644 def metadata_session(self) -> requests.Session:
645 """Client session to send requests which do not require upload or
646 download of data, i.e. mostly metadata requests.
647 """
649 if hasattr(self, "_metadata_session") and self._pid == os.getpid():
650 return self._metadata_session
652 # Reset the store in case it was created by another process and
653 # retrieve a session.
654 self._metadata_session_store.clear()
655 self._pid = os.getpid()
656 self._metadata_session: requests.Session = self._metadata_session_store.get(self)
657 return self._metadata_session
659 @property
660 def data_session(self) -> requests.Session:
661 """Client session for uploading and downloading data."""
663 if hasattr(self, "_data_session") and self._pid == os.getpid():
664 return self._data_session
666 # Reset the store in case it was created by another process and
667 # retrieve a session.
668 self._data_session_store.clear()
669 self._pid = os.getpid()
670 self._data_session: requests.Session = self._data_session_store.get(self)
671 return self._data_session
673 def _clear_sessions(self) -> None:
674 """Internal method to close the socket connections still open. Used
675 only in test suites to avoid warnings.
676 """
677 self._metadata_session_store.clear()
678 self._data_session_store.clear()
680 if hasattr(self, "_metadata_session"):
681 delattr(self, "_metadata_session")
683 if hasattr(self, "_data_session"):
684 delattr(self, "_data_session")
686 @property
687 def is_webdav_endpoint(self) -> bool:
688 """Check if the current endpoint implements WebDAV features.
690 This is stored per URI but cached by root so there is
691 only one check per hostname.
692 """
693 if self._is_webdav is not None:
694 return self._is_webdav
696 self._is_webdav = _is_webdav_endpoint(self.root_uri())
697 return self._is_webdav
699 def exists(self) -> bool:
700 """Check that a remote HTTP resource exists."""
701 log.debug("Checking if resource exists: %s", self.geturl())
702 if not self.is_webdav_endpoint:
703 # The remote is a plain HTTP server. Let's attempt a HEAD
704 # request, even if the behavior for such a request against a
705 # directory is not specified, so it depends on the server
706 # implementation.
707 resp = self.metadata_session.head(
708 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
709 )
710 return resp.status_code == requests.codes.ok # 200
712 # The remote endpoint is a webDAV server: send a PROPFIND request
713 # to determine if it exists.
714 resp = self._propfind()
715 if resp.status_code == requests.codes.multi_status: # 207
716 prop = _parse_propfind_response_body(resp.text)[0]
717 return prop.exists
718 else: # 404 Not Found
719 return False
721 def size(self) -> int:
722 """Return the size of the remote resource in bytes."""
723 if self.dirLike:
724 return 0
726 if not self.is_webdav_endpoint:
727 # The remote is a plain HTTP server. Send a HEAD request to
728 # retrieve the size of the resource.
729 resp = self.metadata_session.head(
730 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
731 )
732 if resp.status_code == requests.codes.ok: # 200
733 if "Content-Length" in resp.headers:
734 return int(resp.headers["Content-Length"])
735 else:
736 raise ValueError(
737 f"Response to HEAD request to {self} does not contain 'Content-Length' header"
738 )
739 elif resp.status_code == requests.codes.not_found:
740 raise FileNotFoundError(
741 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
742 )
743 else:
744 raise ValueError(
745 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} "
746 f"{resp.reason}"
747 )
749 # The remote is a webDAV server: send a PROPFIND request to retrieve
750 # the size of the resource. Sizes are only meaningful for files.
751 resp = self._propfind()
752 if resp.status_code == requests.codes.multi_status: # 207
753 prop = _parse_propfind_response_body(resp.text)[0]
754 if prop.is_file:
755 return prop.size
756 elif prop.is_directory:
757 raise IsADirectoryError(
758 f"Resource {self} is reported by server as a directory but has a file path"
759 )
760 else:
761 raise FileNotFoundError(f"Resource {self} does not exist")
762 else: # 404 Not Found
763 raise FileNotFoundError(
764 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
765 )
767 def mkdir(self) -> None:
768 """Create the directory resource if it does not already exist."""
769 # Creating directories is only available on WebDAV back ends.
770 if not self.is_webdav_endpoint:
771 raise NotImplementedError(
772 f"Creation of directory {self} is not implemented by plain HTTP servers"
773 )
775 if not self.dirLike:
776 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
778 # Check if the target directory already exists.
779 resp = self._propfind()
780 if resp.status_code == requests.codes.multi_status: # 207
781 prop = _parse_propfind_response_body(resp.text)[0]
782 if prop.exists:
783 if prop.is_directory:
784 return
785 else:
786 # A file exists at this path
787 raise NotADirectoryError(
788 f"Can not create a directory for {self} because a file already exists at that path"
789 )
791 # Target directory does not exist. Create it and its ancestors as
792 # needed. We need to test if parent URL is different from self URL,
793 # otherwise we could be stuck in a recursive loop
794 # where self == parent.
795 if self.geturl() != self.parent().geturl():
796 self.parent().mkdir()
798 log.debug("Creating new directory: %s", self.geturl())
799 self._mkcol()
801 def remove(self) -> None:
802 """Remove the resource."""
803 self._delete()
805 def read(self, size: int = -1) -> bytes:
806 """Open the resource and return the contents in bytes.
808 Parameters
809 ----------
810 size : `int`, optional
811 The number of bytes to read. Negative or omitted indicates
812 that all data should be read.
813 """
815 # Use the data session as a context manager to ensure that the
816 # network connections to both the front end and back end servers are
817 # closed after downloading the data.
818 log.debug("Reading from remote resource: %s", self.geturl())
819 stream = True if size > 0 else False
820 with self.data_session as session:
821 with time_this(log, msg="GET %s", args=(self,)):
822 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout)
824 if resp.status_code != requests.codes.ok: # 200
825 raise FileNotFoundError(
826 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}"
827 )
828 if not stream:
829 return resp.content
830 else:
831 return next(resp.iter_content(chunk_size=size))
833 def write(self, data: bytes, overwrite: bool = True) -> None:
834 """Write the supplied bytes to the new resource.
836 Parameters
837 ----------
838 data : `bytes`
839 The bytes to write to the resource. The entire contents of the
840 resource will be replaced.
841 overwrite : `bool`, optional
842 If `True` the resource will be overwritten if it exists. Otherwise
843 the write will fail.
844 """
845 log.debug("Writing to remote resource: %s", self.geturl())
846 if not overwrite:
847 if self.exists():
848 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
850 # Ensure the parent directory exists.
851 self.parent().mkdir()
853 # Upload the data.
854 log.debug("Writing data to remote resource: %s", self.geturl())
855 self._put(data=data)
857 def transfer_from(
858 self,
859 src: ResourcePath,
860 transfer: str = "copy",
861 overwrite: bool = False,
862 transaction: Optional[TransactionProtocol] = None,
863 ) -> None:
864 """Transfer the current resource to a Webdav repository.
866 Parameters
867 ----------
868 src : `ResourcePath`
869 Source URI.
870 transfer : `str`
871 Mode to use for transferring the resource. Supports the following
872 options: copy.
873 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
874 Currently unused.
875 """
876 # Fail early to prevent delays if remote resources are requested.
877 if transfer not in self.transferModes:
878 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
880 # Existence checks cost time so do not call this unless we know
881 # that debugging is enabled.
882 if log.isEnabledFor(logging.DEBUG):
883 log.debug(
884 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
885 src,
886 src.exists(),
887 self,
888 self.exists(),
889 transfer,
890 )
892 # Short circuit immediately if the URIs are identical.
893 if self == src:
894 log.debug(
895 "Target and destination URIs are identical: %s, returning immediately."
896 " No further action required.",
897 self,
898 )
899 return
901 if not overwrite and self.exists():
902 raise FileExistsError(f"Destination path {self} already exists.")
904 if transfer == "auto":
905 transfer = self.transferDefault
907 # We can use webDAV 'COPY' or 'MOVE' if both the current and source
908 # resources are located in the same server.
909 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint:
910 log.debug("Transfer from %s to %s directly", src, self)
911 return self._move(src) if transfer == "move" else self._copy(src)
913 # For resources of different classes or for plain HTTP resources we can
914 # perform the copy or move operation by downloading to a local file
915 # and uploading to the destination.
916 self._copy_via_local(src)
918 # This was an explicit move, try to remove the source.
919 if transfer == "move":
920 src.remove()
922 def walk(
923 self, file_filter: Optional[Union[str, re.Pattern]] = None
924 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]:
925 """Walk the directory tree returning matching files and directories.
926 Parameters
927 ----------
928 file_filter : `str` or `re.Pattern`, optional
929 Regex to filter out files from the list before it is returned.
930 Yields
931 ------
932 dirpath : `ResourcePath`
933 Current directory being examined.
934 dirnames : `list` of `str`
935 Names of subdirectories within dirpath.
936 filenames : `list` of `str`
937 Names of all the files within dirpath.
938 """
939 if not self.dirLike:
940 raise ValueError("Can not walk a non-directory URI")
942 # Walking directories is only available on WebDAV back ends.
943 if not self.is_webdav_endpoint:
944 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers")
946 if isinstance(file_filter, str):
947 file_filter = re.compile(file_filter)
949 resp = self._propfind(depth="1")
950 if resp.status_code == requests.codes.multi_status: # 207
951 files: List[str] = []
952 dirs: List[str] = []
954 for prop in _parse_propfind_response_body(resp.text):
955 if prop.is_file:
956 files.append(prop.name)
957 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")):
958 # Only include the names of sub-directories not the name of
959 # the directory being walked.
960 dirs.append(prop.name)
962 if file_filter is not None:
963 files = [f for f in files if file_filter.search(f)]
965 if not dirs and not files:
966 return
967 else:
968 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files
970 for dir in dirs:
971 new_uri = self.join(dir, forceDirectory=True)
972 yield from new_uri.walk(file_filter)
974 def _as_local(self) -> Tuple[str, bool]:
975 """Download object over HTTP and place in temporary directory.
977 Returns
978 -------
979 path : `str`
980 Path to local temporary file.
981 temporary : `bool`
982 Always returns `True`. This is always a temporary file.
983 """
985 # Use the session as a context manager to ensure that connections
986 # to both the front end and back end servers are closed after the
987 # download operation is finished.
988 with self.data_session as session:
989 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout)
990 if resp.status_code != requests.codes.ok:
991 raise FileNotFoundError(
992 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}"
993 )
995 tmpdir, buffering = _get_temp_dir()
996 with tempfile.NamedTemporaryFile(
997 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
998 ) as tmpFile:
999 expected_length = int(resp.headers.get("Content-Length", "-1"))
1000 with time_this(
1001 log,
1002 msg="GET %s [length=%d] to local file %s [chunk_size=%d]",
1003 args=(self, expected_length, tmpFile.name, buffering),
1004 mem_usage=self._config.collect_memory_usage,
1005 mem_unit=u.mebibyte,
1006 ):
1007 content_length = 0
1008 for chunk in resp.iter_content(chunk_size=buffering):
1009 tmpFile.write(chunk)
1010 content_length += len(chunk)
1012 # Check that the expected and actual content lengths match. Perform
1013 # this check only when the contents of the file was not encoded by
1014 # the server.
1015 if "Content-Encoding" not in resp.headers:
1016 if expected_length >= 0 and expected_length != content_length:
1017 raise ValueError(
1018 f"Size of downloaded file does not match value in Content-Length header for {self}: "
1019 f"expecting {expected_length} and got {content_length} bytes"
1020 )
1022 return tmpFile.name, True
1024 def _send_webdav_request(
1025 self,
1026 method: str,
1027 url: Optional[str] = None,
1028 headers: dict[str, str] = {},
1029 body: Optional[str] = None,
1030 session: Optional[requests.Session] = None,
1031 timeout: Optional[tuple[float, float]] = None,
1032 ) -> requests.Response:
1033 """Send a webDAV request and correctly handle redirects.
1035 Parameters
1036 ----------
1037 method : `str`
1038 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL.
1039 headers : `dict`, optional
1040 A dictionary of key-value pairs (both strings) to include as
1041 headers in the request.
1042 body: `str`, optional
1043 The body of the request.
1045 Notes
1046 -----
1047 This way of sending webDAV requests is necessary for handling
1048 redirection ourselves, since the 'requests' package changes the method
1049 of the redirected request when the server responds with status 302 and
1050 the method of the original request is not HEAD (which is the case for
1051 webDAV requests).
1053 That means that when the webDAV server we interact with responds with
1054 a redirection to a PROPFIND or MKCOL request, the request gets
1055 converted to a GET request when sent to the redirected location.
1057 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in
1058 https://github.com/psf/requests/blob/main/requests/sessions.py
1060 This behavior of the 'requests' package is meant to be compatible with
1061 what is specified in RFC 9110:
1063 https://www.rfc-editor.org/rfc/rfc9110#name-302-found
1065 For our purposes, we do need to follow the redirection and send a new
1066 request using the same HTTP verb.
1067 """
1068 if url is None:
1069 url = self.geturl()
1071 if session is None:
1072 session = self.metadata_session
1074 if timeout is None:
1075 timeout = self._config.timeout
1077 with time_this(
1078 log,
1079 msg="%s %s",
1080 args=(
1081 method,
1082 url,
1083 ),
1084 mem_usage=self._config.collect_memory_usage,
1085 mem_unit=u.mebibyte,
1086 ):
1087 for _ in range(max_redirects := 5):
1088 resp = session.request(
1089 method,
1090 url,
1091 data=body,
1092 headers=headers,
1093 stream=False,
1094 timeout=timeout,
1095 allow_redirects=False,
1096 )
1097 if resp.is_redirect:
1098 url = resp.headers["Location"]
1099 else:
1100 return resp
1102 # We reached the maximum allowed number of redirects.
1103 # Stop trying.
1104 raise ValueError(
1105 f"Could not get a response to {method} request for {self} after "
1106 f"{max_redirects} redirections"
1107 )
1109 def _propfind(self, body: Optional[str] = None, depth: str = "0") -> requests.Response:
1110 """Send a PROPFIND webDAV request and return the response.
1112 Parameters
1113 ----------
1114 body : `str`, optional
1115 The body of the PROPFIND request to send to the server. If
1116 provided, it is expected to be a XML document.
1117 depth : `str`, optional
1118 The value of the 'Depth' header to include in the request.
1120 Returns
1121 -------
1122 response : `requests.Response`
1123 Response to the PROPFIND request.
1125 Notes
1126 -----
1127 It raises `ValueError` if the status code of the PROPFIND request
1128 is different from "207 Multistatus" or "404 Not Found".
1129 """
1130 if body is None:
1131 # Request only the DAV live properties we are explicitly interested
1132 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
1133 # and 'displayname'.
1134 body = (
1135 """<?xml version="1.0" encoding="utf-8" ?>"""
1136 """<D:propfind xmlns:D="DAV:"><D:prop>"""
1137 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
1138 """</D:prop></D:propfind>"""
1139 )
1140 headers = {
1141 "Depth": depth,
1142 "Content-Type": 'application/xml; charset="utf-8"',
1143 "Content-Length": str(len(body)),
1144 }
1145 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body)
1146 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found):
1147 return resp
1148 else:
1149 raise ValueError(
1150 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} "
1151 f"{resp.reason}"
1152 )
1154 def _options(self) -> requests.Response:
1155 """Send a OPTIONS webDAV request for this resource."""
1156 resp = self._send_webdav_request("OPTIONS")
1157 if resp.status_code in (requests.codes.ok, requests.codes.created):
1158 return resp
1160 raise ValueError(
1161 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} " f"{resp.reason}"
1162 )
1164 def _head(self) -> requests.Response:
1165 """Send a HEAD webDAV request for this resource."""
1167 return self._send_webdav_request("HEAD")
1169 def _mkcol(self) -> None:
1170 """Send a MKCOL webDAV request to create a collection. The collection
1171 may already exist.
1172 """
1173 resp = self._send_webdav_request("MKCOL")
1174 if resp.status_code == requests.codes.created: # 201
1175 return
1177 if resp.status_code == requests.codes.method_not_allowed: # 405
1178 # The remote directory already exists
1179 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
1180 else:
1181 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}")
1183 def _delete(self) -> None:
1184 """Send a DELETE webDAV request for this resource."""
1186 log.debug("Deleting %s ...", self.geturl())
1188 # If this is a directory, ensure the remote is a webDAV server because
1189 # plain HTTP servers don't support DELETE requests on non-file
1190 # paths.
1191 if self.dirLike and not self.is_webdav_endpoint:
1192 raise NotImplementedError(
1193 f"Deletion of directory {self} is not implemented by plain HTTP servers"
1194 )
1196 # Deleting non-empty directories may take some time, so increase
1197 # the timeout for getting a response from the server.
1198 timeout = self._config.timeout
1199 if self.dirLike:
1200 timeout = (timeout[0], timeout[1] * 100)
1201 resp = self._send_webdav_request("DELETE", timeout=timeout)
1202 if resp.status_code in (
1203 requests.codes.ok,
1204 requests.codes.accepted,
1205 requests.codes.no_content,
1206 requests.codes.not_found,
1207 ):
1208 # We can get a "404 Not Found" error when the file or directory
1209 # does not exist or when the DELETE request was retried several
1210 # times and a previous attempt actually deleted the resource.
1211 # Therefore we consider that a "Not Found" response is not an
1212 # error since we reached the state desired by the user.
1213 return
1214 else:
1215 # TODO: the response to a DELETE request against a webDAV server
1216 # may be multistatus. If so, we need to parse the reponse body to
1217 # determine more precisely the reason of the failure (e.g. a lock)
1218 # and provide a more helpful error message.
1219 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}")
1221 def _copy_via_local(self, src: ResourcePath) -> None:
1222 """Replace the contents of this resource with the contents of a remote
1223 resource by using a local temporary file.
1225 Parameters
1226 ----------
1227 src : `HttpResourcePath`
1228 The source of the contents to copy to `self`.
1229 """
1230 with src.as_local() as local_uri:
1231 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri)
1232 with open(local_uri.ospath, "rb") as f:
1233 self._put(data=f)
1235 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None:
1236 """Send a COPY or MOVE webDAV request to copy or replace the contents
1237 of this resource with the contents of another resource located in the
1238 same server.
1240 Parameters
1241 ----------
1242 method : `str`
1243 The method to perform. Valid values are "COPY" or "MOVE" (in
1244 uppercase).
1246 src : `HttpResourcePath`
1247 The source of the contents to move to `self`.
1248 """
1249 headers = {"Destination": self.geturl()}
1250 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session)
1251 if resp.status_code in (requests.codes.created, requests.codes.no_content):
1252 return
1254 if resp.status_code == requests.codes.multi_status:
1255 tree = eTree.fromstring(resp.content)
1256 status_element = tree.find("./{DAV:}response/{DAV:}status")
1257 status = status_element.text if status_element is not None else "unknown"
1258 error = tree.find("./{DAV:}response/{DAV:}error")
1259 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}")
1260 else:
1261 raise ValueError(
1262 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}"
1263 )
1265 def _copy(self, src: HttpResourcePath) -> None:
1266 """Send a COPY webDAV request to replace the contents of this resource
1267 (if any) with the contents of another resource located in the same
1268 server.
1270 Parameters
1271 ----------
1272 src : `HttpResourcePath`
1273 The source of the contents to copy to `self`.
1274 """
1275 # Neither dCache nor XrootD currently implement the COPY
1276 # webDAV method as documented in
1277 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
1278 # (See issues DM-37603 and DM-37651 for details)
1279 #
1280 # For the time being, we use a temporary local file to
1281 # perform the copy client side.
1282 # TODO: when those 2 issues above are solved remove the 3 lines below.
1283 must_use_local = True
1284 if must_use_local:
1285 return self._copy_via_local(src)
1287 return self._copy_or_move("COPY", src)
1289 def _move(self, src: HttpResourcePath) -> None:
1290 """Send a MOVE webDAV request to replace the contents of this resource
1291 with the contents of another resource located in the same server.
1293 Parameters
1294 ----------
1295 src : `HttpResourcePath`
1296 The source of the contents to move to `self`.
1297 """
1298 return self._copy_or_move("MOVE", src)
1300 def _put(self, data: Union[BinaryIO, bytes]) -> None:
1301 """Perform an HTTP PUT request and handle redirection.
1303 Parameters
1304 ----------
1305 data : `Union[BinaryIO, bytes]`
1306 The data to be included in the body of the PUT request.
1307 """
1308 # Retrieve the final URL for this upload by sending a PUT request with
1309 # no content. Follow a single server redirection to retrieve the
1310 # final URL.
1311 headers = {"Content-Length": "0"}
1312 if self._config.send_expect_on_put:
1313 headers["Expect"] = "100-continue"
1315 url = self.geturl()
1317 # Use the session as a context manager to ensure the underlying
1318 # connections are closed after finishing uploading the data.
1319 with self.data_session as session:
1320 # Send an empty PUT request to get redirected to the final
1321 # destination.
1322 log.debug("Sending empty PUT request to %s", url)
1323 with time_this(
1324 log,
1325 msg="PUT (no data) %s",
1326 args=(url,),
1327 mem_usage=self._config.collect_memory_usage,
1328 mem_unit=u.mebibyte,
1329 ):
1330 resp = session.request(
1331 "PUT",
1332 url,
1333 data=None,
1334 headers=headers,
1335 stream=False,
1336 timeout=self._config.timeout,
1337 allow_redirects=False,
1338 )
1339 if resp.is_redirect:
1340 url = resp.headers["Location"]
1342 # Upload the data to the final destination.
1343 log.debug("Uploading data to %s", url)
1345 # Ask the server to compute and record a checksum of the uploaded
1346 # file contents, for later integrity checks. Since we don't compute
1347 # the digest ourselves while uploading the data, we cannot control
1348 # after the request is complete that the data we uploaded is
1349 # identical to the data recorded by the server, but at least the
1350 # server has recorded a digest of the data it stored.
1351 #
1352 # See RFC-3230 for details and
1353 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
1354 # for the list of supported digest algorithhms.
1355 # In addition, note that not all servers implement this RFC so
1356 # the checksum may not be computed by the server.
1357 put_headers: Optional[dict[str, str]] = None
1358 if digest := self._config.digest_algorithm:
1359 put_headers = {"Want-Digest": digest}
1361 with time_this(
1362 log,
1363 msg="PUT %s",
1364 args=(url,),
1365 mem_usage=self._config.collect_memory_usage,
1366 mem_unit=u.mebibyte,
1367 ):
1368 resp = session.request(
1369 "PUT",
1370 url,
1371 data=data,
1372 headers=put_headers,
1373 stream=False,
1374 timeout=self._config.timeout,
1375 allow_redirects=False,
1376 )
1377 if resp.status_code in (
1378 requests.codes.ok,
1379 requests.codes.created,
1380 requests.codes.no_content,
1381 ):
1382 return
1383 else:
1384 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}")
1386 @contextlib.contextmanager
1387 def _openImpl(
1388 self,
1389 mode: str = "r",
1390 *,
1391 encoding: Optional[str] = None,
1392 ) -> Iterator[ResourceHandleProtocol]:
1393 resp = self._head()
1394 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes"
1395 handle: ResourceHandleProtocol
1396 if mode in ("rb", "r") and accepts_range:
1397 handle = HttpReadResourceHandle(
1398 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout
1399 )
1400 if mode == "r":
1401 # cast because the protocol is compatible, but does not have
1402 # BytesIO in the inheritance tree
1403 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
1404 else:
1405 yield handle
1406 else:
1407 with super()._openImpl(mode, encoding=encoding) as http_handle:
1408 yield http_handle
1411def _dump_response(resp: requests.Response) -> None:
1412 """Log the contents of a HTTP or webDAV request and its response.
1414 Parameters
1415 ----------
1416 resp : `requests.Response`
1417 The response to log.
1419 Notes
1420 -----
1421 Intended for development purposes only.
1422 """
1423 log.debug("-----------------------------------------------")
1424 log.debug("Request")
1425 log.debug(" method=%s", resp.request.method)
1426 log.debug(" URL=%s", resp.request.url)
1427 log.debug(" headers=%s", resp.request.headers)
1428 if resp.request.method == "PUT":
1429 log.debug(" body=<data>")
1430 elif resp.request.body is None:
1431 log.debug(" body=<empty>")
1432 else:
1433 log.debug(" body=%r", resp.request.body[:120])
1435 log.debug("Response:")
1436 log.debug(" status_code=%d", resp.status_code)
1437 log.debug(" headers=%s", resp.headers)
1438 if not resp.content:
1439 log.debug(" body=<empty>")
1440 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain":
1441 log.debug(" body=%r", resp.content)
1442 else:
1443 log.debug(" body=%r", resp.content[:80])
1446def _is_protected(filepath: str) -> bool:
1447 """Return true if the permissions of file at filepath only allow for access
1448 by its owner.
1450 Parameters
1451 ----------
1452 filepath : `str`
1453 Path of a local file.
1454 """
1455 if not os.path.isfile(filepath):
1456 return False
1457 mode = stat.S_IMODE(os.stat(filepath).st_mode)
1458 owner_accessible = bool(mode & stat.S_IRWXU)
1459 group_accessible = bool(mode & stat.S_IRWXG)
1460 other_accessible = bool(mode & stat.S_IRWXO)
1461 return owner_accessible and not group_accessible and not other_accessible
1464def _parse_propfind_response_body(body: str) -> List[DavProperty]:
1465 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND
1466 request.
1468 Parameters
1469 ----------
1470 body : `str`
1471 XML-encoded response body to a PROPFIND request
1473 Returns
1474 -------
1475 responses : `List[DavProperty]`
1477 Notes
1478 -----
1479 Is is expected that there is at least one reponse in `body`, otherwise
1480 this function raises.
1481 """
1482 # A response body to a PROPFIND request is of the form (indented for
1483 # readability):
1484 #
1485 # <?xml version="1.0" encoding="UTF-8"?>
1486 # <D:multistatus xmlns:D="DAV:">
1487 # <D:response>
1488 # <D:href>path/to/resource</D:href>
1489 # <D:propstat>
1490 # <D:prop>
1491 # <D:resourcetype>
1492 # <D:collection xmlns:D="DAV:"/>
1493 # </D:resourcetype>
1494 # <D:getlastmodified>
1495 # Fri, 27 Jan 2 023 13:59:01 GMT
1496 # </D:getlastmodified>
1497 # <D:getcontentlength>
1498 # 12345
1499 # </D:getcontentlength>
1500 # </D:prop>
1501 # <D:status>
1502 # HTTP/1.1 200 OK
1503 # </D:status>
1504 # </D:propstat>
1505 # </D:response>
1506 # <D:response>
1507 # ...
1508 # </D:response>
1509 # <D:response>
1510 # ...
1511 # </D:response>
1512 # </D:multistatus>
1514 # Scan all the 'response' elements and extract the relevant properties
1515 responses = []
1516 multistatus = eTree.fromstring(body.strip())
1517 for response in multistatus.findall("./{DAV:}response"):
1518 responses.append(DavProperty(response))
1520 if responses:
1521 return responses
1522 else:
1523 # Could not parse the body
1524 raise ValueError(f"Unable to parse response for PROPFIND request: {body}")
1527class DavProperty:
1528 """Helper class to encapsulate select live DAV properties of a single
1529 resource, as retrieved via a PROPFIND request.
1530 """
1532 # Regular expression to compare against the 'status' element of a
1533 # PROPFIND response's 'propstat' element.
1534 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE)
1536 def __init__(self, response: Optional[eTree.Element]):
1537 self._href: str = ""
1538 self._displayname: str = ""
1539 self._collection: bool = False
1540 self._getlastmodified: str = ""
1541 self._getcontentlength: int = -1
1543 if response is not None:
1544 self._parse(response)
1546 def _parse(self, response: eTree.Element) -> None:
1547 # Extract 'href'.
1548 if (element := response.find("./{DAV:}href")) is not None:
1549 # We need to use "str(element.text)"" instead of "element.text" to
1550 # keep mypy happy.
1551 self._href = str(element.text).strip()
1552 else:
1553 raise ValueError(
1554 f"Property 'href' expected but not found in PROPFIND response: "
1555 f"{eTree.tostring(response, encoding='unicode')}"
1556 )
1558 for propstat in response.findall("./{DAV:}propstat"):
1559 # Only extract properties of interest with status OK.
1560 status = propstat.find("./{DAV:}status")
1561 if status is None or not self._status_ok_rex.match(str(status.text)):
1562 continue
1564 for prop in propstat.findall("./{DAV:}prop"):
1565 # Parse "collection".
1566 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None:
1567 self._collection = True
1569 # Parse "getlastmodified".
1570 if (element := prop.find("./{DAV:}getlastmodified")) is not None:
1571 self._getlastmodified = str(element.text)
1573 # Parse "getcontentlength".
1574 if (element := prop.find("./{DAV:}getcontentlength")) is not None:
1575 self._getcontentlength = int(str(element.text))
1577 # Parse "displayname".
1578 if (element := prop.find("./{DAV:}displayname")) is not None:
1579 self._displayname = str(element.text)
1581 # Some webDAV servers don't include the 'displayname' property in the
1582 # response so try to infer it from the value of the 'href' property.
1583 # Depending on the server the href value may end with '/'.
1584 if not self._displayname:
1585 self._displayname = os.path.basename(self._href.rstrip("/"))
1587 # Force a size of 0 for collections.
1588 if self._collection:
1589 self._getcontentlength = 0
1591 @property
1592 def exists(self) -> bool:
1593 # It is either a directory or a file with length of at least zero
1594 return self._collection or self._getcontentlength >= 0
1596 @property
1597 def is_directory(self) -> bool:
1598 return self._collection
1600 @property
1601 def is_file(self) -> bool:
1602 return not self._collection
1604 @property
1605 def size(self) -> int:
1606 return self._getcontentlength
1608 @property
1609 def name(self) -> str:
1610 return self._displayname
1612 @property
1613 def href(self) -> str:
1614 return self._href