Coverage for python/lsst/resources/http.py: 23%
590 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-19 11:17 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-19 11:17 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import math
21import os
22import os.path
23import random
24import re
25import stat
26import tempfile
27from collections.abc import Iterator
28from typing import TYPE_CHECKING, BinaryIO, cast
30try:
31 # Prefer 'defusedxml' (not part of standard library) if available, since
32 # 'xml' is vulnerable to XML bombs.
33 import defusedxml.ElementTree as eTree
34except ImportError:
35 import xml.etree.ElementTree as eTree
37from urllib.parse import parse_qs
39import requests
40from astropy import units as u
41from lsst.utils.timer import time_this
42from requests.adapters import HTTPAdapter
43from requests.auth import AuthBase
44from urllib3.util.retry import Retry
46from ._resourceHandles import ResourceHandleProtocol
47from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle, parse_content_range_header
48from ._resourcePath import ResourcePath
50if TYPE_CHECKING:
51 from .utils import TransactionProtocol
53log = logging.getLogger(__name__)
56def _timeout_from_environment(env_var: str, default_value: float) -> float:
57 """Convert and return a timeout from the value of an environment variable
58 or a default value if the environment variable is not initialized. The
59 value of `env_var` must be a valid `float` otherwise this function raises.
61 Parameters
62 ----------
63 env_var : `str`
64 Environment variable to look for.
65 default_value : `float``
66 Value to return if `env_var` is not defined in the environment.
68 Returns
69 -------
70 _timeout_from_environment : `float`
71 Converted value.
72 """
73 try:
74 timeout = float(os.environ.get(env_var, default_value))
75 except ValueError:
76 raise ValueError(
77 f"Expecting valid timeout value in environment variable {env_var} but found "
78 f"{os.environ.get(env_var)}"
79 ) from None
81 if math.isnan(timeout):
82 raise ValueError(f"Unexpected timeout value NaN found in environment variable {env_var}")
84 return timeout
87class HttpResourcePathConfig:
88 """Configuration class to encapsulate the configurable items used by class
89 HttpResourcePath.
90 """
92 # Default timeouts for all HTTP requests (seconds).
93 DEFAULT_TIMEOUT_CONNECT = 30.0
94 DEFAULT_TIMEOUT_READ = 1_500.0
96 # Default lower and upper bounds for the backoff interval (seconds).
97 # A value in this interval is randomly selected as the backoff factor when
98 # requests need to be retried.
99 DEFAULT_BACKOFF_MIN = 1.0
100 DEFAULT_BACKOFF_MAX = 3.0
102 # Default number of connections to persist with both the front end and
103 # back end servers.
104 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2
105 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1
107 # Accepted digest algorithms
108 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512")
110 _front_end_connections: int | None = None
111 _back_end_connections: int | None = None
112 _digest_algorithm: str | None = None
113 _send_expect_on_put: bool | None = None
114 _timeout: tuple[float, float] | None = None
115 _collect_memory_usage: bool | None = None
116 _backoff_min: float | None = None
117 _backoff_max: float | None = None
119 @property
120 def front_end_connections(self) -> int:
121 """Number of persistent connections to the front end server."""
122 if self._front_end_connections is not None: 122 ↛ 123line 122 didn't jump to line 123, because the condition on line 122 was never true
123 return self._front_end_connections
125 try:
126 self._front_end_connections = int(
127 os.environ.get(
128 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
129 )
130 )
131 except ValueError:
132 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
134 return self._front_end_connections
136 @property
137 def back_end_connections(self) -> int:
138 """Number of persistent connections to the back end servers."""
139 if self._back_end_connections is not None: 139 ↛ 140line 139 didn't jump to line 140, because the condition on line 139 was never true
140 return self._back_end_connections
142 try:
143 self._back_end_connections = int(
144 os.environ.get(
145 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
146 )
147 )
148 except ValueError:
149 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
151 return self._back_end_connections
153 @property
154 def digest_algorithm(self) -> str:
155 """Algorithm to ask the server to use for computing and recording
156 digests of each file contents in PUT requests.
158 Returns
159 -------
160 digest_algorithm: `str`
161 The name of a digest algorithm or the empty string if no algotihm
162 is configured.
163 """
164 if self._digest_algorithm is not None:
165 return self._digest_algorithm
167 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower()
168 if digest not in self.ACCEPTED_DIGESTS:
169 digest = ""
171 self._digest_algorithm = digest
172 return self._digest_algorithm
174 @property
175 def send_expect_on_put(self) -> bool:
176 """Return True if a "Expect: 100-continue" header is to be sent to
177 the server on each PUT request.
179 Some servers (e.g. dCache) uses this information as an indication that
180 the client knows how to handle redirects to the specific server that
181 will actually receive the data for PUT requests.
182 """
183 if self._send_expect_on_put is not None:
184 return self._send_expect_on_put
186 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
187 return self._send_expect_on_put
189 @property
190 def timeout(self) -> tuple[float, float]:
191 """Return a tuple with the values of timeouts for connecting to the
192 server and reading its response, respectively. Both values are in
193 seconds.
194 """
195 if self._timeout is not None:
196 return self._timeout
198 self._timeout = (
199 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT),
200 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ),
201 )
202 return self._timeout
204 @property
205 def collect_memory_usage(self) -> bool:
206 """Return true if we want to collect memory usage when timing
207 operations against the remote server via the `lsst.utils.time_this`
208 context manager.
209 """
210 if self._collect_memory_usage is not None:
211 return self._collect_memory_usage
213 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ
214 return self._collect_memory_usage
216 @property
217 def backoff_min(self) -> float:
218 """Lower bound of the interval from which a backoff factor is randomly
219 selected when retrying requests (seconds).
220 """
221 if self._backoff_min is not None:
222 return self._backoff_min
224 self._backoff_min = self.DEFAULT_BACKOFF_MIN
225 try:
226 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN))
227 if not math.isnan(backoff_min): 227 ↛ 232line 227 didn't jump to line 232, because the condition on line 227 was never false
228 self._backoff_min = backoff_min
229 except ValueError:
230 pass
232 return self._backoff_min
234 @property
235 def backoff_max(self) -> float:
236 """Upper bound of the interval from which a backoff factor is randomly
237 selected when retrying requests (seconds).
238 """
239 if self._backoff_max is not None:
240 return self._backoff_max
242 self._backoff_max = self.DEFAULT_BACKOFF_MAX
243 try:
244 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX))
245 if not math.isnan(backoff_max): 245 ↛ 250line 245 didn't jump to line 250, because the condition on line 245 was never false
246 self._backoff_max = backoff_max
247 except ValueError:
248 pass
250 return self._backoff_max
253@functools.lru_cache
254def _is_webdav_endpoint(path: ResourcePath | str) -> bool:
255 """Check whether the remote HTTP endpoint implements WebDAV features.
257 Parameters
258 ----------
259 path : `ResourcePath` or `str`
260 URL to the resource to be checked.
261 Should preferably refer to the root since the status is shared
262 by all paths in that server.
264 Returns
265 -------
266 _is_webdav_endpoint : `bool`
267 True if the endpoint implements WebDAV, False if it doesn't.
268 """
269 log.debug("Detecting HTTP endpoint type for '%s'...", path)
271 # Send an OPTIONS request and inspect its response. An OPTIONS
272 # request does not need authentication of the client, so we don't need
273 # to provide a client certificate or a bearer token. We set a
274 # relatively short timeout since an OPTIONS request is relatively cheap
275 # for the server to compute.
277 # Create a session for configuring retries
278 retries = Retry(
279 # Total number of retries to allow. Takes precedence over other
280 # counts.
281 total=6,
282 # How many connection-related errors to retry on.
283 connect=3,
284 # How many times to retry on read errors.
285 read=3,
286 # How many times to retry on bad status codes.
287 status=5,
288 # Set of uppercased HTTP method verbs that we should retry on.
289 allowed_methods=frozenset(
290 [
291 "OPTIONS",
292 ]
293 ),
294 # HTTP status codes that we should force a retry on.
295 status_forcelist=frozenset(
296 [
297 requests.codes.too_many_requests, # 429
298 requests.codes.internal_server_error, # 500
299 requests.codes.bad_gateway, # 502
300 requests.codes.service_unavailable, # 503
301 requests.codes.gateway_timeout, # 504
302 ]
303 ),
304 # Whether to respect 'Retry-After' header on status codes defined
305 # above.
306 respect_retry_after_header=True,
307 )
309 try:
310 session = requests.Session()
311 session.mount(str(path), HTTPAdapter(max_retries=retries))
312 session.verify = os.environ.get("LSST_HTTP_CACERT_BUNDLE", True)
313 with session:
314 resp = session.options(
315 str(path),
316 stream=False,
317 timeout=(
318 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", 30.0),
319 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", 60.0),
320 ),
321 )
322 if resp.status_code not in (requests.codes.ok, requests.codes.created):
323 return False
325 # Check that "1" is part of the value of the "DAV" header. We don't
326 # use locks, so a server complying to class 1 is enough for our
327 # purposes. All webDAV servers must advertise at least compliance
328 # class "1".
329 #
330 # Compliance classes are documented in
331 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
332 #
333 # Examples of values for header DAV are:
334 # DAV: 1, 2
335 # DAV: 1, <http://apache.org/dav/propset/fs/1>
336 if "DAV" not in resp.headers:
337 return False
338 else:
339 # Convert to str to keep mypy happy
340 compliance_class = str(resp.headers.get("DAV"))
341 return "1" in compliance_class.replace(" ", "").split(",")
343 except requests.exceptions.SSLError as e:
344 log.warning(
345 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to "
346 "specify tha path to a bundle of certificate authorities you trust "
347 "which are not included in the default set of trusted authorities "
348 "of this system."
349 )
350 raise e
353# Tuple (path, block_size) pointing to the location of a local directory
354# to save temporary files and the block size of the underlying file system.
355_TMPDIR: tuple[str, int] | None = None
358def _get_temp_dir() -> tuple[str, int]:
359 """Return the temporary directory path and block size.
361 This function caches its results in _TMPDIR.
362 """
363 global _TMPDIR
364 if _TMPDIR:
365 return _TMPDIR
367 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
368 # 'TMPDIR', if defined. Otherwise use current working directory.
369 tmpdir = os.getcwd()
370 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
371 if dir and os.path.isdir(dir):
372 tmpdir = dir
373 break
375 # Compute the block size as 256 blocks of typical size
376 # (i.e. 4096 bytes) or 10 times the file system block size,
377 # whichever is higher. This is a reasonable compromise between
378 # using memory for buffering and the number of system calls
379 # issued to read from or write to temporary files.
380 fsstats = os.statvfs(tmpdir)
381 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
384class BearerTokenAuth(AuthBase):
385 """Attach a bearer token 'Authorization' header to each request.
387 Parameters
388 ----------
389 token : `str`
390 Can be either the path to a local protected file which contains the
391 value of the token or the token itself.
392 """
394 def __init__(self, token: str):
395 self._token = self._path = None
396 self._mtime: float = -1.0
397 if not token:
398 return
400 self._token = token
401 if os.path.isfile(token):
402 self._path = os.path.abspath(token)
403 if not _is_protected(self._path):
404 raise PermissionError(
405 f"Bearer token file at {self._path} must be protected for access only by its owner"
406 )
407 self._refresh()
409 def _refresh(self) -> None:
410 """Read the token file (if any) if its modification time is more recent
411 than the last time we read it.
412 """
413 if not self._path:
414 return
416 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
417 log.debug("Reading bearer token file at %s", self._path)
418 self._mtime = mtime
419 with open(self._path) as f:
420 self._token = f.read().rstrip("\n")
422 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
423 # Only add a bearer token to a request when using secure HTTP.
424 if req.url and req.url.lower().startswith("https://") and self._token:
425 self._refresh()
426 req.headers["Authorization"] = f"Bearer {self._token}"
427 return req
430class SessionStore:
431 """Cache a reusable HTTP client session per endpoint.
433 Parameters
434 ----------
435 num_pools : `int`, optional
436 Number of connection pools to keep: there is one pool per remote
437 host.
438 max_persistent_connections : `int`, optional
439 Maximum number of connections per remote host to persist in each
440 connection pool.
441 backoff_min : `float`, optional
442 Minimum value of the interval to compute the exponential
443 backoff factor when retrying requests (seconds).
444 backoff_max : `float`, optional
445 Maximum value of the interval to compute the exponential
446 backoff factor when retrying requests (seconds).
447 """
449 def __init__(
450 self,
451 num_pools: int = 10,
452 max_persistent_connections: int = 1,
453 backoff_min: float = 1.0,
454 backoff_max: float = 3.0,
455 ) -> None:
456 # Dictionary to store the session associated to a given URI. The key
457 # of the dictionary is a root URI and the value is the session.
458 self._sessions: dict[str, requests.Session] = {}
460 # See documentation of urllib3 PoolManager class:
461 # https://urllib3.readthedocs.io
462 self._num_pools: int = num_pools
464 # See urllib3 Advanced Usage documentation:
465 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html
466 self._max_persistent_connections: int = max_persistent_connections
468 # Minimum and maximum values of the interval to compute the exponential
469 # backoff factor when retrying requests (seconds).
470 self._backoff_min: float = backoff_min
471 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0
473 def clear(self) -> None:
474 """Destroy all previously created sessions and attempt to close
475 underlying idle network connections.
476 """
477 # Close all sessions and empty the store. Idle network connections
478 # should be closed as a consequence. We don't have means through
479 # the API exposed by Requests to actually force closing the
480 # underlying open sockets.
481 for session in self._sessions.values():
482 session.close()
484 self._sessions.clear()
486 def get(self, rpath: ResourcePath) -> requests.Session:
487 """Retrieve a session for accessing the remote resource at rpath.
489 Parameters
490 ----------
491 rpath : `ResourcePath`
492 URL to a resource at the remote server for which a session is to
493 be retrieved.
495 Notes
496 -----
497 Once a session is created for a given endpoint it is cached and
498 returned every time a session is requested for any path under that same
499 endpoint. For instance, a single session will be cached and shared
500 for paths "https://www.example.org/path/to/file" and
501 "https://www.example.org/any/other/path".
503 Note that "https://www.example.org" and "https://www.example.org:12345"
504 will have different sessions since the port number is not identical.
506 In order to configure the session, some environment variables are
507 inspected:
509 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
510 certificates to trust when verifying the server's certificate.
512 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
513 local file containing a bearer token to be used as the client
514 authentication mechanism with all requests.
515 The permissions of the token file must be set so that only its
516 owner can access it.
517 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
518 and LSST_HTTP_AUTH_CLIENT_KEY.
520 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
521 client certificate for authenticating to the server.
522 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
523 initialized with the path of the client private key file.
524 The permissions of the client private key must be set so that only
525 its owner can access it, at least for reading.
526 """
527 root_uri = str(rpath.root_uri())
528 if root_uri not in self._sessions:
529 # We don't have yet a session for this endpoint: create a new one.
530 self._sessions[root_uri] = self._make_session(rpath)
532 return self._sessions[root_uri]
534 def _make_session(self, rpath: ResourcePath) -> requests.Session:
535 """Make a new session configured from values from the environment."""
536 session = requests.Session()
537 root_uri = str(rpath.root_uri())
538 log.debug("Creating new HTTP session for endpoint %s ...", root_uri)
539 retries = Retry(
540 # Total number of retries to allow. Takes precedence over other
541 # counts.
542 total=6,
543 # How many connection-related errors to retry on.
544 connect=3,
545 # How many times to retry on read errors.
546 read=3,
547 # Backoff factor to apply between attempts after the second try
548 # (seconds). Compute a random jitter to prevent all the clients
549 # to overwhelm the server by sending requests at the same time.
550 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(),
551 # How many times to retry on bad status codes.
552 status=5,
553 # Set of uppercased HTTP method verbs that we should retry on.
554 # We only automatically retry idempotent requests.
555 allowed_methods=frozenset(
556 [
557 "COPY",
558 "DELETE",
559 "GET",
560 "HEAD",
561 "MKCOL",
562 "OPTIONS",
563 "PROPFIND",
564 "PUT",
565 ]
566 ),
567 # HTTP status codes that we should force a retry on.
568 status_forcelist=frozenset(
569 [
570 requests.codes.too_many_requests, # 429
571 requests.codes.internal_server_error, # 500
572 requests.codes.bad_gateway, # 502
573 requests.codes.service_unavailable, # 503
574 requests.codes.gateway_timeout, # 504
575 ]
576 ),
577 # Whether to respect Retry-After header on status codes defined
578 # above.
579 respect_retry_after_header=True,
580 )
582 # Persist the specified number of connections to the front end server.
583 session.mount(
584 root_uri,
585 HTTPAdapter(
586 pool_connections=self._num_pools,
587 pool_maxsize=self._max_persistent_connections,
588 pool_block=False,
589 max_retries=retries,
590 ),
591 )
593 # Do not persist the connections to back end servers which may vary
594 # from request to request. Systematically persisting connections to
595 # those servers may exhaust their capabilities when there are thousands
596 # of simultaneous clients.
597 session.mount(
598 f"{rpath.scheme}://",
599 HTTPAdapter(
600 pool_connections=self._num_pools,
601 pool_maxsize=0,
602 pool_block=False,
603 max_retries=retries,
604 ),
605 )
607 # If the remote endpoint doesn't use secure HTTP we don't include
608 # bearer tokens in the requests nor need to authenticate the remote
609 # server.
610 if rpath.scheme != "https":
611 return session
613 # Should we use a specific CA cert bundle for authenticating the
614 # server?
615 session.verify = True
616 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
617 session.verify = ca_bundle
619 # Should we use bearer tokens for client authentication?
620 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
621 log.debug("... using bearer token authentication")
622 session.auth = BearerTokenAuth(token)
623 return session
625 # Should we instead use client certificate and private key? If so, both
626 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
627 # initialized.
628 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
629 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
630 if client_cert and client_key:
631 if not _is_protected(client_key):
632 raise PermissionError(
633 f"Private key file at {client_key} must be protected for access only by its owner"
634 )
635 log.debug("... using client certificate authentication.")
636 session.cert = (client_cert, client_key)
637 return session
639 if client_cert:
640 # Only the client certificate was provided.
641 raise ValueError(
642 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
643 )
645 if client_key:
646 # Only the client private key was provided.
647 raise ValueError(
648 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
649 )
651 log.debug(
652 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
653 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
654 )
655 return session
658class HttpResourcePath(ResourcePath):
659 """General HTTP(S) resource.
661 Notes
662 -----
663 In order to configure the behavior of instances of this class, the
664 environment variables below are inspected:
666 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
667 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
668 This header is required by some servers to detect if the client
669 knows how to handle redirections. In case of redirection, the body
670 of the PUT request is sent to the redirected location and not to
671 the front end server.
673 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a
674 numeric value, they are interpreted as the number of seconds to wait
675 for establishing a connection with the server and for reading its
676 response, respectively.
678 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and
679 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number
680 of connections to attempt to persist with both the front end servers
681 and the back end servers.
682 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and
683 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS.
685 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to
686 ask the server to compute for every file's content sent to the server
687 via a PUT request. No digest is requested if this variable is not set
688 or is set to an invalid value.
689 Valid values are those in ACCEPTED_DIGESTS.
690 """
692 _is_webdav: bool | None = None
694 # Configuration items for this class instances.
695 _config = HttpResourcePathConfig()
697 # The session for metadata requests is used for interacting with
698 # the front end servers for requests such as PROPFIND, HEAD, etc. Those
699 # interactions are typically served by the front end servers. We want to
700 # keep the connection to the front end servers open, to reduce the cost
701 # associated to TCP and TLS handshaking for each new request.
702 _metadata_session_store = SessionStore(
703 num_pools=5,
704 max_persistent_connections=_config.front_end_connections,
705 backoff_min=_config.backoff_min,
706 backoff_max=_config.backoff_max,
707 )
709 # The data session is used for interaction with the front end servers which
710 # typically redirect to the back end servers for serving our PUT and GET
711 # requests. We attempt to keep a single connection open with the front end
712 # server, if possible. This depends on how the server behaves and the
713 # kind of request. Some servers close the connection when redirecting
714 # the client to a back end server, for instance when serving a PUT
715 # request.
716 _data_session_store = SessionStore(
717 num_pools=25,
718 max_persistent_connections=_config.back_end_connections,
719 backoff_min=_config.backoff_min,
720 backoff_max=_config.backoff_max,
721 )
723 # Process ID which created the session stores above. We need to store this
724 # to replace sessions created by a parent process and inherited by a
725 # child process after a fork, to avoid confusing the SSL layer.
726 _pid: int = -1
728 @property
729 def metadata_session(self) -> requests.Session:
730 """Client session to send requests which do not require upload or
731 download of data, i.e. mostly metadata requests.
732 """
733 if hasattr(self, "_metadata_session"):
734 if HttpResourcePath._pid == os.getpid():
735 return self._metadata_session
736 else:
737 # The metadata session we have in cache was likely created by
738 # a parent process. Discard all the sessions in that store.
739 self._metadata_session_store.clear()
741 # Retrieve a new metadata session.
742 HttpResourcePath._pid = os.getpid()
743 self._metadata_session: requests.Session = self._metadata_session_store.get(self)
744 return self._metadata_session
746 @property
747 def data_session(self) -> requests.Session:
748 """Client session for uploading and downloading data."""
749 if hasattr(self, "_data_session"):
750 if HttpResourcePath._pid == os.getpid():
751 return self._data_session
752 else:
753 # The data session we have in cache was likely created by
754 # a parent process. Discard all the sessions in that store.
755 self._data_session_store.clear()
757 # Retrieve a new data session.
758 HttpResourcePath._pid = os.getpid()
759 self._data_session: requests.Session = self._data_session_store.get(self)
760 return self._data_session
762 def _clear_sessions(self) -> None:
763 """Close the socket connections that are still open.
765 Used only in test suites to avoid warnings.
766 """
767 self._metadata_session_store.clear()
768 self._data_session_store.clear()
770 if hasattr(self, "_metadata_session"):
771 delattr(self, "_metadata_session")
773 if hasattr(self, "_data_session"):
774 delattr(self, "_data_session")
776 @property
777 def is_webdav_endpoint(self) -> bool:
778 """Check if the current endpoint implements WebDAV features.
780 This is stored per URI but cached by root so there is
781 only one check per hostname.
782 """
783 if self._is_webdav is not None:
784 return self._is_webdav
786 self._is_webdav = _is_webdav_endpoint(self.root_uri())
787 return self._is_webdav
789 def exists(self) -> bool:
790 """Check that a remote HTTP resource exists."""
791 log.debug("Checking if resource exists: %s", self.geturl())
792 if not self.is_webdav_endpoint:
793 # The remote is a plain HTTP server. Let's attempt a HEAD
794 # request, even if the behavior for such a request against a
795 # directory is not specified, so it depends on the server
796 # implementation.
797 resp = self._head_non_webdav_url()
798 return self._is_successful_non_webdav_head_request(resp)
800 # The remote endpoint is a webDAV server: send a PROPFIND request
801 # to determine if it exists.
802 resp = self._propfind()
803 if resp.status_code == requests.codes.multi_status: # 207
804 prop = _parse_propfind_response_body(resp.text)[0]
805 return prop.exists
806 else: # 404 Not Found
807 return False
809 def size(self) -> int:
810 """Return the size of the remote resource in bytes."""
811 if self.dirLike:
812 return 0
814 if not self.is_webdav_endpoint:
815 # The remote is a plain HTTP server. Send a HEAD request to
816 # retrieve the size of the resource.
817 resp = self._head_non_webdav_url()
818 if resp.status_code == requests.codes.ok: # 200
819 if "Content-Length" in resp.headers:
820 return int(resp.headers["Content-Length"])
821 else:
822 raise ValueError(
823 f"Response to HEAD request to {self} does not contain 'Content-Length' header"
824 )
825 elif resp.status_code == requests.codes.partial_content:
826 # 206, returned from a GET request with a Range header (used to
827 # emulate HEAD for presigned S3 URLs). In this case
828 # Content-Length is the length of the Range and not the full
829 # length of the file, so we have to parse Content-Range
830 # instead.
831 content_range_header = resp.headers.get("Content-Range")
832 if content_range_header is None:
833 raise ValueError(
834 f"Response to GET request to {self} did not contain 'Content-Range' header"
835 )
836 content_range = parse_content_range_header(content_range_header)
837 size = content_range.total
838 if size is None:
839 raise ValueError(f"Content-Range header for {self} did not include a total file size")
840 return size
842 elif resp.status_code == requests.codes.not_found:
843 raise FileNotFoundError(
844 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
845 )
846 else:
847 raise ValueError(
848 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} "
849 f"{resp.reason}"
850 )
852 # The remote is a webDAV server: send a PROPFIND request to retrieve
853 # the size of the resource. Sizes are only meaningful for files.
854 resp = self._propfind()
855 if resp.status_code == requests.codes.multi_status: # 207
856 prop = _parse_propfind_response_body(resp.text)[0]
857 if prop.is_file:
858 return prop.size
859 elif prop.is_directory:
860 raise IsADirectoryError(
861 f"Resource {self} is reported by server as a directory but has a file path"
862 )
863 else:
864 raise FileNotFoundError(f"Resource {self} does not exist")
865 else: # 404 Not Found
866 raise FileNotFoundError(
867 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
868 )
870 def _head_non_webdav_url(self) -> requests.Response:
871 """Return a response from a HTTP HEAD request for a non-WebDAV HTTP
872 URL.
874 Emulates HEAD using a 0-byte GET for presigned S3 URLs.
875 """
876 if self._looks_like_presigned_s3_url():
877 # Presigned S3 URLs are signed for a single method only, so you
878 # can't call HEAD on a URL signed for GET. However, S3 does
879 # support Range requests, so you can ask for a 0-byte range with
880 # GET for a similar effect to HEAD.
881 #
882 # Note that some headers differ between a true HEAD request and the
883 # response returned by this GET, e.g. Content-Length will always be
884 # 0, and the status code is 206 instead of 200.
885 return self.metadata_session.get(
886 self.geturl(),
887 timeout=self._config.timeout,
888 allow_redirects=True,
889 stream=False,
890 headers={"Range": "bytes=0-0"},
891 )
892 else:
893 return self.metadata_session.head(
894 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
895 )
897 def _is_successful_non_webdav_head_request(self, resp: requests.Response) -> bool:
898 """Return `True` if the status code in the response indicates a
899 successful HEAD or GET request.
900 """
901 return resp.status_code in (
902 requests.codes.ok, # 200, from a normal HEAD or GET request
903 requests.codes.partial_content, # 206, returned from a GET request with a Range header.
904 )
906 def _looks_like_presigned_s3_url(self) -> bool:
907 """Return `True` if this ResourcePath's URL is likely to be a presigned
908 S3 URL.
909 """
910 query_params = parse_qs(self._uri.query)
911 return "Signature" in query_params and "Expires" in query_params
913 def mkdir(self) -> None:
914 """Create the directory resource if it does not already exist."""
915 # Creating directories is only available on WebDAV back ends.
916 if not self.is_webdav_endpoint:
917 raise NotImplementedError(
918 f"Creation of directory {self} is not implemented by plain HTTP servers"
919 )
921 if not self.dirLike:
922 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
924 # Check if the target directory already exists.
925 resp = self._propfind()
926 if resp.status_code == requests.codes.multi_status: # 207
927 prop = _parse_propfind_response_body(resp.text)[0]
928 if prop.exists:
929 if prop.is_directory:
930 return
931 else:
932 # A file exists at this path
933 raise NotADirectoryError(
934 f"Can not create a directory for {self} because a file already exists at that path"
935 )
937 # Target directory does not exist. Create it and its ancestors as
938 # needed. We need to test if parent URL is different from self URL,
939 # otherwise we could be stuck in a recursive loop
940 # where self == parent.
941 if self.geturl() != self.parent().geturl():
942 self.parent().mkdir()
944 log.debug("Creating new directory: %s", self.geturl())
945 self._mkcol()
947 def remove(self) -> None:
948 """Remove the resource."""
949 self._delete()
951 def read(self, size: int = -1) -> bytes:
952 """Open the resource and return the contents in bytes.
954 Parameters
955 ----------
956 size : `int`, optional
957 The number of bytes to read. Negative or omitted indicates
958 that all data should be read.
959 """
960 # Use the data session as a context manager to ensure that the
961 # network connections to both the front end and back end servers are
962 # closed after downloading the data.
963 log.debug("Reading from remote resource: %s", self.geturl())
964 stream = size > 0
965 with self.data_session as session:
966 with time_this(log, msg="GET %s", args=(self,)):
967 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout)
969 if resp.status_code != requests.codes.ok: # 200
970 raise FileNotFoundError(
971 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}"
972 )
973 if not stream:
974 return resp.content
975 else:
976 return next(resp.iter_content(chunk_size=size))
978 def write(self, data: bytes, overwrite: bool = True) -> None:
979 """Write the supplied bytes to the new resource.
981 Parameters
982 ----------
983 data : `bytes`
984 The bytes to write to the resource. The entire contents of the
985 resource will be replaced.
986 overwrite : `bool`, optional
987 If `True` the resource will be overwritten if it exists. Otherwise
988 the write will fail.
989 """
990 log.debug("Writing to remote resource: %s", self.geturl())
991 if not overwrite and self.exists():
992 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
994 # Ensure the parent directory exists.
995 # This is only meaningful and appropriate for WebDAV, not the general
996 # HTTP case. e.g. for S3 HTTP URLs, the underlying service has no
997 # concept of 'directories' at all.
998 if self.is_webdav_endpoint:
999 self.parent().mkdir()
1001 # Upload the data.
1002 log.debug("Writing data to remote resource: %s", self.geturl())
1003 self._put(data=data)
1005 def transfer_from(
1006 self,
1007 src: ResourcePath,
1008 transfer: str = "copy",
1009 overwrite: bool = False,
1010 transaction: TransactionProtocol | None = None,
1011 ) -> None:
1012 """Transfer the current resource to a Webdav repository.
1014 Parameters
1015 ----------
1016 src : `ResourcePath`
1017 Source URI.
1018 transfer : `str`
1019 Mode to use for transferring the resource. Supports the following
1020 options: copy.
1021 overwrite : `bool`, optional
1022 Whether overwriting the remote resource is allowed or not.
1023 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
1024 Currently unused.
1025 """
1026 # Fail early to prevent delays if remote resources are requested.
1027 if transfer not in self.transferModes:
1028 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
1030 # Existence checks cost time so do not call this unless we know
1031 # that debugging is enabled.
1032 if log.isEnabledFor(logging.DEBUG):
1033 log.debug(
1034 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
1035 src,
1036 src.exists(),
1037 self,
1038 self.exists(),
1039 transfer,
1040 )
1042 # Short circuit immediately if the URIs are identical.
1043 if self == src:
1044 log.debug(
1045 "Target and destination URIs are identical: %s, returning immediately."
1046 " No further action required.",
1047 self,
1048 )
1049 return
1051 if not overwrite and self.exists():
1052 raise FileExistsError(f"Destination path {self} already exists.")
1054 if transfer == "auto":
1055 transfer = self.transferDefault
1057 # We can use webDAV 'COPY' or 'MOVE' if both the current and source
1058 # resources are located in the same server.
1059 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint:
1060 log.debug("Transfer from %s to %s directly", src, self)
1061 return self._move(src) if transfer == "move" else self._copy(src)
1063 # For resources of different classes or for plain HTTP resources we can
1064 # perform the copy or move operation by downloading to a local file
1065 # and uploading to the destination.
1066 self._copy_via_local(src)
1068 # This was an explicit move, try to remove the source.
1069 if transfer == "move":
1070 src.remove()
1072 def walk(
1073 self, file_filter: str | re.Pattern | None = None
1074 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
1075 """Walk the directory tree returning matching files and directories.
1077 Parameters
1078 ----------
1079 file_filter : `str` or `re.Pattern`, optional
1080 Regex to filter out files from the list before it is returned.
1082 Yields
1083 ------
1084 dirpath : `ResourcePath`
1085 Current directory being examined.
1086 dirnames : `list` of `str`
1087 Names of subdirectories within dirpath.
1088 filenames : `list` of `str`
1089 Names of all the files within dirpath.
1090 """
1091 if not self.dirLike:
1092 raise ValueError("Can not walk a non-directory URI")
1094 # Walking directories is only available on WebDAV back ends.
1095 if not self.is_webdav_endpoint:
1096 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers")
1098 if isinstance(file_filter, str):
1099 file_filter = re.compile(file_filter)
1101 resp = self._propfind(depth="1")
1102 if resp.status_code == requests.codes.multi_status: # 207
1103 files: list[str] = []
1104 dirs: list[str] = []
1106 for prop in _parse_propfind_response_body(resp.text):
1107 if prop.is_file:
1108 files.append(prop.name)
1109 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")):
1110 # Only include the names of sub-directories not the name of
1111 # the directory being walked.
1112 dirs.append(prop.name)
1114 if file_filter is not None:
1115 files = [f for f in files if file_filter.search(f)]
1117 if not dirs and not files:
1118 return
1119 else:
1120 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files
1122 for dir in dirs:
1123 new_uri = self.join(dir, forceDirectory=True)
1124 yield from new_uri.walk(file_filter)
1126 def _as_local(self) -> tuple[str, bool]:
1127 """Download object over HTTP and place in temporary directory.
1129 Returns
1130 -------
1131 path : `str`
1132 Path to local temporary file.
1133 temporary : `bool`
1134 Always returns `True`. This is always a temporary file.
1135 """
1136 # Use the session as a context manager to ensure that connections
1137 # to both the front end and back end servers are closed after the
1138 # download operation is finished.
1139 with self.data_session as session:
1140 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout)
1141 if resp.status_code != requests.codes.ok:
1142 raise FileNotFoundError(
1143 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}"
1144 )
1146 tmpdir, buffering = _get_temp_dir()
1147 with tempfile.NamedTemporaryFile(
1148 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
1149 ) as tmpFile:
1150 expected_length = int(resp.headers.get("Content-Length", "-1"))
1151 with time_this(
1152 log,
1153 msg="GET %s [length=%d] to local file %s [chunk_size=%d]",
1154 args=(self, expected_length, tmpFile.name, buffering),
1155 mem_usage=self._config.collect_memory_usage,
1156 mem_unit=u.mebibyte,
1157 ):
1158 content_length = 0
1159 for chunk in resp.iter_content(chunk_size=buffering):
1160 tmpFile.write(chunk)
1161 content_length += len(chunk)
1163 # Check that the expected and actual content lengths match. Perform
1164 # this check only when the contents of the file was not encoded by
1165 # the server.
1166 if (
1167 "Content-Encoding" not in resp.headers
1168 and expected_length >= 0
1169 and expected_length != content_length
1170 ):
1171 raise ValueError(
1172 f"Size of downloaded file does not match value in Content-Length header for {self}: "
1173 f"expecting {expected_length} and got {content_length} bytes"
1174 )
1176 return tmpFile.name, True
1178 def _send_webdav_request(
1179 self,
1180 method: str,
1181 url: str | None = None,
1182 headers: dict[str, str] | None = None,
1183 body: str | None = None,
1184 session: requests.Session | None = None,
1185 timeout: tuple[float, float] | None = None,
1186 ) -> requests.Response:
1187 """Send a webDAV request and correctly handle redirects.
1189 Parameters
1190 ----------
1191 method : `str`
1192 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL.
1193 headers : `dict`, optional
1194 A dictionary of key-value pairs (both strings) to include as
1195 headers in the request.
1196 body : `str`, optional
1197 The body of the request.
1199 Notes
1200 -----
1201 This way of sending webDAV requests is necessary for handling
1202 redirection ourselves, since the 'requests' package changes the method
1203 of the redirected request when the server responds with status 302 and
1204 the method of the original request is not HEAD (which is the case for
1205 webDAV requests).
1207 That means that when the webDAV server we interact with responds with
1208 a redirection to a PROPFIND or MKCOL request, the request gets
1209 converted to a GET request when sent to the redirected location.
1211 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in
1212 https://github.com/psf/requests/blob/main/requests/sessions.py
1214 This behavior of the 'requests' package is meant to be compatible with
1215 what is specified in RFC 9110:
1217 https://www.rfc-editor.org/rfc/rfc9110#name-302-found
1219 For our purposes, we do need to follow the redirection and send a new
1220 request using the same HTTP verb.
1221 """
1222 if url is None:
1223 url = self.geturl()
1225 if headers is None:
1226 headers = {}
1228 if session is None:
1229 session = self.metadata_session
1231 if timeout is None:
1232 timeout = self._config.timeout
1234 with time_this(
1235 log,
1236 msg="%s %s",
1237 args=(
1238 method,
1239 url,
1240 ),
1241 mem_usage=self._config.collect_memory_usage,
1242 mem_unit=u.mebibyte,
1243 ):
1244 for _ in range(max_redirects := 5):
1245 resp = session.request(
1246 method,
1247 url,
1248 data=body,
1249 headers=headers,
1250 stream=False,
1251 timeout=timeout,
1252 allow_redirects=False,
1253 )
1254 if resp.is_redirect:
1255 url = resp.headers["Location"]
1256 else:
1257 return resp
1259 # We reached the maximum allowed number of redirects.
1260 # Stop trying.
1261 raise ValueError(
1262 f"Could not get a response to {method} request for {self} after {max_redirects} redirections"
1263 )
1265 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response:
1266 """Send a PROPFIND webDAV request and return the response.
1268 Parameters
1269 ----------
1270 body : `str`, optional
1271 The body of the PROPFIND request to send to the server. If
1272 provided, it is expected to be a XML document.
1273 depth : `str`, optional
1274 The value of the 'Depth' header to include in the request.
1276 Returns
1277 -------
1278 response : `requests.Response`
1279 Response to the PROPFIND request.
1281 Notes
1282 -----
1283 It raises `ValueError` if the status code of the PROPFIND request
1284 is different from "207 Multistatus" or "404 Not Found".
1285 """
1286 if body is None:
1287 # Request only the DAV live properties we are explicitly interested
1288 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
1289 # and 'displayname'.
1290 body = (
1291 """<?xml version="1.0" encoding="utf-8" ?>"""
1292 """<D:propfind xmlns:D="DAV:"><D:prop>"""
1293 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
1294 """</D:prop></D:propfind>"""
1295 )
1296 headers = {
1297 "Depth": depth,
1298 "Content-Type": 'application/xml; charset="utf-8"',
1299 "Content-Length": str(len(body)),
1300 }
1301 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body)
1302 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found):
1303 return resp
1304 else:
1305 raise ValueError(
1306 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} "
1307 f"{resp.reason}"
1308 )
1310 def _options(self) -> requests.Response:
1311 """Send a OPTIONS webDAV request for this resource."""
1312 resp = self._send_webdav_request("OPTIONS")
1313 if resp.status_code in (requests.codes.ok, requests.codes.created):
1314 return resp
1316 raise ValueError(
1317 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} {resp.reason}"
1318 )
1320 def _head(self) -> requests.Response:
1321 """Send a HEAD webDAV request for this resource."""
1322 return self._send_webdav_request("HEAD")
1324 def _mkcol(self) -> None:
1325 """Send a MKCOL webDAV request to create a collection. The collection
1326 may already exist.
1327 """
1328 resp = self._send_webdav_request("MKCOL")
1329 if resp.status_code == requests.codes.created: # 201
1330 return
1332 if resp.status_code == requests.codes.method_not_allowed: # 405
1333 # The remote directory already exists
1334 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
1335 else:
1336 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}")
1338 def _delete(self) -> None:
1339 """Send a DELETE webDAV request for this resource."""
1340 log.debug("Deleting %s ...", self.geturl())
1342 # If this is a directory, ensure the remote is a webDAV server because
1343 # plain HTTP servers don't support DELETE requests on non-file
1344 # paths.
1345 if self.dirLike and not self.is_webdav_endpoint:
1346 raise NotImplementedError(
1347 f"Deletion of directory {self} is not implemented by plain HTTP servers"
1348 )
1350 # Deleting non-empty directories may take some time, so increase
1351 # the timeout for getting a response from the server.
1352 timeout = self._config.timeout
1353 if self.dirLike:
1354 timeout = (timeout[0], timeout[1] * 100)
1355 resp = self._send_webdav_request("DELETE", timeout=timeout)
1356 if resp.status_code in (
1357 requests.codes.ok,
1358 requests.codes.accepted,
1359 requests.codes.no_content,
1360 requests.codes.not_found,
1361 ):
1362 # We can get a "404 Not Found" error when the file or directory
1363 # does not exist or when the DELETE request was retried several
1364 # times and a previous attempt actually deleted the resource.
1365 # Therefore we consider that a "Not Found" response is not an
1366 # error since we reached the state desired by the user.
1367 return
1368 else:
1369 # TODO: the response to a DELETE request against a webDAV server
1370 # may be multistatus. If so, we need to parse the reponse body to
1371 # determine more precisely the reason of the failure (e.g. a lock)
1372 # and provide a more helpful error message.
1373 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}")
1375 def _copy_via_local(self, src: ResourcePath) -> None:
1376 """Replace the contents of this resource with the contents of a remote
1377 resource by using a local temporary file.
1379 Parameters
1380 ----------
1381 src : `HttpResourcePath`
1382 The source of the contents to copy to `self`.
1383 """
1384 with src.as_local() as local_uri:
1385 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri)
1386 with open(local_uri.ospath, "rb") as f:
1387 self._put(data=f)
1389 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None:
1390 """Send a COPY or MOVE webDAV request to copy or replace the contents
1391 of this resource with the contents of another resource located in the
1392 same server.
1394 Parameters
1395 ----------
1396 method : `str`
1397 The method to perform. Valid values are "COPY" or "MOVE" (in
1398 uppercase).
1399 src : `HttpResourcePath`
1400 The source of the contents to move to `self`.
1401 """
1402 headers = {"Destination": self.geturl()}
1403 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session)
1404 if resp.status_code in (requests.codes.created, requests.codes.no_content):
1405 return
1407 if resp.status_code == requests.codes.multi_status:
1408 tree = eTree.fromstring(resp.content)
1409 status_element = tree.find("./{DAV:}response/{DAV:}status")
1410 status = status_element.text if status_element is not None else "unknown"
1411 error = tree.find("./{DAV:}response/{DAV:}error")
1412 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}")
1413 else:
1414 raise ValueError(
1415 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}"
1416 )
1418 def _copy(self, src: HttpResourcePath) -> None:
1419 """Send a COPY webDAV request to replace the contents of this resource
1420 (if any) with the contents of another resource located in the same
1421 server.
1423 Parameters
1424 ----------
1425 src : `HttpResourcePath`
1426 The source of the contents to copy to `self`.
1427 """
1428 # Neither dCache nor XrootD currently implement the COPY
1429 # webDAV method as documented in
1430 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
1431 # (See issues DM-37603 and DM-37651 for details)
1432 #
1433 # For the time being, we use a temporary local file to
1434 # perform the copy client side.
1435 # TODO: when those 2 issues above are solved remove the 3 lines below.
1436 must_use_local = True
1437 if must_use_local:
1438 return self._copy_via_local(src)
1440 return self._copy_or_move("COPY", src)
1442 def _move(self, src: HttpResourcePath) -> None:
1443 """Send a MOVE webDAV request to replace the contents of this resource
1444 with the contents of another resource located in the same server.
1446 Parameters
1447 ----------
1448 src : `HttpResourcePath`
1449 The source of the contents to move to `self`.
1450 """
1451 return self._copy_or_move("MOVE", src)
1453 def _put(self, data: BinaryIO | bytes) -> None:
1454 """Perform an HTTP PUT request and handle redirection.
1456 Parameters
1457 ----------
1458 data : `Union[BinaryIO, bytes]`
1459 The data to be included in the body of the PUT request.
1460 """
1461 # Retrieve the final URL for this upload by sending a PUT request with
1462 # no content. Follow a single server redirection to retrieve the
1463 # final URL.
1464 headers = {"Content-Length": "0"}
1465 if self._config.send_expect_on_put:
1466 headers["Expect"] = "100-continue"
1468 url = self.geturl()
1470 # Use the session as a context manager to ensure the underlying
1471 # connections are closed after finishing uploading the data.
1472 with self.data_session as session:
1473 # Send an empty PUT request to get redirected to the final
1474 # destination.
1475 log.debug("Sending empty PUT request to %s", url)
1476 with time_this(
1477 log,
1478 msg="PUT (no data) %s",
1479 args=(url,),
1480 mem_usage=self._config.collect_memory_usage,
1481 mem_unit=u.mebibyte,
1482 ):
1483 resp = session.request(
1484 "PUT",
1485 url,
1486 data=None,
1487 headers=headers,
1488 stream=False,
1489 timeout=self._config.timeout,
1490 allow_redirects=False,
1491 )
1492 if resp.is_redirect:
1493 url = resp.headers["Location"]
1495 # Upload the data to the final destination.
1496 log.debug("Uploading data to %s", url)
1498 # Ask the server to compute and record a checksum of the uploaded
1499 # file contents, for later integrity checks. Since we don't compute
1500 # the digest ourselves while uploading the data, we cannot control
1501 # after the request is complete that the data we uploaded is
1502 # identical to the data recorded by the server, but at least the
1503 # server has recorded a digest of the data it stored.
1504 #
1505 # See RFC-3230 for details and
1506 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
1507 # for the list of supported digest algorithhms.
1508 # In addition, note that not all servers implement this RFC so
1509 # the checksum may not be computed by the server.
1510 put_headers: dict[str, str] | None = None
1511 if digest := self._config.digest_algorithm:
1512 put_headers = {"Want-Digest": digest}
1514 with time_this(
1515 log,
1516 msg="PUT %s",
1517 args=(url,),
1518 mem_usage=self._config.collect_memory_usage,
1519 mem_unit=u.mebibyte,
1520 ):
1521 resp = session.request(
1522 "PUT",
1523 url,
1524 data=data,
1525 headers=put_headers,
1526 stream=False,
1527 timeout=self._config.timeout,
1528 allow_redirects=False,
1529 )
1530 if resp.status_code in (
1531 requests.codes.ok,
1532 requests.codes.created,
1533 requests.codes.no_content,
1534 ):
1535 return
1536 else:
1537 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}")
1539 @contextlib.contextmanager
1540 def _openImpl(
1541 self,
1542 mode: str = "r",
1543 *,
1544 encoding: str | None = None,
1545 ) -> Iterator[ResourceHandleProtocol]:
1546 resp = self._head()
1547 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes"
1548 handle: ResourceHandleProtocol
1549 if mode in ("rb", "r") and accepts_range:
1550 handle = HttpReadResourceHandle(
1551 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout
1552 )
1553 if mode == "r":
1554 # cast because the protocol is compatible, but does not have
1555 # BytesIO in the inheritance tree
1556 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
1557 else:
1558 yield handle
1559 else:
1560 with super()._openImpl(mode, encoding=encoding) as http_handle:
1561 yield http_handle
1564def _dump_response(resp: requests.Response) -> None:
1565 """Log the contents of a HTTP or webDAV request and its response.
1567 Parameters
1568 ----------
1569 resp : `requests.Response`
1570 The response to log.
1572 Notes
1573 -----
1574 Intended for development purposes only.
1575 """
1576 log.debug("-----------------------------------------------")
1577 log.debug("Request")
1578 log.debug(" method=%s", resp.request.method)
1579 log.debug(" URL=%s", resp.request.url)
1580 log.debug(" headers=%s", resp.request.headers)
1581 if resp.request.method == "PUT":
1582 log.debug(" body=<data>")
1583 elif resp.request.body is None:
1584 log.debug(" body=<empty>")
1585 else:
1586 log.debug(" body=%r", resp.request.body[:120])
1588 log.debug("Response:")
1589 log.debug(" status_code=%d", resp.status_code)
1590 log.debug(" headers=%s", resp.headers)
1591 if not resp.content:
1592 log.debug(" body=<empty>")
1593 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain":
1594 log.debug(" body=%r", resp.content)
1595 else:
1596 log.debug(" body=%r", resp.content[:80])
1599def _is_protected(filepath: str) -> bool:
1600 """Return true if the permissions of file at filepath only allow for access
1601 by its owner.
1603 Parameters
1604 ----------
1605 filepath : `str`
1606 Path of a local file.
1607 """
1608 if not os.path.isfile(filepath):
1609 return False
1610 mode = stat.S_IMODE(os.stat(filepath).st_mode)
1611 owner_accessible = bool(mode & stat.S_IRWXU)
1612 group_accessible = bool(mode & stat.S_IRWXG)
1613 other_accessible = bool(mode & stat.S_IRWXO)
1614 return owner_accessible and not group_accessible and not other_accessible
1617def _parse_propfind_response_body(body: str) -> list[DavProperty]:
1618 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND
1619 request.
1621 Parameters
1622 ----------
1623 body : `str`
1624 XML-encoded response body to a PROPFIND request
1626 Returns
1627 -------
1628 responses : `List[DavProperty]`
1630 Notes
1631 -----
1632 Is is expected that there is at least one reponse in `body`, otherwise
1633 this function raises.
1634 """
1635 # A response body to a PROPFIND request is of the form (indented for
1636 # readability):
1637 #
1638 # <?xml version="1.0" encoding="UTF-8"?>
1639 # <D:multistatus xmlns:D="DAV:">
1640 # <D:response>
1641 # <D:href>path/to/resource</D:href>
1642 # <D:propstat>
1643 # <D:prop>
1644 # <D:resourcetype>
1645 # <D:collection xmlns:D="DAV:"/>
1646 # </D:resourcetype>
1647 # <D:getlastmodified>
1648 # Fri, 27 Jan 2 023 13:59:01 GMT
1649 # </D:getlastmodified>
1650 # <D:getcontentlength>
1651 # 12345
1652 # </D:getcontentlength>
1653 # </D:prop>
1654 # <D:status>
1655 # HTTP/1.1 200 OK
1656 # </D:status>
1657 # </D:propstat>
1658 # </D:response>
1659 # <D:response>
1660 # ...
1661 # </D:response>
1662 # <D:response>
1663 # ...
1664 # </D:response>
1665 # </D:multistatus>
1667 # Scan all the 'response' elements and extract the relevant properties
1668 responses = []
1669 multistatus = eTree.fromstring(body.strip())
1670 for response in multistatus.findall("./{DAV:}response"):
1671 responses.append(DavProperty(response))
1673 if responses:
1674 return responses
1675 else:
1676 # Could not parse the body
1677 raise ValueError(f"Unable to parse response for PROPFIND request: {body}")
1680class DavProperty:
1681 """Helper class to encapsulate select live DAV properties of a single
1682 resource, as retrieved via a PROPFIND request.
1684 Parameters
1685 ----------
1686 response : `eTree.Element` or `None`
1687 The XML response defining the DAV property.
1688 """
1690 # Regular expression to compare against the 'status' element of a
1691 # PROPFIND response's 'propstat' element.
1692 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE)
1694 def __init__(self, response: eTree.Element | None):
1695 self._href: str = ""
1696 self._displayname: str = ""
1697 self._collection: bool = False
1698 self._getlastmodified: str = ""
1699 self._getcontentlength: int = -1
1701 if response is not None:
1702 self._parse(response)
1704 def _parse(self, response: eTree.Element) -> None:
1705 # Extract 'href'.
1706 if (element := response.find("./{DAV:}href")) is not None:
1707 # We need to use "str(element.text)"" instead of "element.text" to
1708 # keep mypy happy.
1709 self._href = str(element.text).strip()
1710 else:
1711 raise ValueError(
1712 "Property 'href' expected but not found in PROPFIND response: "
1713 f"{eTree.tostring(response, encoding='unicode')}"
1714 )
1716 for propstat in response.findall("./{DAV:}propstat"):
1717 # Only extract properties of interest with status OK.
1718 status = propstat.find("./{DAV:}status")
1719 if status is None or not self._status_ok_rex.match(str(status.text)):
1720 continue
1722 for prop in propstat.findall("./{DAV:}prop"):
1723 # Parse "collection".
1724 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None:
1725 self._collection = True
1727 # Parse "getlastmodified".
1728 if (element := prop.find("./{DAV:}getlastmodified")) is not None:
1729 self._getlastmodified = str(element.text)
1731 # Parse "getcontentlength".
1732 if (element := prop.find("./{DAV:}getcontentlength")) is not None:
1733 self._getcontentlength = int(str(element.text))
1735 # Parse "displayname".
1736 if (element := prop.find("./{DAV:}displayname")) is not None:
1737 self._displayname = str(element.text)
1739 # Some webDAV servers don't include the 'displayname' property in the
1740 # response so try to infer it from the value of the 'href' property.
1741 # Depending on the server the href value may end with '/'.
1742 if not self._displayname:
1743 self._displayname = os.path.basename(self._href.rstrip("/"))
1745 # Force a size of 0 for collections.
1746 if self._collection:
1747 self._getcontentlength = 0
1749 @property
1750 def exists(self) -> bool:
1751 # It is either a directory or a file with length of at least zero
1752 return self._collection or self._getcontentlength >= 0
1754 @property
1755 def is_directory(self) -> bool:
1756 return self._collection
1758 @property
1759 def is_file(self) -> bool:
1760 return not self._collection
1762 @property
1763 def size(self) -> int:
1764 return self._getcontentlength
1766 @property
1767 def name(self) -> str:
1768 return self._displayname
1770 @property
1771 def href(self) -> str:
1772 return self._href