Coverage for python/lsst/resources/http.py: 23%
571 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-09 11:30 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-09 11:30 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import math
21import os
22import os.path
23import random
24import re
25import stat
26import tempfile
27from collections.abc import Iterator
28from typing import TYPE_CHECKING, BinaryIO, cast
30try:
31 # Prefer 'defusedxml' (not part of standard library) if available, since
32 # 'xml' is vulnerable to XML bombs.
33 import defusedxml.ElementTree as eTree
34except ImportError:
35 import xml.etree.ElementTree as eTree
37import requests
38from astropy import units as u
39from lsst.utils.timer import time_this
40from requests.adapters import HTTPAdapter
41from requests.auth import AuthBase
42from urllib3.util.retry import Retry
44from ._resourceHandles import ResourceHandleProtocol
45from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
46from ._resourcePath import ResourcePath
48if TYPE_CHECKING:
49 from .utils import TransactionProtocol
51log = logging.getLogger(__name__)
54def _timeout_from_environment(env_var: str, default_value: float) -> float:
55 """Convert and return a timeout from the value of an environment variable
56 or a default value if the environment variable is not initialized. The
57 value of `env_var` must be a valid `float` otherwise this function raises.
59 Parameters
60 ----------
61 env_var : `str`
62 Environment variable to look for.
63 default_value : `float``
64 Value to return if `env_var` is not defined in the environment.
66 Returns
67 -------
68 _timeout_from_environment : `float`
69 Converted value.
70 """
71 try:
72 timeout = float(os.environ.get(env_var, default_value))
73 except ValueError:
74 raise ValueError(
75 f"Expecting valid timeout value in environment variable {env_var} but found "
76 f"{os.environ.get(env_var)}"
77 ) from None
79 if math.isnan(timeout):
80 raise ValueError(f"Unexpected timeout value NaN found in environment variable {env_var}")
82 return timeout
85class HttpResourcePathConfig:
86 """Configuration class to encapsulate the configurable items used by class
87 HttpResourcePath.
88 """
90 # Default timeouts for all HTTP requests (seconds).
91 DEFAULT_TIMEOUT_CONNECT = 30.0
92 DEFAULT_TIMEOUT_READ = 1_500.0
94 # Default lower and upper bounds for the backoff interval (seconds).
95 # A value in this interval is randomly selected as the backoff factor when
96 # requests need to be retried.
97 DEFAULT_BACKOFF_MIN = 1.0
98 DEFAULT_BACKOFF_MAX = 3.0
100 # Default number of connections to persist with both the front end and
101 # back end servers.
102 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2
103 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1
105 # Accepted digest algorithms
106 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512")
108 _front_end_connections: int | None = None
109 _back_end_connections: int | None = None
110 _digest_algorithm: str | None = None
111 _send_expect_on_put: bool | None = None
112 _timeout: tuple[float, float] | None = None
113 _collect_memory_usage: bool | None = None
114 _backoff_min: float | None = None
115 _backoff_max: float | None = None
117 @property
118 def front_end_connections(self) -> int:
119 """Number of persistent connections to the front end server."""
120 if self._front_end_connections is not None: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true
121 return self._front_end_connections
123 try:
124 self._front_end_connections = int(
125 os.environ.get(
126 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
127 )
128 )
129 except ValueError:
130 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
132 return self._front_end_connections
134 @property
135 def back_end_connections(self) -> int:
136 """Number of persistent connections to the back end servers."""
137 if self._back_end_connections is not None: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true
138 return self._back_end_connections
140 try:
141 self._back_end_connections = int(
142 os.environ.get(
143 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
144 )
145 )
146 except ValueError:
147 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
149 return self._back_end_connections
151 @property
152 def digest_algorithm(self) -> str:
153 """Algorithm to ask the server to use for computing and recording
154 digests of each file contents in PUT requests.
156 Returns
157 -------
158 digest_algorithm: `str`
159 The name of a digest algorithm or the empty string if no algotihm
160 is configured.
161 """
162 if self._digest_algorithm is not None:
163 return self._digest_algorithm
165 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower()
166 if digest not in self.ACCEPTED_DIGESTS:
167 digest = ""
169 self._digest_algorithm = digest
170 return self._digest_algorithm
172 @property
173 def send_expect_on_put(self) -> bool:
174 """Return True if a "Expect: 100-continue" header is to be sent to
175 the server on each PUT request.
177 Some servers (e.g. dCache) uses this information as an indication that
178 the client knows how to handle redirects to the specific server that
179 will actually receive the data for PUT requests.
180 """
181 if self._send_expect_on_put is not None:
182 return self._send_expect_on_put
184 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
185 return self._send_expect_on_put
187 @property
188 def timeout(self) -> tuple[float, float]:
189 """Return a tuple with the values of timeouts for connecting to the
190 server and reading its response, respectively. Both values are in
191 seconds.
192 """
193 if self._timeout is not None:
194 return self._timeout
196 self._timeout = (
197 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT),
198 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ),
199 )
200 return self._timeout
202 @property
203 def collect_memory_usage(self) -> bool:
204 """Return true if we want to collect memory usage when timing
205 operations against the remote server via the `lsst.utils.time_this`
206 context manager.
207 """
208 if self._collect_memory_usage is not None:
209 return self._collect_memory_usage
211 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ
212 return self._collect_memory_usage
214 @property
215 def backoff_min(self) -> float:
216 """Lower bound of the interval from which a backoff factor is randomly
217 selected when retrying requests (seconds).
218 """
219 if self._backoff_min is not None:
220 return self._backoff_min
222 self._backoff_min = self.DEFAULT_BACKOFF_MIN
223 try:
224 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN))
225 if not math.isnan(backoff_min): 225 ↛ 230line 225 didn't jump to line 230, because the condition on line 225 was never false
226 self._backoff_min = backoff_min
227 except ValueError:
228 pass
230 return self._backoff_min
232 @property
233 def backoff_max(self) -> float:
234 """Upper bound of the interval from which a backoff factor is randomly
235 selected when retrying requests (seconds).
236 """
237 if self._backoff_max is not None:
238 return self._backoff_max
240 self._backoff_max = self.DEFAULT_BACKOFF_MAX
241 try:
242 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX))
243 if not math.isnan(backoff_max): 243 ↛ 248line 243 didn't jump to line 248, because the condition on line 243 was never false
244 self._backoff_max = backoff_max
245 except ValueError:
246 pass
248 return self._backoff_max
251@functools.lru_cache
252def _is_webdav_endpoint(path: ResourcePath | str) -> bool:
253 """Check whether the remote HTTP endpoint implements WebDAV features.
255 Parameters
256 ----------
257 path : `ResourcePath` or `str`
258 URL to the resource to be checked.
259 Should preferably refer to the root since the status is shared
260 by all paths in that server.
262 Returns
263 -------
264 _is_webdav_endpoint : `bool`
265 True if the endpoint implements WebDAV, False if it doesn't.
266 """
267 log.debug("Detecting HTTP endpoint type for '%s'...", path)
269 # Send an OPTIONS request and inspect its response. An OPTIONS
270 # request does not need authentication of the client, so we don't need
271 # to provide a client certificate or a bearer token. We set a
272 # relatively short timeout since an OPTIONS request is relatively cheap
273 # for the server to compute.
275 # Create a session for configuring retries
276 retries = Retry(
277 # Total number of retries to allow. Takes precedence over other
278 # counts.
279 total=6,
280 # How many connection-related errors to retry on.
281 connect=3,
282 # How many times to retry on read errors.
283 read=3,
284 # How many times to retry on bad status codes.
285 status=5,
286 # Set of uppercased HTTP method verbs that we should retry on.
287 allowed_methods=frozenset(
288 [
289 "OPTIONS",
290 ]
291 ),
292 # HTTP status codes that we should force a retry on.
293 status_forcelist=frozenset(
294 [
295 requests.codes.too_many_requests, # 429
296 requests.codes.internal_server_error, # 500
297 requests.codes.bad_gateway, # 502
298 requests.codes.service_unavailable, # 503
299 requests.codes.gateway_timeout, # 504
300 ]
301 ),
302 # Whether to respect 'Retry-After' header on status codes defined
303 # above.
304 respect_retry_after_header=True,
305 )
307 try:
308 session = requests.Session()
309 session.mount(str(path), HTTPAdapter(max_retries=retries))
310 session.verify = os.environ.get("LSST_HTTP_CACERT_BUNDLE", True)
311 with session:
312 resp = session.options(
313 str(path),
314 stream=False,
315 timeout=(
316 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", 30.0),
317 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", 60.0),
318 ),
319 )
320 if resp.status_code not in (requests.codes.ok, requests.codes.created):
321 return False
323 # Check that "1" is part of the value of the "DAV" header. We don't
324 # use locks, so a server complying to class 1 is enough for our
325 # purposes. All webDAV servers must advertise at least compliance
326 # class "1".
327 #
328 # Compliance classes are documented in
329 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
330 #
331 # Examples of values for header DAV are:
332 # DAV: 1, 2
333 # DAV: 1, <http://apache.org/dav/propset/fs/1>
334 if "DAV" not in resp.headers:
335 return False
336 else:
337 # Convert to str to keep mypy happy
338 compliance_class = str(resp.headers.get("DAV"))
339 return "1" in compliance_class.replace(" ", "").split(",")
341 except requests.exceptions.SSLError as e:
342 log.warning(
343 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to "
344 "specify tha path to a bundle of certificate authorities you trust "
345 "which are not included in the default set of trusted authorities "
346 "of this system."
347 )
348 raise e
351# Tuple (path, block_size) pointing to the location of a local directory
352# to save temporary files and the block size of the underlying file system.
353_TMPDIR: tuple[str, int] | None = None
356def _get_temp_dir() -> tuple[str, int]:
357 """Return the temporary directory path and block size.
359 This function caches its results in _TMPDIR.
360 """
361 global _TMPDIR
362 if _TMPDIR:
363 return _TMPDIR
365 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
366 # 'TMPDIR', if defined. Otherwise use current working directory.
367 tmpdir = os.getcwd()
368 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
369 if dir and os.path.isdir(dir):
370 tmpdir = dir
371 break
373 # Compute the block size as 256 blocks of typical size
374 # (i.e. 4096 bytes) or 10 times the file system block size,
375 # whichever is higher. This is a reasonable compromise between
376 # using memory for buffering and the number of system calls
377 # issued to read from or write to temporary files.
378 fsstats = os.statvfs(tmpdir)
379 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
382class BearerTokenAuth(AuthBase):
383 """Attach a bearer token 'Authorization' header to each request.
385 Parameters
386 ----------
387 token : `str`
388 Can be either the path to a local protected file which contains the
389 value of the token or the token itself.
390 """
392 def __init__(self, token: str):
393 self._token = self._path = None
394 self._mtime: float = -1.0
395 if not token:
396 return
398 self._token = token
399 if os.path.isfile(token):
400 self._path = os.path.abspath(token)
401 if not _is_protected(self._path):
402 raise PermissionError(
403 f"Bearer token file at {self._path} must be protected for access only by its owner"
404 )
405 self._refresh()
407 def _refresh(self) -> None:
408 """Read the token file (if any) if its modification time is more recent
409 than the last time we read it.
410 """
411 if not self._path:
412 return
414 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
415 log.debug("Reading bearer token file at %s", self._path)
416 self._mtime = mtime
417 with open(self._path) as f:
418 self._token = f.read().rstrip("\n")
420 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
421 # Only add a bearer token to a request when using secure HTTP.
422 if req.url and req.url.lower().startswith("https://") and self._token:
423 self._refresh()
424 req.headers["Authorization"] = f"Bearer {self._token}"
425 return req
428class SessionStore:
429 """Cache a reusable HTTP client session per endpoint.
431 Parameters
432 ----------
433 num_pools : `int`, optional
434 Number of connection pools to keep: there is one pool per remote
435 host.
436 max_persistent_connections : `int`, optional
437 Maximum number of connections per remote host to persist in each
438 connection pool.
439 backoff_min : `float`, optional
440 Minimum value of the interval to compute the exponential
441 backoff factor when retrying requests (seconds).
442 backoff_max : `float`, optional
443 Maximum value of the interval to compute the exponential
444 backoff factor when retrying requests (seconds).
445 """
447 def __init__(
448 self,
449 num_pools: int = 10,
450 max_persistent_connections: int = 1,
451 backoff_min: float = 1.0,
452 backoff_max: float = 3.0,
453 ) -> None:
454 # Dictionary to store the session associated to a given URI. The key
455 # of the dictionary is a root URI and the value is the session.
456 self._sessions: dict[str, requests.Session] = {}
458 # See documentation of urllib3 PoolManager class:
459 # https://urllib3.readthedocs.io
460 self._num_pools: int = num_pools
462 # See urllib3 Advanced Usage documentation:
463 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html
464 self._max_persistent_connections: int = max_persistent_connections
466 # Minimum and maximum values of the interval to compute the exponential
467 # backoff factor when retrying requests (seconds).
468 self._backoff_min: float = backoff_min
469 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0
471 def clear(self) -> None:
472 """Destroy all previously created sessions and attempt to close
473 underlying idle network connections.
474 """
475 # Close all sessions and empty the store. Idle network connections
476 # should be closed as a consequence. We don't have means through
477 # the API exposed by Requests to actually force closing the
478 # underlying open sockets.
479 for session in self._sessions.values():
480 session.close()
482 self._sessions.clear()
484 def get(self, rpath: ResourcePath) -> requests.Session:
485 """Retrieve a session for accessing the remote resource at rpath.
487 Parameters
488 ----------
489 rpath : `ResourcePath`
490 URL to a resource at the remote server for which a session is to
491 be retrieved.
493 Notes
494 -----
495 Once a session is created for a given endpoint it is cached and
496 returned every time a session is requested for any path under that same
497 endpoint. For instance, a single session will be cached and shared
498 for paths "https://www.example.org/path/to/file" and
499 "https://www.example.org/any/other/path".
501 Note that "https://www.example.org" and "https://www.example.org:12345"
502 will have different sessions since the port number is not identical.
504 In order to configure the session, some environment variables are
505 inspected:
507 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
508 certificates to trust when verifying the server's certificate.
510 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
511 local file containing a bearer token to be used as the client
512 authentication mechanism with all requests.
513 The permissions of the token file must be set so that only its
514 owner can access it.
515 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
516 and LSST_HTTP_AUTH_CLIENT_KEY.
518 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
519 client certificate for authenticating to the server.
520 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
521 initialized with the path of the client private key file.
522 The permissions of the client private key must be set so that only
523 its owner can access it, at least for reading.
524 """
525 root_uri = str(rpath.root_uri())
526 if root_uri not in self._sessions:
527 # We don't have yet a session for this endpoint: create a new one.
528 self._sessions[root_uri] = self._make_session(rpath)
530 return self._sessions[root_uri]
532 def _make_session(self, rpath: ResourcePath) -> requests.Session:
533 """Make a new session configured from values from the environment."""
534 session = requests.Session()
535 root_uri = str(rpath.root_uri())
536 log.debug("Creating new HTTP session for endpoint %s ...", root_uri)
537 retries = Retry(
538 # Total number of retries to allow. Takes precedence over other
539 # counts.
540 total=6,
541 # How many connection-related errors to retry on.
542 connect=3,
543 # How many times to retry on read errors.
544 read=3,
545 # Backoff factor to apply between attempts after the second try
546 # (seconds). Compute a random jitter to prevent all the clients
547 # to overwhelm the server by sending requests at the same time.
548 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(),
549 # How many times to retry on bad status codes.
550 status=5,
551 # Set of uppercased HTTP method verbs that we should retry on.
552 # We only automatically retry idempotent requests.
553 allowed_methods=frozenset(
554 [
555 "COPY",
556 "DELETE",
557 "GET",
558 "HEAD",
559 "MKCOL",
560 "OPTIONS",
561 "PROPFIND",
562 "PUT",
563 ]
564 ),
565 # HTTP status codes that we should force a retry on.
566 status_forcelist=frozenset(
567 [
568 requests.codes.too_many_requests, # 429
569 requests.codes.internal_server_error, # 500
570 requests.codes.bad_gateway, # 502
571 requests.codes.service_unavailable, # 503
572 requests.codes.gateway_timeout, # 504
573 ]
574 ),
575 # Whether to respect Retry-After header on status codes defined
576 # above.
577 respect_retry_after_header=True,
578 )
580 # Persist the specified number of connections to the front end server.
581 session.mount(
582 root_uri,
583 HTTPAdapter(
584 pool_connections=self._num_pools,
585 pool_maxsize=self._max_persistent_connections,
586 pool_block=False,
587 max_retries=retries,
588 ),
589 )
591 # Do not persist the connections to back end servers which may vary
592 # from request to request. Systematically persisting connections to
593 # those servers may exhaust their capabilities when there are thousands
594 # of simultaneous clients.
595 session.mount(
596 f"{rpath.scheme}://",
597 HTTPAdapter(
598 pool_connections=self._num_pools,
599 pool_maxsize=0,
600 pool_block=False,
601 max_retries=retries,
602 ),
603 )
605 # If the remote endpoint doesn't use secure HTTP we don't include
606 # bearer tokens in the requests nor need to authenticate the remote
607 # server.
608 if rpath.scheme != "https":
609 return session
611 # Should we use a specific CA cert bundle for authenticating the
612 # server?
613 session.verify = True
614 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
615 session.verify = ca_bundle
617 # Should we use bearer tokens for client authentication?
618 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
619 log.debug("... using bearer token authentication")
620 session.auth = BearerTokenAuth(token)
621 return session
623 # Should we instead use client certificate and private key? If so, both
624 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
625 # initialized.
626 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
627 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
628 if client_cert and client_key:
629 if not _is_protected(client_key):
630 raise PermissionError(
631 f"Private key file at {client_key} must be protected for access only by its owner"
632 )
633 log.debug("... using client certificate authentication.")
634 session.cert = (client_cert, client_key)
635 return session
637 if client_cert:
638 # Only the client certificate was provided.
639 raise ValueError(
640 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
641 )
643 if client_key:
644 # Only the client private key was provided.
645 raise ValueError(
646 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
647 )
649 log.debug(
650 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
651 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
652 )
653 return session
656class HttpResourcePath(ResourcePath):
657 """General HTTP(S) resource.
659 Notes
660 -----
661 In order to configure the behavior of instances of this class, the
662 environment variables below are inspected:
664 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
665 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
666 This header is required by some servers to detect if the client
667 knows how to handle redirections. In case of redirection, the body
668 of the PUT request is sent to the redirected location and not to
669 the front end server.
671 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a
672 numeric value, they are interpreted as the number of seconds to wait
673 for establishing a connection with the server and for reading its
674 response, respectively.
676 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and
677 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number
678 of connections to attempt to persist with both the front end servers
679 and the back end servers.
680 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and
681 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS.
683 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to
684 ask the server to compute for every file's content sent to the server
685 via a PUT request. No digest is requested if this variable is not set
686 or is set to an invalid value.
687 Valid values are those in ACCEPTED_DIGESTS.
688 """
690 _is_webdav: bool | None = None
692 # Configuration items for this class instances.
693 _config = HttpResourcePathConfig()
695 # The session for metadata requests is used for interacting with
696 # the front end servers for requests such as PROPFIND, HEAD, etc. Those
697 # interactions are typically served by the front end servers. We want to
698 # keep the connection to the front end servers open, to reduce the cost
699 # associated to TCP and TLS handshaking for each new request.
700 _metadata_session_store = SessionStore(
701 num_pools=5,
702 max_persistent_connections=_config.front_end_connections,
703 backoff_min=_config.backoff_min,
704 backoff_max=_config.backoff_max,
705 )
707 # The data session is used for interaction with the front end servers which
708 # typically redirect to the back end servers for serving our PUT and GET
709 # requests. We attempt to keep a single connection open with the front end
710 # server, if possible. This depends on how the server behaves and the
711 # kind of request. Some servers close the connection when redirecting
712 # the client to a back end server, for instance when serving a PUT
713 # request.
714 _data_session_store = SessionStore(
715 num_pools=25,
716 max_persistent_connections=_config.back_end_connections,
717 backoff_min=_config.backoff_min,
718 backoff_max=_config.backoff_max,
719 )
721 # Process ID which created the session stores above. We need to store this
722 # to replace sessions created by a parent process and inherited by a
723 # child process after a fork, to avoid confusing the SSL layer.
724 _pid: int = -1
726 @property
727 def metadata_session(self) -> requests.Session:
728 """Client session to send requests which do not require upload or
729 download of data, i.e. mostly metadata requests.
730 """
731 if hasattr(self, "_metadata_session"):
732 if HttpResourcePath._pid == os.getpid():
733 return self._metadata_session
734 else:
735 # The metadata session we have in cache was likely created by
736 # a parent process. Discard all the sessions in that store.
737 self._metadata_session_store.clear()
739 # Retrieve a new metadata session.
740 HttpResourcePath._pid = os.getpid()
741 self._metadata_session: requests.Session = self._metadata_session_store.get(self)
742 return self._metadata_session
744 @property
745 def data_session(self) -> requests.Session:
746 """Client session for uploading and downloading data."""
747 if hasattr(self, "_data_session"):
748 if HttpResourcePath._pid == os.getpid():
749 return self._data_session
750 else:
751 # The data session we have in cache was likely created by
752 # a parent process. Discard all the sessions in that store.
753 self._data_session_store.clear()
755 # Retrieve a new data session.
756 HttpResourcePath._pid = os.getpid()
757 self._data_session: requests.Session = self._data_session_store.get(self)
758 return self._data_session
760 def _clear_sessions(self) -> None:
761 """Close the socket connections that are still open.
763 Used only in test suites to avoid warnings.
764 """
765 self._metadata_session_store.clear()
766 self._data_session_store.clear()
768 if hasattr(self, "_metadata_session"):
769 delattr(self, "_metadata_session")
771 if hasattr(self, "_data_session"):
772 delattr(self, "_data_session")
774 @property
775 def is_webdav_endpoint(self) -> bool:
776 """Check if the current endpoint implements WebDAV features.
778 This is stored per URI but cached by root so there is
779 only one check per hostname.
780 """
781 if self._is_webdav is not None:
782 return self._is_webdav
784 self._is_webdav = _is_webdav_endpoint(self.root_uri())
785 return self._is_webdav
787 def exists(self) -> bool:
788 """Check that a remote HTTP resource exists."""
789 log.debug("Checking if resource exists: %s", self.geturl())
790 if not self.is_webdav_endpoint:
791 # The remote is a plain HTTP server. Let's attempt a HEAD
792 # request, even if the behavior for such a request against a
793 # directory is not specified, so it depends on the server
794 # implementation.
795 resp = self.metadata_session.head(
796 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
797 )
798 return resp.status_code == requests.codes.ok # 200
800 # The remote endpoint is a webDAV server: send a PROPFIND request
801 # to determine if it exists.
802 resp = self._propfind()
803 if resp.status_code == requests.codes.multi_status: # 207
804 prop = _parse_propfind_response_body(resp.text)[0]
805 return prop.exists
806 else: # 404 Not Found
807 return False
809 def size(self) -> int:
810 """Return the size of the remote resource in bytes."""
811 if self.dirLike:
812 return 0
814 if not self.is_webdav_endpoint:
815 # The remote is a plain HTTP server. Send a HEAD request to
816 # retrieve the size of the resource.
817 resp = self.metadata_session.head(
818 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
819 )
820 if resp.status_code == requests.codes.ok: # 200
821 if "Content-Length" in resp.headers:
822 return int(resp.headers["Content-Length"])
823 else:
824 raise ValueError(
825 f"Response to HEAD request to {self} does not contain 'Content-Length' header"
826 )
827 elif resp.status_code == requests.codes.not_found:
828 raise FileNotFoundError(
829 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
830 )
831 else:
832 raise ValueError(
833 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} "
834 f"{resp.reason}"
835 )
837 # The remote is a webDAV server: send a PROPFIND request to retrieve
838 # the size of the resource. Sizes are only meaningful for files.
839 resp = self._propfind()
840 if resp.status_code == requests.codes.multi_status: # 207
841 prop = _parse_propfind_response_body(resp.text)[0]
842 if prop.is_file:
843 return prop.size
844 elif prop.is_directory:
845 raise IsADirectoryError(
846 f"Resource {self} is reported by server as a directory but has a file path"
847 )
848 else:
849 raise FileNotFoundError(f"Resource {self} does not exist")
850 else: # 404 Not Found
851 raise FileNotFoundError(
852 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
853 )
855 def mkdir(self) -> None:
856 """Create the directory resource if it does not already exist."""
857 # Creating directories is only available on WebDAV back ends.
858 if not self.is_webdav_endpoint:
859 raise NotImplementedError(
860 f"Creation of directory {self} is not implemented by plain HTTP servers"
861 )
863 if not self.dirLike:
864 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
866 # Check if the target directory already exists.
867 resp = self._propfind()
868 if resp.status_code == requests.codes.multi_status: # 207
869 prop = _parse_propfind_response_body(resp.text)[0]
870 if prop.exists:
871 if prop.is_directory:
872 return
873 else:
874 # A file exists at this path
875 raise NotADirectoryError(
876 f"Can not create a directory for {self} because a file already exists at that path"
877 )
879 # Target directory does not exist. Create it and its ancestors as
880 # needed. We need to test if parent URL is different from self URL,
881 # otherwise we could be stuck in a recursive loop
882 # where self == parent.
883 if self.geturl() != self.parent().geturl():
884 self.parent().mkdir()
886 log.debug("Creating new directory: %s", self.geturl())
887 self._mkcol()
889 def remove(self) -> None:
890 """Remove the resource."""
891 self._delete()
893 def read(self, size: int = -1) -> bytes:
894 """Open the resource and return the contents in bytes.
896 Parameters
897 ----------
898 size : `int`, optional
899 The number of bytes to read. Negative or omitted indicates
900 that all data should be read.
901 """
902 # Use the data session as a context manager to ensure that the
903 # network connections to both the front end and back end servers are
904 # closed after downloading the data.
905 log.debug("Reading from remote resource: %s", self.geturl())
906 stream = size > 0
907 with self.data_session as session:
908 with time_this(log, msg="GET %s", args=(self,)):
909 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout)
911 if resp.status_code != requests.codes.ok: # 200
912 raise FileNotFoundError(
913 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}"
914 )
915 if not stream:
916 return resp.content
917 else:
918 return next(resp.iter_content(chunk_size=size))
920 def write(self, data: bytes, overwrite: bool = True) -> None:
921 """Write the supplied bytes to the new resource.
923 Parameters
924 ----------
925 data : `bytes`
926 The bytes to write to the resource. The entire contents of the
927 resource will be replaced.
928 overwrite : `bool`, optional
929 If `True` the resource will be overwritten if it exists. Otherwise
930 the write will fail.
931 """
932 log.debug("Writing to remote resource: %s", self.geturl())
933 if not overwrite and self.exists():
934 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
936 # Ensure the parent directory exists.
937 # This is only meaningful and appropriate for WebDAV, not the general
938 # HTTP case. e.g. for S3 HTTP URLs, the underlying service has no
939 # concept of 'directories' at all.
940 if self.is_webdav_endpoint:
941 self.parent().mkdir()
943 # Upload the data.
944 log.debug("Writing data to remote resource: %s", self.geturl())
945 self._put(data=data)
947 def transfer_from(
948 self,
949 src: ResourcePath,
950 transfer: str = "copy",
951 overwrite: bool = False,
952 transaction: TransactionProtocol | None = None,
953 ) -> None:
954 """Transfer the current resource to a Webdav repository.
956 Parameters
957 ----------
958 src : `ResourcePath`
959 Source URI.
960 transfer : `str`
961 Mode to use for transferring the resource. Supports the following
962 options: copy.
963 overwrite : `bool`, optional
964 Whether overwriting the remote resource is allowed or not.
965 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
966 Currently unused.
967 """
968 # Fail early to prevent delays if remote resources are requested.
969 if transfer not in self.transferModes:
970 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
972 # Existence checks cost time so do not call this unless we know
973 # that debugging is enabled.
974 if log.isEnabledFor(logging.DEBUG):
975 log.debug(
976 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
977 src,
978 src.exists(),
979 self,
980 self.exists(),
981 transfer,
982 )
984 # Short circuit immediately if the URIs are identical.
985 if self == src:
986 log.debug(
987 "Target and destination URIs are identical: %s, returning immediately."
988 " No further action required.",
989 self,
990 )
991 return
993 if not overwrite and self.exists():
994 raise FileExistsError(f"Destination path {self} already exists.")
996 if transfer == "auto":
997 transfer = self.transferDefault
999 # We can use webDAV 'COPY' or 'MOVE' if both the current and source
1000 # resources are located in the same server.
1001 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint:
1002 log.debug("Transfer from %s to %s directly", src, self)
1003 return self._move(src) if transfer == "move" else self._copy(src)
1005 # For resources of different classes or for plain HTTP resources we can
1006 # perform the copy or move operation by downloading to a local file
1007 # and uploading to the destination.
1008 self._copy_via_local(src)
1010 # This was an explicit move, try to remove the source.
1011 if transfer == "move":
1012 src.remove()
1014 def walk(
1015 self, file_filter: str | re.Pattern | None = None
1016 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
1017 """Walk the directory tree returning matching files and directories.
1019 Parameters
1020 ----------
1021 file_filter : `str` or `re.Pattern`, optional
1022 Regex to filter out files from the list before it is returned.
1024 Yields
1025 ------
1026 dirpath : `ResourcePath`
1027 Current directory being examined.
1028 dirnames : `list` of `str`
1029 Names of subdirectories within dirpath.
1030 filenames : `list` of `str`
1031 Names of all the files within dirpath.
1032 """
1033 if not self.dirLike:
1034 raise ValueError("Can not walk a non-directory URI")
1036 # Walking directories is only available on WebDAV back ends.
1037 if not self.is_webdav_endpoint:
1038 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers")
1040 if isinstance(file_filter, str):
1041 file_filter = re.compile(file_filter)
1043 resp = self._propfind(depth="1")
1044 if resp.status_code == requests.codes.multi_status: # 207
1045 files: list[str] = []
1046 dirs: list[str] = []
1048 for prop in _parse_propfind_response_body(resp.text):
1049 if prop.is_file:
1050 files.append(prop.name)
1051 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")):
1052 # Only include the names of sub-directories not the name of
1053 # the directory being walked.
1054 dirs.append(prop.name)
1056 if file_filter is not None:
1057 files = [f for f in files if file_filter.search(f)]
1059 if not dirs and not files:
1060 return
1061 else:
1062 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files
1064 for dir in dirs:
1065 new_uri = self.join(dir, forceDirectory=True)
1066 yield from new_uri.walk(file_filter)
1068 def _as_local(self) -> tuple[str, bool]:
1069 """Download object over HTTP and place in temporary directory.
1071 Returns
1072 -------
1073 path : `str`
1074 Path to local temporary file.
1075 temporary : `bool`
1076 Always returns `True`. This is always a temporary file.
1077 """
1078 # Use the session as a context manager to ensure that connections
1079 # to both the front end and back end servers are closed after the
1080 # download operation is finished.
1081 with self.data_session as session:
1082 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout)
1083 if resp.status_code != requests.codes.ok:
1084 raise FileNotFoundError(
1085 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}"
1086 )
1088 tmpdir, buffering = _get_temp_dir()
1089 with tempfile.NamedTemporaryFile(
1090 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
1091 ) as tmpFile:
1092 expected_length = int(resp.headers.get("Content-Length", "-1"))
1093 with time_this(
1094 log,
1095 msg="GET %s [length=%d] to local file %s [chunk_size=%d]",
1096 args=(self, expected_length, tmpFile.name, buffering),
1097 mem_usage=self._config.collect_memory_usage,
1098 mem_unit=u.mebibyte,
1099 ):
1100 content_length = 0
1101 for chunk in resp.iter_content(chunk_size=buffering):
1102 tmpFile.write(chunk)
1103 content_length += len(chunk)
1105 # Check that the expected and actual content lengths match. Perform
1106 # this check only when the contents of the file was not encoded by
1107 # the server.
1108 if (
1109 "Content-Encoding" not in resp.headers
1110 and expected_length >= 0
1111 and expected_length != content_length
1112 ):
1113 raise ValueError(
1114 f"Size of downloaded file does not match value in Content-Length header for {self}: "
1115 f"expecting {expected_length} and got {content_length} bytes"
1116 )
1118 return tmpFile.name, True
1120 def _send_webdav_request(
1121 self,
1122 method: str,
1123 url: str | None = None,
1124 headers: dict[str, str] | None = None,
1125 body: str | None = None,
1126 session: requests.Session | None = None,
1127 timeout: tuple[float, float] | None = None,
1128 ) -> requests.Response:
1129 """Send a webDAV request and correctly handle redirects.
1131 Parameters
1132 ----------
1133 method : `str`
1134 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL.
1135 headers : `dict`, optional
1136 A dictionary of key-value pairs (both strings) to include as
1137 headers in the request.
1138 body : `str`, optional
1139 The body of the request.
1141 Notes
1142 -----
1143 This way of sending webDAV requests is necessary for handling
1144 redirection ourselves, since the 'requests' package changes the method
1145 of the redirected request when the server responds with status 302 and
1146 the method of the original request is not HEAD (which is the case for
1147 webDAV requests).
1149 That means that when the webDAV server we interact with responds with
1150 a redirection to a PROPFIND or MKCOL request, the request gets
1151 converted to a GET request when sent to the redirected location.
1153 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in
1154 https://github.com/psf/requests/blob/main/requests/sessions.py
1156 This behavior of the 'requests' package is meant to be compatible with
1157 what is specified in RFC 9110:
1159 https://www.rfc-editor.org/rfc/rfc9110#name-302-found
1161 For our purposes, we do need to follow the redirection and send a new
1162 request using the same HTTP verb.
1163 """
1164 if url is None:
1165 url = self.geturl()
1167 if headers is None:
1168 headers = {}
1170 if session is None:
1171 session = self.metadata_session
1173 if timeout is None:
1174 timeout = self._config.timeout
1176 with time_this(
1177 log,
1178 msg="%s %s",
1179 args=(
1180 method,
1181 url,
1182 ),
1183 mem_usage=self._config.collect_memory_usage,
1184 mem_unit=u.mebibyte,
1185 ):
1186 for _ in range(max_redirects := 5):
1187 resp = session.request(
1188 method,
1189 url,
1190 data=body,
1191 headers=headers,
1192 stream=False,
1193 timeout=timeout,
1194 allow_redirects=False,
1195 )
1196 if resp.is_redirect:
1197 url = resp.headers["Location"]
1198 else:
1199 return resp
1201 # We reached the maximum allowed number of redirects.
1202 # Stop trying.
1203 raise ValueError(
1204 f"Could not get a response to {method} request for {self} after {max_redirects} redirections"
1205 )
1207 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response:
1208 """Send a PROPFIND webDAV request and return the response.
1210 Parameters
1211 ----------
1212 body : `str`, optional
1213 The body of the PROPFIND request to send to the server. If
1214 provided, it is expected to be a XML document.
1215 depth : `str`, optional
1216 The value of the 'Depth' header to include in the request.
1218 Returns
1219 -------
1220 response : `requests.Response`
1221 Response to the PROPFIND request.
1223 Notes
1224 -----
1225 It raises `ValueError` if the status code of the PROPFIND request
1226 is different from "207 Multistatus" or "404 Not Found".
1227 """
1228 if body is None:
1229 # Request only the DAV live properties we are explicitly interested
1230 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
1231 # and 'displayname'.
1232 body = (
1233 """<?xml version="1.0" encoding="utf-8" ?>"""
1234 """<D:propfind xmlns:D="DAV:"><D:prop>"""
1235 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
1236 """</D:prop></D:propfind>"""
1237 )
1238 headers = {
1239 "Depth": depth,
1240 "Content-Type": 'application/xml; charset="utf-8"',
1241 "Content-Length": str(len(body)),
1242 }
1243 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body)
1244 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found):
1245 return resp
1246 else:
1247 raise ValueError(
1248 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} "
1249 f"{resp.reason}"
1250 )
1252 def _options(self) -> requests.Response:
1253 """Send a OPTIONS webDAV request for this resource."""
1254 resp = self._send_webdav_request("OPTIONS")
1255 if resp.status_code in (requests.codes.ok, requests.codes.created):
1256 return resp
1258 raise ValueError(
1259 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} {resp.reason}"
1260 )
1262 def _head(self) -> requests.Response:
1263 """Send a HEAD webDAV request for this resource."""
1264 return self._send_webdav_request("HEAD")
1266 def _mkcol(self) -> None:
1267 """Send a MKCOL webDAV request to create a collection. The collection
1268 may already exist.
1269 """
1270 resp = self._send_webdav_request("MKCOL")
1271 if resp.status_code == requests.codes.created: # 201
1272 return
1274 if resp.status_code == requests.codes.method_not_allowed: # 405
1275 # The remote directory already exists
1276 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
1277 else:
1278 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}")
1280 def _delete(self) -> None:
1281 """Send a DELETE webDAV request for this resource."""
1282 log.debug("Deleting %s ...", self.geturl())
1284 # If this is a directory, ensure the remote is a webDAV server because
1285 # plain HTTP servers don't support DELETE requests on non-file
1286 # paths.
1287 if self.dirLike and not self.is_webdav_endpoint:
1288 raise NotImplementedError(
1289 f"Deletion of directory {self} is not implemented by plain HTTP servers"
1290 )
1292 # Deleting non-empty directories may take some time, so increase
1293 # the timeout for getting a response from the server.
1294 timeout = self._config.timeout
1295 if self.dirLike:
1296 timeout = (timeout[0], timeout[1] * 100)
1297 resp = self._send_webdav_request("DELETE", timeout=timeout)
1298 if resp.status_code in (
1299 requests.codes.ok,
1300 requests.codes.accepted,
1301 requests.codes.no_content,
1302 requests.codes.not_found,
1303 ):
1304 # We can get a "404 Not Found" error when the file or directory
1305 # does not exist or when the DELETE request was retried several
1306 # times and a previous attempt actually deleted the resource.
1307 # Therefore we consider that a "Not Found" response is not an
1308 # error since we reached the state desired by the user.
1309 return
1310 else:
1311 # TODO: the response to a DELETE request against a webDAV server
1312 # may be multistatus. If so, we need to parse the reponse body to
1313 # determine more precisely the reason of the failure (e.g. a lock)
1314 # and provide a more helpful error message.
1315 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}")
1317 def _copy_via_local(self, src: ResourcePath) -> None:
1318 """Replace the contents of this resource with the contents of a remote
1319 resource by using a local temporary file.
1321 Parameters
1322 ----------
1323 src : `HttpResourcePath`
1324 The source of the contents to copy to `self`.
1325 """
1326 with src.as_local() as local_uri:
1327 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri)
1328 with open(local_uri.ospath, "rb") as f:
1329 self._put(data=f)
1331 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None:
1332 """Send a COPY or MOVE webDAV request to copy or replace the contents
1333 of this resource with the contents of another resource located in the
1334 same server.
1336 Parameters
1337 ----------
1338 method : `str`
1339 The method to perform. Valid values are "COPY" or "MOVE" (in
1340 uppercase).
1341 src : `HttpResourcePath`
1342 The source of the contents to move to `self`.
1343 """
1344 headers = {"Destination": self.geturl()}
1345 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session)
1346 if resp.status_code in (requests.codes.created, requests.codes.no_content):
1347 return
1349 if resp.status_code == requests.codes.multi_status:
1350 tree = eTree.fromstring(resp.content)
1351 status_element = tree.find("./{DAV:}response/{DAV:}status")
1352 status = status_element.text if status_element is not None else "unknown"
1353 error = tree.find("./{DAV:}response/{DAV:}error")
1354 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}")
1355 else:
1356 raise ValueError(
1357 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}"
1358 )
1360 def _copy(self, src: HttpResourcePath) -> None:
1361 """Send a COPY webDAV request to replace the contents of this resource
1362 (if any) with the contents of another resource located in the same
1363 server.
1365 Parameters
1366 ----------
1367 src : `HttpResourcePath`
1368 The source of the contents to copy to `self`.
1369 """
1370 # Neither dCache nor XrootD currently implement the COPY
1371 # webDAV method as documented in
1372 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
1373 # (See issues DM-37603 and DM-37651 for details)
1374 #
1375 # For the time being, we use a temporary local file to
1376 # perform the copy client side.
1377 # TODO: when those 2 issues above are solved remove the 3 lines below.
1378 must_use_local = True
1379 if must_use_local:
1380 return self._copy_via_local(src)
1382 return self._copy_or_move("COPY", src)
1384 def _move(self, src: HttpResourcePath) -> None:
1385 """Send a MOVE webDAV request to replace the contents of this resource
1386 with the contents of another resource located in the same server.
1388 Parameters
1389 ----------
1390 src : `HttpResourcePath`
1391 The source of the contents to move to `self`.
1392 """
1393 return self._copy_or_move("MOVE", src)
1395 def _put(self, data: BinaryIO | bytes) -> None:
1396 """Perform an HTTP PUT request and handle redirection.
1398 Parameters
1399 ----------
1400 data : `Union[BinaryIO, bytes]`
1401 The data to be included in the body of the PUT request.
1402 """
1403 # Retrieve the final URL for this upload by sending a PUT request with
1404 # no content. Follow a single server redirection to retrieve the
1405 # final URL.
1406 headers = {"Content-Length": "0"}
1407 if self._config.send_expect_on_put:
1408 headers["Expect"] = "100-continue"
1410 url = self.geturl()
1412 # Use the session as a context manager to ensure the underlying
1413 # connections are closed after finishing uploading the data.
1414 with self.data_session as session:
1415 # Send an empty PUT request to get redirected to the final
1416 # destination.
1417 log.debug("Sending empty PUT request to %s", url)
1418 with time_this(
1419 log,
1420 msg="PUT (no data) %s",
1421 args=(url,),
1422 mem_usage=self._config.collect_memory_usage,
1423 mem_unit=u.mebibyte,
1424 ):
1425 resp = session.request(
1426 "PUT",
1427 url,
1428 data=None,
1429 headers=headers,
1430 stream=False,
1431 timeout=self._config.timeout,
1432 allow_redirects=False,
1433 )
1434 if resp.is_redirect:
1435 url = resp.headers["Location"]
1437 # Upload the data to the final destination.
1438 log.debug("Uploading data to %s", url)
1440 # Ask the server to compute and record a checksum of the uploaded
1441 # file contents, for later integrity checks. Since we don't compute
1442 # the digest ourselves while uploading the data, we cannot control
1443 # after the request is complete that the data we uploaded is
1444 # identical to the data recorded by the server, but at least the
1445 # server has recorded a digest of the data it stored.
1446 #
1447 # See RFC-3230 for details and
1448 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
1449 # for the list of supported digest algorithhms.
1450 # In addition, note that not all servers implement this RFC so
1451 # the checksum may not be computed by the server.
1452 put_headers: dict[str, str] | None = None
1453 if digest := self._config.digest_algorithm:
1454 put_headers = {"Want-Digest": digest}
1456 with time_this(
1457 log,
1458 msg="PUT %s",
1459 args=(url,),
1460 mem_usage=self._config.collect_memory_usage,
1461 mem_unit=u.mebibyte,
1462 ):
1463 resp = session.request(
1464 "PUT",
1465 url,
1466 data=data,
1467 headers=put_headers,
1468 stream=False,
1469 timeout=self._config.timeout,
1470 allow_redirects=False,
1471 )
1472 if resp.status_code in (
1473 requests.codes.ok,
1474 requests.codes.created,
1475 requests.codes.no_content,
1476 ):
1477 return
1478 else:
1479 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}")
1481 @contextlib.contextmanager
1482 def _openImpl(
1483 self,
1484 mode: str = "r",
1485 *,
1486 encoding: str | None = None,
1487 ) -> Iterator[ResourceHandleProtocol]:
1488 resp = self._head()
1489 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes"
1490 handle: ResourceHandleProtocol
1491 if mode in ("rb", "r") and accepts_range:
1492 handle = HttpReadResourceHandle(
1493 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout
1494 )
1495 if mode == "r":
1496 # cast because the protocol is compatible, but does not have
1497 # BytesIO in the inheritance tree
1498 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
1499 else:
1500 yield handle
1501 else:
1502 with super()._openImpl(mode, encoding=encoding) as http_handle:
1503 yield http_handle
1506def _dump_response(resp: requests.Response) -> None:
1507 """Log the contents of a HTTP or webDAV request and its response.
1509 Parameters
1510 ----------
1511 resp : `requests.Response`
1512 The response to log.
1514 Notes
1515 -----
1516 Intended for development purposes only.
1517 """
1518 log.debug("-----------------------------------------------")
1519 log.debug("Request")
1520 log.debug(" method=%s", resp.request.method)
1521 log.debug(" URL=%s", resp.request.url)
1522 log.debug(" headers=%s", resp.request.headers)
1523 if resp.request.method == "PUT":
1524 log.debug(" body=<data>")
1525 elif resp.request.body is None:
1526 log.debug(" body=<empty>")
1527 else:
1528 log.debug(" body=%r", resp.request.body[:120])
1530 log.debug("Response:")
1531 log.debug(" status_code=%d", resp.status_code)
1532 log.debug(" headers=%s", resp.headers)
1533 if not resp.content:
1534 log.debug(" body=<empty>")
1535 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain":
1536 log.debug(" body=%r", resp.content)
1537 else:
1538 log.debug(" body=%r", resp.content[:80])
1541def _is_protected(filepath: str) -> bool:
1542 """Return true if the permissions of file at filepath only allow for access
1543 by its owner.
1545 Parameters
1546 ----------
1547 filepath : `str`
1548 Path of a local file.
1549 """
1550 if not os.path.isfile(filepath):
1551 return False
1552 mode = stat.S_IMODE(os.stat(filepath).st_mode)
1553 owner_accessible = bool(mode & stat.S_IRWXU)
1554 group_accessible = bool(mode & stat.S_IRWXG)
1555 other_accessible = bool(mode & stat.S_IRWXO)
1556 return owner_accessible and not group_accessible and not other_accessible
1559def _parse_propfind_response_body(body: str) -> list[DavProperty]:
1560 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND
1561 request.
1563 Parameters
1564 ----------
1565 body : `str`
1566 XML-encoded response body to a PROPFIND request
1568 Returns
1569 -------
1570 responses : `List[DavProperty]`
1572 Notes
1573 -----
1574 Is is expected that there is at least one reponse in `body`, otherwise
1575 this function raises.
1576 """
1577 # A response body to a PROPFIND request is of the form (indented for
1578 # readability):
1579 #
1580 # <?xml version="1.0" encoding="UTF-8"?>
1581 # <D:multistatus xmlns:D="DAV:">
1582 # <D:response>
1583 # <D:href>path/to/resource</D:href>
1584 # <D:propstat>
1585 # <D:prop>
1586 # <D:resourcetype>
1587 # <D:collection xmlns:D="DAV:"/>
1588 # </D:resourcetype>
1589 # <D:getlastmodified>
1590 # Fri, 27 Jan 2 023 13:59:01 GMT
1591 # </D:getlastmodified>
1592 # <D:getcontentlength>
1593 # 12345
1594 # </D:getcontentlength>
1595 # </D:prop>
1596 # <D:status>
1597 # HTTP/1.1 200 OK
1598 # </D:status>
1599 # </D:propstat>
1600 # </D:response>
1601 # <D:response>
1602 # ...
1603 # </D:response>
1604 # <D:response>
1605 # ...
1606 # </D:response>
1607 # </D:multistatus>
1609 # Scan all the 'response' elements and extract the relevant properties
1610 responses = []
1611 multistatus = eTree.fromstring(body.strip())
1612 for response in multistatus.findall("./{DAV:}response"):
1613 responses.append(DavProperty(response))
1615 if responses:
1616 return responses
1617 else:
1618 # Could not parse the body
1619 raise ValueError(f"Unable to parse response for PROPFIND request: {body}")
1622class DavProperty:
1623 """Helper class to encapsulate select live DAV properties of a single
1624 resource, as retrieved via a PROPFIND request.
1626 Parameters
1627 ----------
1628 response : `eTree.Element` or `None`
1629 The XML response defining the DAV property.
1630 """
1632 # Regular expression to compare against the 'status' element of a
1633 # PROPFIND response's 'propstat' element.
1634 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE)
1636 def __init__(self, response: eTree.Element | None):
1637 self._href: str = ""
1638 self._displayname: str = ""
1639 self._collection: bool = False
1640 self._getlastmodified: str = ""
1641 self._getcontentlength: int = -1
1643 if response is not None:
1644 self._parse(response)
1646 def _parse(self, response: eTree.Element) -> None:
1647 # Extract 'href'.
1648 if (element := response.find("./{DAV:}href")) is not None:
1649 # We need to use "str(element.text)"" instead of "element.text" to
1650 # keep mypy happy.
1651 self._href = str(element.text).strip()
1652 else:
1653 raise ValueError(
1654 "Property 'href' expected but not found in PROPFIND response: "
1655 f"{eTree.tostring(response, encoding='unicode')}"
1656 )
1658 for propstat in response.findall("./{DAV:}propstat"):
1659 # Only extract properties of interest with status OK.
1660 status = propstat.find("./{DAV:}status")
1661 if status is None or not self._status_ok_rex.match(str(status.text)):
1662 continue
1664 for prop in propstat.findall("./{DAV:}prop"):
1665 # Parse "collection".
1666 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None:
1667 self._collection = True
1669 # Parse "getlastmodified".
1670 if (element := prop.find("./{DAV:}getlastmodified")) is not None:
1671 self._getlastmodified = str(element.text)
1673 # Parse "getcontentlength".
1674 if (element := prop.find("./{DAV:}getcontentlength")) is not None:
1675 self._getcontentlength = int(str(element.text))
1677 # Parse "displayname".
1678 if (element := prop.find("./{DAV:}displayname")) is not None:
1679 self._displayname = str(element.text)
1681 # Some webDAV servers don't include the 'displayname' property in the
1682 # response so try to infer it from the value of the 'href' property.
1683 # Depending on the server the href value may end with '/'.
1684 if not self._displayname:
1685 self._displayname = os.path.basename(self._href.rstrip("/"))
1687 # Force a size of 0 for collections.
1688 if self._collection:
1689 self._getcontentlength = 0
1691 @property
1692 def exists(self) -> bool:
1693 # It is either a directory or a file with length of at least zero
1694 return self._collection or self._getcontentlength >= 0
1696 @property
1697 def is_directory(self) -> bool:
1698 return self._collection
1700 @property
1701 def is_file(self) -> bool:
1702 return not self._collection
1704 @property
1705 def size(self) -> int:
1706 return self._getcontentlength
1708 @property
1709 def name(self) -> str:
1710 return self._displayname
1712 @property
1713 def href(self) -> str:
1714 return self._href