Coverage for python/lsst/resources/http.py: 23%
592 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-13 09:59 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-13 09:59 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import math
21import os
22import os.path
23import random
24import re
25import stat
26import tempfile
27from collections.abc import Iterator
28from typing import TYPE_CHECKING, BinaryIO, cast
30try:
31 # Prefer 'defusedxml' (not part of standard library) if available, since
32 # 'xml' is vulnerable to XML bombs.
33 import defusedxml.ElementTree as eTree
34except ImportError:
35 import xml.etree.ElementTree as eTree
37from urllib.parse import parse_qs
39import requests
40from astropy import units as u
41from lsst.utils.timer import time_this
42from requests.adapters import HTTPAdapter
43from requests.auth import AuthBase
44from urllib3.util.retry import Retry
46from ._resourceHandles import ResourceHandleProtocol
47from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle, parse_content_range_header
48from ._resourcePath import ResourcePath
50if TYPE_CHECKING:
51 from .utils import TransactionProtocol
53log = logging.getLogger(__name__)
56def _timeout_from_environment(env_var: str, default_value: float) -> float:
57 """Convert and return a timeout from the value of an environment variable
58 or a default value if the environment variable is not initialized. The
59 value of `env_var` must be a valid `float` otherwise this function raises.
61 Parameters
62 ----------
63 env_var : `str`
64 Environment variable to look for.
65 default_value : `float``
66 Value to return if `env_var` is not defined in the environment.
68 Returns
69 -------
70 _timeout_from_environment : `float`
71 Converted value.
72 """
73 try:
74 timeout = float(os.environ.get(env_var, default_value))
75 except ValueError:
76 raise ValueError(
77 f"Expecting valid timeout value in environment variable {env_var} but found "
78 f"{os.environ.get(env_var)}"
79 ) from None
81 if math.isnan(timeout):
82 raise ValueError(f"Unexpected timeout value NaN found in environment variable {env_var}")
84 return timeout
87class HttpResourcePathConfig:
88 """Configuration class to encapsulate the configurable items used by class
89 HttpResourcePath.
90 """
92 # Default timeouts for all HTTP requests (seconds).
93 DEFAULT_TIMEOUT_CONNECT = 30.0
94 DEFAULT_TIMEOUT_READ = 1_500.0
96 # Default lower and upper bounds for the backoff interval (seconds).
97 # A value in this interval is randomly selected as the backoff factor when
98 # requests need to be retried.
99 DEFAULT_BACKOFF_MIN = 1.0
100 DEFAULT_BACKOFF_MAX = 3.0
102 # Default number of connections to persist with both the front end and
103 # back end servers.
104 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2
105 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1
107 # Accepted digest algorithms
108 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512")
110 _front_end_connections: int | None = None
111 _back_end_connections: int | None = None
112 _digest_algorithm: str | None = None
113 _send_expect_on_put: bool | None = None
114 _timeout: tuple[float, float] | None = None
115 _collect_memory_usage: bool | None = None
116 _backoff_min: float | None = None
117 _backoff_max: float | None = None
119 @property
120 def front_end_connections(self) -> int:
121 """Number of persistent connections to the front end server."""
122 if self._front_end_connections is not None: 122 ↛ 123line 122 didn't jump to line 123, because the condition on line 122 was never true
123 return self._front_end_connections
125 try:
126 self._front_end_connections = int(
127 os.environ.get(
128 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
129 )
130 )
131 except ValueError:
132 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
134 return self._front_end_connections
136 @property
137 def back_end_connections(self) -> int:
138 """Number of persistent connections to the back end servers."""
139 if self._back_end_connections is not None: 139 ↛ 140line 139 didn't jump to line 140, because the condition on line 139 was never true
140 return self._back_end_connections
142 try:
143 self._back_end_connections = int(
144 os.environ.get(
145 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
146 )
147 )
148 except ValueError:
149 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
151 return self._back_end_connections
153 @property
154 def digest_algorithm(self) -> str:
155 """Algorithm to ask the server to use for computing and recording
156 digests of each file contents in PUT requests.
158 Returns
159 -------
160 digest_algorithm: `str`
161 The name of a digest algorithm or the empty string if no algotihm
162 is configured.
163 """
164 if self._digest_algorithm is not None:
165 return self._digest_algorithm
167 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower()
168 if digest not in self.ACCEPTED_DIGESTS:
169 digest = ""
171 self._digest_algorithm = digest
172 return self._digest_algorithm
174 @property
175 def send_expect_on_put(self) -> bool:
176 """Return True if a "Expect: 100-continue" header is to be sent to
177 the server on each PUT request.
179 Some servers (e.g. dCache) uses this information as an indication that
180 the client knows how to handle redirects to the specific server that
181 will actually receive the data for PUT requests.
182 """
183 if self._send_expect_on_put is not None:
184 return self._send_expect_on_put
186 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
187 return self._send_expect_on_put
189 @property
190 def timeout(self) -> tuple[float, float]:
191 """Return a tuple with the values of timeouts for connecting to the
192 server and reading its response, respectively. Both values are in
193 seconds.
194 """
195 if self._timeout is not None:
196 return self._timeout
198 self._timeout = (
199 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT),
200 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ),
201 )
202 return self._timeout
204 @property
205 def collect_memory_usage(self) -> bool:
206 """Return true if we want to collect memory usage when timing
207 operations against the remote server via the `lsst.utils.time_this`
208 context manager.
209 """
210 if self._collect_memory_usage is not None:
211 return self._collect_memory_usage
213 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ
214 return self._collect_memory_usage
216 @property
217 def backoff_min(self) -> float:
218 """Lower bound of the interval from which a backoff factor is randomly
219 selected when retrying requests (seconds).
220 """
221 if self._backoff_min is not None:
222 return self._backoff_min
224 self._backoff_min = self.DEFAULT_BACKOFF_MIN
225 try:
226 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN))
227 if not math.isnan(backoff_min): 227 ↛ 232line 227 didn't jump to line 232, because the condition on line 227 was never false
228 self._backoff_min = backoff_min
229 except ValueError:
230 pass
232 return self._backoff_min
234 @property
235 def backoff_max(self) -> float:
236 """Upper bound of the interval from which a backoff factor is randomly
237 selected when retrying requests (seconds).
238 """
239 if self._backoff_max is not None:
240 return self._backoff_max
242 self._backoff_max = self.DEFAULT_BACKOFF_MAX
243 try:
244 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX))
245 if not math.isnan(backoff_max): 245 ↛ 250line 245 didn't jump to line 250, because the condition on line 245 was never false
246 self._backoff_max = backoff_max
247 except ValueError:
248 pass
250 return self._backoff_max
253@functools.lru_cache
254def _is_webdav_endpoint(path: ResourcePath | str) -> bool:
255 """Check whether the remote HTTP endpoint implements WebDAV features.
257 Parameters
258 ----------
259 path : `ResourcePath` or `str`
260 URL to the resource to be checked.
261 Should preferably refer to the root since the status is shared
262 by all paths in that server.
264 Returns
265 -------
266 _is_webdav_endpoint : `bool`
267 True if the endpoint implements WebDAV, False if it doesn't.
268 """
269 log.debug("Detecting HTTP endpoint type for '%s'...", path)
271 # Send an OPTIONS request and inspect its response. An OPTIONS
272 # request does not need authentication of the client, so we don't need
273 # to provide a client certificate or a bearer token. We set a
274 # relatively short timeout since an OPTIONS request is relatively cheap
275 # for the server to compute.
277 # Create a session for configuring retries
278 retries = Retry(
279 # Total number of retries to allow. Takes precedence over other
280 # counts.
281 total=6,
282 # How many connection-related errors to retry on.
283 connect=3,
284 # How many times to retry on read errors.
285 read=3,
286 # How many times to retry on bad status codes.
287 status=5,
288 # Set of uppercased HTTP method verbs that we should retry on.
289 allowed_methods=frozenset(
290 [
291 "OPTIONS",
292 ]
293 ),
294 # HTTP status codes that we should force a retry on.
295 status_forcelist=frozenset(
296 [
297 requests.codes.too_many_requests, # 429
298 requests.codes.internal_server_error, # 500
299 requests.codes.bad_gateway, # 502
300 requests.codes.service_unavailable, # 503
301 requests.codes.gateway_timeout, # 504
302 ]
303 ),
304 # Whether to respect 'Retry-After' header on status codes defined
305 # above.
306 respect_retry_after_header=True,
307 )
309 try:
310 session = requests.Session()
311 session.mount(str(path), HTTPAdapter(max_retries=retries))
312 session.verify = os.environ.get("LSST_HTTP_CACERT_BUNDLE", True)
313 with session:
314 resp = session.options(
315 str(path),
316 stream=False,
317 timeout=(
318 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", 30.0),
319 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", 60.0),
320 ),
321 )
322 if resp.status_code not in (requests.codes.ok, requests.codes.created):
323 return False
325 # Check that "1" is part of the value of the "DAV" header. We don't
326 # use locks, so a server complying to class 1 is enough for our
327 # purposes. All webDAV servers must advertise at least compliance
328 # class "1".
329 #
330 # Compliance classes are documented in
331 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
332 #
333 # Examples of values for header DAV are:
334 # DAV: 1, 2
335 # DAV: 1, <http://apache.org/dav/propset/fs/1>
336 if "DAV" not in resp.headers:
337 return False
338 else:
339 # Convert to str to keep mypy happy
340 compliance_class = str(resp.headers.get("DAV"))
341 return "1" in compliance_class.replace(" ", "").split(",")
343 except requests.exceptions.SSLError as e:
344 log.warning(
345 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to "
346 "specify tha path to a bundle of certificate authorities you trust "
347 "which are not included in the default set of trusted authorities "
348 "of this system."
349 )
350 raise e
353# Tuple (path, block_size) pointing to the location of a local directory
354# to save temporary files and the block size of the underlying file system.
355_TMPDIR: tuple[str, int] | None = None
358def _get_temp_dir() -> tuple[str, int]:
359 """Return the temporary directory path and block size.
361 This function caches its results in _TMPDIR.
362 """
363 global _TMPDIR
364 if _TMPDIR:
365 return _TMPDIR
367 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
368 # 'TMPDIR', if defined. Otherwise use current working directory.
369 tmpdir = os.getcwd()
370 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
371 if dir and os.path.isdir(dir):
372 tmpdir = dir
373 break
375 # Compute the block size as 256 blocks of typical size
376 # (i.e. 4096 bytes) or 10 times the file system block size,
377 # whichever is higher. This is a reasonable compromise between
378 # using memory for buffering and the number of system calls
379 # issued to read from or write to temporary files.
380 fsstats = os.statvfs(tmpdir)
381 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
384class BearerTokenAuth(AuthBase):
385 """Attach a bearer token 'Authorization' header to each request.
387 Parameters
388 ----------
389 token : `str`
390 Can be either the path to a local protected file which contains the
391 value of the token or the token itself.
392 """
394 def __init__(self, token: str):
395 self._token = self._path = None
396 self._mtime: float = -1.0
397 if not token:
398 return
400 self._token = token
401 if os.path.isfile(token):
402 self._path = os.path.abspath(token)
403 if not _is_protected(self._path):
404 raise PermissionError(
405 f"Bearer token file at {self._path} must be protected for access only by its owner"
406 )
407 self._refresh()
409 def _refresh(self) -> None:
410 """Read the token file (if any) if its modification time is more recent
411 than the last time we read it.
412 """
413 if not self._path:
414 return
416 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
417 log.debug("Reading bearer token file at %s", self._path)
418 self._mtime = mtime
419 with open(self._path) as f:
420 self._token = f.read().rstrip("\n")
422 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
423 # Only add a bearer token to a request when using secure HTTP.
424 if req.url and req.url.lower().startswith("https://") and self._token:
425 self._refresh()
426 req.headers["Authorization"] = f"Bearer {self._token}"
427 return req
430class SessionStore:
431 """Cache a reusable HTTP client session per endpoint.
433 Parameters
434 ----------
435 num_pools : `int`, optional
436 Number of connection pools to keep: there is one pool per remote
437 host.
438 max_persistent_connections : `int`, optional
439 Maximum number of connections per remote host to persist in each
440 connection pool.
441 backoff_min : `float`, optional
442 Minimum value of the interval to compute the exponential
443 backoff factor when retrying requests (seconds).
444 backoff_max : `float`, optional
445 Maximum value of the interval to compute the exponential
446 backoff factor when retrying requests (seconds).
447 """
449 def __init__(
450 self,
451 num_pools: int = 10,
452 max_persistent_connections: int = 1,
453 backoff_min: float = 1.0,
454 backoff_max: float = 3.0,
455 ) -> None:
456 # Dictionary to store the session associated to a given URI. The key
457 # of the dictionary is a root URI and the value is the session.
458 self._sessions: dict[str, requests.Session] = {}
460 # See documentation of urllib3 PoolManager class:
461 # https://urllib3.readthedocs.io
462 self._num_pools: int = num_pools
464 # See urllib3 Advanced Usage documentation:
465 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html
466 self._max_persistent_connections: int = max_persistent_connections
468 # Minimum and maximum values of the interval to compute the exponential
469 # backoff factor when retrying requests (seconds).
470 self._backoff_min: float = backoff_min
471 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0
473 def clear(self) -> None:
474 """Destroy all previously created sessions and attempt to close
475 underlying idle network connections.
476 """
477 # Close all sessions and empty the store. Idle network connections
478 # should be closed as a consequence. We don't have means through
479 # the API exposed by Requests to actually force closing the
480 # underlying open sockets.
481 for session in self._sessions.values():
482 session.close()
484 self._sessions.clear()
486 def get(self, rpath: ResourcePath) -> requests.Session:
487 """Retrieve a session for accessing the remote resource at rpath.
489 Parameters
490 ----------
491 rpath : `ResourcePath`
492 URL to a resource at the remote server for which a session is to
493 be retrieved.
495 Notes
496 -----
497 Once a session is created for a given endpoint it is cached and
498 returned every time a session is requested for any path under that same
499 endpoint. For instance, a single session will be cached and shared
500 for paths "https://www.example.org/path/to/file" and
501 "https://www.example.org/any/other/path".
503 Note that "https://www.example.org" and "https://www.example.org:12345"
504 will have different sessions since the port number is not identical.
506 In order to configure the session, some environment variables are
507 inspected:
509 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
510 certificates to trust when verifying the server's certificate.
512 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
513 local file containing a bearer token to be used as the client
514 authentication mechanism with all requests.
515 The permissions of the token file must be set so that only its
516 owner can access it.
517 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
518 and LSST_HTTP_AUTH_CLIENT_KEY.
520 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
521 client certificate for authenticating to the server.
522 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
523 initialized with the path of the client private key file.
524 The permissions of the client private key must be set so that only
525 its owner can access it, at least for reading.
526 """
527 root_uri = str(rpath.root_uri())
528 if root_uri not in self._sessions:
529 # We don't have yet a session for this endpoint: create a new one.
530 self._sessions[root_uri] = self._make_session(rpath)
532 return self._sessions[root_uri]
534 def _make_session(self, rpath: ResourcePath) -> requests.Session:
535 """Make a new session configured from values from the environment."""
536 session = requests.Session()
537 root_uri = str(rpath.root_uri())
538 log.debug("Creating new HTTP session for endpoint %s ...", root_uri)
539 retries = Retry(
540 # Total number of retries to allow. Takes precedence over other
541 # counts.
542 total=6,
543 # How many connection-related errors to retry on.
544 connect=3,
545 # How many times to retry on read errors.
546 read=3,
547 # Backoff factor to apply between attempts after the second try
548 # (seconds). Compute a random jitter to prevent all the clients
549 # to overwhelm the server by sending requests at the same time.
550 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(),
551 # How many times to retry on bad status codes.
552 status=5,
553 # Set of uppercased HTTP method verbs that we should retry on.
554 # We only automatically retry idempotent requests.
555 allowed_methods=frozenset(
556 [
557 "COPY",
558 "DELETE",
559 "GET",
560 "HEAD",
561 "MKCOL",
562 "OPTIONS",
563 "PROPFIND",
564 "PUT",
565 ]
566 ),
567 # HTTP status codes that we should force a retry on.
568 status_forcelist=frozenset(
569 [
570 requests.codes.too_many_requests, # 429
571 requests.codes.internal_server_error, # 500
572 requests.codes.bad_gateway, # 502
573 requests.codes.service_unavailable, # 503
574 requests.codes.gateway_timeout, # 504
575 ]
576 ),
577 # Whether to respect Retry-After header on status codes defined
578 # above.
579 respect_retry_after_header=True,
580 )
582 # Persist the specified number of connections to the front end server.
583 session.mount(
584 root_uri,
585 HTTPAdapter(
586 pool_connections=self._num_pools,
587 pool_maxsize=self._max_persistent_connections,
588 pool_block=False,
589 max_retries=retries,
590 ),
591 )
593 # Do not persist the connections to back end servers which may vary
594 # from request to request. Systematically persisting connections to
595 # those servers may exhaust their capabilities when there are thousands
596 # of simultaneous clients.
597 session.mount(
598 f"{rpath.scheme}://",
599 HTTPAdapter(
600 pool_connections=self._num_pools,
601 pool_maxsize=0,
602 pool_block=False,
603 max_retries=retries,
604 ),
605 )
607 # If the remote endpoint doesn't use secure HTTP we don't include
608 # bearer tokens in the requests nor need to authenticate the remote
609 # server.
610 if rpath.scheme != "https":
611 return session
613 # Should we use a specific CA cert bundle for authenticating the
614 # server?
615 session.verify = True
616 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
617 session.verify = ca_bundle
619 # Should we use bearer tokens for client authentication?
620 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
621 log.debug("... using bearer token authentication")
622 session.auth = BearerTokenAuth(token)
623 return session
625 # Should we instead use client certificate and private key? If so, both
626 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
627 # initialized.
628 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
629 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
630 if client_cert and client_key:
631 if not _is_protected(client_key):
632 raise PermissionError(
633 f"Private key file at {client_key} must be protected for access only by its owner"
634 )
635 log.debug("... using client certificate authentication.")
636 session.cert = (client_cert, client_key)
637 return session
639 if client_cert:
640 # Only the client certificate was provided.
641 raise ValueError(
642 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
643 )
645 if client_key:
646 # Only the client private key was provided.
647 raise ValueError(
648 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
649 )
651 log.debug(
652 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
653 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
654 )
655 return session
658class HttpResourcePath(ResourcePath):
659 """General HTTP(S) resource.
661 Notes
662 -----
663 In order to configure the behavior of instances of this class, the
664 environment variables below are inspected:
666 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
667 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
668 This header is required by some servers to detect if the client
669 knows how to handle redirections. In case of redirection, the body
670 of the PUT request is sent to the redirected location and not to
671 the front end server.
673 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a
674 numeric value, they are interpreted as the number of seconds to wait
675 for establishing a connection with the server and for reading its
676 response, respectively.
678 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and
679 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number
680 of connections to attempt to persist with both the front end servers
681 and the back end servers.
682 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and
683 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS.
685 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to
686 ask the server to compute for every file's content sent to the server
687 via a PUT request. No digest is requested if this variable is not set
688 or is set to an invalid value.
689 Valid values are those in ACCEPTED_DIGESTS.
690 """
692 _is_webdav: bool | None = None
694 # Configuration items for this class instances.
695 _config = HttpResourcePathConfig()
697 # The session for metadata requests is used for interacting with
698 # the front end servers for requests such as PROPFIND, HEAD, etc. Those
699 # interactions are typically served by the front end servers. We want to
700 # keep the connection to the front end servers open, to reduce the cost
701 # associated to TCP and TLS handshaking for each new request.
702 _metadata_session_store = SessionStore(
703 num_pools=5,
704 max_persistent_connections=_config.front_end_connections,
705 backoff_min=_config.backoff_min,
706 backoff_max=_config.backoff_max,
707 )
709 # The data session is used for interaction with the front end servers which
710 # typically redirect to the back end servers for serving our PUT and GET
711 # requests. We attempt to keep a single connection open with the front end
712 # server, if possible. This depends on how the server behaves and the
713 # kind of request. Some servers close the connection when redirecting
714 # the client to a back end server, for instance when serving a PUT
715 # request.
716 _data_session_store = SessionStore(
717 num_pools=25,
718 max_persistent_connections=_config.back_end_connections,
719 backoff_min=_config.backoff_min,
720 backoff_max=_config.backoff_max,
721 )
723 # Process ID which created the session stores above. We need to store this
724 # to replace sessions created by a parent process and inherited by a
725 # child process after a fork, to avoid confusing the SSL layer.
726 _pid: int = -1
728 @property
729 def metadata_session(self) -> requests.Session:
730 """Client session to send requests which do not require upload or
731 download of data, i.e. mostly metadata requests.
732 """
733 if hasattr(self, "_metadata_session"):
734 if HttpResourcePath._pid == os.getpid():
735 return self._metadata_session
736 else:
737 # The metadata session we have in cache was likely created by
738 # a parent process. Discard all the sessions in that store.
739 self._metadata_session_store.clear()
741 # Retrieve a new metadata session.
742 HttpResourcePath._pid = os.getpid()
743 self._metadata_session: requests.Session = self._metadata_session_store.get(self)
744 return self._metadata_session
746 @property
747 def data_session(self) -> requests.Session:
748 """Client session for uploading and downloading data."""
749 if hasattr(self, "_data_session"):
750 if HttpResourcePath._pid == os.getpid():
751 return self._data_session
752 else:
753 # The data session we have in cache was likely created by
754 # a parent process. Discard all the sessions in that store.
755 self._data_session_store.clear()
757 # Retrieve a new data session.
758 HttpResourcePath._pid = os.getpid()
759 self._data_session: requests.Session = self._data_session_store.get(self)
760 return self._data_session
762 def _clear_sessions(self) -> None:
763 """Close the socket connections that are still open.
765 Used only in test suites to avoid warnings.
766 """
767 self._metadata_session_store.clear()
768 self._data_session_store.clear()
770 if hasattr(self, "_metadata_session"):
771 delattr(self, "_metadata_session")
773 if hasattr(self, "_data_session"):
774 delattr(self, "_data_session")
776 @property
777 def is_webdav_endpoint(self) -> bool:
778 """Check if the current endpoint implements WebDAV features.
780 This is stored per URI but cached by root so there is
781 only one check per hostname.
782 """
783 if self._is_webdav is not None:
784 return self._is_webdav
786 self._is_webdav = _is_webdav_endpoint(self.root_uri())
787 return self._is_webdav
789 def exists(self) -> bool:
790 """Check that a remote HTTP resource exists."""
791 log.debug("Checking if resource exists: %s", self.geturl())
792 if not self.is_webdav_endpoint:
793 # The remote is a plain HTTP server. Let's attempt a HEAD
794 # request, even if the behavior for such a request against a
795 # directory is not specified, so it depends on the server
796 # implementation.
797 resp = self._head_non_webdav_url()
798 return self._is_successful_non_webdav_head_request(resp)
800 # The remote endpoint is a webDAV server: send a PROPFIND request
801 # to determine if it exists.
802 resp = self._propfind()
803 if resp.status_code == requests.codes.multi_status: # 207
804 prop = _parse_propfind_response_body(resp.text)[0]
805 return prop.exists
806 else: # 404 Not Found
807 return False
809 def size(self) -> int:
810 """Return the size of the remote resource in bytes."""
811 if self.dirLike:
812 return 0
814 if not self.is_webdav_endpoint:
815 # The remote is a plain HTTP server. Send a HEAD request to
816 # retrieve the size of the resource.
817 resp = self._head_non_webdav_url()
818 if resp.status_code == requests.codes.ok: # 200
819 if "Content-Length" in resp.headers:
820 return int(resp.headers["Content-Length"])
821 else:
822 raise ValueError(
823 f"Response to HEAD request to {self} does not contain 'Content-Length' header"
824 )
825 elif resp.status_code == requests.codes.partial_content:
826 # 206 Partial Content, returned from a GET request with a Range
827 # header (used to emulate HEAD for presigned S3 URLs).
828 # In this case Content-Length is the length of the Range and
829 # not the full length of the file, so we have to parse
830 # Content-Range instead.
831 content_range_header = resp.headers.get("Content-Range")
832 if content_range_header is None:
833 raise ValueError(
834 f"Response to GET request to {self} did not contain 'Content-Range' header"
835 )
836 content_range = parse_content_range_header(content_range_header)
837 size = content_range.total
838 if size is None:
839 raise ValueError(f"Content-Range header for {self} did not include a total file size")
840 return size
841 elif resp.status_code == requests.codes.range_not_satisfiable:
842 # 416 Range Not Satisfiable, which can occur on a GET for a 0
843 # byte file since we asked for 1 byte Range which is longer
844 # than the file.
845 #
846 # Servers are supposed to include a Content-Range header in
847 # this case, but Google's S3 implementation doesn't. Any
848 # non-zero file size should have been handled by the 206 and
849 # 200 cases above, so assume we have a zero here.
850 return 0
851 elif resp.status_code == requests.codes.not_found:
852 raise FileNotFoundError(
853 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
854 )
855 else:
856 raise ValueError(
857 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} "
858 f"{resp.reason}"
859 )
861 # The remote is a webDAV server: send a PROPFIND request to retrieve
862 # the size of the resource. Sizes are only meaningful for files.
863 resp = self._propfind()
864 if resp.status_code == requests.codes.multi_status: # 207
865 prop = _parse_propfind_response_body(resp.text)[0]
866 if prop.is_file:
867 return prop.size
868 elif prop.is_directory:
869 raise IsADirectoryError(
870 f"Resource {self} is reported by server as a directory but has a file path"
871 )
872 else:
873 raise FileNotFoundError(f"Resource {self} does not exist")
874 else: # 404 Not Found
875 raise FileNotFoundError(
876 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
877 )
879 def _head_non_webdav_url(self) -> requests.Response:
880 """Return a response from a HTTP HEAD request for a non-WebDAV HTTP
881 URL.
883 Emulates HEAD using a 1-byte GET for presigned S3 URLs.
884 """
885 if self._looks_like_presigned_s3_url():
886 # Presigned S3 URLs are signed for a single method only, so you
887 # can't call HEAD on a URL signed for GET. However, S3 does
888 # support Range requests, so you can ask for a 1-byte range with
889 # GET for a similar effect to HEAD.
890 #
891 # Note that some headers differ between a true HEAD request and the
892 # response returned by this GET, e.g. Content-Length will always be
893 # 1, and the status code is 206 instead of 200.
894 return self.metadata_session.get(
895 self.geturl(),
896 timeout=self._config.timeout,
897 allow_redirects=True,
898 stream=False,
899 headers={"Range": "bytes=0-0"},
900 )
901 else:
902 return self.metadata_session.head(
903 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
904 )
906 def _is_successful_non_webdav_head_request(self, resp: requests.Response) -> bool:
907 """Return `True` if the status code in the response indicates a
908 successful response to ``_head_non_webdav_url``.
909 """
910 return resp.status_code in (
911 requests.codes.ok, # 200, from a normal HEAD or GET request
912 requests.codes.partial_content, # 206, returned from a GET request with a Range header.
913 # 416, returned from a GET request with a 1-byte Range header that
914 # is longer than the 0-byte file.
915 requests.codes.range_not_satisfiable,
916 )
918 def _looks_like_presigned_s3_url(self) -> bool:
919 """Return `True` if this ResourcePath's URL is likely to be a presigned
920 S3 URL.
921 """
922 query_params = parse_qs(self._uri.query)
923 return "Signature" in query_params and "Expires" in query_params
925 def mkdir(self) -> None:
926 """Create the directory resource if it does not already exist."""
927 # Creating directories is only available on WebDAV back ends.
928 if not self.is_webdav_endpoint:
929 raise NotImplementedError(
930 f"Creation of directory {self} is not implemented by plain HTTP servers"
931 )
933 if not self.dirLike:
934 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
936 # Check if the target directory already exists.
937 resp = self._propfind()
938 if resp.status_code == requests.codes.multi_status: # 207
939 prop = _parse_propfind_response_body(resp.text)[0]
940 if prop.exists:
941 if prop.is_directory:
942 return
943 else:
944 # A file exists at this path
945 raise NotADirectoryError(
946 f"Can not create a directory for {self} because a file already exists at that path"
947 )
949 # Target directory does not exist. Create it and its ancestors as
950 # needed. We need to test if parent URL is different from self URL,
951 # otherwise we could be stuck in a recursive loop
952 # where self == parent.
953 if self.geturl() != self.parent().geturl():
954 self.parent().mkdir()
956 log.debug("Creating new directory: %s", self.geturl())
957 self._mkcol()
959 def remove(self) -> None:
960 """Remove the resource."""
961 self._delete()
963 def read(self, size: int = -1) -> bytes:
964 """Open the resource and return the contents in bytes.
966 Parameters
967 ----------
968 size : `int`, optional
969 The number of bytes to read. Negative or omitted indicates
970 that all data should be read.
971 """
972 # Use the data session as a context manager to ensure that the
973 # network connections to both the front end and back end servers are
974 # closed after downloading the data.
975 log.debug("Reading from remote resource: %s", self.geturl())
976 stream = size > 0
977 with self.data_session as session:
978 with time_this(log, msg="GET %s", args=(self,)):
979 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout)
981 if resp.status_code != requests.codes.ok: # 200
982 raise FileNotFoundError(
983 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}"
984 )
985 if not stream:
986 return resp.content
987 else:
988 return next(resp.iter_content(chunk_size=size))
990 def write(self, data: bytes, overwrite: bool = True) -> None:
991 """Write the supplied bytes to the new resource.
993 Parameters
994 ----------
995 data : `bytes`
996 The bytes to write to the resource. The entire contents of the
997 resource will be replaced.
998 overwrite : `bool`, optional
999 If `True` the resource will be overwritten if it exists. Otherwise
1000 the write will fail.
1001 """
1002 log.debug("Writing to remote resource: %s", self.geturl())
1003 if not overwrite and self.exists():
1004 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
1006 # Ensure the parent directory exists.
1007 # This is only meaningful and appropriate for WebDAV, not the general
1008 # HTTP case. e.g. for S3 HTTP URLs, the underlying service has no
1009 # concept of 'directories' at all.
1010 if self.is_webdav_endpoint:
1011 self.parent().mkdir()
1013 # Upload the data.
1014 log.debug("Writing data to remote resource: %s", self.geturl())
1015 self._put(data=data)
1017 def transfer_from(
1018 self,
1019 src: ResourcePath,
1020 transfer: str = "copy",
1021 overwrite: bool = False,
1022 transaction: TransactionProtocol | None = None,
1023 ) -> None:
1024 """Transfer the current resource to a Webdav repository.
1026 Parameters
1027 ----------
1028 src : `ResourcePath`
1029 Source URI.
1030 transfer : `str`
1031 Mode to use for transferring the resource. Supports the following
1032 options: copy.
1033 overwrite : `bool`, optional
1034 Whether overwriting the remote resource is allowed or not.
1035 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
1036 Currently unused.
1037 """
1038 # Fail early to prevent delays if remote resources are requested.
1039 if transfer not in self.transferModes:
1040 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
1042 # Existence checks cost time so do not call this unless we know
1043 # that debugging is enabled.
1044 if log.isEnabledFor(logging.DEBUG):
1045 log.debug(
1046 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
1047 src,
1048 src.exists(),
1049 self,
1050 self.exists(),
1051 transfer,
1052 )
1054 # Short circuit immediately if the URIs are identical.
1055 if self == src:
1056 log.debug(
1057 "Target and destination URIs are identical: %s, returning immediately."
1058 " No further action required.",
1059 self,
1060 )
1061 return
1063 if not overwrite and self.exists():
1064 raise FileExistsError(f"Destination path {self} already exists.")
1066 if transfer == "auto":
1067 transfer = self.transferDefault
1069 # We can use webDAV 'COPY' or 'MOVE' if both the current and source
1070 # resources are located in the same server.
1071 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint:
1072 log.debug("Transfer from %s to %s directly", src, self)
1073 return self._move(src) if transfer == "move" else self._copy(src)
1075 # For resources of different classes or for plain HTTP resources we can
1076 # perform the copy or move operation by downloading to a local file
1077 # and uploading to the destination.
1078 self._copy_via_local(src)
1080 # This was an explicit move, try to remove the source.
1081 if transfer == "move":
1082 src.remove()
1084 def walk(
1085 self, file_filter: str | re.Pattern | None = None
1086 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
1087 """Walk the directory tree returning matching files and directories.
1089 Parameters
1090 ----------
1091 file_filter : `str` or `re.Pattern`, optional
1092 Regex to filter out files from the list before it is returned.
1094 Yields
1095 ------
1096 dirpath : `ResourcePath`
1097 Current directory being examined.
1098 dirnames : `list` of `str`
1099 Names of subdirectories within dirpath.
1100 filenames : `list` of `str`
1101 Names of all the files within dirpath.
1102 """
1103 if not self.dirLike:
1104 raise ValueError("Can not walk a non-directory URI")
1106 # Walking directories is only available on WebDAV back ends.
1107 if not self.is_webdav_endpoint:
1108 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers")
1110 if isinstance(file_filter, str):
1111 file_filter = re.compile(file_filter)
1113 resp = self._propfind(depth="1")
1114 if resp.status_code == requests.codes.multi_status: # 207
1115 files: list[str] = []
1116 dirs: list[str] = []
1118 for prop in _parse_propfind_response_body(resp.text):
1119 if prop.is_file:
1120 files.append(prop.name)
1121 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")):
1122 # Only include the names of sub-directories not the name of
1123 # the directory being walked.
1124 dirs.append(prop.name)
1126 if file_filter is not None:
1127 files = [f for f in files if file_filter.search(f)]
1129 if not dirs and not files:
1130 return
1131 else:
1132 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files
1134 for dir in dirs:
1135 new_uri = self.join(dir, forceDirectory=True)
1136 yield from new_uri.walk(file_filter)
1138 def _as_local(self) -> tuple[str, bool]:
1139 """Download object over HTTP and place in temporary directory.
1141 Returns
1142 -------
1143 path : `str`
1144 Path to local temporary file.
1145 temporary : `bool`
1146 Always returns `True`. This is always a temporary file.
1147 """
1148 # Use the session as a context manager to ensure that connections
1149 # to both the front end and back end servers are closed after the
1150 # download operation is finished.
1151 with self.data_session as session:
1152 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout)
1153 if resp.status_code != requests.codes.ok:
1154 raise FileNotFoundError(
1155 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}"
1156 )
1158 tmpdir, buffering = _get_temp_dir()
1159 with tempfile.NamedTemporaryFile(
1160 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
1161 ) as tmpFile:
1162 expected_length = int(resp.headers.get("Content-Length", "-1"))
1163 with time_this(
1164 log,
1165 msg="GET %s [length=%d] to local file %s [chunk_size=%d]",
1166 args=(self, expected_length, tmpFile.name, buffering),
1167 mem_usage=self._config.collect_memory_usage,
1168 mem_unit=u.mebibyte,
1169 ):
1170 content_length = 0
1171 for chunk in resp.iter_content(chunk_size=buffering):
1172 tmpFile.write(chunk)
1173 content_length += len(chunk)
1175 # Check that the expected and actual content lengths match. Perform
1176 # this check only when the contents of the file was not encoded by
1177 # the server.
1178 if (
1179 "Content-Encoding" not in resp.headers
1180 and expected_length >= 0
1181 and expected_length != content_length
1182 ):
1183 raise ValueError(
1184 f"Size of downloaded file does not match value in Content-Length header for {self}: "
1185 f"expecting {expected_length} and got {content_length} bytes"
1186 )
1188 return tmpFile.name, True
1190 def _send_webdav_request(
1191 self,
1192 method: str,
1193 url: str | None = None,
1194 headers: dict[str, str] | None = None,
1195 body: str | None = None,
1196 session: requests.Session | None = None,
1197 timeout: tuple[float, float] | None = None,
1198 ) -> requests.Response:
1199 """Send a webDAV request and correctly handle redirects.
1201 Parameters
1202 ----------
1203 method : `str`
1204 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL.
1205 headers : `dict`, optional
1206 A dictionary of key-value pairs (both strings) to include as
1207 headers in the request.
1208 body : `str`, optional
1209 The body of the request.
1211 Notes
1212 -----
1213 This way of sending webDAV requests is necessary for handling
1214 redirection ourselves, since the 'requests' package changes the method
1215 of the redirected request when the server responds with status 302 and
1216 the method of the original request is not HEAD (which is the case for
1217 webDAV requests).
1219 That means that when the webDAV server we interact with responds with
1220 a redirection to a PROPFIND or MKCOL request, the request gets
1221 converted to a GET request when sent to the redirected location.
1223 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in
1224 https://github.com/psf/requests/blob/main/requests/sessions.py
1226 This behavior of the 'requests' package is meant to be compatible with
1227 what is specified in RFC 9110:
1229 https://www.rfc-editor.org/rfc/rfc9110#name-302-found
1231 For our purposes, we do need to follow the redirection and send a new
1232 request using the same HTTP verb.
1233 """
1234 if url is None:
1235 url = self.geturl()
1237 if headers is None:
1238 headers = {}
1240 if session is None:
1241 session = self.metadata_session
1243 if timeout is None:
1244 timeout = self._config.timeout
1246 with time_this(
1247 log,
1248 msg="%s %s",
1249 args=(
1250 method,
1251 url,
1252 ),
1253 mem_usage=self._config.collect_memory_usage,
1254 mem_unit=u.mebibyte,
1255 ):
1256 for _ in range(max_redirects := 5):
1257 resp = session.request(
1258 method,
1259 url,
1260 data=body,
1261 headers=headers,
1262 stream=False,
1263 timeout=timeout,
1264 allow_redirects=False,
1265 )
1266 if resp.is_redirect:
1267 url = resp.headers["Location"]
1268 else:
1269 return resp
1271 # We reached the maximum allowed number of redirects.
1272 # Stop trying.
1273 raise ValueError(
1274 f"Could not get a response to {method} request for {self} after {max_redirects} redirections"
1275 )
1277 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response:
1278 """Send a PROPFIND webDAV request and return the response.
1280 Parameters
1281 ----------
1282 body : `str`, optional
1283 The body of the PROPFIND request to send to the server. If
1284 provided, it is expected to be a XML document.
1285 depth : `str`, optional
1286 The value of the 'Depth' header to include in the request.
1288 Returns
1289 -------
1290 response : `requests.Response`
1291 Response to the PROPFIND request.
1293 Notes
1294 -----
1295 It raises `ValueError` if the status code of the PROPFIND request
1296 is different from "207 Multistatus" or "404 Not Found".
1297 """
1298 if body is None:
1299 # Request only the DAV live properties we are explicitly interested
1300 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
1301 # and 'displayname'.
1302 body = (
1303 """<?xml version="1.0" encoding="utf-8" ?>"""
1304 """<D:propfind xmlns:D="DAV:"><D:prop>"""
1305 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
1306 """</D:prop></D:propfind>"""
1307 )
1308 headers = {
1309 "Depth": depth,
1310 "Content-Type": 'application/xml; charset="utf-8"',
1311 "Content-Length": str(len(body)),
1312 }
1313 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body)
1314 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found):
1315 return resp
1316 else:
1317 raise ValueError(
1318 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} "
1319 f"{resp.reason}"
1320 )
1322 def _options(self) -> requests.Response:
1323 """Send a OPTIONS webDAV request for this resource."""
1324 resp = self._send_webdav_request("OPTIONS")
1325 if resp.status_code in (requests.codes.ok, requests.codes.created):
1326 return resp
1328 raise ValueError(
1329 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} {resp.reason}"
1330 )
1332 def _head(self) -> requests.Response:
1333 """Send a HEAD webDAV request for this resource."""
1334 return self._send_webdav_request("HEAD")
1336 def _mkcol(self) -> None:
1337 """Send a MKCOL webDAV request to create a collection. The collection
1338 may already exist.
1339 """
1340 resp = self._send_webdav_request("MKCOL")
1341 if resp.status_code == requests.codes.created: # 201
1342 return
1344 if resp.status_code == requests.codes.method_not_allowed: # 405
1345 # The remote directory already exists
1346 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
1347 else:
1348 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}")
1350 def _delete(self) -> None:
1351 """Send a DELETE webDAV request for this resource."""
1352 log.debug("Deleting %s ...", self.geturl())
1354 # If this is a directory, ensure the remote is a webDAV server because
1355 # plain HTTP servers don't support DELETE requests on non-file
1356 # paths.
1357 if self.dirLike and not self.is_webdav_endpoint:
1358 raise NotImplementedError(
1359 f"Deletion of directory {self} is not implemented by plain HTTP servers"
1360 )
1362 # Deleting non-empty directories may take some time, so increase
1363 # the timeout for getting a response from the server.
1364 timeout = self._config.timeout
1365 if self.dirLike:
1366 timeout = (timeout[0], timeout[1] * 100)
1367 resp = self._send_webdav_request("DELETE", timeout=timeout)
1368 if resp.status_code in (
1369 requests.codes.ok,
1370 requests.codes.accepted,
1371 requests.codes.no_content,
1372 requests.codes.not_found,
1373 ):
1374 # We can get a "404 Not Found" error when the file or directory
1375 # does not exist or when the DELETE request was retried several
1376 # times and a previous attempt actually deleted the resource.
1377 # Therefore we consider that a "Not Found" response is not an
1378 # error since we reached the state desired by the user.
1379 return
1380 else:
1381 # TODO: the response to a DELETE request against a webDAV server
1382 # may be multistatus. If so, we need to parse the reponse body to
1383 # determine more precisely the reason of the failure (e.g. a lock)
1384 # and provide a more helpful error message.
1385 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}")
1387 def _copy_via_local(self, src: ResourcePath) -> None:
1388 """Replace the contents of this resource with the contents of a remote
1389 resource by using a local temporary file.
1391 Parameters
1392 ----------
1393 src : `HttpResourcePath`
1394 The source of the contents to copy to `self`.
1395 """
1396 with src.as_local() as local_uri:
1397 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri)
1398 with open(local_uri.ospath, "rb") as f:
1399 self._put(data=f)
1401 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None:
1402 """Send a COPY or MOVE webDAV request to copy or replace the contents
1403 of this resource with the contents of another resource located in the
1404 same server.
1406 Parameters
1407 ----------
1408 method : `str`
1409 The method to perform. Valid values are "COPY" or "MOVE" (in
1410 uppercase).
1411 src : `HttpResourcePath`
1412 The source of the contents to move to `self`.
1413 """
1414 headers = {"Destination": self.geturl()}
1415 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session)
1416 if resp.status_code in (requests.codes.created, requests.codes.no_content):
1417 return
1419 if resp.status_code == requests.codes.multi_status:
1420 tree = eTree.fromstring(resp.content)
1421 status_element = tree.find("./{DAV:}response/{DAV:}status")
1422 status = status_element.text if status_element is not None else "unknown"
1423 error = tree.find("./{DAV:}response/{DAV:}error")
1424 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}")
1425 else:
1426 raise ValueError(
1427 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}"
1428 )
1430 def _copy(self, src: HttpResourcePath) -> None:
1431 """Send a COPY webDAV request to replace the contents of this resource
1432 (if any) with the contents of another resource located in the same
1433 server.
1435 Parameters
1436 ----------
1437 src : `HttpResourcePath`
1438 The source of the contents to copy to `self`.
1439 """
1440 # Neither dCache nor XrootD currently implement the COPY
1441 # webDAV method as documented in
1442 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
1443 # (See issues DM-37603 and DM-37651 for details)
1444 #
1445 # For the time being, we use a temporary local file to
1446 # perform the copy client side.
1447 # TODO: when those 2 issues above are solved remove the 3 lines below.
1448 must_use_local = True
1449 if must_use_local:
1450 return self._copy_via_local(src)
1452 return self._copy_or_move("COPY", src)
1454 def _move(self, src: HttpResourcePath) -> None:
1455 """Send a MOVE webDAV request to replace the contents of this resource
1456 with the contents of another resource located in the same server.
1458 Parameters
1459 ----------
1460 src : `HttpResourcePath`
1461 The source of the contents to move to `self`.
1462 """
1463 return self._copy_or_move("MOVE", src)
1465 def _put(self, data: BinaryIO | bytes) -> None:
1466 """Perform an HTTP PUT request and handle redirection.
1468 Parameters
1469 ----------
1470 data : `Union[BinaryIO, bytes]`
1471 The data to be included in the body of the PUT request.
1472 """
1473 # Retrieve the final URL for this upload by sending a PUT request with
1474 # no content. Follow a single server redirection to retrieve the
1475 # final URL.
1476 headers = {"Content-Length": "0"}
1477 if self._config.send_expect_on_put:
1478 headers["Expect"] = "100-continue"
1480 url = self.geturl()
1482 # Use the session as a context manager to ensure the underlying
1483 # connections are closed after finishing uploading the data.
1484 with self.data_session as session:
1485 # Send an empty PUT request to get redirected to the final
1486 # destination.
1487 log.debug("Sending empty PUT request to %s", url)
1488 with time_this(
1489 log,
1490 msg="PUT (no data) %s",
1491 args=(url,),
1492 mem_usage=self._config.collect_memory_usage,
1493 mem_unit=u.mebibyte,
1494 ):
1495 resp = session.request(
1496 "PUT",
1497 url,
1498 data=None,
1499 headers=headers,
1500 stream=False,
1501 timeout=self._config.timeout,
1502 allow_redirects=False,
1503 )
1504 if resp.is_redirect:
1505 url = resp.headers["Location"]
1507 # Upload the data to the final destination.
1508 log.debug("Uploading data to %s", url)
1510 # Ask the server to compute and record a checksum of the uploaded
1511 # file contents, for later integrity checks. Since we don't compute
1512 # the digest ourselves while uploading the data, we cannot control
1513 # after the request is complete that the data we uploaded is
1514 # identical to the data recorded by the server, but at least the
1515 # server has recorded a digest of the data it stored.
1516 #
1517 # See RFC-3230 for details and
1518 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
1519 # for the list of supported digest algorithhms.
1520 # In addition, note that not all servers implement this RFC so
1521 # the checksum may not be computed by the server.
1522 put_headers: dict[str, str] | None = None
1523 if digest := self._config.digest_algorithm:
1524 put_headers = {"Want-Digest": digest}
1526 with time_this(
1527 log,
1528 msg="PUT %s",
1529 args=(url,),
1530 mem_usage=self._config.collect_memory_usage,
1531 mem_unit=u.mebibyte,
1532 ):
1533 resp = session.request(
1534 "PUT",
1535 url,
1536 data=data,
1537 headers=put_headers,
1538 stream=False,
1539 timeout=self._config.timeout,
1540 allow_redirects=False,
1541 )
1542 if resp.status_code in (
1543 requests.codes.ok,
1544 requests.codes.created,
1545 requests.codes.no_content,
1546 ):
1547 return
1548 else:
1549 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}")
1551 @contextlib.contextmanager
1552 def _openImpl(
1553 self,
1554 mode: str = "r",
1555 *,
1556 encoding: str | None = None,
1557 ) -> Iterator[ResourceHandleProtocol]:
1558 resp = self._head()
1559 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes"
1560 handle: ResourceHandleProtocol
1561 if mode in ("rb", "r") and accepts_range:
1562 handle = HttpReadResourceHandle(
1563 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout
1564 )
1565 if mode == "r":
1566 # cast because the protocol is compatible, but does not have
1567 # BytesIO in the inheritance tree
1568 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
1569 else:
1570 yield handle
1571 else:
1572 with super()._openImpl(mode, encoding=encoding) as http_handle:
1573 yield http_handle
1576def _dump_response(resp: requests.Response) -> None:
1577 """Log the contents of a HTTP or webDAV request and its response.
1579 Parameters
1580 ----------
1581 resp : `requests.Response`
1582 The response to log.
1584 Notes
1585 -----
1586 Intended for development purposes only.
1587 """
1588 log.debug("-----------------------------------------------")
1589 log.debug("Request")
1590 log.debug(" method=%s", resp.request.method)
1591 log.debug(" URL=%s", resp.request.url)
1592 log.debug(" headers=%s", resp.request.headers)
1593 if resp.request.method == "PUT":
1594 log.debug(" body=<data>")
1595 elif resp.request.body is None:
1596 log.debug(" body=<empty>")
1597 else:
1598 log.debug(" body=%r", resp.request.body[:120])
1600 log.debug("Response:")
1601 log.debug(" status_code=%d", resp.status_code)
1602 log.debug(" headers=%s", resp.headers)
1603 if not resp.content:
1604 log.debug(" body=<empty>")
1605 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain":
1606 log.debug(" body=%r", resp.content)
1607 else:
1608 log.debug(" body=%r", resp.content[:80])
1611def _is_protected(filepath: str) -> bool:
1612 """Return true if the permissions of file at filepath only allow for access
1613 by its owner.
1615 Parameters
1616 ----------
1617 filepath : `str`
1618 Path of a local file.
1619 """
1620 if not os.path.isfile(filepath):
1621 return False
1622 mode = stat.S_IMODE(os.stat(filepath).st_mode)
1623 owner_accessible = bool(mode & stat.S_IRWXU)
1624 group_accessible = bool(mode & stat.S_IRWXG)
1625 other_accessible = bool(mode & stat.S_IRWXO)
1626 return owner_accessible and not group_accessible and not other_accessible
1629def _parse_propfind_response_body(body: str) -> list[DavProperty]:
1630 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND
1631 request.
1633 Parameters
1634 ----------
1635 body : `str`
1636 XML-encoded response body to a PROPFIND request
1638 Returns
1639 -------
1640 responses : `List[DavProperty]`
1642 Notes
1643 -----
1644 Is is expected that there is at least one reponse in `body`, otherwise
1645 this function raises.
1646 """
1647 # A response body to a PROPFIND request is of the form (indented for
1648 # readability):
1649 #
1650 # <?xml version="1.0" encoding="UTF-8"?>
1651 # <D:multistatus xmlns:D="DAV:">
1652 # <D:response>
1653 # <D:href>path/to/resource</D:href>
1654 # <D:propstat>
1655 # <D:prop>
1656 # <D:resourcetype>
1657 # <D:collection xmlns:D="DAV:"/>
1658 # </D:resourcetype>
1659 # <D:getlastmodified>
1660 # Fri, 27 Jan 2 023 13:59:01 GMT
1661 # </D:getlastmodified>
1662 # <D:getcontentlength>
1663 # 12345
1664 # </D:getcontentlength>
1665 # </D:prop>
1666 # <D:status>
1667 # HTTP/1.1 200 OK
1668 # </D:status>
1669 # </D:propstat>
1670 # </D:response>
1671 # <D:response>
1672 # ...
1673 # </D:response>
1674 # <D:response>
1675 # ...
1676 # </D:response>
1677 # </D:multistatus>
1679 # Scan all the 'response' elements and extract the relevant properties
1680 responses = []
1681 multistatus = eTree.fromstring(body.strip())
1682 for response in multistatus.findall("./{DAV:}response"):
1683 responses.append(DavProperty(response))
1685 if responses:
1686 return responses
1687 else:
1688 # Could not parse the body
1689 raise ValueError(f"Unable to parse response for PROPFIND request: {body}")
1692class DavProperty:
1693 """Helper class to encapsulate select live DAV properties of a single
1694 resource, as retrieved via a PROPFIND request.
1696 Parameters
1697 ----------
1698 response : `eTree.Element` or `None`
1699 The XML response defining the DAV property.
1700 """
1702 # Regular expression to compare against the 'status' element of a
1703 # PROPFIND response's 'propstat' element.
1704 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE)
1706 def __init__(self, response: eTree.Element | None):
1707 self._href: str = ""
1708 self._displayname: str = ""
1709 self._collection: bool = False
1710 self._getlastmodified: str = ""
1711 self._getcontentlength: int = -1
1713 if response is not None:
1714 self._parse(response)
1716 def _parse(self, response: eTree.Element) -> None:
1717 # Extract 'href'.
1718 if (element := response.find("./{DAV:}href")) is not None:
1719 # We need to use "str(element.text)"" instead of "element.text" to
1720 # keep mypy happy.
1721 self._href = str(element.text).strip()
1722 else:
1723 raise ValueError(
1724 "Property 'href' expected but not found in PROPFIND response: "
1725 f"{eTree.tostring(response, encoding='unicode')}"
1726 )
1728 for propstat in response.findall("./{DAV:}propstat"):
1729 # Only extract properties of interest with status OK.
1730 status = propstat.find("./{DAV:}status")
1731 if status is None or not self._status_ok_rex.match(str(status.text)):
1732 continue
1734 for prop in propstat.findall("./{DAV:}prop"):
1735 # Parse "collection".
1736 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None:
1737 self._collection = True
1739 # Parse "getlastmodified".
1740 if (element := prop.find("./{DAV:}getlastmodified")) is not None:
1741 self._getlastmodified = str(element.text)
1743 # Parse "getcontentlength".
1744 if (element := prop.find("./{DAV:}getcontentlength")) is not None:
1745 self._getcontentlength = int(str(element.text))
1747 # Parse "displayname".
1748 if (element := prop.find("./{DAV:}displayname")) is not None:
1749 self._displayname = str(element.text)
1751 # Some webDAV servers don't include the 'displayname' property in the
1752 # response so try to infer it from the value of the 'href' property.
1753 # Depending on the server the href value may end with '/'.
1754 if not self._displayname:
1755 self._displayname = os.path.basename(self._href.rstrip("/"))
1757 # Force a size of 0 for collections.
1758 if self._collection:
1759 self._getcontentlength = 0
1761 @property
1762 def exists(self) -> bool:
1763 # It is either a directory or a file with length of at least zero
1764 return self._collection or self._getcontentlength >= 0
1766 @property
1767 def is_directory(self) -> bool:
1768 return self._collection
1770 @property
1771 def is_file(self) -> bool:
1772 return not self._collection
1774 @property
1775 def size(self) -> int:
1776 return self._getcontentlength
1778 @property
1779 def name(self) -> str:
1780 return self._displayname
1782 @property
1783 def href(self) -> str:
1784 return self._href