Coverage for python/lsst/resources/http.py: 23%
570 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-30 11:34 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-30 11:34 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import math
21import os
22import os.path
23import random
24import re
25import stat
26import tempfile
27from collections.abc import Iterator
28from typing import TYPE_CHECKING, BinaryIO, cast
30try:
31 # Prefer 'defusedxml' (not part of standard library) if available, since
32 # 'xml' is vulnerable to XML bombs.
33 import defusedxml.ElementTree as eTree
34except ImportError:
35 import xml.etree.ElementTree as eTree
37import requests
38from astropy import units as u
39from lsst.utils.timer import time_this
40from requests.adapters import HTTPAdapter
41from requests.auth import AuthBase
42from urllib3.util.retry import Retry
44from ._resourceHandles import ResourceHandleProtocol
45from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
46from ._resourcePath import ResourcePath
48if TYPE_CHECKING:
49 from .utils import TransactionProtocol
51log = logging.getLogger(__name__)
54def _timeout_from_environment(env_var: str, default_value: float) -> float:
55 """Convert and return a timeout from the value of an environment variable
56 or a default value if the environment variable is not initialized. The
57 value of `env_var` must be a valid `float` otherwise this function raises.
59 Parameters
60 ----------
61 env_var : `str`
62 Environment variable to look for.
63 default_value: `float``
64 Value to return if `env_var` is not defined in the environment.
66 Returns
67 -------
68 _timeout_from_environment : `float`
69 Converted value.
70 """
71 try:
72 timeout = float(os.environ.get(env_var, default_value))
73 except ValueError:
74 raise ValueError(
75 f"Expecting valid timeout value in environment variable {env_var} but found "
76 f"{os.environ.get(env_var)}"
77 ) from None
79 if math.isnan(timeout):
80 raise ValueError(f"Unexpected timeout value NaN found in environment variable {env_var}")
82 return timeout
85class HttpResourcePathConfig:
86 """Configuration class to encapsulate the configurable items used by class
87 HttpResourcePath.
88 """
90 # Default timeouts for all HTTP requests (seconds).
91 DEFAULT_TIMEOUT_CONNECT = 30.0
92 DEFAULT_TIMEOUT_READ = 1_500.0
94 # Default lower and upper bounds for the backoff interval (seconds).
95 # A value in this interval is randomly selected as the backoff factor when
96 # requests need to be retried.
97 DEFAULT_BACKOFF_MIN = 1.0
98 DEFAULT_BACKOFF_MAX = 3.0
100 # Default number of connections to persist with both the front end and
101 # back end servers.
102 DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS = 2
103 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS = 1
105 # Accepted digest algorithms
106 ACCEPTED_DIGESTS = ("adler32", "md5", "sha-256", "sha-512")
108 _front_end_connections: int | None = None
109 _back_end_connections: int | None = None
110 _digest_algorithm: str | None = None
111 _send_expect_on_put: bool | None = None
112 _timeout: tuple[float, float] | None = None
113 _collect_memory_usage: bool | None = None
114 _backoff_min: float | None = None
115 _backoff_max: float | None = None
117 @property
118 def front_end_connections(self) -> int:
119 """Number of persistent connections to the front end server."""
120 if self._front_end_connections is not None: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true
121 return self._front_end_connections
123 try:
124 self._front_end_connections = int(
125 os.environ.get(
126 "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
127 )
128 )
129 except ValueError:
130 self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
132 return self._front_end_connections
134 @property
135 def back_end_connections(self) -> int:
136 """Number of persistent connections to the back end servers."""
137 if self._back_end_connections is not None: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true
138 return self._back_end_connections
140 try:
141 self._back_end_connections = int(
142 os.environ.get(
143 "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
144 )
145 )
146 except ValueError:
147 self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
149 return self._back_end_connections
151 @property
152 def digest_algorithm(self) -> str:
153 """Algorithm to ask the server to use for computing and recording
154 digests of each file contents in PUT requests.
156 Returns
157 -------
158 digest_algorithm: `str`
159 The name of a digest algorithm or the empty string if no algotihm
160 is configured.
161 """
162 if self._digest_algorithm is not None:
163 return self._digest_algorithm
165 digest = os.environ.get("LSST_HTTP_DIGEST", "").lower()
166 if digest not in self.ACCEPTED_DIGESTS:
167 digest = ""
169 self._digest_algorithm = digest
170 return self._digest_algorithm
172 @property
173 def send_expect_on_put(self) -> bool:
174 """Return True if a "Expect: 100-continue" header is to be sent to
175 the server on each PUT request.
177 Some servers (e.g. dCache) uses this information as an indication that
178 the client knows how to handle redirects to the specific server that
179 will actually receive the data for PUT requests.
180 """
181 if self._send_expect_on_put is not None:
182 return self._send_expect_on_put
184 self._send_expect_on_put = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
185 return self._send_expect_on_put
187 @property
188 def timeout(self) -> tuple[float, float]:
189 """Return a tuple with the values of timeouts for connecting to the
190 server and reading its response, respectively. Both values are in
191 seconds.
192 """
193 if self._timeout is not None:
194 return self._timeout
196 self._timeout = (
197 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", self.DEFAULT_TIMEOUT_CONNECT),
198 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", self.DEFAULT_TIMEOUT_READ),
199 )
200 return self._timeout
202 @property
203 def collect_memory_usage(self) -> bool:
204 """Return true if we want to collect memory usage when timing
205 operations against the remote server via the `lsst.utils.time_this`
206 context manager.
207 """
208 if self._collect_memory_usage is not None:
209 return self._collect_memory_usage
211 self._collect_memory_usage = "LSST_HTTP_COLLECT_MEMORY_USAGE" in os.environ
212 return self._collect_memory_usage
214 @property
215 def backoff_min(self) -> float:
216 """Lower bound of the interval from which a backoff factor is randomly
217 selected when retrying requests (seconds).
218 """
219 if self._backoff_min is not None:
220 return self._backoff_min
222 self._backoff_min = self.DEFAULT_BACKOFF_MIN
223 try:
224 backoff_min = float(os.environ.get("LSST_HTTP_BACKOFF_MIN", self.DEFAULT_BACKOFF_MIN))
225 if not math.isnan(backoff_min): 225 ↛ 230line 225 didn't jump to line 230, because the condition on line 225 was never false
226 self._backoff_min = backoff_min
227 except ValueError:
228 pass
230 return self._backoff_min
232 @property
233 def backoff_max(self) -> float:
234 """Upper bound of the interval from which a backoff factor is randomly
235 selected when retrying requests (seconds).
236 """
237 if self._backoff_max is not None:
238 return self._backoff_max
240 self._backoff_max = self.DEFAULT_BACKOFF_MAX
241 try:
242 backoff_max = float(os.environ.get("LSST_HTTP_BACKOFF_MAX", self.DEFAULT_BACKOFF_MAX))
243 if not math.isnan(backoff_max): 243 ↛ 248line 243 didn't jump to line 248, because the condition on line 243 was never false
244 self._backoff_max = backoff_max
245 except ValueError:
246 pass
248 return self._backoff_max
251@functools.lru_cache
252def _is_webdav_endpoint(path: ResourcePath | str) -> bool:
253 """Check whether the remote HTTP endpoint implements WebDAV features.
255 Parameters
256 ----------
257 path : `ResourcePath` or `str`
258 URL to the resource to be checked.
259 Should preferably refer to the root since the status is shared
260 by all paths in that server.
262 Returns
263 -------
264 _is_webdav_endpoint : `bool`
265 True if the endpoint implements WebDAV, False if it doesn't.
266 """
267 log.debug("Detecting HTTP endpoint type for '%s'...", path)
269 # Send an OPTIONS request and inspect its response. An OPTIONS
270 # request does not need authentication of the client, so we don't need
271 # to provide a client certificate or a bearer token. We set a
272 # relatively short timeout since an OPTIONS request is relatively cheap
273 # for the server to compute.
275 # Create a session for configuring retries
276 retries = Retry(
277 # Total number of retries to allow. Takes precedence over other
278 # counts.
279 total=6,
280 # How many connection-related errors to retry on.
281 connect=3,
282 # How many times to retry on read errors.
283 read=3,
284 # How many times to retry on bad status codes.
285 status=5,
286 # Set of uppercased HTTP method verbs that we should retry on.
287 allowed_methods=frozenset(
288 [
289 "OPTIONS",
290 ]
291 ),
292 # HTTP status codes that we should force a retry on.
293 status_forcelist=frozenset(
294 [
295 requests.codes.too_many_requests, # 429
296 requests.codes.internal_server_error, # 500
297 requests.codes.bad_gateway, # 502
298 requests.codes.service_unavailable, # 503
299 requests.codes.gateway_timeout, # 504
300 ]
301 ),
302 # Whether to respect 'Retry-After' header on status codes defined
303 # above.
304 respect_retry_after_header=True,
305 )
307 try:
308 session = requests.Session()
309 session.mount(str(path), HTTPAdapter(max_retries=retries))
310 session.verify = os.environ.get("LSST_HTTP_CACERT_BUNDLE", True)
311 with session:
312 resp = session.options(
313 str(path),
314 stream=False,
315 timeout=(
316 _timeout_from_environment("LSST_HTTP_TIMEOUT_CONNECT", 30.0),
317 _timeout_from_environment("LSST_HTTP_TIMEOUT_READ", 60.0),
318 ),
319 )
320 if resp.status_code not in (requests.codes.ok, requests.codes.created):
321 return False
323 # Check that "1" is part of the value of the "DAV" header. We don't
324 # use locks, so a server complying to class 1 is enough for our
325 # purposes. All webDAV servers must advertise at least compliance
326 # class "1".
327 #
328 # Compliance classes are documented in
329 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
330 #
331 # Examples of values for header DAV are:
332 # DAV: 1, 2
333 # DAV: 1, <http://apache.org/dav/propset/fs/1>
334 if "DAV" not in resp.headers:
335 return False
336 else:
337 # Convert to str to keep mypy happy
338 compliance_class = str(resp.headers.get("DAV"))
339 return "1" in compliance_class.replace(" ", "").split(",")
341 except requests.exceptions.SSLError as e:
342 log.warning(
343 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to "
344 "specify tha path to a bundle of certificate authorities you trust "
345 "which are not included in the default set of trusted authorities "
346 "of this system."
347 )
348 raise e
351# Tuple (path, block_size) pointing to the location of a local directory
352# to save temporary files and the block size of the underlying file system.
353_TMPDIR: tuple[str, int] | None = None
356def _get_temp_dir() -> tuple[str, int]:
357 """Return the temporary directory path and block size.
359 This function caches its results in _TMPDIR.
360 """
361 global _TMPDIR
362 if _TMPDIR:
363 return _TMPDIR
365 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
366 # 'TMPDIR', if defined. Otherwise use current working directory.
367 tmpdir = os.getcwd()
368 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
369 if dir and os.path.isdir(dir):
370 tmpdir = dir
371 break
373 # Compute the block size as 256 blocks of typical size
374 # (i.e. 4096 bytes) or 10 times the file system block size,
375 # whichever is higher. This is a reasonable compromise between
376 # using memory for buffering and the number of system calls
377 # issued to read from or write to temporary files.
378 fsstats = os.statvfs(tmpdir)
379 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
382class BearerTokenAuth(AuthBase):
383 """Attach a bearer token 'Authorization' header to each request.
385 Parameters
386 ----------
387 token : `str`
388 Can be either the path to a local protected file which contains the
389 value of the token or the token itself.
390 """
392 def __init__(self, token: str):
393 self._token = self._path = None
394 self._mtime: float = -1.0
395 if not token:
396 return
398 self._token = token
399 if os.path.isfile(token):
400 self._path = os.path.abspath(token)
401 if not _is_protected(self._path):
402 raise PermissionError(
403 f"Bearer token file at {self._path} must be protected for access only by its owner"
404 )
405 self._refresh()
407 def _refresh(self) -> None:
408 """Read the token file (if any) if its modification time is more recent
409 than the last time we read it.
410 """
411 if not self._path:
412 return
414 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
415 log.debug("Reading bearer token file at %s", self._path)
416 self._mtime = mtime
417 with open(self._path) as f:
418 self._token = f.read().rstrip("\n")
420 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
421 # Only add a bearer token to a request when using secure HTTP.
422 if req.url and req.url.lower().startswith("https://") and self._token:
423 self._refresh()
424 req.headers["Authorization"] = f"Bearer {self._token}"
425 return req
428class SessionStore:
429 """Cache a reusable HTTP client session per endpoint."""
431 def __init__(
432 self,
433 num_pools: int = 10,
434 max_persistent_connections: int = 1,
435 backoff_min: float = 1.0,
436 backoff_max: float = 3.0,
437 ) -> None:
438 # Dictionary to store the session associated to a given URI. The key
439 # of the dictionary is a root URI and the value is the session.
440 self._sessions: dict[str, requests.Session] = {}
442 # Number of connection pools to keep: there is one pool per remote
443 # host. See documentation of urllib3 PoolManager class:
444 # https://urllib3.readthedocs.io
445 self._num_pools: int = num_pools
447 # Maximum number of connections per remote host to persist in each
448 # connection pool. See urllib3 Advanced Usage documentation:
449 # https://urllib3.readthedocs.io/en/stable/advanced-usage.html
450 self._max_persistent_connections: int = max_persistent_connections
452 # Minimum and maximum values of the inverval to compute the exponential
453 # backoff factor when retrying requests (seconds).
454 self._backoff_min: float = backoff_min
455 self._backoff_max: float = backoff_max if backoff_max > backoff_min else backoff_min + 1.0
457 def clear(self) -> None:
458 """Destroy all previously created sessions and attempt to close
459 underlying idle network connections.
460 """
461 # Close all sessions and empty the store. Idle network connections
462 # should be closed as a consequence. We don't have means through
463 # the API exposed by Requests to actually force closing the
464 # underlying open sockets.
465 for session in self._sessions.values():
466 session.close()
468 self._sessions.clear()
470 def get(self, rpath: ResourcePath) -> requests.Session:
471 """Retrieve a session for accessing the remote resource at rpath.
473 Parameters
474 ----------
475 rpath : `ResourcePath`
476 URL to a resource at the remote server for which a session is to
477 be retrieved.
479 Notes
480 -----
481 Once a session is created for a given endpoint it is cached and
482 returned every time a session is requested for any path under that same
483 endpoint. For instance, a single session will be cached and shared
484 for paths "https://www.example.org/path/to/file" and
485 "https://www.example.org/any/other/path".
487 Note that "https://www.example.org" and "https://www.example.org:12345"
488 will have different sessions since the port number is not identical.
490 In order to configure the session, some environment variables are
491 inspected:
493 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
494 certificates to trust when verifying the server's certificate.
496 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
497 local file containing a bearer token to be used as the client
498 authentication mechanism with all requests.
499 The permissions of the token file must be set so that only its
500 owner can access it.
501 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
502 and LSST_HTTP_AUTH_CLIENT_KEY.
504 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
505 client certificate for authenticating to the server.
506 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
507 initialized with the path of the client private key file.
508 The permissions of the client private key must be set so that only
509 its owner can access it, at least for reading.
510 """
511 root_uri = str(rpath.root_uri())
512 if root_uri not in self._sessions:
513 # We don't have yet a session for this endpoint: create a new one.
514 self._sessions[root_uri] = self._make_session(rpath)
516 return self._sessions[root_uri]
518 def _make_session(self, rpath: ResourcePath) -> requests.Session:
519 """Make a new session configured from values from the environment."""
520 session = requests.Session()
521 root_uri = str(rpath.root_uri())
522 log.debug("Creating new HTTP session for endpoint %s ...", root_uri)
523 retries = Retry(
524 # Total number of retries to allow. Takes precedence over other
525 # counts.
526 total=6,
527 # How many connection-related errors to retry on.
528 connect=3,
529 # How many times to retry on read errors.
530 read=3,
531 # Backoff factor to apply between attempts after the second try
532 # (seconds). Compute a random jitter to prevent all the clients
533 # to overwhelm the server by sending requests at the same time.
534 backoff_factor=self._backoff_min + (self._backoff_max - self._backoff_min) * random.random(),
535 # How many times to retry on bad status codes.
536 status=5,
537 # Set of uppercased HTTP method verbs that we should retry on.
538 # We only automatically retry idempotent requests.
539 allowed_methods=frozenset(
540 [
541 "COPY",
542 "DELETE",
543 "GET",
544 "HEAD",
545 "MKCOL",
546 "OPTIONS",
547 "PROPFIND",
548 "PUT",
549 ]
550 ),
551 # HTTP status codes that we should force a retry on.
552 status_forcelist=frozenset(
553 [
554 requests.codes.too_many_requests, # 429
555 requests.codes.internal_server_error, # 500
556 requests.codes.bad_gateway, # 502
557 requests.codes.service_unavailable, # 503
558 requests.codes.gateway_timeout, # 504
559 ]
560 ),
561 # Whether to respect Retry-After header on status codes defined
562 # above.
563 respect_retry_after_header=True,
564 )
566 # Persist the specified number of connections to the front end server.
567 session.mount(
568 root_uri,
569 HTTPAdapter(
570 pool_connections=self._num_pools,
571 pool_maxsize=self._max_persistent_connections,
572 pool_block=False,
573 max_retries=retries,
574 ),
575 )
577 # Do not persist the connections to back end servers which may vary
578 # from request to request. Systematically persisting connections to
579 # those servers may exhaust their capabilities when there are thousands
580 # of simultaneous clients.
581 session.mount(
582 f"{rpath.scheme}://",
583 HTTPAdapter(
584 pool_connections=self._num_pools,
585 pool_maxsize=0,
586 pool_block=False,
587 max_retries=retries,
588 ),
589 )
591 # If the remote endpoint doesn't use secure HTTP we don't include
592 # bearer tokens in the requests nor need to authenticate the remote
593 # server.
594 if rpath.scheme != "https":
595 return session
597 # Should we use a specific CA cert bundle for authenticating the
598 # server?
599 session.verify = True
600 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
601 session.verify = ca_bundle
603 # Should we use bearer tokens for client authentication?
604 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
605 log.debug("... using bearer token authentication")
606 session.auth = BearerTokenAuth(token)
607 return session
609 # Should we instead use client certificate and private key? If so, both
610 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
611 # initialized.
612 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
613 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
614 if client_cert and client_key:
615 if not _is_protected(client_key):
616 raise PermissionError(
617 f"Private key file at {client_key} must be protected for access only by its owner"
618 )
619 log.debug("... using client certificate authentication.")
620 session.cert = (client_cert, client_key)
621 return session
623 if client_cert:
624 # Only the client certificate was provided.
625 raise ValueError(
626 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
627 )
629 if client_key:
630 # Only the client private key was provided.
631 raise ValueError(
632 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
633 )
635 log.debug(
636 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
637 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
638 )
639 return session
642class HttpResourcePath(ResourcePath):
643 """General HTTP(S) resource.
645 Notes
646 -----
647 In order to configure the behavior of instances of this class, the
648 environment variables below are inspected:
650 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
651 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
652 This header is required by some servers to detect if the client
653 knows how to handle redirections. In case of redirection, the body
654 of the PUT request is sent to the redirected location and not to
655 the front end server.
657 - LSST_HTTP_TIMEOUT_CONNECT and LSST_HTTP_TIMEOUT_READ: if set to a
658 numeric value, they are interpreted as the number of seconds to wait
659 for establishing a connection with the server and for reading its
660 response, respectively.
662 - LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS and
663 LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS: contain the maximum number
664 of connections to attempt to persist with both the front end servers
665 and the back end servers.
666 Default values: DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS and
667 DEFAULT_BACKEND_PERSISTENT_CONNECTIONS.
669 - LSST_HTTP_DIGEST: case-insensitive name of the digest algorithm to
670 ask the server to compute for every file's content sent to the server
671 via a PUT request. No digest is requested if this variable is not set
672 or is set to an invalid value.
673 Valid values are those in ACCEPTED_DIGESTS.
674 """
676 _is_webdav: bool | None = None
678 # Configuration items for this class instances.
679 _config = HttpResourcePathConfig()
681 # The session for metadata requests is used for interacting with
682 # the front end servers for requests such as PROPFIND, HEAD, etc. Those
683 # interactions are typically served by the front end servers. We want to
684 # keep the connection to the front end servers open, to reduce the cost
685 # associated to TCP and TLS handshaking for each new request.
686 _metadata_session_store = SessionStore(
687 num_pools=5,
688 max_persistent_connections=_config.front_end_connections,
689 backoff_min=_config.backoff_min,
690 backoff_max=_config.backoff_max,
691 )
693 # The data session is used for interaction with the front end servers which
694 # typically redirect to the back end servers for serving our PUT and GET
695 # requests. We attempt to keep a single connection open with the front end
696 # server, if possible. This depends on how the server behaves and the
697 # kind of request. Some servers close the connection when redirecting
698 # the client to a back end server, for instance when serving a PUT
699 # request.
700 _data_session_store = SessionStore(
701 num_pools=25,
702 max_persistent_connections=_config.back_end_connections,
703 backoff_min=_config.backoff_min,
704 backoff_max=_config.backoff_max,
705 )
707 # Process ID which created the session stores above. We need to store this
708 # to replace sessions created by a parent process and inherited by a
709 # child process after a fork, to avoid confusing the SSL layer.
710 _pid: int = -1
712 @property
713 def metadata_session(self) -> requests.Session:
714 """Client session to send requests which do not require upload or
715 download of data, i.e. mostly metadata requests.
716 """
717 if hasattr(self, "_metadata_session"):
718 if HttpResourcePath._pid == os.getpid():
719 return self._metadata_session
720 else:
721 # The metadata session we have in cache was likely created by
722 # a parent process. Discard all the sessions in that store.
723 self._metadata_session_store.clear()
725 # Retrieve a new metadata session.
726 HttpResourcePath._pid = os.getpid()
727 self._metadata_session: requests.Session = self._metadata_session_store.get(self)
728 return self._metadata_session
730 @property
731 def data_session(self) -> requests.Session:
732 """Client session for uploading and downloading data."""
733 if hasattr(self, "_data_session"):
734 if HttpResourcePath._pid == os.getpid():
735 return self._data_session
736 else:
737 # The data session we have in cache was likely created by
738 # a parent process. Discard all the sessions in that store.
739 self._data_session_store.clear()
741 # Retrieve a new data session.
742 HttpResourcePath._pid = os.getpid()
743 self._data_session: requests.Session = self._data_session_store.get(self)
744 return self._data_session
746 def _clear_sessions(self) -> None:
747 """Close the socket connections that are still open.
749 Used only in test suites to avoid warnings.
750 """
751 self._metadata_session_store.clear()
752 self._data_session_store.clear()
754 if hasattr(self, "_metadata_session"):
755 delattr(self, "_metadata_session")
757 if hasattr(self, "_data_session"):
758 delattr(self, "_data_session")
760 @property
761 def is_webdav_endpoint(self) -> bool:
762 """Check if the current endpoint implements WebDAV features.
764 This is stored per URI but cached by root so there is
765 only one check per hostname.
766 """
767 if self._is_webdav is not None:
768 return self._is_webdav
770 self._is_webdav = _is_webdav_endpoint(self.root_uri())
771 return self._is_webdav
773 def exists(self) -> bool:
774 """Check that a remote HTTP resource exists."""
775 log.debug("Checking if resource exists: %s", self.geturl())
776 if not self.is_webdav_endpoint:
777 # The remote is a plain HTTP server. Let's attempt a HEAD
778 # request, even if the behavior for such a request against a
779 # directory is not specified, so it depends on the server
780 # implementation.
781 resp = self.metadata_session.head(
782 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
783 )
784 return resp.status_code == requests.codes.ok # 200
786 # The remote endpoint is a webDAV server: send a PROPFIND request
787 # to determine if it exists.
788 resp = self._propfind()
789 if resp.status_code == requests.codes.multi_status: # 207
790 prop = _parse_propfind_response_body(resp.text)[0]
791 return prop.exists
792 else: # 404 Not Found
793 return False
795 def size(self) -> int:
796 """Return the size of the remote resource in bytes."""
797 if self.dirLike:
798 return 0
800 if not self.is_webdav_endpoint:
801 # The remote is a plain HTTP server. Send a HEAD request to
802 # retrieve the size of the resource.
803 resp = self.metadata_session.head(
804 self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
805 )
806 if resp.status_code == requests.codes.ok: # 200
807 if "Content-Length" in resp.headers:
808 return int(resp.headers["Content-Length"])
809 else:
810 raise ValueError(
811 f"Response to HEAD request to {self} does not contain 'Content-Length' header"
812 )
813 elif resp.status_code == requests.codes.not_found:
814 raise FileNotFoundError(
815 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
816 )
817 else:
818 raise ValueError(
819 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} "
820 f"{resp.reason}"
821 )
823 # The remote is a webDAV server: send a PROPFIND request to retrieve
824 # the size of the resource. Sizes are only meaningful for files.
825 resp = self._propfind()
826 if resp.status_code == requests.codes.multi_status: # 207
827 prop = _parse_propfind_response_body(resp.text)[0]
828 if prop.is_file:
829 return prop.size
830 elif prop.is_directory:
831 raise IsADirectoryError(
832 f"Resource {self} is reported by server as a directory but has a file path"
833 )
834 else:
835 raise FileNotFoundError(f"Resource {self} does not exist")
836 else: # 404 Not Found
837 raise FileNotFoundError(
838 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
839 )
841 def mkdir(self) -> None:
842 """Create the directory resource if it does not already exist."""
843 # Creating directories is only available on WebDAV back ends.
844 if not self.is_webdav_endpoint:
845 raise NotImplementedError(
846 f"Creation of directory {self} is not implemented by plain HTTP servers"
847 )
849 if not self.dirLike:
850 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
852 # Check if the target directory already exists.
853 resp = self._propfind()
854 if resp.status_code == requests.codes.multi_status: # 207
855 prop = _parse_propfind_response_body(resp.text)[0]
856 if prop.exists:
857 if prop.is_directory:
858 return
859 else:
860 # A file exists at this path
861 raise NotADirectoryError(
862 f"Can not create a directory for {self} because a file already exists at that path"
863 )
865 # Target directory does not exist. Create it and its ancestors as
866 # needed. We need to test if parent URL is different from self URL,
867 # otherwise we could be stuck in a recursive loop
868 # where self == parent.
869 if self.geturl() != self.parent().geturl():
870 self.parent().mkdir()
872 log.debug("Creating new directory: %s", self.geturl())
873 self._mkcol()
875 def remove(self) -> None:
876 """Remove the resource."""
877 self._delete()
879 def read(self, size: int = -1) -> bytes:
880 """Open the resource and return the contents in bytes.
882 Parameters
883 ----------
884 size : `int`, optional
885 The number of bytes to read. Negative or omitted indicates
886 that all data should be read.
887 """
888 # Use the data session as a context manager to ensure that the
889 # network connections to both the front end and back end servers are
890 # closed after downloading the data.
891 log.debug("Reading from remote resource: %s", self.geturl())
892 stream = size > 0
893 with self.data_session as session:
894 with time_this(log, msg="GET %s", args=(self,)):
895 resp = session.get(self.geturl(), stream=stream, timeout=self._config.timeout)
897 if resp.status_code != requests.codes.ok: # 200
898 raise FileNotFoundError(
899 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}"
900 )
901 if not stream:
902 return resp.content
903 else:
904 return next(resp.iter_content(chunk_size=size))
906 def write(self, data: bytes, overwrite: bool = True) -> None:
907 """Write the supplied bytes to the new resource.
909 Parameters
910 ----------
911 data : `bytes`
912 The bytes to write to the resource. The entire contents of the
913 resource will be replaced.
914 overwrite : `bool`, optional
915 If `True` the resource will be overwritten if it exists. Otherwise
916 the write will fail.
917 """
918 log.debug("Writing to remote resource: %s", self.geturl())
919 if not overwrite and self.exists():
920 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
922 # Ensure the parent directory exists.
923 self.parent().mkdir()
925 # Upload the data.
926 log.debug("Writing data to remote resource: %s", self.geturl())
927 self._put(data=data)
929 def transfer_from(
930 self,
931 src: ResourcePath,
932 transfer: str = "copy",
933 overwrite: bool = False,
934 transaction: TransactionProtocol | None = None,
935 ) -> None:
936 """Transfer the current resource to a Webdav repository.
938 Parameters
939 ----------
940 src : `ResourcePath`
941 Source URI.
942 transfer : `str`
943 Mode to use for transferring the resource. Supports the following
944 options: copy.
945 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
946 Currently unused.
947 """
948 # Fail early to prevent delays if remote resources are requested.
949 if transfer not in self.transferModes:
950 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
952 # Existence checks cost time so do not call this unless we know
953 # that debugging is enabled.
954 if log.isEnabledFor(logging.DEBUG):
955 log.debug(
956 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
957 src,
958 src.exists(),
959 self,
960 self.exists(),
961 transfer,
962 )
964 # Short circuit immediately if the URIs are identical.
965 if self == src:
966 log.debug(
967 "Target and destination URIs are identical: %s, returning immediately."
968 " No further action required.",
969 self,
970 )
971 return
973 if not overwrite and self.exists():
974 raise FileExistsError(f"Destination path {self} already exists.")
976 if transfer == "auto":
977 transfer = self.transferDefault
979 # We can use webDAV 'COPY' or 'MOVE' if both the current and source
980 # resources are located in the same server.
981 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint:
982 log.debug("Transfer from %s to %s directly", src, self)
983 return self._move(src) if transfer == "move" else self._copy(src)
985 # For resources of different classes or for plain HTTP resources we can
986 # perform the copy or move operation by downloading to a local file
987 # and uploading to the destination.
988 self._copy_via_local(src)
990 # This was an explicit move, try to remove the source.
991 if transfer == "move":
992 src.remove()
994 def walk(
995 self, file_filter: str | re.Pattern | None = None
996 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]:
997 """Walk the directory tree returning matching files and directories.
999 Parameters
1000 ----------
1001 file_filter : `str` or `re.Pattern`, optional
1002 Regex to filter out files from the list before it is returned.
1004 Yields
1005 ------
1006 dirpath : `ResourcePath`
1007 Current directory being examined.
1008 dirnames : `list` of `str`
1009 Names of subdirectories within dirpath.
1010 filenames : `list` of `str`
1011 Names of all the files within dirpath.
1012 """
1013 if not self.dirLike:
1014 raise ValueError("Can not walk a non-directory URI")
1016 # Walking directories is only available on WebDAV back ends.
1017 if not self.is_webdav_endpoint:
1018 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers")
1020 if isinstance(file_filter, str):
1021 file_filter = re.compile(file_filter)
1023 resp = self._propfind(depth="1")
1024 if resp.status_code == requests.codes.multi_status: # 207
1025 files: list[str] = []
1026 dirs: list[str] = []
1028 for prop in _parse_propfind_response_body(resp.text):
1029 if prop.is_file:
1030 files.append(prop.name)
1031 elif not prop.href.rstrip("/").endswith(self.path.rstrip("/")):
1032 # Only include the names of sub-directories not the name of
1033 # the directory being walked.
1034 dirs.append(prop.name)
1036 if file_filter is not None:
1037 files = [f for f in files if file_filter.search(f)]
1039 if not dirs and not files:
1040 return
1041 else:
1042 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files
1044 for dir in dirs:
1045 new_uri = self.join(dir, forceDirectory=True)
1046 yield from new_uri.walk(file_filter)
1048 def _as_local(self) -> tuple[str, bool]:
1049 """Download object over HTTP and place in temporary directory.
1051 Returns
1052 -------
1053 path : `str`
1054 Path to local temporary file.
1055 temporary : `bool`
1056 Always returns `True`. This is always a temporary file.
1057 """
1058 # Use the session as a context manager to ensure that connections
1059 # to both the front end and back end servers are closed after the
1060 # download operation is finished.
1061 with self.data_session as session:
1062 resp = session.get(self.geturl(), stream=True, timeout=self._config.timeout)
1063 if resp.status_code != requests.codes.ok:
1064 raise FileNotFoundError(
1065 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}"
1066 )
1068 tmpdir, buffering = _get_temp_dir()
1069 with tempfile.NamedTemporaryFile(
1070 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
1071 ) as tmpFile:
1072 expected_length = int(resp.headers.get("Content-Length", "-1"))
1073 with time_this(
1074 log,
1075 msg="GET %s [length=%d] to local file %s [chunk_size=%d]",
1076 args=(self, expected_length, tmpFile.name, buffering),
1077 mem_usage=self._config.collect_memory_usage,
1078 mem_unit=u.mebibyte,
1079 ):
1080 content_length = 0
1081 for chunk in resp.iter_content(chunk_size=buffering):
1082 tmpFile.write(chunk)
1083 content_length += len(chunk)
1085 # Check that the expected and actual content lengths match. Perform
1086 # this check only when the contents of the file was not encoded by
1087 # the server.
1088 if (
1089 "Content-Encoding" not in resp.headers
1090 and expected_length >= 0
1091 and expected_length != content_length
1092 ):
1093 raise ValueError(
1094 f"Size of downloaded file does not match value in Content-Length header for {self}: "
1095 f"expecting {expected_length} and got {content_length} bytes"
1096 )
1098 return tmpFile.name, True
1100 def _send_webdav_request(
1101 self,
1102 method: str,
1103 url: str | None = None,
1104 headers: dict[str, str] | None = None,
1105 body: str | None = None,
1106 session: requests.Session | None = None,
1107 timeout: tuple[float, float] | None = None,
1108 ) -> requests.Response:
1109 """Send a webDAV request and correctly handle redirects.
1111 Parameters
1112 ----------
1113 method : `str`
1114 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL.
1115 headers : `dict`, optional
1116 A dictionary of key-value pairs (both strings) to include as
1117 headers in the request.
1118 body: `str`, optional
1119 The body of the request.
1121 Notes
1122 -----
1123 This way of sending webDAV requests is necessary for handling
1124 redirection ourselves, since the 'requests' package changes the method
1125 of the redirected request when the server responds with status 302 and
1126 the method of the original request is not HEAD (which is the case for
1127 webDAV requests).
1129 That means that when the webDAV server we interact with responds with
1130 a redirection to a PROPFIND or MKCOL request, the request gets
1131 converted to a GET request when sent to the redirected location.
1133 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in
1134 https://github.com/psf/requests/blob/main/requests/sessions.py
1136 This behavior of the 'requests' package is meant to be compatible with
1137 what is specified in RFC 9110:
1139 https://www.rfc-editor.org/rfc/rfc9110#name-302-found
1141 For our purposes, we do need to follow the redirection and send a new
1142 request using the same HTTP verb.
1143 """
1144 if url is None:
1145 url = self.geturl()
1147 if headers is None:
1148 headers = {}
1150 if session is None:
1151 session = self.metadata_session
1153 if timeout is None:
1154 timeout = self._config.timeout
1156 with time_this(
1157 log,
1158 msg="%s %s",
1159 args=(
1160 method,
1161 url,
1162 ),
1163 mem_usage=self._config.collect_memory_usage,
1164 mem_unit=u.mebibyte,
1165 ):
1166 for _ in range(max_redirects := 5):
1167 resp = session.request(
1168 method,
1169 url,
1170 data=body,
1171 headers=headers,
1172 stream=False,
1173 timeout=timeout,
1174 allow_redirects=False,
1175 )
1176 if resp.is_redirect:
1177 url = resp.headers["Location"]
1178 else:
1179 return resp
1181 # We reached the maximum allowed number of redirects.
1182 # Stop trying.
1183 raise ValueError(
1184 f"Could not get a response to {method} request for {self} after {max_redirects} redirections"
1185 )
1187 def _propfind(self, body: str | None = None, depth: str = "0") -> requests.Response:
1188 """Send a PROPFIND webDAV request and return the response.
1190 Parameters
1191 ----------
1192 body : `str`, optional
1193 The body of the PROPFIND request to send to the server. If
1194 provided, it is expected to be a XML document.
1195 depth : `str`, optional
1196 The value of the 'Depth' header to include in the request.
1198 Returns
1199 -------
1200 response : `requests.Response`
1201 Response to the PROPFIND request.
1203 Notes
1204 -----
1205 It raises `ValueError` if the status code of the PROPFIND request
1206 is different from "207 Multistatus" or "404 Not Found".
1207 """
1208 if body is None:
1209 # Request only the DAV live properties we are explicitly interested
1210 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
1211 # and 'displayname'.
1212 body = (
1213 """<?xml version="1.0" encoding="utf-8" ?>"""
1214 """<D:propfind xmlns:D="DAV:"><D:prop>"""
1215 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
1216 """</D:prop></D:propfind>"""
1217 )
1218 headers = {
1219 "Depth": depth,
1220 "Content-Type": 'application/xml; charset="utf-8"',
1221 "Content-Length": str(len(body)),
1222 }
1223 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body)
1224 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found):
1225 return resp
1226 else:
1227 raise ValueError(
1228 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} "
1229 f"{resp.reason}"
1230 )
1232 def _options(self) -> requests.Response:
1233 """Send a OPTIONS webDAV request for this resource."""
1234 resp = self._send_webdav_request("OPTIONS")
1235 if resp.status_code in (requests.codes.ok, requests.codes.created):
1236 return resp
1238 raise ValueError(
1239 f"Unexpected response to OPTIONS request for {self}, status: {resp.status_code} {resp.reason}"
1240 )
1242 def _head(self) -> requests.Response:
1243 """Send a HEAD webDAV request for this resource."""
1244 return self._send_webdav_request("HEAD")
1246 def _mkcol(self) -> None:
1247 """Send a MKCOL webDAV request to create a collection. The collection
1248 may already exist.
1249 """
1250 resp = self._send_webdav_request("MKCOL")
1251 if resp.status_code == requests.codes.created: # 201
1252 return
1254 if resp.status_code == requests.codes.method_not_allowed: # 405
1255 # The remote directory already exists
1256 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
1257 else:
1258 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}")
1260 def _delete(self) -> None:
1261 """Send a DELETE webDAV request for this resource."""
1262 log.debug("Deleting %s ...", self.geturl())
1264 # If this is a directory, ensure the remote is a webDAV server because
1265 # plain HTTP servers don't support DELETE requests on non-file
1266 # paths.
1267 if self.dirLike and not self.is_webdav_endpoint:
1268 raise NotImplementedError(
1269 f"Deletion of directory {self} is not implemented by plain HTTP servers"
1270 )
1272 # Deleting non-empty directories may take some time, so increase
1273 # the timeout for getting a response from the server.
1274 timeout = self._config.timeout
1275 if self.dirLike:
1276 timeout = (timeout[0], timeout[1] * 100)
1277 resp = self._send_webdav_request("DELETE", timeout=timeout)
1278 if resp.status_code in (
1279 requests.codes.ok,
1280 requests.codes.accepted,
1281 requests.codes.no_content,
1282 requests.codes.not_found,
1283 ):
1284 # We can get a "404 Not Found" error when the file or directory
1285 # does not exist or when the DELETE request was retried several
1286 # times and a previous attempt actually deleted the resource.
1287 # Therefore we consider that a "Not Found" response is not an
1288 # error since we reached the state desired by the user.
1289 return
1290 else:
1291 # TODO: the response to a DELETE request against a webDAV server
1292 # may be multistatus. If so, we need to parse the reponse body to
1293 # determine more precisely the reason of the failure (e.g. a lock)
1294 # and provide a more helpful error message.
1295 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}")
1297 def _copy_via_local(self, src: ResourcePath) -> None:
1298 """Replace the contents of this resource with the contents of a remote
1299 resource by using a local temporary file.
1301 Parameters
1302 ----------
1303 src : `HttpResourcePath`
1304 The source of the contents to copy to `self`.
1305 """
1306 with src.as_local() as local_uri:
1307 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri)
1308 with open(local_uri.ospath, "rb") as f:
1309 self._put(data=f)
1311 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None:
1312 """Send a COPY or MOVE webDAV request to copy or replace the contents
1313 of this resource with the contents of another resource located in the
1314 same server.
1316 Parameters
1317 ----------
1318 method : `str`
1319 The method to perform. Valid values are "COPY" or "MOVE" (in
1320 uppercase).
1322 src : `HttpResourcePath`
1323 The source of the contents to move to `self`.
1324 """
1325 headers = {"Destination": self.geturl()}
1326 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers, session=self.data_session)
1327 if resp.status_code in (requests.codes.created, requests.codes.no_content):
1328 return
1330 if resp.status_code == requests.codes.multi_status:
1331 tree = eTree.fromstring(resp.content)
1332 status_element = tree.find("./{DAV:}response/{DAV:}status")
1333 status = status_element.text if status_element is not None else "unknown"
1334 error = tree.find("./{DAV:}response/{DAV:}error")
1335 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}")
1336 else:
1337 raise ValueError(
1338 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}"
1339 )
1341 def _copy(self, src: HttpResourcePath) -> None:
1342 """Send a COPY webDAV request to replace the contents of this resource
1343 (if any) with the contents of another resource located in the same
1344 server.
1346 Parameters
1347 ----------
1348 src : `HttpResourcePath`
1349 The source of the contents to copy to `self`.
1350 """
1351 # Neither dCache nor XrootD currently implement the COPY
1352 # webDAV method as documented in
1353 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
1354 # (See issues DM-37603 and DM-37651 for details)
1355 #
1356 # For the time being, we use a temporary local file to
1357 # perform the copy client side.
1358 # TODO: when those 2 issues above are solved remove the 3 lines below.
1359 must_use_local = True
1360 if must_use_local:
1361 return self._copy_via_local(src)
1363 return self._copy_or_move("COPY", src)
1365 def _move(self, src: HttpResourcePath) -> None:
1366 """Send a MOVE webDAV request to replace the contents of this resource
1367 with the contents of another resource located in the same server.
1369 Parameters
1370 ----------
1371 src : `HttpResourcePath`
1372 The source of the contents to move to `self`.
1373 """
1374 return self._copy_or_move("MOVE", src)
1376 def _put(self, data: BinaryIO | bytes) -> None:
1377 """Perform an HTTP PUT request and handle redirection.
1379 Parameters
1380 ----------
1381 data : `Union[BinaryIO, bytes]`
1382 The data to be included in the body of the PUT request.
1383 """
1384 # Retrieve the final URL for this upload by sending a PUT request with
1385 # no content. Follow a single server redirection to retrieve the
1386 # final URL.
1387 headers = {"Content-Length": "0"}
1388 if self._config.send_expect_on_put:
1389 headers["Expect"] = "100-continue"
1391 url = self.geturl()
1393 # Use the session as a context manager to ensure the underlying
1394 # connections are closed after finishing uploading the data.
1395 with self.data_session as session:
1396 # Send an empty PUT request to get redirected to the final
1397 # destination.
1398 log.debug("Sending empty PUT request to %s", url)
1399 with time_this(
1400 log,
1401 msg="PUT (no data) %s",
1402 args=(url,),
1403 mem_usage=self._config.collect_memory_usage,
1404 mem_unit=u.mebibyte,
1405 ):
1406 resp = session.request(
1407 "PUT",
1408 url,
1409 data=None,
1410 headers=headers,
1411 stream=False,
1412 timeout=self._config.timeout,
1413 allow_redirects=False,
1414 )
1415 if resp.is_redirect:
1416 url = resp.headers["Location"]
1418 # Upload the data to the final destination.
1419 log.debug("Uploading data to %s", url)
1421 # Ask the server to compute and record a checksum of the uploaded
1422 # file contents, for later integrity checks. Since we don't compute
1423 # the digest ourselves while uploading the data, we cannot control
1424 # after the request is complete that the data we uploaded is
1425 # identical to the data recorded by the server, but at least the
1426 # server has recorded a digest of the data it stored.
1427 #
1428 # See RFC-3230 for details and
1429 # https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
1430 # for the list of supported digest algorithhms.
1431 # In addition, note that not all servers implement this RFC so
1432 # the checksum may not be computed by the server.
1433 put_headers: dict[str, str] | None = None
1434 if digest := self._config.digest_algorithm:
1435 put_headers = {"Want-Digest": digest}
1437 with time_this(
1438 log,
1439 msg="PUT %s",
1440 args=(url,),
1441 mem_usage=self._config.collect_memory_usage,
1442 mem_unit=u.mebibyte,
1443 ):
1444 resp = session.request(
1445 "PUT",
1446 url,
1447 data=data,
1448 headers=put_headers,
1449 stream=False,
1450 timeout=self._config.timeout,
1451 allow_redirects=False,
1452 )
1453 if resp.status_code in (
1454 requests.codes.ok,
1455 requests.codes.created,
1456 requests.codes.no_content,
1457 ):
1458 return
1459 else:
1460 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}")
1462 @contextlib.contextmanager
1463 def _openImpl(
1464 self,
1465 mode: str = "r",
1466 *,
1467 encoding: str | None = None,
1468 ) -> Iterator[ResourceHandleProtocol]:
1469 resp = self._head()
1470 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes"
1471 handle: ResourceHandleProtocol
1472 if mode in ("rb", "r") and accepts_range:
1473 handle = HttpReadResourceHandle(
1474 mode, log, url=self.geturl(), session=self.data_session, timeout=self._config.timeout
1475 )
1476 if mode == "r":
1477 # cast because the protocol is compatible, but does not have
1478 # BytesIO in the inheritance tree
1479 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
1480 else:
1481 yield handle
1482 else:
1483 with super()._openImpl(mode, encoding=encoding) as http_handle:
1484 yield http_handle
1487def _dump_response(resp: requests.Response) -> None:
1488 """Log the contents of a HTTP or webDAV request and its response.
1490 Parameters
1491 ----------
1492 resp : `requests.Response`
1493 The response to log.
1495 Notes
1496 -----
1497 Intended for development purposes only.
1498 """
1499 log.debug("-----------------------------------------------")
1500 log.debug("Request")
1501 log.debug(" method=%s", resp.request.method)
1502 log.debug(" URL=%s", resp.request.url)
1503 log.debug(" headers=%s", resp.request.headers)
1504 if resp.request.method == "PUT":
1505 log.debug(" body=<data>")
1506 elif resp.request.body is None:
1507 log.debug(" body=<empty>")
1508 else:
1509 log.debug(" body=%r", resp.request.body[:120])
1511 log.debug("Response:")
1512 log.debug(" status_code=%d", resp.status_code)
1513 log.debug(" headers=%s", resp.headers)
1514 if not resp.content:
1515 log.debug(" body=<empty>")
1516 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain":
1517 log.debug(" body=%r", resp.content)
1518 else:
1519 log.debug(" body=%r", resp.content[:80])
1522def _is_protected(filepath: str) -> bool:
1523 """Return true if the permissions of file at filepath only allow for access
1524 by its owner.
1526 Parameters
1527 ----------
1528 filepath : `str`
1529 Path of a local file.
1530 """
1531 if not os.path.isfile(filepath):
1532 return False
1533 mode = stat.S_IMODE(os.stat(filepath).st_mode)
1534 owner_accessible = bool(mode & stat.S_IRWXU)
1535 group_accessible = bool(mode & stat.S_IRWXG)
1536 other_accessible = bool(mode & stat.S_IRWXO)
1537 return owner_accessible and not group_accessible and not other_accessible
1540def _parse_propfind_response_body(body: str) -> list[DavProperty]:
1541 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND
1542 request.
1544 Parameters
1545 ----------
1546 body : `str`
1547 XML-encoded response body to a PROPFIND request
1549 Returns
1550 -------
1551 responses : `List[DavProperty]`
1553 Notes
1554 -----
1555 Is is expected that there is at least one reponse in `body`, otherwise
1556 this function raises.
1557 """
1558 # A response body to a PROPFIND request is of the form (indented for
1559 # readability):
1560 #
1561 # <?xml version="1.0" encoding="UTF-8"?>
1562 # <D:multistatus xmlns:D="DAV:">
1563 # <D:response>
1564 # <D:href>path/to/resource</D:href>
1565 # <D:propstat>
1566 # <D:prop>
1567 # <D:resourcetype>
1568 # <D:collection xmlns:D="DAV:"/>
1569 # </D:resourcetype>
1570 # <D:getlastmodified>
1571 # Fri, 27 Jan 2 023 13:59:01 GMT
1572 # </D:getlastmodified>
1573 # <D:getcontentlength>
1574 # 12345
1575 # </D:getcontentlength>
1576 # </D:prop>
1577 # <D:status>
1578 # HTTP/1.1 200 OK
1579 # </D:status>
1580 # </D:propstat>
1581 # </D:response>
1582 # <D:response>
1583 # ...
1584 # </D:response>
1585 # <D:response>
1586 # ...
1587 # </D:response>
1588 # </D:multistatus>
1590 # Scan all the 'response' elements and extract the relevant properties
1591 responses = []
1592 multistatus = eTree.fromstring(body.strip())
1593 for response in multistatus.findall("./{DAV:}response"):
1594 responses.append(DavProperty(response))
1596 if responses:
1597 return responses
1598 else:
1599 # Could not parse the body
1600 raise ValueError(f"Unable to parse response for PROPFIND request: {body}")
1603class DavProperty:
1604 """Helper class to encapsulate select live DAV properties of a single
1605 resource, as retrieved via a PROPFIND request.
1606 """
1608 # Regular expression to compare against the 'status' element of a
1609 # PROPFIND response's 'propstat' element.
1610 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE)
1612 def __init__(self, response: eTree.Element | None):
1613 self._href: str = ""
1614 self._displayname: str = ""
1615 self._collection: bool = False
1616 self._getlastmodified: str = ""
1617 self._getcontentlength: int = -1
1619 if response is not None:
1620 self._parse(response)
1622 def _parse(self, response: eTree.Element) -> None:
1623 # Extract 'href'.
1624 if (element := response.find("./{DAV:}href")) is not None:
1625 # We need to use "str(element.text)"" instead of "element.text" to
1626 # keep mypy happy.
1627 self._href = str(element.text).strip()
1628 else:
1629 raise ValueError(
1630 "Property 'href' expected but not found in PROPFIND response: "
1631 f"{eTree.tostring(response, encoding='unicode')}"
1632 )
1634 for propstat in response.findall("./{DAV:}propstat"):
1635 # Only extract properties of interest with status OK.
1636 status = propstat.find("./{DAV:}status")
1637 if status is None or not self._status_ok_rex.match(str(status.text)):
1638 continue
1640 for prop in propstat.findall("./{DAV:}prop"):
1641 # Parse "collection".
1642 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None:
1643 self._collection = True
1645 # Parse "getlastmodified".
1646 if (element := prop.find("./{DAV:}getlastmodified")) is not None:
1647 self._getlastmodified = str(element.text)
1649 # Parse "getcontentlength".
1650 if (element := prop.find("./{DAV:}getcontentlength")) is not None:
1651 self._getcontentlength = int(str(element.text))
1653 # Parse "displayname".
1654 if (element := prop.find("./{DAV:}displayname")) is not None:
1655 self._displayname = str(element.text)
1657 # Some webDAV servers don't include the 'displayname' property in the
1658 # response so try to infer it from the value of the 'href' property.
1659 # Depending on the server the href value may end with '/'.
1660 if not self._displayname:
1661 self._displayname = os.path.basename(self._href.rstrip("/"))
1663 # Force a size of 0 for collections.
1664 if self._collection:
1665 self._getcontentlength = 0
1667 @property
1668 def exists(self) -> bool:
1669 # It is either a directory or a file with length of at least zero
1670 return self._collection or self._getcontentlength >= 0
1672 @property
1673 def is_directory(self) -> bool:
1674 return self._collection
1676 @property
1677 def is_file(self) -> bool:
1678 return not self._collection
1680 @property
1681 def size(self) -> int:
1682 return self._getcontentlength
1684 @property
1685 def name(self) -> str:
1686 return self._displayname
1688 @property
1689 def href(self) -> str:
1690 return self._href