Coverage for python/lsst/resources/http.py: 15%
428 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-09 03:06 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-09 03:06 -0800
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import os
21import os.path
22import random
23import re
24import stat
25import tempfile
26import xml.etree.ElementTree as eTree
27from typing import TYPE_CHECKING, BinaryIO, Iterator, List, Optional, Tuple, Union, cast
29import requests
30from astropy import units as u
31from lsst.utils.timer import time_this
32from requests.adapters import HTTPAdapter
33from requests.auth import AuthBase
34from urllib3.util.retry import Retry
36from ._resourceHandles import ResourceHandleProtocol
37from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
38from ._resourcePath import ResourcePath
40if TYPE_CHECKING: 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true
41 from .utils import TransactionProtocol
43log = logging.getLogger(__name__)
46# Default timeouts for all HTTP requests, in seconds.
47DEFAULT_TIMEOUT_CONNECT = 60
48DEFAULT_TIMEOUT_READ = 300
50# Allow for network timeouts to be set in the environment.
51TIMEOUT = (
52 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)),
53 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)),
54)
56# Should we send a "Expect: 100-continue" header on PUT requests?
57# The "Expect: 100-continue" header is used by some servers (e.g. dCache)
58# as an indication that the client knows how to handle redirects to
59# the specific server that will actually receive the data for PUT
60# requests.
61_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
64@functools.lru_cache
65def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool:
66 """Check whether the remote HTTP endpoint implements WebDAV features.
68 Parameters
69 ----------
70 path : `ResourcePath` or `str`
71 URL to the resource to be checked.
72 Should preferably refer to the root since the status is shared
73 by all paths in that server.
75 Returns
76 -------
77 _is_webdav_endpoint : `bool`
78 True if the endpoint implements WebDAV, False if it doesn't.
79 """
80 log.debug("Detecting HTTP endpoint type for '%s'...", path)
81 try:
82 ca_cert_bundle = os.getenv("LSST_HTTP_CACERT_BUNDLE")
83 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True
84 resp = requests.options(str(path), verify=verify, stream=True)
86 # Check that "1" is part of the value of the "DAV" header. We don't
87 # use locks, so a server complying to class 1 is enough for our
88 # purposes. All webDAV servers must advertise at least compliance
89 # class "1".
90 #
91 # Compliance classes are documented in
92 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
93 #
94 # Examples of values for header DAV are:
95 # DAV: 1, 2
96 # DAV: 1, <http://apache.org/dav/propset/fs/1>
97 if "DAV" not in resp.headers:
98 return False
99 else:
100 # Convert to str to keep mypy happy
101 compliance_class = str(resp.headers.get("DAV"))
102 return "1" in compliance_class.replace(" ", "").split(",")
103 except requests.exceptions.SSLError as e:
104 log.warning(
105 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to "
106 "specify a bundle of certificate authorities you trust which are "
107 "not included in the default set of trusted authorities of your "
108 "system."
109 )
110 raise e
113# Tuple (path, block_size) pointing to the location of a local directory
114# to save temporary files and the block size of the underlying file system.
115_TMPDIR: Optional[Tuple[str, int]] = None
118def _get_temp_dir() -> Tuple[str, int]:
119 """Return the temporary directory path and block size.
121 This function caches its results in _TMPDIR.
122 """
123 global _TMPDIR
124 if _TMPDIR:
125 return _TMPDIR
127 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
128 # 'TMPDIR', if defined. Otherwise use current working directory.
129 tmpdir = os.getcwd()
130 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
131 if dir and os.path.isdir(dir):
132 tmpdir = dir
133 break
135 # Compute the block size as 256 blocks of typical size
136 # (i.e. 4096 bytes) or 10 times the file system block size,
137 # whichever is higher. This is a reasonable compromise between
138 # using memory for buffering and the number of system calls
139 # issued to read from or write to temporary files.
140 fsstats = os.statvfs(tmpdir)
141 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
144class BearerTokenAuth(AuthBase):
145 """Attach a bearer token 'Authorization' header to each request.
147 Parameters
148 ----------
149 token : `str`
150 Can be either the path to a local protected file which contains the
151 value of the token or the token itself.
152 """
154 def __init__(self, token: str):
155 self._token = self._path = None
156 self._mtime: float = -1.0
157 if not token:
158 return
160 self._token = token
161 if os.path.isfile(token):
162 self._path = os.path.abspath(token)
163 if not _is_protected(self._path):
164 raise PermissionError(
165 f"Bearer token file at {self._path} must be protected for access only by its owner"
166 )
167 self._refresh()
169 def _refresh(self) -> None:
170 """Read the token file (if any) if its modification time is more recent
171 than the last time we read it.
172 """
173 if not self._path:
174 return
176 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
177 log.debug("Reading bearer token file at %s", self._path)
178 self._mtime = mtime
179 with open(self._path) as f:
180 self._token = f.read().rstrip("\n")
182 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
183 if self._token:
184 self._refresh()
185 req.headers["Authorization"] = f"Bearer {self._token}"
186 return req
189class SessionStore:
190 """Cache a single reusable HTTP client session per enpoint."""
192 def __init__(self) -> None:
193 # The key of the dictionary is a root URI and the value is the
194 # session
195 self._sessions: dict[str, requests.Session] = {}
197 def get(self, rpath: ResourcePath, persist: bool = True) -> requests.Session:
198 """Retrieve a session for accessing the remote resource at rpath.
200 Parameters
201 ----------
202 rpath : `ResourcePath`
203 URL to a resource at the remote server for which a session is to
204 be retrieved.
206 persist : `bool`
207 if `True`, make the network connection with the front end server
208 of the endpoint persistent. Connections to the backend servers
209 are persisted.
211 Notes
212 -----
213 Once a session is created for a given endpoint it is cached and
214 returned every time a session is requested for any path under that same
215 endpoint. For instance, a single session will be cached and shared
216 for paths "https://www.example.org/path/to/file" and
217 "https://www.example.org/any/other/path".
219 Note that "https://www.example.org" and "https://www.example.org:12345"
220 will have different sessions since the port number is not identical.
222 In order to configure the session, some environment variables are
223 inspected:
225 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
226 certificates to trust when verifying the server's certificate.
228 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
229 local file containing a bearer token to be used as the client
230 authentication mechanism with all requests.
231 The permissions of the token file must be set so that only its
232 owner can access it.
233 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
234 and LSST_HTTP_AUTH_CLIENT_KEY.
236 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
237 client certificate for authenticating to the server.
238 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
239 initialized with the path of the client private key file.
240 The permissions of the client private key must be set so that only
241 its owner can access it, at least for reading.
242 """
243 root_uri = str(rpath.root_uri())
244 if root_uri not in self._sessions:
245 # We don't have yet a session for this endpoint: create a new one
246 self._sessions[root_uri] = self._make_session(rpath, persist)
247 return self._sessions[root_uri]
249 def _make_session(self, rpath: ResourcePath, persist: bool) -> requests.Session:
250 """Make a new session configured from values from the environment."""
251 session = requests.Session()
252 root_uri = str(rpath.root_uri())
253 log.debug("Creating new HTTP session for endpoint %s (persist connection=%s)...", root_uri, persist)
255 retries = Retry(
256 # Total number of retries to allow. Takes precedence over other
257 # counts.
258 total=3,
259 # How many connection-related errors to retry on.
260 connect=3,
261 # How many times to retry on read errors.
262 read=3,
263 # Backoff factor to apply between attempts after the second try
264 # (seconds)
265 backoff_factor=5.0 + random.random(),
266 # How many times to retry on bad status codes
267 status=3,
268 # HTTP status codes that we should force a retry on
269 status_forcelist=[
270 requests.codes.too_many_requests, # 429
271 requests.codes.internal_server_error, # 500
272 requests.codes.bad_gateway, # 502
273 requests.codes.service_unavailable, # 503
274 requests.codes.gateway_timeout, # 504
275 ],
276 )
278 # Persist a single connection to the front end server, if required
279 num_connections = 1 if persist else 0
280 session.mount(
281 root_uri,
282 HTTPAdapter(
283 pool_connections=1, pool_maxsize=num_connections, pool_block=False, max_retries=retries
284 ),
285 )
287 # Prevent persisting connections to back-end servers which may vary
288 # from request to request. Systematically persisting connections to
289 # those servers may exhaust their capabilities when there are thousands
290 # of simultaneous clients
291 session.mount(
292 f"{rpath.scheme}://",
293 HTTPAdapter(pool_connections=1, pool_maxsize=0, pool_block=False, max_retries=retries),
294 )
296 # If the remote endpoint don't use secure HTTP we dont include bearer
297 # tokens in the requests nor need to authenticate the remove server.
298 if rpath.scheme != "https":
299 return session
301 # Should we use a specific CA cert bundle for authenticating the
302 # server?
303 session.verify = True
304 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
305 session.verify = ca_bundle
307 # Should we use bearer tokens for client authentication?
308 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
309 log.debug("... using bearer token authentication")
310 session.auth = BearerTokenAuth(token)
311 return session
313 # Should we instead use client certificate and private key? If so, both
314 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
315 # initialized.
316 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
317 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
318 if client_cert and client_key:
319 if not _is_protected(client_key):
320 raise PermissionError(
321 f"Private key file at {client_key} must be protected for access only by its owner"
322 )
323 log.debug("... using client certificate authentication.")
324 session.cert = (client_cert, client_key)
325 return session
327 if client_cert:
328 # Only the client certificate was provided.
329 raise ValueError(
330 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
331 )
333 if client_key:
334 # Only the client private key was provided.
335 raise ValueError(
336 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
337 )
339 log.debug(
340 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
341 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
342 )
343 return session
346class HttpResourcePath(ResourcePath):
347 """General HTTP(S) resource.
349 Notes
350 -----
351 In order to configure the behavior of the object, one environment variable
352 is inspected:
354 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
355 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
356 This header is required by some servers to detect if the client
357 knows how to handle redirections. In case of redirection, the body
358 of the PUT request is sent to the redirected location and not to
359 the front end server.
360 """
362 _is_webdav: Optional[bool] = None
363 _sessions_store = SessionStore()
364 _put_sessions_store = SessionStore()
366 # Use a session exclusively for PUT requests and another session for
367 # all other requests. PUT requests may be redirected and in that case
368 # the server may close the persisted connection. If that is the case
369 # only the connection persisted for PUT requests will be closed and
370 # the other persisted connection will be kept alive and reused for
371 # other requests.
373 @property
374 def session(self) -> requests.Session:
375 """Client session to address remote resource for all HTTP methods but
376 PUT.
377 """
378 if hasattr(self, "_session"):
379 return self._session
381 self._session: requests.Session = self._sessions_store.get(self)
382 return self._session
384 @property
385 def put_session(self) -> requests.Session:
386 """Client session for uploading data to the remote resource."""
387 if hasattr(self, "_put_session"):
388 return self._put_session
390 self._put_session: requests.Session = self._put_sessions_store.get(self)
391 return self._put_session
393 @property
394 def is_webdav_endpoint(self) -> bool:
395 """Check if the current endpoint implements WebDAV features.
397 This is stored per URI but cached by root so there is
398 only one check per hostname.
399 """
400 if self._is_webdav is not None:
401 return self._is_webdav
403 self._is_webdav = _is_webdav_endpoint(self.root_uri())
404 return self._is_webdav
406 def exists(self) -> bool:
407 """Check that a remote HTTP resource exists."""
408 log.debug("Checking if resource exists: %s", self.geturl())
409 if not self.is_webdav_endpoint:
410 # The remote is a plain HTTP server. Let's attempt a HEAD
411 # request, even if the behavior for such a request against a
412 # directory is not specified, so it depends on the server
413 # implementation.
414 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True, stream=True)
415 return resp.status_code == requests.codes.ok # 200
417 # The remote endpoint is a webDAV server: send a PROPFIND request
418 # to determine if it exists.
419 resp = self._propfind()
420 if resp.status_code == requests.codes.multi_status: # 207
421 prop = _parse_propfind_response_body(resp.text)[0]
422 return prop.exists
423 else: # 404 Not Found
424 return False
426 def size(self) -> int:
427 """Return the size of the remote resource in bytes."""
428 if self.dirLike:
429 return 0
431 if not self.is_webdav_endpoint:
432 # The remote is a plain HTTP server. Send a HEAD request to
433 # retrieve the size of the resource.
434 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True, stream=True)
435 if resp.status_code == requests.codes.ok: # 200
436 if "Content-Length" in resp.headers:
437 return int(resp.headers["Content-Length"])
438 else:
439 raise ValueError(
440 f"Response to HEAD request to {self} does not contain 'Content-Length' header"
441 )
442 elif resp.status_code == requests.codes.not_found:
443 raise FileNotFoundError(
444 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
445 )
446 else:
447 raise ValueError(
448 f"Unexpected response for HEAD request for {self}, status: {resp.status_code} "
449 f"{resp.reason}"
450 )
452 # The remote is a webDAV server: send a PROPFIND request to retrieve
453 # the size of the resource. Sizes are only meaningful for files.
454 resp = self._propfind()
455 if resp.status_code == requests.codes.multi_status: # 207
456 prop = _parse_propfind_response_body(resp.text)[0]
457 if prop.is_file:
458 return prop.size
459 elif prop.is_directory:
460 raise IsADirectoryError(
461 f"Resource {self} is reported by server as a directory but has a file path"
462 )
463 else:
464 raise FileNotFoundError(f"Resource {self} does not exist")
465 else: # 404 Not Found
466 raise FileNotFoundError(
467 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
468 )
470 def mkdir(self) -> None:
471 """Create the directory resource if it does not already exist."""
472 # Creating directories is only available on WebDAV backends.
473 if not self.is_webdav_endpoint:
474 raise NotImplementedError(
475 f"Creation of directory {self} is not implemented by plain HTTP servers"
476 )
478 if not self.dirLike:
479 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
481 # Check if the target directory already exists.
482 resp = self._propfind()
483 if resp.status_code == requests.codes.multi_status: # 207
484 prop = _parse_propfind_response_body(resp.text)[0]
485 if prop.exists:
486 if prop.is_directory:
487 return
488 else:
489 # A file exists at this path
490 raise NotADirectoryError(
491 f"Can not create a directory for {self} because a file already exists at that path"
492 )
494 # Target directory does not exist. Create it and its ancestors as
495 # needed. We need to test if parent URL is different from self URL,
496 # otherwise we could be stuck in a recursive loop
497 # where self == parent.
498 if self.geturl() != self.parent().geturl():
499 self.parent().mkdir()
501 log.debug("Creating new directory: %s", self.geturl())
502 self._mkcol()
504 def remove(self) -> None:
505 """Remove the resource."""
506 self._delete()
508 def read(self, size: int = -1) -> bytes:
509 """Open the resource and return the contents in bytes.
511 Parameters
512 ----------
513 size : `int`, optional
514 The number of bytes to read. Negative or omitted indicates
515 that all data should be read.
516 """
517 log.debug("Reading from remote resource: %s", self.geturl())
518 stream = True if size > 0 else False
519 with time_this(log, msg="GET %s", args=(self,)):
520 resp = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT)
522 if resp.status_code != requests.codes.ok: # 200
523 raise FileNotFoundError(
524 f"Unable to read resource {self}; status: {resp.status_code} {resp.reason}"
525 )
526 if not stream:
527 return resp.content
528 else:
529 return next(resp.iter_content(chunk_size=size))
531 def write(self, data: bytes, overwrite: bool = True) -> None:
532 """Write the supplied bytes to the new resource.
534 Parameters
535 ----------
536 data : `bytes`
537 The bytes to write to the resource. The entire contents of the
538 resource will be replaced.
539 overwrite : `bool`, optional
540 If `True` the resource will be overwritten if it exists. Otherwise
541 the write will fail.
542 """
543 log.debug("Writing to remote resource: %s", self.geturl())
544 if not overwrite:
545 if self.exists():
546 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
548 # Ensure the parent directory exists.
549 self.parent().mkdir()
551 # Upload the data.
552 log.debug("Writing data to remote resource: %s", self.geturl())
553 self._put(data=data)
555 def transfer_from(
556 self,
557 src: ResourcePath,
558 transfer: str = "copy",
559 overwrite: bool = False,
560 transaction: Optional[TransactionProtocol] = None,
561 ) -> None:
562 """Transfer the current resource to a Webdav repository.
564 Parameters
565 ----------
566 src : `ResourcePath`
567 Source URI.
568 transfer : `str`
569 Mode to use for transferring the resource. Supports the following
570 options: copy.
571 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
572 Currently unused.
573 """
574 # Fail early to prevent delays if remote resources are requested.
575 if transfer not in self.transferModes:
576 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
578 # Existence checks cost time so do not call this unless we know
579 # that debugging is enabled.
580 if log.isEnabledFor(logging.DEBUG):
581 log.debug(
582 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
583 src,
584 src.exists(),
585 self,
586 self.exists(),
587 transfer,
588 )
590 # Short circuit immediately if the URIs are identical.
591 if self == src:
592 log.debug(
593 "Target and destination URIs are identical: %s, returning immediately."
594 " No further action required.",
595 self,
596 )
597 return
599 if not overwrite and self.exists():
600 raise FileExistsError(f"Destination path {self} already exists.")
602 if transfer == "auto":
603 transfer = self.transferDefault
605 # We can use webDAV 'COPY' or 'MOVE' if both the current and source
606 # resources are located in the same server.
607 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint:
608 log.debug("Transfer from %s to %s directly", src, self)
609 return self._move(src) if transfer == "move" else self._copy(src)
611 # For resources of different classes or for plain HTTP resources we can
612 # perform the copy or move operation by downloading to a local file
613 # and uploading to the destination.
614 self._copy_via_local(src)
616 # This was an explicit move, try to remove the source.
617 if transfer == "move":
618 src.remove()
620 def walk(
621 self, file_filter: Optional[Union[str, re.Pattern]] = None
622 ) -> Iterator[Union[List, Tuple[ResourcePath, List[str], List[str]]]]:
623 """Walk the directory tree returning matching files and directories.
624 Parameters
625 ----------
626 file_filter : `str` or `re.Pattern`, optional
627 Regex to filter out files from the list before it is returned.
628 Yields
629 ------
630 dirpath : `ResourcePath`
631 Current directory being examined.
632 dirnames : `list` of `str`
633 Names of subdirectories within dirpath.
634 filenames : `list` of `str`
635 Names of all the files within dirpath.
636 """
637 if not self.dirLike:
638 raise ValueError("Can not walk a non-directory URI")
640 # Walking directories is only available on WebDAV backends.
641 if not self.is_webdav_endpoint:
642 raise NotImplementedError(f"Walking directory {self} is not implemented by plain HTTP servers")
644 if isinstance(file_filter, str):
645 file_filter = re.compile(file_filter)
647 resp = self._propfind(depth="1")
648 if resp.status_code == requests.codes.multi_status: # 207
649 files: List[str] = []
650 dirs: List[str] = []
652 for prop in _parse_propfind_response_body(resp.text):
653 if prop.is_file:
654 files.append(prop.name)
655 elif not self.path.endswith(prop.href):
656 # Only include the names of sub-directories not the
657 # directory being walked.
658 dirs.append(prop.name)
660 if file_filter is not None:
661 files = [f for f in files if file_filter.search(f)]
663 if not dirs and not files:
664 return
665 else:
666 yield type(self)(self, forceAbsolute=False, forceDirectory=True), dirs, files
668 for dir in dirs:
669 new_uri = self.join(dir, forceDirectory=True)
670 yield from new_uri.walk(file_filter)
672 def _as_local(self) -> Tuple[str, bool]:
673 """Download object over HTTP and place in temporary directory.
675 Returns
676 -------
677 path : `str`
678 Path to local temporary file.
679 temporary : `bool`
680 Always returns `True`. This is always a temporary file.
681 """
682 resp = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT)
683 if resp.status_code != requests.codes.ok:
684 raise FileNotFoundError(
685 f"Unable to download resource {self}; status: {resp.status_code} {resp.reason}"
686 )
688 tmpdir, buffering = _get_temp_dir()
689 with tempfile.NamedTemporaryFile(
690 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
691 ) as tmpFile:
692 with time_this(
693 log,
694 msg="GET %s [length=%s] to local file %s [chunk_size=%d]",
695 args=(self, resp.headers.get("Content-Length"), tmpFile.name, buffering),
696 mem_usage=True,
697 mem_unit=u.mebibyte,
698 ):
699 for chunk in resp.iter_content(chunk_size=buffering):
700 tmpFile.write(chunk)
702 return tmpFile.name, True
704 def _send_webdav_request(
705 self, method: str, url: Optional[str] = None, headers: dict[str, str] = {}, body: Optional[str] = None
706 ) -> requests.Response:
707 """Send a webDAV request and correctly handle redirects.
709 Parameters
710 ----------
711 method : `str`
712 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL.
713 headers : `dict`, optional
714 A dictionary of key-value pairs (both strings) to include as
715 headers in the request.
716 body: `str`, optional
717 The body of the request.
719 Notes
720 -----
721 This way of sending webDAV requests is necessary for handling
722 redirection ourselves, since the 'requests' package changes the method
723 of the redirected request when the server responds with status 302 and
724 the method of the original request is not HEAD (which is the case for
725 webDAV requests).
727 That means that when the webDAV server we interact with responds with
728 a redirection to a PROPFIND or MKCOL request, the request gets
729 converted to a GET request when sent to the redirected location.
731 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in
732 https://github.com/psf/requests/blob/main/requests/sessions.py
734 This behavior of the 'requests' package is meant to be compatible with
735 what is specified in RFC 9110:
737 https://www.rfc-editor.org/rfc/rfc9110#name-302-found
739 For our purposes, we do need to follow the redirection and send a new
740 request using the same HTTP verb.
741 """
742 if url is None:
743 url = self.geturl()
745 with time_this(
746 log,
747 msg="%s %s",
748 args=(
749 method,
750 url,
751 ),
752 mem_usage=True,
753 mem_unit=u.mebibyte,
754 ):
755 for _ in range(max_redirects := 5):
756 resp = self.session.request(
757 method,
758 url,
759 data=body,
760 headers=headers,
761 stream=True,
762 timeout=TIMEOUT,
763 allow_redirects=False,
764 )
765 if resp.is_redirect:
766 url = resp.headers["Location"]
767 else:
768 return resp
770 # We reached the maximum allowed number of redirects. Stop trying.
771 raise ValueError(
772 f"Could not get a response to {method} request for {self} after {max_redirects} redirections"
773 )
775 def _propfind(self, body: Optional[str] = None, depth: str = "0") -> requests.Response:
776 """Send a PROPFIND webDAV request and return the response.
778 Parameters
779 ----------
780 body : `str`, optional
781 The body of the PROPFIND request to send to the server. If
782 provided, it is expected to be a XML document.
783 depth : `str`, optional
784 The value of the 'Depth' header to include in the request.
786 Returns
787 -------
788 response : `requests.Response`
789 Response to the PROPFIND request.
791 Notes
792 -----
793 It raises `ValueError` if the status code of the PROPFIND request
794 is different from "207 Multistatus" or "404 Not Found".
795 """
796 if body is None:
797 # Request only the DAV live properties we are explicitly interested
798 # in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
799 # and 'displayname'.
800 body = (
801 """<?xml version="1.0" encoding="utf-8" ?>"""
802 """<D:propfind xmlns:D="DAV:"><D:prop>"""
803 """<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
804 """</D:prop></D:propfind>"""
805 )
806 headers = {
807 "Depth": depth,
808 "Content-Type": 'application/xml; charset="utf-8"',
809 "Content-Length": str(len(body)),
810 }
811 resp = self._send_webdav_request("PROPFIND", headers=headers, body=body)
812 if resp.status_code in (requests.codes.multi_status, requests.codes.not_found):
813 return resp
814 else:
815 raise ValueError(
816 f"Unexpected response for PROPFIND request for {self}, status: {resp.status_code} "
817 f"{resp.reason}"
818 )
820 def _options(self) -> requests.Response:
821 """Send a OPTIONS webDAV request for this resource."""
823 return self._send_webdav_request("OPTIONS")
825 def _head(self) -> requests.Response:
826 """Send a HEAD webDAV request for this resource."""
828 return self._send_webdav_request("HEAD")
830 def _mkcol(self) -> None:
831 """Send a MKCOL webDAV request to create a collection. The collection
832 may already exist.
833 """
834 resp = self._send_webdav_request("MKCOL")
835 if resp.status_code == requests.codes.created: # 201
836 return
838 if resp.status_code == requests.codes.method_not_allowed: # 405
839 # The remote directory already exists
840 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
841 else:
842 raise ValueError(f"Can not create directory {self}, status: {resp.status_code} {resp.reason}")
844 def _delete(self) -> None:
845 """Send a DELETE webDAV request for this resource."""
847 log.debug("Deleting %s ...", self.geturl())
849 # If this is a directory, ensure the remote is a webDAV server because
850 # plain HTTP servers don't support DELETE requests on non-file
851 # paths.
852 if self.dirLike and not self.is_webdav_endpoint:
853 raise NotImplementedError(
854 f"Deletion of directory {self} is not implemented by plain HTTP servers"
855 )
857 resp = self._send_webdav_request("DELETE")
858 if resp.status_code in (requests.codes.ok, requests.codes.accepted, requests.codes.no_content):
859 return
860 elif resp.status_code == requests.codes.not_found:
861 raise FileNotFoundError(
862 f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
863 )
864 else:
865 # TODO: the response to a DELETE request against a webDAV server
866 # may be multistatus. If so, we need to parse the reponse body to
867 # determine more precisely the reason of the failure (e.g. a lock)
868 # and provide a more helpful error message.
869 raise ValueError(f"Unable to delete resource {self}; status: {resp.status_code} {resp.reason}")
871 def _copy_via_local(self, src: ResourcePath) -> None:
872 """Replace the contents of this resource with the contents of a remote
873 resource by using a local temporary file.
875 Parameters
876 ----------
877 src : `HttpResourcePath`
878 The source of the contents to copy to `self`.
879 """
880 with src.as_local() as local_uri:
881 log.debug("Transfer from %s to %s via local file %s", src, self, local_uri)
882 with open(local_uri.ospath, "rb") as f:
883 self._put(data=f)
885 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None:
886 """Send a COPY or MOVE webDAV request to copy or replace the contents
887 of this resource with the contents of another resource located in the
888 same server.
890 Parameters
891 ----------
892 method : `str`
893 The method to perform. Valid values are "COPY" or "MOVE" (in
894 uppercase).
896 src : `HttpResourcePath`
897 The source of the contents to move to `self`.
898 """
899 headers = {"Destination": self.geturl()}
900 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers)
901 if resp.status_code in (requests.codes.created, requests.codes.no_content):
902 return
904 if resp.status_code == requests.codes.multi_status:
905 tree = eTree.fromstring(resp.content)
906 status_element = tree.find("./{DAV:}response/{DAV:}status")
907 status = status_element.text if status_element is not None else "unknown"
908 error = tree.find("./{DAV:}response/{DAV:}error")
909 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}")
910 else:
911 raise ValueError(
912 f"{method} operation from {src} to {self} failed, status: {resp.status_code} {resp.reason}"
913 )
915 def _copy(self, src: HttpResourcePath) -> None:
916 """Send a COPY webDAV request to replace the contents of this resource
917 (if any) with the contents of another resource located in the same
918 server.
920 Parameters
921 ----------
922 src : `HttpResourcePath`
923 The source of the contents to copy to `self`.
924 """
925 # Neither dCache nor XrootD currently implement the COPY
926 # webDAV method as documented in
927 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
928 # (See issues DM-37603 and DM-37651 for details)
929 #
930 # For the time being, we use a temporary local file to
931 # perform the copy client side.
932 # TODO: when those 2 issues above are solved remove the 3 lines below.
933 must_use_local = True
934 if must_use_local:
935 return self._copy_via_local(src)
937 return self._copy_or_move("COPY", src)
939 def _move(self, src: HttpResourcePath) -> None:
940 """Send a MOVE webDAV request to replace the contents of this resource
941 with the contents of another resource located in the same server.
943 Parameters
944 ----------
945 src : `HttpResourcePath`
946 The source of the contents to move to `self`.
947 """
948 return self._copy_or_move("MOVE", src)
950 def _put(self, data: Union[BinaryIO, bytes]) -> None:
951 """Perform an HTTP PUT request and handle redirection.
953 Parameters
954 ----------
955 data : `Union[BinaryIO, bytes]`
956 The data to be included in the body of the PUT request.
957 """
958 # Retrieve the final URL for this upload by sending a PUT request with
959 # no content. Follow a single server redirection to retrieve the
960 # final URL.
961 headers = {"Content-Length": "0"}
962 if _SEND_EXPECT_HEADER_ON_PUT:
963 headers["Expect"] = "100-continue"
965 url = self.geturl()
967 log.debug("Sending empty PUT request to %s", url)
968 with time_this(log, msg="PUT (no data) %s", args=(url,), mem_usage=True, mem_unit=u.mebibyte):
969 resp = self.session.request(
970 "PUT", url, data=None, headers=headers, stream=True, timeout=TIMEOUT, allow_redirects=False
971 )
972 if resp.is_redirect:
973 url = resp.headers["Location"]
975 # Upload the data to the final destination using the PUT session
976 log.debug("Uploading data to %s", url)
977 with time_this(log, msg="PUT %s", args=(url,), mem_usage=True, mem_unit=u.mebibyte):
978 resp = self.put_session.put(url, data=data, stream=True, timeout=TIMEOUT, allow_redirects=False)
979 if resp.status_code not in (requests.codes.ok, requests.codes.created, requests.codes.no_content):
980 raise ValueError(f"Can not write file {self}, status: {resp.status_code} {resp.reason}")
982 @contextlib.contextmanager
983 def _openImpl(
984 self,
985 mode: str = "r",
986 *,
987 encoding: Optional[str] = None,
988 ) -> Iterator[ResourceHandleProtocol]:
989 resp = self._head()
990 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes"
991 handle: ResourceHandleProtocol
992 if mode in ("rb", "r") and accepts_range:
993 handle = HttpReadResourceHandle(
994 mode, log, url=self.geturl(), session=self.session, timeout=TIMEOUT
995 )
996 if mode == "r":
997 # cast because the protocol is compatible, but does not have
998 # BytesIO in the inheritance tree
999 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
1000 else:
1001 yield handle
1002 else:
1003 with super()._openImpl(mode, encoding=encoding) as http_handle:
1004 yield http_handle
1007def _dump_response(resp: requests.Response) -> None:
1008 """Log the contents of a HTTP or webDAV request and its response.
1010 Parameters
1011 ----------
1012 resp : `requests.Response`
1013 The response to log.
1015 Notes
1016 -----
1017 Intended for development purposes only.
1018 """
1019 log.debug("-----------------------------------------------")
1020 log.debug("Request")
1021 log.debug(" method=%s", resp.request.method)
1022 log.debug(" URL=%s", resp.request.url)
1023 log.debug(" headers=%s", resp.request.headers)
1024 if resp.request.method == "PUT":
1025 log.debug(" body=<data>")
1026 elif resp.request.body is None:
1027 log.debug(" body=<empty>")
1028 else:
1029 log.debug(" body=%r", resp.request.body[:120])
1031 log.debug("Response:")
1032 log.debug(" status_code=%d", resp.status_code)
1033 log.debug(" headers=%s", resp.headers)
1034 if not resp.content:
1035 log.debug(" body=<empty>")
1036 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain":
1037 log.debug(" body=%r", resp.content)
1038 else:
1039 log.debug(" body=%r", resp.content[:80])
1042def _is_protected(filepath: str) -> bool:
1043 """Return true if the permissions of file at filepath only allow for access
1044 by its owner.
1046 Parameters
1047 ----------
1048 filepath : `str`
1049 Path of a local file.
1050 """
1051 if not os.path.isfile(filepath):
1052 return False
1053 mode = stat.S_IMODE(os.stat(filepath).st_mode)
1054 owner_accessible = bool(mode & stat.S_IRWXU)
1055 group_accessible = bool(mode & stat.S_IRWXG)
1056 other_accessible = bool(mode & stat.S_IRWXO)
1057 return owner_accessible and not group_accessible and not other_accessible
1060def _parse_propfind_response_body(body: str) -> List[DavProperty]:
1061 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND
1062 request.
1064 Parameters
1065 ----------
1066 body : `str`
1067 XML-encoded response body to a PROPFIND request
1069 Returns
1070 -------
1071 responses : `List[DavProperty]`
1073 Notes
1074 -----
1075 Is is expected that there is at least one reponse in `body`, otherwise
1076 this function raises.
1077 """
1078 # A response body to a PROPFIND request is of the form (indented for
1079 # readability):
1080 #
1081 # <?xml version="1.0" encoding="UTF-8"?>
1082 # <D:multistatus xmlns:D="DAV:">
1083 # <D:response>
1084 # <D:href>path/to/resource</D:href>
1085 # <D:propstat>
1086 # <D:prop>
1087 # <D:resourcetype>
1088 # <D:collection xmlns:D="DAV:"/>
1089 # </D:resourcetype>
1090 # <D:getlastmodified>
1091 # Fri, 27 Jan 2 023 13:59:01 GMT
1092 # </D:getlastmodified>
1093 # <D:getcontentlength>
1094 # 12345
1095 # </D:getcontentlength>
1096 # </D:prop>
1097 # <D:status>
1098 # HTTP/1.1 200 OK
1099 # </D:status>
1100 # </D:propstat>
1101 # </D:response>
1102 # <D:response>
1103 # ...
1104 # </D:response>
1105 # <D:response>
1106 # ...
1107 # </D:response>
1108 # </D:multistatus>
1110 # Scan all the 'response' elements and extract the relevant properties
1111 responses = []
1112 multistatus = eTree.fromstring(body.strip())
1113 for response in multistatus.findall("./{DAV:}response"):
1114 responses.append(DavProperty(response))
1116 if responses:
1117 return responses
1118 else:
1119 # Could not parse the body
1120 raise ValueError(f"Unable to parse response for PROPFIND request: {response}")
1123class DavProperty:
1124 """Helper class to encapsulate select live DAV properties of a single
1125 resource, as retrieved via a PROPFIND request.
1126 """
1128 # Regular expression to compare against the 'status' element of a
1129 # PROPFIND response's 'propstat' element.
1130 _status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE)
1132 def __init__(self, response: Optional[eTree.Element]):
1133 self._href: str = ""
1134 self._displayname: str = ""
1135 self._collection: bool = False
1136 self._getlastmodified: str = ""
1137 self._getcontentlength: int = -1
1139 if response is not None:
1140 self._parse(response)
1142 def _parse(self, response: eTree.Element) -> None:
1143 # Extract 'href'
1144 if (element := response.find("./{DAV:}href")) is not None:
1145 # We need to use "str(element.text)"" instead of "element.text" to
1146 # keep mypy happy
1147 self._href = str(element.text).strip()
1149 for propstat in response.findall("./{DAV:}propstat"):
1150 # Only extract properties of interest with status OK.
1151 status = propstat.find("./{DAV:}status")
1152 if status is None or not self._status_ok_rex.match(str(status.text)):
1153 continue
1155 for prop in propstat.findall("./{DAV:}prop"):
1156 # Parse "collection".
1157 if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None:
1158 self._collection = True
1160 # Parse "getlastmodified".
1161 if (element := prop.find("./{DAV:}getlastmodified")) is not None:
1162 self._getlastmodified = str(element.text)
1164 # Parse "getcontentlength".
1165 if (element := prop.find("./{DAV:}getcontentlength")) is not None:
1166 self._getcontentlength = int(str(element.text))
1168 # Parse "displayname".
1169 if (element := prop.find("./{DAV:}displayname")) is not None:
1170 self._displayname = str(element.text)
1172 @property
1173 def exists(self) -> bool:
1174 # It is either a directory or a file with length of at least zero
1175 return self._collection or self._getcontentlength >= 0
1177 @property
1178 def is_directory(self) -> bool:
1179 return self._collection
1181 @property
1182 def is_file(self) -> bool:
1183 return self._getcontentlength >= 0
1185 @property
1186 def size(self) -> int:
1187 # Only valid if is_file is True
1188 return self._getcontentlength
1190 @property
1191 def name(self) -> str:
1192 return self._displayname
1194 @property
1195 def href(self) -> str:
1196 return self._href