Coverage for python/lsst/resources/http.py: 14%
381 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-22 11:09 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-22 11:09 +0000
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import os
21import os.path
22import random
23import re
24import stat
25import tempfile
26import xml.etree.ElementTree as eTree
27from typing import TYPE_CHECKING, BinaryIO, Iterator, List, Optional, Tuple, Union, cast
29import requests
30from lsst.utils.timer import time_this
31from requests.adapters import HTTPAdapter
32from requests.auth import AuthBase
33from urllib3.util.retry import Retry
35from ._resourceHandles import ResourceHandleProtocol
36from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
37from ._resourcePath import ResourcePath
39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 from .utils import TransactionProtocol
42log = logging.getLogger(__name__)
45# Default timeouts for all HTTP requests, in seconds.
46DEFAULT_TIMEOUT_CONNECT = 60
47DEFAULT_TIMEOUT_READ = 300
49# Allow for network timeouts to be set in the environment.
50TIMEOUT = (
51 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)),
52 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)),
53)
55# Should we send a "Expect: 100-continue" header on PUT requests?
56# The "Expect: 100-continue" header is used by some servers (e.g. dCache)
57# as an indication that the client knows how to handle redirects to
58# the specific server that will actually receive the data for PUT
59# requests.
60_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
63@functools.lru_cache
64def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool:
65 """Check whether the remote HTTP endpoint implements WebDAV features.
67 Parameters
68 ----------
69 path : `ResourcePath` or `str`
70 URL to the resource to be checked.
71 Should preferably refer to the root since the status is shared
72 by all paths in that server.
74 Returns
75 -------
76 _is_webdav_endpoint : `bool`
77 True if the endpoint implements WebDAV, False if it doesn't.
78 """
79 log.debug("Detecting HTTP endpoint type for '%s'...", path)
80 try:
81 ca_cert_bundle = os.getenv("LSST_HTTP_CACERT_BUNDLE")
82 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True
83 resp = requests.options(str(path), verify=verify, stream=True)
85 # Check that "1" is part of the value of the "DAV" header. We don't
86 # use locks, so a server complying to class 1 is enough for our
87 # purposes. All webDAV servers must advertise at least compliance
88 # class "1".
89 #
90 # Compliance classes are documented in
91 # http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
92 #
93 # Examples of values for header DAV are:
94 # DAV: 1, 2
95 # DAV: 1, <http://apache.org/dav/propset/fs/1>
96 if "DAV" not in resp.headers:
97 return False
98 else:
99 # Convert to str to keep mypy happy
100 compliance_class = str(resp.headers.get("DAV"))
101 return "1" in compliance_class.replace(" ", "").split(",")
102 except requests.exceptions.SSLError as e:
103 log.warning(
104 "Environment variable LSST_HTTP_CACERT_BUNDLE can be used to "
105 "specify a bundle of certificate authorities you trust which are "
106 "not included in the default set of trusted authorities of your "
107 "system."
108 )
109 raise e
112# Tuple (path, block_size) pointing to the location of a local directory
113# to save temporary files and the block size of the underlying file system.
114_TMPDIR: Optional[Tuple[str, int]] = None
117def _get_temp_dir() -> Tuple[str, int]:
118 """Return the temporary directory path and block size.
120 This function caches its results in _TMPDIR.
121 """
122 global _TMPDIR
123 if _TMPDIR:
124 return _TMPDIR
126 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
127 # 'TMPDIR', if defined. Otherwise use current working directory.
128 tmpdir = os.getcwd()
129 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
130 if dir and os.path.isdir(dir):
131 tmpdir = dir
132 break
134 # Compute the block size as 256 blocks of typical size
135 # (i.e. 4096 bytes) or 10 times the file system block size,
136 # whichever is higher. This is a reasonable compromise between
137 # using memory for buffering and the number of system calls
138 # issued to read from or write to temporary files.
139 fsstats = os.statvfs(tmpdir)
140 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
143class BearerTokenAuth(AuthBase):
144 """Attach a bearer token 'Authorization' header to each request.
146 Parameters
147 ----------
148 token : `str`
149 Can be either the path to a local protected file which contains the
150 value of the token or the token itself.
151 """
153 def __init__(self, token: str):
154 self._token = self._path = None
155 self._mtime: float = -1.0
156 if not token:
157 return
159 self._token = token
160 if os.path.isfile(token):
161 self._path = os.path.abspath(token)
162 if not _is_protected(self._path):
163 raise PermissionError(
164 f"Bearer token file at {self._path} must be protected for access only by its owner"
165 )
166 self._refresh()
168 def _refresh(self) -> None:
169 """Read the token file (if any) if its modification time is more recent
170 than the last time we read it.
171 """
172 if not self._path:
173 return
175 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
176 log.debug("Reading bearer token file at %s", self._path)
177 self._mtime = mtime
178 with open(self._path) as f:
179 self._token = f.read().rstrip("\n")
181 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
182 if self._token:
183 self._refresh()
184 req.headers["Authorization"] = f"Bearer {self._token}"
185 return req
188class SessionStore:
189 """Cache a single reusable HTTP client session per enpoint."""
191 def __init__(self) -> None:
192 # The key of the dictionary is a root URI and the value is the
193 # session
194 self._sessions: dict[str, requests.Session] = {}
196 def get(self, rpath: ResourcePath, persist: bool = True) -> requests.Session:
197 """Retrieve a session for accessing the remote resource at rpath.
199 Parameters
200 ----------
201 rpath : `ResourcePath`
202 URL to a resource at the remote server for which a session is to
203 be retrieved.
205 persist : `bool`
206 if `True`, make the network connection with the front end server
207 of the endpoint persistent. Connections to the backend servers
208 are persisted.
210 Notes
211 -----
212 Once a session is created for a given endpoint it is cached and
213 returned every time a session is requested for any path under that same
214 endpoint. For instance, a single session will be cached and shared
215 for paths "https://www.example.org/path/to/file" and
216 "https://www.example.org/any/other/path".
218 Note that "https://www.example.org" and "https://www.example.org:12345"
219 will have different sessions since the port number is not identical.
221 In order to configure the session, some environment variables are
222 inspected:
224 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
225 certificates to trust when verifying the server's certificate.
227 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
228 local file containing a bearer token to be used as the client
229 authentication mechanism with all requests.
230 The permissions of the token file must be set so that only its
231 owner can access it.
232 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
233 and LSST_HTTP_AUTH_CLIENT_KEY.
235 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
236 client certificate for authenticating to the server.
237 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
238 initialized with the path of the client private key file.
239 The permissions of the client private key must be set so that only
240 its owner can access it, at least for reading.
241 """
242 root_uri = str(rpath.root_uri())
243 if root_uri not in self._sessions:
244 # We don't have yet a session for this endpoint: create a new one
245 self._sessions[root_uri] = self._make_session(rpath, persist)
246 return self._sessions[root_uri]
248 def _make_session(self, rpath: ResourcePath, persist: bool) -> requests.Session:
249 """Make a new session configured from values from the environment."""
250 session = requests.Session()
251 root_uri = str(rpath.root_uri())
252 log.debug("Creating new HTTP session for endpoint %s (persist connection=%s)...", root_uri, persist)
254 retries = Retry(
255 # Total number of retries to allow. Takes precedence over other
256 # counts.
257 total=3,
258 # How many connection-related errors to retry on.
259 connect=3,
260 # How many times to retry on read errors.
261 read=3,
262 # Backoff factor to apply between attempts after the second try
263 # (seconds)
264 backoff_factor=5.0 + random.random(),
265 # How many times to retry on bad status codes
266 status=3,
267 # HTTP status codes that we should force a retry on
268 status_forcelist=[
269 requests.codes.too_many_requests, # 429
270 requests.codes.internal_server_error, # 500
271 requests.codes.bad_gateway, # 502
272 requests.codes.service_unavailable, # 503
273 requests.codes.gateway_timeout, # 504
274 ],
275 )
277 # Persist a single connection to the front end server, if required
278 num_connections = 1 if persist else 0
279 session.mount(
280 root_uri,
281 HTTPAdapter(
282 pool_connections=1, pool_maxsize=num_connections, pool_block=False, max_retries=retries
283 ),
284 )
286 # Prevent persisting connections to back-end servers which may vary
287 # from request to request. Systematically persisting connections to
288 # those servers may exhaust their capabilities when there are thousands
289 # of simultaneous clients
290 session.mount(
291 f"{rpath.scheme}://",
292 HTTPAdapter(pool_connections=1, pool_maxsize=0, pool_block=False, max_retries=retries),
293 )
295 # If the remote endpoint don't use secure HTTP we dont include bearer
296 # tokens in the requests nor need to authenticate the remove server.
297 if rpath.scheme != "https":
298 return session
300 # Should we use a specific CA cert bundle for authenticating the
301 # server?
302 session.verify = True
303 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
304 session.verify = ca_bundle
306 # Should we use bearer tokens for client authentication?
307 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
308 log.debug("... using bearer token authentication")
309 session.auth = BearerTokenAuth(token)
310 return session
312 # Should we instead use client certificate and private key? If so, both
313 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
314 # initialized.
315 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
316 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
317 if client_cert and client_key:
318 if not _is_protected(client_key):
319 raise PermissionError(
320 f"Private key file at {client_key} must be protected for access only by its owner"
321 )
322 log.debug("... using client certificate authentication.")
323 session.cert = (client_cert, client_key)
324 return session
326 if client_cert:
327 # Only the client certificate was provided.
328 raise ValueError(
329 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
330 )
332 if client_key:
333 # Only the client private key was provided.
334 raise ValueError(
335 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
336 )
338 log.debug(
339 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
340 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
341 )
342 return session
345class HttpResourcePath(ResourcePath):
346 """General HTTP(S) resource.
348 Notes
349 -----
350 In order to configure the behavior of the object, one environment variable
351 is inspected:
353 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
354 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
355 This header is required by some servers to detect if the client
356 knows how to handle redirections. In case of redirection, the body
357 of the PUT request is sent to the redirected location and not to
358 the front end server.
359 """
361 _is_webdav: Optional[bool] = None
362 _sessions_store = SessionStore()
363 _put_sessions_store = SessionStore()
365 # Use a session exclusively for PUT requests and another session for
366 # all other requests. PUT requests may be redirected and in that case
367 # the server may close the persisted connection. If that is the case
368 # only the connection persisted for PUT requests will be closed and
369 # the other persisted connection will be kept alive and reused for
370 # other requests.
372 @property
373 def session(self) -> requests.Session:
374 """Client session to address remote resource for all HTTP methods but
375 PUT.
376 """
377 if hasattr(self, "_session"):
378 return self._session
380 self._session: requests.Session = self._sessions_store.get(self)
381 return self._session
383 @property
384 def put_session(self) -> requests.Session:
385 """Client session for uploading data to the remote resource."""
386 if hasattr(self, "_put_session"):
387 return self._put_session
389 self._put_session: requests.Session = self._put_sessions_store.get(self)
390 return self._put_session
392 @property
393 def is_webdav_endpoint(self) -> bool:
394 """Check if the current endpoint implements WebDAV features.
396 This is stored per URI but cached by root so there is
397 only one check per hostname.
398 """
399 if self._is_webdav is not None:
400 return self._is_webdav
402 self._is_webdav = _is_webdav_endpoint(self.root_uri())
403 return self._is_webdav
405 def exists(self) -> bool:
406 """Check that a remote HTTP resource exists."""
407 log.debug("Checking if resource exists: %s", self.geturl())
408 if not self.is_webdav_endpoint:
409 # The remote is a plain HTTP server. Let's attempt a HEAD
410 # request, even if the behavior for such a request against a
411 # directory is not specified, so it depends on the server
412 # implementation.
413 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True, stream=True)
414 return resp.status_code == requests.codes.ok # 200
416 # The remote endpoint is a webDAV server: send a PROPFIND request
417 # requesting only the 'getlastmodified' property.
418 request_body = (
419 """<?xml version="1.0" encoding="utf-8" ?>"""
420 """<D:propfind xmlns:D="DAV:"><D:prop><D:getlastmodified/></D:prop></D:propfind>"""
421 )
422 resp = self._propfind(request_body)
423 if resp.status_code == requests.codes.multi_status: # 207
424 # Retrieve the status of the first and only element in the response
425 propfind_resp = _parse_propfind_response_body(resp.text)[0]
426 return propfind_resp.status_code == requests.codes.ok
427 elif resp.status_code == requests.codes.not_found: # 404
428 return False
429 else:
430 raise ValueError(
431 f"Unexpected status received for PROPFIND request for {self}: {resp.status_code}"
432 )
434 def size(self) -> int:
435 """Return the size of the remote resource in bytes."""
436 if self.dirLike:
437 return 0
439 if not self.is_webdav_endpoint:
440 # The remote is a plain HTTP server. Send a HEAD request to
441 # retrieve the size of the resource.
442 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True, stream=True)
443 if resp.status_code == requests.codes.ok: # 200
444 if "Content-Length" in resp.headers:
445 return int(resp.headers["Content-Length"])
446 else:
447 raise ValueError(
448 f"Response to HEAD request to {self} does not contain 'Content-Length' header"
449 )
450 elif resp.status_code == requests.codes.not_found:
451 raise FileNotFoundError(f"Resource {self} does not exist, status code: {resp.status_code}")
452 else:
453 raise ValueError(
454 f"Unexpected response for HEAD request for {self}, status code: {resp.status_code}"
455 )
457 # The remote is a webDAV server: send a PROPFIND request to retrieve
458 # the 'getcontentlength' property of the resource.
459 request_body = (
460 """<?xml version="1.0" encoding="utf-8" ?>"""
461 """<D:propfind xmlns:D="DAV:"><D:prop><D:getcontentlength/></D:prop></D:propfind>"""
462 )
463 resp = self._propfind(body=request_body)
464 if resp.status_code == requests.codes.multi_status: # 207
465 # Parse the response body and retrieve the 'getcontentlength'
466 # property
467 propfind_resp = _parse_propfind_response_body(resp.text)[0]
468 if propfind_resp.status_code == requests.codes.ok: # 200
469 return propfind_resp.getcontentlength
470 else:
471 raise FileNotFoundError(f"Resource {self} does not exist")
472 elif resp.status_code == requests.codes.not_found:
473 raise FileNotFoundError(f"Resource {self} does not exist, status code: {resp.status_code}")
474 else:
475 raise ValueError(
476 f"Unexpected response for PROPFIND request for {self}, status code: {resp.status_code}"
477 )
479 def mkdir(self) -> None:
480 """Create the directory resource if it does not already exist."""
481 # Creating directories is only available on WebDAV backends.
482 if not self.is_webdav_endpoint:
483 raise NotImplementedError(
484 f"Creation of directory {self} is not implemented by plain HTTP servers"
485 )
487 if not self.dirLike:
488 raise NotADirectoryError(f"Can not create a 'directory' for file-like URI {self}")
490 if not self.exists():
491 # We need to test the absence of the parent directory,
492 # but also if parent URL is different from self URL,
493 # otherwise we could be stuck in a recursive loop
494 # where self == parent.
495 if not self.parent().exists() and self.parent().geturl() != self.geturl():
496 self.parent().mkdir()
498 log.debug("Creating new directory: %s", self.geturl())
499 self._mkcol()
501 def remove(self) -> None:
502 """Remove the resource."""
503 self._delete()
505 def read(self, size: int = -1) -> bytes:
506 """Open the resource and return the contents in bytes.
508 Parameters
509 ----------
510 size : `int`, optional
511 The number of bytes to read. Negative or omitted indicates
512 that all data should be read.
513 """
514 log.debug("Reading from remote resource: %s", self.geturl())
515 stream = True if size > 0 else False
516 with time_this(log, msg="Read from remote resource %s", args=(self,)):
517 resp = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT)
519 if resp.status_code != requests.codes.ok: # 200
520 raise FileNotFoundError(f"Unable to read resource {self}; status code: {resp.status_code}")
521 if not stream:
522 return resp.content
523 else:
524 return next(resp.iter_content(chunk_size=size))
526 def write(self, data: bytes, overwrite: bool = True) -> None:
527 """Write the supplied bytes to the new resource.
529 Parameters
530 ----------
531 data : `bytes`
532 The bytes to write to the resource. The entire contents of the
533 resource will be replaced.
534 overwrite : `bool`, optional
535 If `True` the resource will be overwritten if it exists. Otherwise
536 the write will fail.
537 """
538 log.debug("Writing to remote resource: %s", self.geturl())
539 if not overwrite:
540 if self.exists():
541 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
543 # Ensure the parent directory exists
544 self.parent().mkdir()
546 # Upload the data
547 with time_this(log, msg="Write to remote %s (%d bytes)", args=(self, len(data))):
548 self._put(data=data)
550 def transfer_from(
551 self,
552 src: ResourcePath,
553 transfer: str = "copy",
554 overwrite: bool = False,
555 transaction: Optional[TransactionProtocol] = None,
556 ) -> None:
557 """Transfer the current resource to a Webdav repository.
559 Parameters
560 ----------
561 src : `ResourcePath`
562 Source URI.
563 transfer : `str`
564 Mode to use for transferring the resource. Supports the following
565 options: copy.
566 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
567 Currently unused.
568 """
569 # Fail early to prevent delays if remote resources are requested
570 if transfer not in self.transferModes:
571 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
573 # Existence checks cost time so do not call this unless we know
574 # that debugging is enabled.
575 if log.isEnabledFor(logging.DEBUG):
576 log.debug(
577 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
578 src,
579 src.exists(),
580 self,
581 self.exists(),
582 transfer,
583 )
585 # Short circuit immediately if the URIs are identical.
586 if self == src:
587 log.debug(
588 "Target and destination URIs are identical: %s, returning immediately."
589 " No further action required.",
590 self,
591 )
592 return
594 if self.exists() and not overwrite:
595 raise FileExistsError(f"Destination path {self} already exists.")
597 if transfer == "auto":
598 transfer = self.transferDefault
600 # We can use webDAV 'COPY' or 'MOVE' if both the current and source
601 # resources are located in the same server.
602 if isinstance(src, type(self)) and self.root_uri() == src.root_uri() and self.is_webdav_endpoint:
603 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)):
604 return self._move(src) if transfer == "move" else self._copy(src)
606 # For resources of different classes or for plain HTTP resources we can
607 # perform the copy or move operation by downloading to a local file
608 # and uploading to the destination.
609 with time_this(log, msg="Transfer from %s to %s via local copy", args=(src, self)):
610 self._copy_via_local(src)
612 # This was an explicit move, try to remove the source.
613 if transfer == "move":
614 src.remove()
616 def _as_local(self) -> Tuple[str, bool]:
617 """Download object over HTTP and place in temporary directory.
619 Returns
620 -------
621 path : `str`
622 Path to local temporary file.
623 temporary : `bool`
624 Always returns `True`. This is always a temporary file.
625 """
626 resp = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT)
627 if resp.status_code != requests.codes.ok:
628 raise FileNotFoundError(f"Unable to download resource {self}; status code: {resp.status_code}")
630 tmpdir, buffering = _get_temp_dir()
631 with tempfile.NamedTemporaryFile(
632 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
633 ) as tmpFile:
634 with time_this(
635 log,
636 msg="Downloading %s [length=%s] to local file %s [chunk_size=%d]",
637 args=(self, resp.headers.get("Content-Length"), tmpFile.name, buffering),
638 ):
639 for chunk in resp.iter_content(chunk_size=buffering):
640 tmpFile.write(chunk)
642 return tmpFile.name, True
644 def _send_webdav_request(
645 self, method: str, url: Optional[str] = None, headers: dict[str, str] = {}, body: Optional[str] = None
646 ) -> requests.Response:
647 """Send a webDAV request and correctly handle redirects.
649 Parameters
650 ----------
651 method : `str`
652 The mthod of the HTTP request to be sent, e.g. PROPFIND, MKCOL.
653 headers : `dict`, optional
654 A dictionary of key-value pairs (both strings) to include as
655 headers in the request.
656 body: `str`, optional
657 The body of the request.
659 Notes
660 -----
661 This way of sending webDAV requests is necessary for handling
662 redirection ourselves, since the 'requests' package changes the method
663 of the redirected request when the server responds with status 302 and
664 the method of the original request is not HEAD (which is the case for
665 webDAV requests).
667 That means that when the webDAV server we interact with responds with
668 a redirection to a PROPFIND or MKCOL request, the request gets
669 converted to a GET request when sent to the redirected location.
671 See `requests.sessions.SessionRedirectMixin.rebuild_method()` in
672 https://github.com/psf/requests/blob/main/requests/sessions.py
674 This behavior of the 'requests' package is meant to be compatible with
675 what is specified in RFC 9110:
677 https://www.rfc-editor.org/rfc/rfc9110#name-302-found
679 For our purposes, we do need to follow the redirection and send a new
680 request using the same HTTP verb.
681 """
682 if url is None:
683 url = self.geturl()
685 for _ in range(max_redirects := 5):
686 resp = self.session.request(
687 method, url, data=body, headers=headers, stream=True, timeout=TIMEOUT, allow_redirects=False
688 )
689 if resp.is_redirect:
690 url = resp.headers["Location"]
691 else:
692 return resp
694 # We reached the maximum allowed number of redirects. Stop trying.
695 raise ValueError(
696 f"Could not get a response to {method} request for {self} after {max_redirects} redirections"
697 )
699 def _propfind(self, body: Optional[str] = None, depth: str = "0") -> requests.Response:
700 """Send a PROPFIND webDAV request and return the response.
702 Parameters
703 ----------
704 body : `str`, optional
705 The body of the PROPFIND request to send to the server. If
706 provided, it is expected to be a XML document.
707 depth : `str`, optional
708 The value of the 'Depth' header to include in the request.
710 Returns
711 -------
712 response : `requests.Response`
713 Response to the PROPFIND request.
714 """
715 headers = {
716 "Depth": depth,
717 }
718 if body is not None:
719 headers.update(
720 {"Content-Type": 'application/xml; charset="utf-8"', "Content-Length": str(len(body))}
721 )
722 return self._send_webdav_request("PROPFIND", headers=headers, body=body)
724 def _options(self) -> requests.Response:
725 """Send a OPTIONS webDAV request for this resource."""
727 return self._send_webdav_request("OPTIONS")
729 def _head(self) -> requests.Response:
730 """Send a HEAD webDAV request for this resource."""
732 return self._send_webdav_request("HEAD")
734 def _mkcol(self) -> None:
735 """Send a MKCOL webDAV request to create a collection. The collection
736 may already exist.
737 """
738 resp = self._send_webdav_request("MKCOL")
739 if resp.status_code == requests.codes.created: # 201
740 return
742 if resp.status_code == requests.codes.method_not_allowed: # 405
743 # The remote directory already exists
744 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
745 else:
746 raise ValueError(f"Can not create directory {self}, status code: {resp.status_code}")
748 def _delete(self) -> None:
749 """Send a DELETE webDAV request for this resource."""
751 log.debug("Deleting %s ...", self.geturl())
753 # If this is a directory, ensure the remote is a webDAV server because
754 # plain HTTP servers don't support DELETE requests on non-file
755 # paths.
756 if self.dirLike and not self.is_webdav_endpoint:
757 raise NotImplementedError(
758 f"Deletion of directory {self} is not implemented by plain HTTP servers"
759 )
761 resp = self._send_webdav_request("DELETE")
762 if resp.status_code in (requests.codes.ok, requests.codes.accepted, requests.codes.no_content):
763 return
764 elif resp.status_code == requests.codes.not_found:
765 raise FileNotFoundError(f"Resource {self} does not exist, status code: {resp.status_code}")
766 else:
767 # TODO: the response to a DELETE request against a webDAV server
768 # may be multistatus. If so, we need to parse the reponse body to
769 # determine more precisely the reason of the failure (e.g. a lock)
770 # and provide a more helpful error message.
771 raise ValueError(f"Unable to delete resource {self}; status code: {resp.status_code}")
773 def _copy_via_local(self, src: ResourcePath) -> None:
774 """Replace the contents of this resource with the contents of a remote
775 resource by using a local temporary file.
777 Parameters
778 ----------
779 src : `HttpResourcePath`
780 The source of the contents to copy to `self`.
781 """
782 with src.as_local() as local_uri:
783 with open(local_uri.ospath, "rb") as f:
784 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)):
785 self._put(data=f)
787 def _copy_or_move(self, method: str, src: HttpResourcePath) -> None:
788 """Send a COPY or MOVE webDAV request to copy or replace the contents
789 of this resource with the contents of another resource located in the
790 same server.
792 Parameters
793 ----------
794 method : `str`
795 The method to perform. Valid values are "COPY" or "MOVE" (in
796 uppercase).
798 src : `HttpResourcePath`
799 The source of the contents to move to `self`.
800 """
801 headers = {"Destination": self.geturl()}
802 resp = self._send_webdav_request(method, url=src.geturl(), headers=headers)
803 if resp.status_code in (requests.codes.created, requests.codes.no_content):
804 return
806 if resp.status_code == requests.codes.multi_status:
807 tree = eTree.fromstring(resp.content)
808 status_element = tree.find("./{DAV:}response/{DAV:}status")
809 status = status_element.text if status_element is not None else "unknown"
810 error = tree.find("./{DAV:}response/{DAV:}error")
811 raise ValueError(f"{method} returned multistatus reponse with status {status} and error {error}")
812 else:
813 raise ValueError(
814 f"{method} operation from {src} to {self} failed, status code: {resp.status_code}"
815 )
817 def _copy(self, src: HttpResourcePath) -> None:
818 """Send a COPY webDAV request to replace the contents of this resource
819 (if any) with the contents of another resource located in the same
820 server.
822 Parameters
823 ----------
824 src : `HttpResourcePath`
825 The source of the contents to copy to `self`.
826 """
827 # Neither dCache nor XrootD currently implement the COPY
828 # webDAV method as documented in
829 # http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
830 # (See issues DM-37603 and DM-37651 for details)
831 #
832 # For the time being, we use a temporary local file to
833 # perform the copy client side.
834 # TODO: when those 2 issues above are solved remove the 3 lines below.
835 must_use_local = True
836 if must_use_local:
837 return self._copy_via_local(src)
839 return self._copy_or_move("COPY", src)
841 def _move(self, src: HttpResourcePath) -> None:
842 """Send a MOVE webDAV request to replace the contents of this resource
843 with the contents of another resource located in the same server.
845 Parameters
846 ----------
847 src : `HttpResourcePath`
848 The source of the contents to move to `self`.
849 """
850 return self._copy_or_move("MOVE", src)
852 def _put(self, data: Union[BinaryIO, bytes]) -> None:
853 """Perform an HTTP PUT request and handle redirection.
855 Parameters
856 ----------
857 data : `Union[BinaryIO, bytes]`
858 The data to be included in the body of the PUT request.
859 """
860 # Retrieve the final URL for this upload by sending a PUT request with
861 # no content. Follow a single server redirection to retrieve the
862 # final URL.
863 headers = {"Content-Length": "0"}
864 if _SEND_EXPECT_HEADER_ON_PUT:
865 headers["Expect"] = "100-continue"
867 url = self.geturl()
868 log.debug("Sending empty PUT request to %s", url)
869 resp = self.session.request(
870 "PUT", url, data=None, headers=headers, stream=True, timeout=TIMEOUT, allow_redirects=False
871 )
872 if resp.is_redirect:
873 url = resp.headers["Location"]
875 # Send data to its final destination using the PUT session
876 log.debug("Uploading data to %s", url)
877 resp = self.put_session.put(url, data=data, timeout=TIMEOUT, allow_redirects=False, stream=True)
878 if resp.status_code not in (requests.codes.ok, requests.codes.created, requests.codes.no_content):
879 raise ValueError(f"Can not write file {self}, status code: {resp.status_code}")
881 @contextlib.contextmanager
882 def _openImpl(
883 self,
884 mode: str = "r",
885 *,
886 encoding: Optional[str] = None,
887 ) -> Iterator[ResourceHandleProtocol]:
888 resp = self._head()
889 accepts_range = resp.status_code == requests.codes.ok and resp.headers.get("Accept-Ranges") == "bytes"
890 handle: ResourceHandleProtocol
891 if mode in ("rb", "r") and accepts_range:
892 handle = HttpReadResourceHandle(
893 mode, log, url=self.geturl(), session=self.session, timeout=TIMEOUT
894 )
895 if mode == "r":
896 # cast because the protocol is compatible, but does not have
897 # BytesIO in the inheritance tree
898 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
899 else:
900 yield handle
901 else:
902 with super()._openImpl(mode, encoding=encoding) as http_handle:
903 yield http_handle
906def _dump_response(resp: requests.Response) -> None:
907 """Log the contents of a HTTP or webDAV request and its response.
909 Parameters
910 ----------
911 resp : `requests.Response`
912 The response to log.
914 Notes
915 -----
916 Intended for development purposes only.
917 """
918 log.debug("-----------------------------------------------")
919 log.debug("Request")
920 log.debug(" method=%s", resp.request.method)
921 log.debug(" URL=%s", resp.request.url)
922 log.debug(" headers=%s", resp.request.headers)
923 if resp.request.method == "PUT":
924 log.debug(" body=<data>")
925 elif resp.request.body is None:
926 log.debug(" body=<empty>")
927 else:
928 log.debug(" body=%r", resp.request.body[:120])
930 log.debug("Response:")
931 log.debug(" status_code=%d", resp.status_code)
932 log.debug(" headers=%s", resp.headers)
933 if not resp.content:
934 log.debug(" body=<empty>")
935 elif "Content-Type" in resp.headers and resp.headers["Content-Type"] == "text/plain":
936 log.debug(" body=%r", resp.content)
937 else:
938 log.debug(" body=%r", resp.content[:80])
941def _is_protected(filepath: str) -> bool:
942 """Return true if the permissions of file at filepath only allow for access
943 by its owner.
945 Parameters
946 ----------
947 filepath : `str`
948 Path of a local file.
949 """
950 if not os.path.isfile(filepath):
951 return False
952 mode = stat.S_IMODE(os.stat(filepath).st_mode)
953 owner_accessible = bool(mode & stat.S_IRWXU)
954 group_accessible = bool(mode & stat.S_IRWXG)
955 other_accessible = bool(mode & stat.S_IRWXO)
956 return owner_accessible and not group_accessible and not other_accessible
959def _parse_propfind_response_body(body: str) -> List[PropfindResponse]:
960 """Parse the XML-encoded contents of the response body to a webDAV PROPFIND
961 request.
963 Parameters
964 ----------
965 body : `str`
966 XML-encoded response body to a PROPFIND request
968 Returns
969 -------
970 responses : `List[PropfindResponse]`
972 Notes
973 -----
974 Is is expected that there is at least one reponse in `body`, otherwise
975 this function raises.
976 """
977 # A response body to a PROPFIND request is of the form (indented for
978 # readability):
979 #
980 # <?xml version="1.0" encoding="UTF-8"?>
981 # <D:multistatus xmlns:D="DAV:">
982 # <D:response>
983 # <D:href>path/to/resource</D:href>
984 # <D:propstat>
985 # <D:prop>
986 # <D:resourcetype>
987 # <D:collection xmlns:D="DAV:"/>
988 # </D:resourcetype>
989 # <D:getlastmodified>
990 # Fri, 27 Jan 2 023 13:59:01 GMT
991 # </D:getlastmodified>
992 # <D:getcontentlength>
993 # 12345
994 # </D:getcontentlength>
995 # </D:prop>
996 # <D:status>
997 # HTTP/1.1 200 OK
998 # </D:status>
999 # </D:propstat>
1000 # </D:response>
1001 # <D:response>
1002 # ...
1003 # </D:response>
1004 # <D:response>
1005 # ...
1006 # </D:response>
1007 # </D:multistatus>
1009 # Scan all the 'response' elements and extract the relevant properties
1010 responses = []
1011 multistatus = eTree.fromstring(body.strip())
1012 for response in multistatus.findall("./{DAV:}response"):
1013 responses.append(PropfindResponse(response))
1015 if len(responses) == 0:
1016 # Could not parse the body
1017 raise ValueError(f"Unable to parse response for PROPFIND request: {response}")
1018 else:
1019 return responses
1022class PropfindResponse:
1023 """Helper class to contain the parsed response to a PROFIND request for
1024 a single resource.
1025 """
1027 # Regular expression to extract the status code and reason from
1028 # the 'status' element of a PROPFIND response.
1029 _status_rex = re.compile(r"^HTTP/.* +(?P<status_code>\d{3}) +(?P<reason>.*)$", re.IGNORECASE)
1031 def __init__(self, response: Optional[eTree.Element]):
1032 self.status_code: int = 0
1033 self.reason: str = ""
1034 self.href: str = ""
1035 self.collection: bool = False
1036 self.getlastmodified: str = ""
1037 self.getcontentlength: int = 0
1039 if response is not None:
1040 self._parse(response)
1042 def _parse(self, response: eTree.Element) -> None:
1043 element = response.find("./{DAV:}propstat/{DAV:}status")
1044 if element is not None:
1045 # We need to use "str(element.text)"" instead of "element.text" to
1046 # keep mypy happy
1047 if match := self._status_rex.match(str(element.text)):
1048 self.status_code = int(match["status_code"])
1049 self.reason = match["reason"]
1051 # Parse "href"
1052 element = response.find("./{DAV:}href")
1053 if element is not None:
1054 self.href = str(element.text).strip()
1056 # Parse "collection"
1057 element = response.find("./{DAV:}propstat/{DAV:}prop/{DAV:}resourcetype/{DAV:}collection")
1058 if element is not None:
1059 self.collection = True
1061 # Parse "getlastmodified"
1062 element = response.find("./{DAV:}propstat/{DAV:}prop/{DAV:}getlastmodified")
1063 if element is not None:
1064 self.getlastmodified = str(element.text).strip()
1066 # Parse "getcontentlength"
1067 element = response.find("./{DAV:}propstat/{DAV:}prop/{DAV:}getcontentlength")
1068 if element is not None:
1069 self.getcontentlength = int(str(element.text).strip())