Coverage for python/lsst/resources/http.py: 17%
246 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-01 02:02 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-01 02:02 -0800
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14__all__ = ("HttpResourcePath",)
16import contextlib
17import functools
18import io
19import logging
20import os
21import os.path
22import random
23import stat
24import tempfile
25from typing import TYPE_CHECKING, BinaryIO, Iterator, Optional, Tuple, Union, cast
27import requests
28from lsst.utils.timer import time_this
29from requests.adapters import HTTPAdapter
30from requests.auth import AuthBase
31from urllib3.util.retry import Retry
33from ._resourceHandles import ResourceHandleProtocol
34from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
35from ._resourcePath import ResourcePath
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from .utils import TransactionProtocol
40log = logging.getLogger(__name__)
43# Default timeouts for all HTTP requests, in seconds.
44DEFAULT_TIMEOUT_CONNECT = 60
45DEFAULT_TIMEOUT_READ = 300
47# Allow for network timeouts to be set in the environment.
48TIMEOUT = (
49 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)),
50 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)),
51)
53# Should we send a "Expect: 100-continue" header on PUT requests?
54# The "Expect: 100-continue" header is used by some servers (e.g. dCache)
55# as an indication that the client knows how to handle redirects to
56# the specific server that will actually receive the data for PUT
57# requests.
58_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
61@functools.lru_cache
62def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool:
63 """Check whether the remote HTTP endpoint implements WebDAV features.
65 Parameters
66 ----------
67 path : `ResourcePath` or `str`
68 URL to the resource to be checked.
69 Should preferably refer to the root since the status is shared
70 by all paths in that server.
72 Returns
73 -------
74 _is_webdav_endpoint : `bool`
75 True if the endpoint implements WebDAV, False if it doesn't.
76 """
77 if (ca_cert_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE")) is None:
78 log.warning(
79 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: "
80 "some HTTPS requests may fail if remote server presents a "
81 "certificate issued by an unknown certificate authority."
82 )
84 log.debug("Detecting HTTP endpoint type for '%s'...", path)
85 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True
86 resp = requests.options(str(path), verify=verify)
87 return "DAV" in resp.headers
90# Tuple (path, block_size) pointing to the location of a local directory
91# to save temporary files and the block size of the underlying file system.
92_TMPDIR: Optional[Tuple[str, int]] = None
95def _get_temp_dir() -> Tuple[str, int]:
96 """Return the temporary directory path and block size.
98 This function caches its results in _TMPDIR.
99 """
100 global _TMPDIR
101 if _TMPDIR:
102 return _TMPDIR
104 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
105 # 'TMPDIR', if defined. Otherwise use current working directory.
106 tmpdir = os.getcwd()
107 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
108 if dir and os.path.isdir(dir):
109 tmpdir = dir
110 break
112 # Compute the block size as 256 blocks of typical size
113 # (i.e. 4096 bytes) or 10 times the file system block size,
114 # whichever is higher. This is a reasonable compromise between
115 # using memory for buffering and the number of system calls
116 # issued to read from or write to temporary files.
117 fsstats = os.statvfs(tmpdir)
118 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
121class BearerTokenAuth(AuthBase):
122 """Attach a bearer token 'Authorization' header to each request.
124 Parameters
125 ----------
126 token : `str`
127 Can be either the path to a local protected file which contains the
128 value of the token or the token itself.
129 """
131 def __init__(self, token: str):
132 self._token = self._path = None
133 self._mtime: float = -1.0
134 if not token:
135 return
137 self._token = token
138 if os.path.isfile(token):
139 self._path = os.path.abspath(token)
140 if not _is_protected(self._path):
141 raise PermissionError(
142 f"Bearer token file at {self._path} must be protected for access only by its owner"
143 )
144 self._refresh()
146 def _refresh(self) -> None:
147 """Read the token file (if any) if its modification time is more recent
148 than the last time we read it.
149 """
150 if not self._path:
151 return
153 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
154 log.debug("Reading bearer token file at %s", self._path)
155 self._mtime = mtime
156 with open(self._path) as f:
157 self._token = f.read().rstrip("\n")
159 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
160 if self._token:
161 self._refresh()
162 req.headers["Authorization"] = f"Bearer {self._token}"
163 return req
166class SessionStore:
167 """Cache a single reusable HTTP client session per enpoint."""
169 def __init__(self) -> None:
170 # The key of the dictionary is a root URI and the value is the
171 # session
172 self._sessions: dict[str, requests.Session] = {}
174 def get(self, rpath: ResourcePath, persist: bool = True) -> requests.Session:
175 """Retrieve a session for accessing the remote resource at rpath.
177 Parameters
178 ----------
179 rpath : `ResourcePath`
180 URL to a resource at the remote server for which a session is to
181 be retrieved.
183 persist : `bool`
184 if `True`, make the network connection with the front end server
185 of the endpoint persistent. Connections to the backend servers
186 are persisted.
188 Notes
189 -----
190 Once a session is created for a given endpoint it is cached and
191 returned every time a session is requested for any path under that same
192 endpoint. For instance, a single session will be cached and shared
193 for paths "https://www.example.org/path/to/file" and
194 "https://www.example.org/any/other/path".
196 Note that "https://www.example.org" and "https://www.example.org:12345"
197 will have different sessions since the port number is not identical.
199 In order to configure the session, some environment variables are
200 inspected:
202 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
203 certificates to trust when verifying the server's certificate.
205 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
206 local file containing a bearer token to be used as the client
207 authentication mechanism with all requests.
208 The permissions of the token file must be set so that only its
209 owner can access it.
210 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
211 and LSST_HTTP_AUTH_CLIENT_KEY.
213 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
214 client certificate for authenticating to the server.
215 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
216 initialized with the path of the client private key file.
217 The permissions of the client private key must be set so that only
218 its owner can access it, at least for reading.
219 """
220 root_uri = str(rpath.root_uri())
221 if root_uri not in self._sessions:
222 # We don't have yet a session for this endpoint: create a new one
223 self._sessions[root_uri] = self._make_session(rpath, persist)
224 return self._sessions[root_uri]
226 def _make_session(self, rpath: ResourcePath, persist: bool) -> requests.Session:
227 """Make a new session configured from values from the environment."""
228 session = requests.Session()
229 root_uri = str(rpath.root_uri())
230 log.debug("Creating new HTTP session for endpoint %s (persist connection=%s)...", root_uri, persist)
232 retries = Retry(
233 total=3,
234 connect=3,
235 read=3,
236 backoff_factor=5.0 + random.random(),
237 status=3,
238 status_forcelist=[429, 500, 502, 503, 504],
239 )
241 # Persist a single connection to the front end server, if required
242 num_connections = 1 if persist else 0
243 session.mount(
244 root_uri,
245 HTTPAdapter(
246 pool_connections=1, pool_maxsize=num_connections, pool_block=False, max_retries=retries
247 ),
248 )
250 # Prevent persisting connections to back-end servers which may vary
251 # from request to request. Systematically persisting connections to
252 # those servers may exhaust their capabilities when there are thousands
253 # of simultaneous clients
254 session.mount(
255 f"{rpath.scheme}://",
256 HTTPAdapter(pool_connections=1, pool_maxsize=0, pool_block=False, max_retries=retries),
257 )
259 # Should we use a specific CA cert bundle for authenticating the
260 # server?
261 session.verify = True
262 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
263 session.verify = ca_bundle
264 else:
265 log.debug(
266 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: "
267 "if you would need to verify the remote server's certificate "
268 "issued by specific certificate authorities please consider "
269 "initializing this variable."
270 )
272 # Should we use bearer tokens for client authentication?
273 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
274 log.debug("... using bearer token authentication")
275 session.auth = BearerTokenAuth(token)
276 return session
278 # Should we instead use client certificate and private key? If so, both
279 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
280 # initialized.
281 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
282 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
283 if client_cert and client_key:
284 if not _is_protected(client_key):
285 raise PermissionError(
286 f"Private key file at {client_key} must be protected for access only by its owner"
287 )
288 log.debug("... using client certificate authentication.")
289 session.cert = (client_cert, client_key)
290 return session
292 if client_cert:
293 # Only the client certificate was provided.
294 raise ValueError(
295 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
296 )
298 if client_key:
299 # Only the client private key was provided.
300 raise ValueError(
301 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
302 )
304 log.debug(
305 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
306 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
307 )
308 return session
311class HttpResourcePath(ResourcePath):
312 """General HTTP(S) resource.
314 Notes
315 -----
316 In order to configure the behavior of the object, one environment variable
317 is inspected:
319 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
320 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
321 This header is required by some servers to detect if the client
322 knows how to handle redirections. In case of redirection, the body
323 of the PUT request is sent to the redirected location and not to
324 the front end server.
325 """
327 _is_webdav: Optional[bool] = None
328 _sessions_store = SessionStore()
329 _put_sessions_store = SessionStore()
331 # Use a session exclusively for PUT requests and another session for
332 # all other requests. PUT requests may be redirected and in that case
333 # the server may close the persisted connection. If that is the case
334 # only the connection persisted for PUT requests will be closed and
335 # the other persisted connection will be kept alive and reused for
336 # other requests.
338 @property
339 def session(self) -> requests.Session:
340 """Client session to address remote resource for all HTTP methods but
341 PUT.
342 """
343 if hasattr(self, "_session"):
344 return self._session
346 self._session: requests.Session = self._sessions_store.get(self)
347 return self._session
349 @property
350 def put_session(self) -> requests.Session:
351 """Client session for uploading data to the remote resource."""
352 if hasattr(self, "_put_session"):
353 return self._put_session
355 self._put_session: requests.Session = self._put_sessions_store.get(self)
356 return self._put_session
358 @property
359 def is_webdav_endpoint(self) -> bool:
360 """Check if the current endpoint implements WebDAV features.
362 This is stored per URI but cached by root so there is
363 only one check per hostname.
364 """
365 if self._is_webdav is not None:
366 return self._is_webdav
368 self._is_webdav = _is_webdav_endpoint(self.root_uri())
369 return self._is_webdav
371 def exists(self) -> bool:
372 """Check that a remote HTTP resource exists."""
373 log.debug("Checking if resource exists: %s", self.geturl())
374 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True)
375 return resp.status_code == 200
377 def size(self) -> int:
378 """Return the size of the remote resource in bytes."""
379 if self.dirLike:
380 return 0
382 resp = self.session.head(self.geturl(), timeout=TIMEOUT, allow_redirects=True)
383 if resp.status_code != 200:
384 raise FileNotFoundError(f"Resource {self} does not exist")
385 return int(resp.headers["Content-Length"])
387 def mkdir(self) -> None:
388 """Create the directory resource if it does not already exist."""
389 # Creating directories is only available on WebDAV backends.
390 if not self.is_webdav_endpoint:
391 raise NotImplementedError("Endpoint does not implement WebDAV functionality")
393 if not self.dirLike:
394 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
396 if not self.exists():
397 # We need to test the absence of the parent directory,
398 # but also if parent URL is different from self URL,
399 # otherwise we could be stuck in a recursive loop
400 # where self == parent.
401 if not self.parent().exists() and self.parent().geturl() != self.geturl():
402 self.parent().mkdir()
403 log.debug("Creating new directory: %s", self.geturl())
404 resp = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT)
405 if resp.status_code != 201:
406 if resp.status_code == 405:
407 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
408 else:
409 raise ValueError(f"Can not create directory {self}, status code: {resp.status_code}")
411 def remove(self) -> None:
412 """Remove the resource."""
413 log.debug("Removing resource: %s", self.geturl())
414 resp = self.session.delete(self.geturl(), timeout=TIMEOUT)
415 if resp.status_code not in [200, 202, 204]:
416 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {resp.status_code}")
418 def _as_local(self) -> Tuple[str, bool]:
419 """Download object over HTTP and place in temporary directory.
421 Returns
422 -------
423 path : `str`
424 Path to local temporary file.
425 temporary : `bool`
426 Always returns `True`. This is always a temporary file.
427 """
428 resp = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT)
429 if resp.status_code != 200:
430 raise FileNotFoundError(f"Unable to download resource {self}; status code: {resp.status_code}")
432 tmpdir, buffering = _get_temp_dir()
433 with tempfile.NamedTemporaryFile(
434 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
435 ) as tmpFile:
436 with time_this(
437 log,
438 msg="Downloading %s [length=%s] to local file %s [chunk_size=%d]",
439 args=(self, resp.headers.get("Content-Length"), tmpFile.name, buffering),
440 ):
441 for chunk in resp.iter_content(chunk_size=buffering):
442 tmpFile.write(chunk)
443 return tmpFile.name, True
445 def read(self, size: int = -1) -> bytes:
446 """Open the resource and return the contents in bytes.
448 Parameters
449 ----------
450 size : `int`, optional
451 The number of bytes to read. Negative or omitted indicates
452 that all data should be read.
453 """
454 log.debug("Reading from remote resource: %s", self.geturl())
455 stream = True if size > 0 else False
456 with time_this(log, msg="Read from remote resource %s", args=(self,)):
457 resp = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT)
458 if resp.status_code != 200:
459 raise FileNotFoundError(f"Unable to read resource {self}; status code: {resp.status_code}")
460 if not stream:
461 return resp.content
462 else:
463 return next(resp.iter_content(chunk_size=size))
465 def write(self, data: bytes, overwrite: bool = True) -> None:
466 """Write the supplied bytes to the new resource.
468 Parameters
469 ----------
470 data : `bytes`
471 The bytes to write to the resource. The entire contents of the
472 resource will be replaced.
473 overwrite : `bool`, optional
474 If `True` the resource will be overwritten if it exists. Otherwise
475 the write will fail.
476 """
477 log.debug("Writing to remote resource: %s", self.geturl())
478 if not overwrite:
479 if self.exists():
480 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
481 with time_this(log, msg="Write to remote %s (%d bytes)", args=(self, len(data))):
482 self._do_put(data=data)
484 def transfer_from(
485 self,
486 src: ResourcePath,
487 transfer: str = "copy",
488 overwrite: bool = False,
489 transaction: Optional[TransactionProtocol] = None,
490 ) -> None:
491 """Transfer the current resource to a Webdav repository.
493 Parameters
494 ----------
495 src : `ResourcePath`
496 Source URI.
497 transfer : `str`
498 Mode to use for transferring the resource. Supports the following
499 options: copy.
500 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
501 Currently unused.
502 """
503 # Fail early to prevent delays if remote resources are requested
504 if transfer not in self.transferModes:
505 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
507 # Existence checks cost time so do not call this unless we know
508 # that debugging is enabled.
509 if log.isEnabledFor(logging.DEBUG):
510 log.debug(
511 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
512 src,
513 src.exists(),
514 self,
515 self.exists(),
516 transfer,
517 )
519 # Short circuit if the URIs are identical immediately.
520 if self == src:
521 log.debug(
522 "Target and destination URIs are identical: %s, returning immediately."
523 " No further action required.",
524 self,
525 )
526 return
528 if self.exists() and not overwrite:
529 raise FileExistsError(f"Destination path {self} already exists.")
531 if transfer == "auto":
532 transfer = self.transferDefault
534 if isinstance(src, type(self)):
535 # Only available on WebDAV backends.
536 if not self.is_webdav_endpoint:
537 raise NotImplementedError("Endpoint does not implement WebDAV functionality")
539 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)):
540 method = "MOVE" if transfer == "move" else "COPY"
541 log.debug("%s from %s to %s", method, src.geturl(), self.geturl())
542 resp = self.session.request(
543 method, src.geturl(), headers={"Destination": self.geturl()}, timeout=TIMEOUT
544 )
545 if resp.status_code not in [201, 202, 204]:
546 raise ValueError(f"Can not transfer file {self}, status code: {resp.status_code}")
547 else:
548 # Use local file and upload it.
549 with src.as_local() as local_uri:
550 with open(local_uri.ospath, "rb") as f:
551 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)):
552 self._do_put(data=f)
554 # This was an explicit move requested from a remote resource
555 # try to remove that resource.
556 if transfer == "move":
557 # Transactions do not work here
558 src.remove()
560 def _do_put(self, data: Union[BinaryIO, bytes]) -> None:
561 """Perform an HTTP PUT request taking into account redirection."""
562 final_url = self.geturl()
563 if _SEND_EXPECT_HEADER_ON_PUT:
564 # Do a PUT request with an empty body and retrieve the final
565 # destination URL returned by the server.
566 headers = {"Content-Length": "0", "Expect": "100-continue"}
567 resp = self.put_session.put(
568 final_url, data=None, headers=headers, allow_redirects=False, timeout=TIMEOUT
569 )
570 if resp.is_redirect or resp.is_permanent_redirect:
571 final_url = resp.headers["Location"]
572 log.debug("PUT request to %s redirected to %s", self.geturl(), final_url)
574 # Send data to its final destination.
575 resp = self.put_session.put(final_url, data=data, timeout=TIMEOUT)
576 if resp.status_code not in [200, 201, 202, 204]:
577 raise ValueError(f"Can not write file {self}, status code: {resp.status_code}")
579 @contextlib.contextmanager
580 def _openImpl(
581 self,
582 mode: str = "r",
583 *,
584 encoding: Optional[str] = None,
585 ) -> Iterator[ResourceHandleProtocol]:
586 url = self.geturl()
587 response = self.session.head(url, timeout=TIMEOUT, allow_redirects=True)
588 accepts_range = "Accept-Ranges" in response.headers
589 handle: ResourceHandleProtocol
590 if mode in ("rb", "r") and accepts_range:
591 handle = HttpReadResourceHandle(
592 mode, log, url=self.geturl(), session=self.session, timeout=TIMEOUT
593 )
594 if mode == "r":
595 # cast because the protocol is compatible, but does not have
596 # BytesIO in the inheritance tree
597 yield io.TextIOWrapper(cast(io.BytesIO, handle), encoding=encoding)
598 else:
599 yield handle
600 else:
601 with super()._openImpl(mode, encoding=encoding) as http_handle:
602 yield http_handle
605def _is_protected(filepath: str) -> bool:
606 """Return true if the permissions of file at filepath only allow for access
607 by its owner.
609 Parameters
610 ----------
611 filepath : `str`
612 Path of a local file.
613 """
614 if not os.path.isfile(filepath):
615 return False
616 mode = stat.S_IMODE(os.stat(filepath).st_mode)
617 owner_accessible = bool(mode & stat.S_IRWXU)
618 group_accessible = bool(mode & stat.S_IRWXG)
619 other_accessible = bool(mode & stat.S_IRWXO)
620 return owner_accessible and not group_accessible and not other_accessible