Coverage for python/lsst/resources/http.py: 18%
230 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-14 02:29 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-14 02:29 -0700
1# This file is part of lsst-resources.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# Use of this source code is governed by a 3-clause BSD-style
10# license that can be found in the LICENSE file.
12from __future__ import annotations
14import functools
15import logging
16import os
17import os.path
18import random
19import stat
20import tempfile
22import requests
24__all__ = ("HttpResourcePath",)
26from typing import TYPE_CHECKING, BinaryIO, Optional, Tuple, Union
28from lsst.utils.timer import time_this
29from requests.adapters import HTTPAdapter
30from requests.auth import AuthBase
31from urllib3.util.retry import Retry
33from ._resourcePath import ResourcePath
35if TYPE_CHECKING: 35 ↛ 36line 35 didn't jump to line 36, because the condition on line 35 was never true
36 from .utils import TransactionProtocol
38log = logging.getLogger(__name__)
41# Default timeouts for all HTTP requests, in seconds.
42DEFAULT_TIMEOUT_CONNECT = 60
43DEFAULT_TIMEOUT_READ = 300
45# Allow for network timeouts to be set in the environment.
46TIMEOUT = (
47 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)),
48 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)),
49)
51# Should we send a "Expect: 100-continue" header on PUT requests?
52# The "Expect: 100-continue" header is used by some servers (e.g. dCache)
53# as an indication that the client knows how to handle redirects to
54# the specific server that will actually receive the data for PUT
55# requests.
56_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
59@functools.lru_cache
60def _is_webdav_endpoint(path: Union[ResourcePath, str]) -> bool:
61 """Check whether the remote HTTP endpoint implements WebDAV features.
63 Parameters
64 ----------
65 path : `ResourcePath` or `str`
66 URL to the resource to be checked.
67 Should preferably refer to the root since the status is shared
68 by all paths in that server.
70 Returns
71 -------
72 _is_webdav_endpoint : `bool`
73 True if the endpoint implements WebDAV, False if it doesn't.
74 """
75 if (ca_cert_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE")) is None:
76 log.warning(
77 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: "
78 "some HTTPS requests may fail if remote server presents a "
79 "certificate issued by an unknown certificate authority."
80 )
82 log.debug("Detecting HTTP endpoint type for '%s'...", path)
83 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True
84 resp = requests.options(str(path), verify=verify)
85 return "DAV" in resp.headers
88# Tuple (path, block_size) pointing to the location of a local directory
89# to save temporary files and the block size of the underlying file system.
90_TMPDIR: Optional[Tuple[str, int]] = None
93def _get_temp_dir() -> Tuple[str, int]:
94 """Return the temporary directory path and block size.
96 This function caches its results in _TMPDIR.
97 """
98 global _TMPDIR
99 if _TMPDIR:
100 return _TMPDIR
102 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
103 # 'TMPDIR', if defined. Otherwise use current working directory.
104 tmpdir = os.getcwd()
105 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
106 if dir and os.path.isdir(dir):
107 tmpdir = dir
108 break
110 # Compute the block size as 256 blocks of typical size
111 # (i.e. 4096 bytes) or 10 times the file system block size,
112 # whichever is higher. This is a reasonable compromise between
113 # using memory for buffering and the number of system calls
114 # issued to read from or write to temporary files.
115 fsstats = os.statvfs(tmpdir)
116 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
119class BearerTokenAuth(AuthBase):
120 """Attach a bearer token 'Authorization' header to each request.
122 Parameters
123 ----------
124 token : `str`
125 Can be either the path to a local protected file which contains the
126 value of the token or the token itself.
127 """
129 def __init__(self, token: str):
130 self._token = self._path = None
131 self._mtime: float = -1.0
132 if not token:
133 return
135 self._token = token
136 if os.path.isfile(token):
137 self._path = os.path.abspath(token)
138 if not _is_protected(self._path):
139 raise PermissionError(
140 f"Bearer token file at {self._path} must be protected for access only by its owner"
141 )
142 self._refresh()
144 def _refresh(self) -> None:
145 """Read the token file (if any) if its modification time is more recent
146 than the last time we read it.
147 """
148 if not self._path:
149 return
151 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
152 log.debug("Reading bearer token file at %s", self._path)
153 self._mtime = mtime
154 with open(self._path) as f:
155 self._token = f.read().rstrip("\n")
157 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
158 if self._token:
159 self._refresh()
160 req.headers["Authorization"] = f"Bearer {self._token}"
161 return req
164class SessionStore:
165 """Cache a single reusable HTTP client session per enpoint."""
167 def __init__(self) -> None:
168 # The key of the dictionary is a root URI and the value is the
169 # session
170 self._sessions: dict[str, requests.Session] = {}
172 def get(self, rpath: ResourcePath, persist: bool = True) -> requests.Session:
173 """Retrieve a session for accessing the remote resource at rpath.
175 Parameters
176 ----------
177 rpath : `ResourcePath`
178 URL to a resource at the remote server for which a session is to
179 be retrieved.
181 persist : `bool`
182 if `True`, make the network connection with the front end server
183 of the endpoint persistent. Connections to the backend servers
184 are persisted.
186 Notes
187 -----
188 Once a session is created for a given endpoint it is cached and
189 returned every time a session is requested for any path under that same
190 endpoint. For instance, a single session will be cached and shared
191 for paths "https://www.example.org/path/to/file" and
192 "https://www.example.org/any/other/path".
194 Note that "https://www.example.org" and "https://www.example.org:12345"
195 will have different sessions since the port number is not identical.
197 In order to configure the session, some environment variables are
198 inspected:
200 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
201 certificates to trust when verifying the server's certificate.
203 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
204 local file containing a bearer token to be used as the client
205 authentication mechanism with all requests.
206 The permissions of the token file must be set so that only its
207 owner can access it.
208 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
209 and LSST_HTTP_AUTH_CLIENT_KEY.
211 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
212 client certificate for authenticating to the server.
213 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
214 initialized with the path of the client private key file.
215 The permissions of the client private key must be set so that only
216 its owner can access it, at least for reading.
217 """
218 root_uri = str(rpath.root_uri())
219 if root_uri not in self._sessions:
220 # We don't have yet a session for this endpoint: create a new one
221 self._sessions[root_uri] = self._make_session(rpath, persist)
222 return self._sessions[root_uri]
224 def _make_session(self, rpath: ResourcePath, persist: bool) -> requests.Session:
225 """Make a new session configured from values from the environment."""
226 session = requests.Session()
227 root_uri = str(rpath.root_uri())
228 log.debug("Creating new HTTP session for endpoint %s (persist connection=%s)...", root_uri, persist)
230 retries = Retry(
231 total=3,
232 connect=3,
233 read=3,
234 backoff_factor=5.0 + random.random(),
235 status=3,
236 status_forcelist=[429, 500, 502, 503, 504],
237 )
239 # Persist a single connection to the front end server, if required
240 num_connections = 1 if persist else 0
241 session.mount(
242 root_uri,
243 HTTPAdapter(
244 pool_connections=1, pool_maxsize=num_connections, pool_block=False, max_retries=retries
245 ),
246 )
248 # Prevent persisting connections to back-end servers which may vary
249 # from request to request. Systematically persisting connections to
250 # those servers may exhaust their capabilities when there are thousands
251 # of simultaneous clients
252 session.mount(
253 f"{rpath.scheme}://",
254 HTTPAdapter(pool_connections=1, pool_maxsize=0, pool_block=False, max_retries=retries),
255 )
257 # Should we use a specific CA cert bundle for authenticating the
258 # server?
259 session.verify = True
260 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
261 session.verify = ca_bundle
262 else:
263 log.debug(
264 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: "
265 "if you would need to verify the remote server's certificate "
266 "issued by specific certificate authorities please consider "
267 "initializing this variable."
268 )
270 # Should we use bearer tokens for client authentication?
271 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
272 log.debug("... using bearer token authentication")
273 session.auth = BearerTokenAuth(token)
274 return session
276 # Should we instead use client certificate and private key? If so, both
277 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
278 # initialized.
279 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
280 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
281 if client_cert and client_key:
282 if not _is_protected(client_key):
283 raise PermissionError(
284 f"Private key file at {client_key} must be protected for access only by its owner"
285 )
286 log.debug("... using client certificate authentication.")
287 session.cert = (client_cert, client_key)
288 return session
290 if client_cert:
291 # Only the client certificate was provided.
292 raise ValueError(
293 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
294 )
296 if client_key:
297 # Only the client private key was provided.
298 raise ValueError(
299 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
300 )
302 log.debug(
303 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
304 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
305 )
306 return session
309class HttpResourcePath(ResourcePath):
310 """General HTTP(S) resource.
312 Notes
313 -----
314 In order to configure the behavior of the object, one environment variable
315 is inspected:
317 - LSST_HTTP_PUT_SEND_EXPECT_HEADER: if set (with any value), a
318 "Expect: 100-Continue" header will be added to all HTTP PUT requests.
319 This header is required by some servers to detect if the client
320 knows how to handle redirections. In case of redirection, the body
321 of the PUT request is sent to the redirected location and not to
322 the front end server.
323 """
325 _is_webdav: Optional[bool] = None
326 _sessions_store = SessionStore()
327 _put_sessions_store = SessionStore()
329 # Use a session exclusively for PUT requests and another session for
330 # all other requests. PUT requests may be redirected and in that case
331 # the server may close the persisted connection. If that is the case
332 # only the connection persisted for PUT requests will be closed and
333 # the other persisted connection will be kept alive and reused for
334 # other requests.
336 @property
337 def session(self) -> requests.Session:
338 """Client session to address remote resource for all HTTP methods but
339 PUT.
340 """
341 if hasattr(self, "_session"):
342 return self._session
344 self._session: requests.Session = self._sessions_store.get(self)
345 return self._session
347 @property
348 def put_session(self) -> requests.Session:
349 """Client session for uploading data to the remote resource."""
350 if hasattr(self, "_put_session"):
351 return self._put_session
353 self._put_session: requests.Session = self._put_sessions_store.get(self)
354 return self._put_session
356 @property
357 def is_webdav_endpoint(self) -> bool:
358 """Check if the current endpoint implements WebDAV features.
360 This is stored per URI but cached by root so there is
361 only one check per hostname.
362 """
363 if self._is_webdav is not None:
364 return self._is_webdav
366 self._is_webdav = _is_webdav_endpoint(self.root_uri())
367 return self._is_webdav
369 def exists(self) -> bool:
370 """Check that a remote HTTP resource exists."""
371 log.debug("Checking if resource exists: %s", self.geturl())
372 resp = self.session.head(self.geturl(), timeout=TIMEOUT)
373 return resp.status_code == 200
375 def size(self) -> int:
376 """Return the size of the remote resource in bytes."""
377 if self.dirLike:
378 return 0
380 resp = self.session.head(self.geturl(), timeout=TIMEOUT)
381 if resp.status_code != 200:
382 raise FileNotFoundError(f"Resource {self} does not exist")
383 return int(resp.headers["Content-Length"])
385 def mkdir(self) -> None:
386 """Create the directory resource if it does not already exist."""
387 # Creating directories is only available on WebDAV backends.
388 if not self.is_webdav_endpoint:
389 raise NotImplementedError("Endpoint does not implement WebDAV functionality")
391 if not self.dirLike:
392 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
394 if not self.exists():
395 # We need to test the absence of the parent directory,
396 # but also if parent URL is different from self URL,
397 # otherwise we could be stuck in a recursive loop
398 # where self == parent.
399 if not self.parent().exists() and self.parent().geturl() != self.geturl():
400 self.parent().mkdir()
401 log.debug("Creating new directory: %s", self.geturl())
402 resp = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT)
403 if resp.status_code != 201:
404 if resp.status_code == 405:
405 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
406 else:
407 raise ValueError(f"Can not create directory {self}, status code: {resp.status_code}")
409 def remove(self) -> None:
410 """Remove the resource."""
411 log.debug("Removing resource: %s", self.geturl())
412 resp = self.session.delete(self.geturl(), timeout=TIMEOUT)
413 if resp.status_code not in [200, 202, 204]:
414 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {resp.status_code}")
416 def _as_local(self) -> Tuple[str, bool]:
417 """Download object over HTTP and place in temporary directory.
419 Returns
420 -------
421 path : `str`
422 Path to local temporary file.
423 temporary : `bool`
424 Always returns `True`. This is always a temporary file.
425 """
426 resp = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT)
427 if resp.status_code != 200:
428 raise FileNotFoundError(f"Unable to download resource {self}; status code: {resp.status_code}")
430 tmpdir, buffering = _get_temp_dir()
431 with tempfile.NamedTemporaryFile(
432 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
433 ) as tmpFile:
434 with time_this(
435 log,
436 msg="Downloading %s [length=%s] to local file %s [chunk_size=%d]",
437 args=(self, resp.headers.get("Content-Length"), tmpFile.name, buffering),
438 ):
439 for chunk in resp.iter_content(chunk_size=buffering):
440 tmpFile.write(chunk)
441 return tmpFile.name, True
443 def read(self, size: int = -1) -> bytes:
444 """Open the resource and return the contents in bytes.
446 Parameters
447 ----------
448 size : `int`, optional
449 The number of bytes to read. Negative or omitted indicates
450 that all data should be read.
451 """
452 log.debug("Reading from remote resource: %s", self.geturl())
453 stream = True if size > 0 else False
454 with time_this(log, msg="Read from remote resource %s", args=(self,)):
455 resp = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT)
456 if resp.status_code != 200:
457 raise FileNotFoundError(f"Unable to read resource {self}; status code: {resp.status_code}")
458 if not stream:
459 return resp.content
460 else:
461 return next(resp.iter_content(chunk_size=size))
463 def write(self, data: bytes, overwrite: bool = True) -> None:
464 """Write the supplied bytes to the new resource.
466 Parameters
467 ----------
468 data : `bytes`
469 The bytes to write to the resource. The entire contents of the
470 resource will be replaced.
471 overwrite : `bool`, optional
472 If `True` the resource will be overwritten if it exists. Otherwise
473 the write will fail.
474 """
475 log.debug("Writing to remote resource: %s", self.geturl())
476 if not overwrite:
477 if self.exists():
478 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
479 with time_this(log, msg="Write to remote %s (%d bytes)", args=(self, len(data))):
480 self._do_put(data=data)
482 def transfer_from(
483 self,
484 src: ResourcePath,
485 transfer: str = "copy",
486 overwrite: bool = False,
487 transaction: Optional[TransactionProtocol] = None,
488 ) -> None:
489 """Transfer the current resource to a Webdav repository.
491 Parameters
492 ----------
493 src : `ResourcePath`
494 Source URI.
495 transfer : `str`
496 Mode to use for transferring the resource. Supports the following
497 options: copy.
498 transaction : `~lsst.resources.utils.TransactionProtocol`, optional
499 Currently unused.
500 """
501 # Fail early to prevent delays if remote resources are requested
502 if transfer not in self.transferModes:
503 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
505 # Existence checks cost time so do not call this unless we know
506 # that debugging is enabled.
507 if log.isEnabledFor(logging.DEBUG):
508 log.debug(
509 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
510 src,
511 src.exists(),
512 self,
513 self.exists(),
514 transfer,
515 )
517 # Short circuit if the URIs are identical immediately.
518 if self == src:
519 log.debug(
520 "Target and destination URIs are identical: %s, returning immediately."
521 " No further action required.",
522 self,
523 )
524 return
526 if self.exists() and not overwrite:
527 raise FileExistsError(f"Destination path {self} already exists.")
529 if transfer == "auto":
530 transfer = self.transferDefault
532 if isinstance(src, type(self)):
533 # Only available on WebDAV backends.
534 if not self.is_webdav_endpoint:
535 raise NotImplementedError("Endpoint does not implement WebDAV functionality")
537 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)):
538 method = "MOVE" if transfer == "move" else "COPY"
539 log.debug("%s from %s to %s", method, src.geturl(), self.geturl())
540 resp = self.session.request(
541 method, src.geturl(), headers={"Destination": self.geturl()}, timeout=TIMEOUT
542 )
543 if resp.status_code not in [201, 202, 204]:
544 raise ValueError(f"Can not transfer file {self}, status code: {resp.status_code}")
545 else:
546 # Use local file and upload it.
547 with src.as_local() as local_uri:
548 with open(local_uri.ospath, "rb") as f:
549 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)):
550 self._do_put(data=f)
552 # This was an explicit move requested from a remote resource
553 # try to remove that resource.
554 if transfer == "move":
555 # Transactions do not work here
556 src.remove()
558 def _do_put(self, data: Union[BinaryIO, bytes]) -> None:
559 """Perform an HTTP PUT request taking into account redirection."""
560 final_url = self.geturl()
561 if _SEND_EXPECT_HEADER_ON_PUT:
562 # Do a PUT request with an empty body and retrieve the final
563 # destination URL returned by the server.
564 headers = {"Content-Length": "0", "Expect": "100-continue"}
565 resp = self.put_session.put(
566 final_url, data=None, headers=headers, allow_redirects=False, timeout=TIMEOUT
567 )
568 if resp.is_redirect or resp.is_permanent_redirect:
569 final_url = resp.headers["Location"]
570 log.debug("PUT request to %s redirected to %s", self.geturl(), final_url)
572 # Send data to its final destination.
573 resp = self.put_session.put(final_url, data=data, timeout=TIMEOUT)
574 if resp.status_code not in [201, 202, 204]:
575 raise ValueError(f"Can not write file {self}, status code: {resp.status_code}")
578def _is_protected(filepath: str) -> bool:
579 """Return true if the permissions of file at filepath only allow for access
580 by its owner.
582 Parameters
583 ----------
584 filepath : `str`
585 Path of a local file.
586 """
587 if not os.path.isfile(filepath):
588 return False
589 mode = stat.S_IMODE(os.stat(filepath).st_mode)
590 owner_accessible = bool(mode & stat.S_IRWXU)
591 group_accessible = bool(mode & stat.S_IRWXG)
592 other_accessible = bool(mode & stat.S_IRWXO)
593 return owner_accessible and not group_accessible and not other_accessible