Coverage for python/lsst/daf/butler/core/_butlerUri/http.py: 18%
228 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:55 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:55 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import functools
25import logging
26import os
27import os.path
28import random
29import stat
30import tempfile
32import requests
34__all__ = ("ButlerHttpURI",)
36from typing import TYPE_CHECKING, BinaryIO, Optional, Tuple, Union
38from requests.adapters import HTTPAdapter
39from requests.auth import AuthBase
40from urllib3.util.retry import Retry
42from ..utils import time_this
43from ._butlerUri import ButlerURI
44from .utils import NoTransaction
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ..datastore import DatastoreTransaction
49log = logging.getLogger(__name__)
52# Default timeouts for all HTTP requests, in seconds.
53DEFAULT_TIMEOUT_CONNECT = 60
54DEFAULT_TIMEOUT_READ = 300
56# Allow for network timeouts to be set in the environment.
57TIMEOUT = (
58 int(os.environ.get("LSST_HTTP_TIMEOUT_CONNECT", DEFAULT_TIMEOUT_CONNECT)),
59 int(os.environ.get("LSST_HTTP_TIMEOUT_READ", DEFAULT_TIMEOUT_READ)),
60)
62# Should we send a "Expect: 100-continue" header on PUT requests?
63# The "Expect: 100-continue" header is used by some servers (e.g. dCache)
64# as an indication that the client knows how to handle redirects to
65# the specific server that will actually receive the data for PUT
66# requests.
67_SEND_EXPECT_HEADER_ON_PUT = "LSST_HTTP_PUT_SEND_EXPECT_HEADER" in os.environ
70class BearerTokenAuth(AuthBase):
71 """Attach a bearer token 'Authorization' header to each request.
73 Parameters
74 ----------
75 token : `str`
76 Can be either the path to a local protected file which contains the
77 value of the token or the token itself.
78 """
80 def __init__(self, token: str):
81 self._token = self._path = None
82 self._mtime: float = -1.0
83 if not token:
84 return
86 self._token = token
87 if os.path.isfile(token):
88 self._path = os.path.abspath(token)
89 if not _is_protected(self._path):
90 raise PermissionError(
91 f"Bearer token file at {self._path} must be protected for access only by its owner"
92 )
93 self._refresh()
95 def _refresh(self) -> None:
96 """Read the token file (if any) if its modification time is more recent
97 than the last time we read it.
98 """
99 if not self._path:
100 return
102 if (mtime := os.stat(self._path).st_mtime) > self._mtime:
103 log.debug("Reading bearer token file at %s", self._path)
104 self._mtime = mtime
105 with open(self._path) as f:
106 self._token = f.read().rstrip("\n")
108 def __call__(self, req: requests.PreparedRequest) -> requests.PreparedRequest:
109 if self._token:
110 self._refresh()
111 req.headers["Authorization"] = f"Bearer {self._token}"
112 return req
115class SessionStore:
116 """Cache a single reusable HTTP client session per enpoint."""
118 def __init__(self) -> None:
119 # The key of the dictionary is a root URI and the value is the
120 # session
121 self._sessions: dict[str, requests.Session] = {}
123 def get(self, rpath: ButlerHttpURI, persist: bool = True) -> requests.Session:
124 """Retrieve a session for accessing the remote resource at rpath.
126 Parameters
127 ----------
128 rpath : `ButlerHttpURI`
129 URL to a resource at the remote server for which a session is to
130 be retrieved.
132 persist : `bool`
133 if `True`, make the network connection with the front end server
134 of the endpoint persistent. Connections to the backend servers
135 are persisted.
137 Notes
138 -----
139 Once a session is created for a given endpoint it is cached and
140 returned every time a session is requested for any path under that same
141 endpoint. For instance, a single session will be cached and shared
142 for paths "https://www.example.org/path/to/file" and
143 "https://www.example.org/any/other/path".
145 Note that "https://www.example.org" and "https://www.example.org:12345"
146 will have different sessions since the port number is not identical.
148 In order to configure the session, some environment variables are
149 inspected:
151 - LSST_HTTP_CACERT_BUNDLE: path to a .pem file containing the CA
152 certificates to trust when verifying the server's certificate.
154 - LSST_HTTP_AUTH_BEARER_TOKEN: value of a bearer token or path to a
155 local file containing a bearer token to be used as the client
156 authentication mechanism with all requests.
157 The permissions of the token file must be set so that only its
158 owner can access it.
159 If initialized, takes precedence over LSST_HTTP_AUTH_CLIENT_CERT
160 and LSST_HTTP_AUTH_CLIENT_KEY.
162 - LSST_HTTP_AUTH_CLIENT_CERT: path to a .pem file which contains the
163 client certificate for authenticating to the server.
164 If initialized, the variable LSST_HTTP_AUTH_CLIENT_KEY must also be
165 initialized with the path of the client private key file.
166 The permissions of the client private key must be set so that only
167 its owner can access it, at least for reading.
168 """
169 root_uri = str(rpath.root_uri())
170 if root_uri not in self._sessions:
171 # We don't have yet a session for this endpoint: create a new one
172 self._sessions[root_uri] = self._make_session(rpath, persist)
173 return self._sessions[root_uri]
175 def _make_session(self, rpath: ButlerHttpURI, persist: bool) -> requests.Session:
176 """Make a new session configured from values from the environment."""
177 session = requests.Session()
178 root_uri = str(rpath.root_uri())
179 log.debug(
180 "Creating new HTTP session for endpoint %s (persist connection=%s)...",
181 root_uri,
182 persist,
183 )
185 retries = Retry(
186 total=3,
187 connect=3,
188 read=3,
189 backoff_factor=5.0 + random.random(),
190 status=3,
191 status_forcelist=[429, 500, 502, 503, 504],
192 )
194 # Persist a single connection to the front end server, if required
195 num_connections = 1 if persist else 0
196 session.mount(
197 root_uri,
198 HTTPAdapter(
199 pool_connections=1,
200 pool_maxsize=num_connections,
201 pool_block=False,
202 max_retries=retries,
203 ),
204 )
206 # Prevent persisting connections to back-end servers which may vary
207 # from request to request. Systematically persisting connections to
208 # those servers may exhaust their capabilities when there are thousands
209 # of simultaneous clients
210 session.mount(
211 f"{rpath.scheme}://",
212 HTTPAdapter(
213 pool_connections=1,
214 pool_maxsize=0,
215 pool_block=False,
216 max_retries=retries,
217 ),
218 )
220 # Should we use a specific CA cert bundle for authenticating the
221 # server?
222 session.verify = True
223 if ca_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE"):
224 session.verify = ca_bundle
225 else:
226 log.debug(
227 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: "
228 "if you would need to verify the remote server's certificate "
229 "issued by specific certificate authorities please consider "
230 "initializing this variable."
231 )
233 # Should we use bearer tokens for client authentication?
234 if token := os.getenv("LSST_HTTP_AUTH_BEARER_TOKEN"):
235 log.debug("... using bearer token authentication")
236 session.auth = BearerTokenAuth(token)
237 return session
239 # Should we instead use client certificate and private key? If so, both
240 # LSST_HTTP_AUTH_CLIENT_CERT and LSST_HTTP_AUTH_CLIENT_KEY must be
241 # initialized.
242 client_cert = os.getenv("LSST_HTTP_AUTH_CLIENT_CERT")
243 client_key = os.getenv("LSST_HTTP_AUTH_CLIENT_KEY")
244 if client_cert and client_key:
245 if not _is_protected(client_key):
246 raise PermissionError(
247 f"Private key file at {client_key} must be protected for access only by its owner"
248 )
249 log.debug("... using client certificate authentication.")
250 session.cert = (client_cert, client_key)
251 return session
253 if client_cert:
254 # Only the client certificate was provided.
255 raise ValueError(
256 "Environment variable LSST_HTTP_AUTH_CLIENT_KEY must be set to client private key file path"
257 )
259 if client_key:
260 # Only the client private key was provided.
261 raise ValueError(
262 "Environment variable LSST_HTTP_AUTH_CLIENT_CERT must be set to client certificate file path"
263 )
265 log.debug(
266 "Neither LSST_HTTP_AUTH_BEARER_TOKEN nor (LSST_HTTP_AUTH_CLIENT_CERT and "
267 "LSST_HTTP_AUTH_CLIENT_KEY) are initialized. Client authentication is disabled."
268 )
269 return session
272@functools.lru_cache
273def _is_webdav_endpoint(path: Union[ButlerURI, str]) -> bool:
274 """Check whether the remote HTTP endpoint implements Webdav features.
276 Parameters
277 ----------
278 path : `ButlerURI` or `str`
279 URL to the resource to be checked.
280 Should preferably refer to the root since the status is shared
281 by all paths in that server.
283 Returns
284 -------
285 isWebdav : `bool`
286 True if the endpoint implements Webdav, False if it doesn't.
287 """
288 if (ca_cert_bundle := os.getenv("LSST_HTTP_CACERT_BUNDLE")) is None:
289 log.warning(
290 "Environment variable LSST_HTTP_CACERT_BUNDLE is not set: "
291 "some HTTPS requests may fail if remote server presents a "
292 "certificate issued by an unknown certificate authority."
293 )
295 log.debug("Detecting HTTP endpoint type for '%s'...", path)
296 verify: Union[bool, str] = ca_cert_bundle if ca_cert_bundle else True
297 resp = requests.options(str(path), verify=verify)
298 return "DAV" in resp.headers
301# Tuple (path, block_size) pointing to the location of a local directory
302# to save temporary files and the block size of the underlying file system
303_TMPDIR: Optional[Tuple[str, int]] = None
306def _get_temp_dir() -> Tuple[str, int]:
307 """Return the temporary directory path and block size.
308 This function caches its results in _TMPDIR.
309 """
310 global _TMPDIR
311 if _TMPDIR:
312 return _TMPDIR
314 # Use the value of environment variables 'LSST_RESOURCES_TMPDIR' or
315 # 'TMPDIR', if defined. Otherwise use current working directory
316 tmpdir = os.getcwd()
317 for dir in (os.getenv(v) for v in ("LSST_RESOURCES_TMPDIR", "TMPDIR")):
318 if dir and os.path.isdir(dir):
319 tmpdir = dir
320 break
322 # Compute the block size as 256 blocks of typical size
323 # (i.e. 4096 bytes) or 10 times the file system block size,
324 # whichever is higher. This is a reasonable compromise between
325 # using memory for buffering and the number of system calls
326 # issued to read from or write to temporary files
327 fsstats = os.statvfs(tmpdir)
328 return (_TMPDIR := (tmpdir, max(10 * fsstats.f_bsize, 256 * 4096)))
331class ButlerHttpURI(ButlerURI):
332 """General HTTP(S) resource."""
334 _is_webdav: Optional[bool] = None
335 _sessions_store = SessionStore()
336 _put_sessions_store = SessionStore()
338 @property
339 def session(self) -> requests.Session:
340 """Client session to address remote resource for all HTTP methods but
341 PUT.
342 """
343 if hasattr(self, "_session"):
344 return self._session
346 self._session: requests.Session = self._sessions_store.get(self)
347 return self._session
349 @property
350 def put_session(self) -> requests.Session:
351 """Client session for uploading data to the remote resource."""
352 if hasattr(self, "_put_session"):
353 return self._put_session
355 self._put_session: requests.Session = self._put_sessions_store.get(self)
356 return self._put_session
358 @property
359 def is_webdav_endpoint(self) -> bool:
360 """Check if the current endpoint implements WebDAV features.
362 This is stored per URI but cached by root so there is
363 only one check per hostname.
364 """
365 if self._is_webdav is not None:
366 return self._is_webdav
368 self._is_webdav = _is_webdav_endpoint(self.root_uri())
369 return self._is_webdav
371 def exists(self) -> bool:
372 """Check that a remote HTTP resource exists."""
373 log.debug("Checking if resource exists: %s", self.geturl())
374 resp = self.session.head(self.geturl(), timeout=TIMEOUT)
375 return resp.status_code == 200
377 def size(self) -> int:
378 """Return the size of the remote resource in bytes."""
379 if self.dirLike:
380 return 0
382 resp = self.session.head(self.geturl(), timeout=TIMEOUT)
383 if resp.status_code != 200:
384 raise FileNotFoundError(f"Resource {self} does not exist")
385 return int(resp.headers["Content-Length"])
387 def mkdir(self) -> None:
388 """Create the directory resource if it does not already exist."""
389 # Only available on WebDAV backends
390 if not self.is_webdav_endpoint:
391 raise NotImplementedError(
392 "Endpoint does not implement WebDAV functionality"
393 )
395 if not self.dirLike:
396 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
398 if not self.exists():
399 # We need to test the absence of the parent directory,
400 # but also if parent URL is different from self URL,
401 # otherwise we could be stuck in a recursive loop
402 # where self == parent
403 if not self.parent().exists() and self.parent().geturl() != self.geturl():
404 self.parent().mkdir()
405 log.debug("Creating new directory: %s", self.geturl())
406 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT)
407 if r.status_code != 201:
408 if r.status_code == 405:
409 log.debug(
410 "Can not create directory: %s may already exist: skipping.",
411 self.geturl(),
412 )
413 else:
414 raise ValueError(
415 f"Can not create directory {self}, status code: {r.status_code}"
416 )
418 def remove(self) -> None:
419 """Remove the resource."""
420 log.debug("Removing resource: %s", self.geturl())
421 r = self.session.delete(self.geturl(), timeout=TIMEOUT)
422 if r.status_code not in [200, 202, 204]:
423 raise FileNotFoundError(
424 f"Unable to delete resource {self}; status code: {r.status_code}"
425 )
427 def _as_local(self) -> Tuple[str, bool]:
428 """Download object over HTTP and place in temporary directory.
430 Returns
431 -------
432 path : `str`
433 Path to local temporary file.
434 temporary : `bool`
435 Always returns `True`. This is always a temporary file.
436 """
437 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT)
438 if r.status_code != 200:
439 raise FileNotFoundError(
440 f"Unable to download resource {self}; status code: {r.status_code}"
441 )
442 tmpdir, buffering = _get_temp_dir()
443 with tempfile.NamedTemporaryFile(
444 suffix=self.getExtension(), buffering=buffering, dir=tmpdir, delete=False
445 ) as tmpFile:
446 with time_this(
447 log,
448 msg="Downloading %s [length=%s] to local file %s [chunk_size=%d]",
449 args=(self, r.headers.get("Content-Length"), tmpFile.name, buffering),
450 ):
451 for chunk in r.iter_content(chunk_size=buffering):
452 tmpFile.write(chunk)
453 return tmpFile.name, True
455 def read(self, size: int = -1) -> bytes:
456 """Open the resource and return the contents in bytes.
458 Parameters
459 ----------
460 size : `int`, optional
461 The number of bytes to read. Negative or omitted indicates
462 that all data should be read.
463 """
464 log.debug("Reading from remote resource: %s", self.geturl())
465 stream = True if size > 0 else False
466 with time_this(log, msg="Read from remote resource %s", args=(self,)):
467 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT)
468 if r.status_code != 200:
469 raise FileNotFoundError(
470 f"Unable to read resource {self}; status code: {r.status_code}"
471 )
472 if not stream:
473 return r.content
474 else:
475 return next(r.iter_content(chunk_size=size))
477 def write(self, data: bytes, overwrite: bool = True) -> None:
478 """Write the supplied bytes to the new resource.
480 Parameters
481 ----------
482 data : `bytes`
483 The bytes to write to the resource. The entire contents of the
484 resource will be replaced.
485 overwrite : `bool`, optional
486 If `True` the resource will be overwritten if it exists. Otherwise
487 the write will fail.
488 """
489 log.debug("Writing to remote resource: %s", self.geturl())
490 if not overwrite:
491 if self.exists():
492 raise FileExistsError(
493 f"Remote resource {self} exists and overwrite has been disabled"
494 )
495 with time_this(
496 log, msg="Write to remote %s (%d bytes)", args=(self, len(data))
497 ):
498 self._do_put(data=data)
500 def transfer_from(
501 self,
502 src: ButlerURI,
503 transfer: str = "copy",
504 overwrite: bool = False,
505 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None,
506 ) -> None:
507 """Transfer the current resource to a Webdav repository.
509 Parameters
510 ----------
511 src : `ButlerURI`
512 Source URI.
513 transfer : `str`
514 Mode to use for transferring the resource. Supports the following
515 options: copy.
516 transaction : `DatastoreTransaction`, optional
517 Currently unused.
518 """
519 # Fail early to prevent delays if remote resources are requested
520 if transfer not in self.transferModes:
521 raise ValueError(
522 f"Transfer mode {transfer} not supported by URI scheme {self.scheme}"
523 )
525 # Existence checks cost time so do not call this unless we know
526 # that debugging is enabled.
527 if log.isEnabledFor(logging.DEBUG):
528 log.debug(
529 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
530 src,
531 src.exists(),
532 self,
533 self.exists(),
534 transfer,
535 )
537 if self.exists():
538 raise FileExistsError(f"Destination path {self} already exists.")
540 if transfer == "auto":
541 transfer = self.transferDefault
543 if isinstance(src, type(self)):
544 # Only available on WebDAV backends
545 if not self.is_webdav_endpoint:
546 raise NotImplementedError(
547 "Endpoint does not implement WebDAV functionality"
548 )
550 with time_this(
551 log, msg="Transfer from %s to %s directly", args=(src, self)
552 ):
553 method = "MOVE" if transfer == "move" else "COPY"
554 log.debug("%s from %s to %s", method, src.geturl(), self.geturl())
555 resp = self.session.request(
556 method,
557 src.geturl(),
558 headers={"Destination": self.geturl()},
559 timeout=TIMEOUT,
560 )
561 if resp.status_code not in [201, 202, 204]:
562 raise ValueError(
563 f"Can not transfer file {self}, status code: {resp.status_code}"
564 )
565 else:
566 # Use local file and upload it
567 with src.as_local() as local_uri:
568 with open(local_uri.ospath, "rb") as f:
569 with time_this(
570 log,
571 msg="Transfer from %s to %s via local file",
572 args=(src, self),
573 ):
574 self._do_put(data=f)
576 # This was an explicit move requested from a remote resource
577 # try to remove that resource
578 if transfer == "move":
579 # Transactions do not work here
580 src.remove()
582 def _do_put(self, data: Union[BinaryIO, bytes]) -> None:
583 """Perform an HTTP PUT request taking into account redirection."""
584 final_url = self.geturl()
585 if _SEND_EXPECT_HEADER_ON_PUT:
586 # Do a PUT request with an empty body and retrieve the final
587 # destination URL returned by the server.
588 headers = {"Content-Length": "0", "Expect": "100-continue"}
589 resp = self.put_session.put(
590 final_url,
591 data=None,
592 headers=headers,
593 allow_redirects=False,
594 timeout=TIMEOUT,
595 )
596 if resp.is_redirect or resp.is_permanent_redirect:
597 final_url = resp.headers["Location"]
598 log.debug(
599 "PUT request to %s redirected to %s", self.geturl(), final_url
600 )
602 # Send data to its final destination.
603 resp = self.put_session.put(final_url, data=data, timeout=TIMEOUT)
604 if resp.status_code not in [201, 202, 204]:
605 raise ValueError(
606 f"Can not write file {self}, status code: {resp.status_code}"
607 )
610def _is_protected(filepath: str) -> bool:
611 """Return true if the permissions of file at filepath only allow for access
612 by its owner.
614 Parameters
615 ----------
616 filepath : `str`
617 Path of a local file.
618 """
619 if not os.path.isfile(filepath):
620 return False
621 mode = stat.S_IMODE(os.stat(filepath).st_mode)
622 owner_accessible = bool(mode & stat.S_IRWXU)
623 group_accessible = bool(mode & stat.S_IRWXG)
624 other_accessible = bool(mode & stat.S_IRWXO)
625 return owner_accessible and not group_accessible and not other_accessible