Coverage for python/lsst/daf/butler/core/_butlerUri/http.py: 15%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import os
25import os.path
26import requests
27import tempfile
28import logging
29import functools
31__all__ = ('ButlerHttpURI', )
33from requests.adapters import HTTPAdapter
34from requests.packages.urllib3.util.retry import Retry
36from typing import (
37 TYPE_CHECKING,
38 Optional,
39 Tuple,
40 Union,
41)
43from lsst.utils.timer import time_this
44from .utils import NoTransaction
45from ._butlerUri import ButlerURI
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 from ..datastore import DatastoreTransaction
50log = logging.getLogger(__name__)
52# Default timeout for all HTTP requests, in seconds
53TIMEOUT = 20
56def getHttpSession() -> requests.Session:
57 """Create a requests.Session pre-configured with environment variable data.
59 Returns
60 -------
61 session : `requests.Session`
62 An http session used to execute requests.
64 Notes
65 -----
66 The following environment variables must be set:
67 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA
68 certificates are stored if you intend to use HTTPS to
69 communicate with the endpoint.
70 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use.
71 Possible values are X509 and TOKEN
72 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy
73 certificate used to authenticate requests
74 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which
75 contains the bearer token used to authenticate requests
76 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an
77 "Expect: 100-Continue" header in all requests. This is required
78 on certain endpoints where requests redirection is made.
79 """
80 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
82 session = requests.Session()
83 session.mount("http://", HTTPAdapter(max_retries=retries))
84 session.mount("https://", HTTPAdapter(max_retries=retries))
86 log.debug("Creating new HTTP session...")
88 ca_bundle = None
89 try:
90 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE']
91 except KeyError:
92 log.debug("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: "
93 "If you would like to trust additional CAs, please consider "
94 "exporting this variable.")
95 session.verify = ca_bundle
97 try:
98 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH']
99 except KeyError:
100 log.debug("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, "
101 "no authentication configured.")
102 log.debug("Unauthenticated session configured and ready.")
103 return session
105 if env_auth_method == "X509":
106 log.debug("... using x509 authentication.")
107 try:
108 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT']
109 except KeyError:
110 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set")
111 session.cert = (proxy_cert, proxy_cert)
112 elif env_auth_method == "TOKEN":
113 log.debug("... using bearer-token authentication.")
114 refreshToken(session)
115 else:
116 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN")
118 log.debug("Authenticated session configured and ready.")
119 return session
122def useExpect100() -> bool:
123 """Return the status of the "Expect-100" header.
125 Returns
126 -------
127 useExpect100 : `bool`
128 True if LSST_BUTLER_WEBDAV_EXPECT100 is set, False otherwise.
129 """
130 # This header is required for request redirection, in dCache for example
131 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ:
132 log.debug("Expect: 100-Continue header enabled.")
133 return True
134 return False
137def isTokenAuth() -> bool:
138 """Return the status of bearer-token authentication.
140 Returns
141 -------
142 isTokenAuth : `bool`
143 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise.
144 """
145 try:
146 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH']
147 except KeyError:
148 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, "
149 "please use values X509 or TOKEN")
151 if env_auth_method == "TOKEN":
152 return True
153 return False
156def refreshToken(session: requests.Session) -> None:
157 """Refresh the session token.
159 Set or update the 'Authorization' header of the session,
160 configure bearer token authentication, with the value fetched
161 from LSST_BUTLER_WEBDAV_TOKEN_FILE
163 Parameters
164 ----------
165 session : `requests.Session`
166 Session on which bearer token authentication must be configured.
167 """
168 try:
169 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE']
170 if not os.path.isfile(token_path):
171 raise FileNotFoundError(f"No token file: {token_path}")
172 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh:
173 bearer_token = fh.read().replace('\n', '')
174 except KeyError:
175 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set")
177 session.headers.update({'Authorization': 'Bearer ' + bearer_token})
180@functools.lru_cache
181def isWebdavEndpoint(path: Union[ButlerURI, str]) -> bool:
182 """Check whether the remote HTTP endpoint implements Webdav features.
184 Parameters
185 ----------
186 path : `ButlerURI` or `str`
187 URL to the resource to be checked.
188 Should preferably refer to the root since the status is shared
189 by all paths in that server.
191 Returns
192 -------
193 isWebdav : `bool`
194 True if the endpoint implements Webdav, False if it doesn't.
195 """
196 ca_bundle = None
197 try:
198 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE']
199 except KeyError:
200 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: "
201 "some HTTPS requests will fail. If you intend to use HTTPS, please "
202 "export this variable.")
204 log.debug("Detecting HTTP endpoint type for '%s'...", path)
205 r = requests.options(str(path), verify=ca_bundle)
206 return True if 'DAV' in r.headers else False
209def finalurl(r: requests.Response) -> str:
210 """Calculate the final URL, including redirects.
212 Check whether the remote HTTP endpoint redirects to a different
213 endpoint, and return the final destination of the request.
214 This is needed when using PUT operations, to avoid starting
215 to send the data to the endpoint, before having to send it again once
216 the 307 redirect response is received, and thus wasting bandwidth.
218 Parameters
219 ----------
220 r : `requests.Response`
221 An HTTP response received when requesting the endpoint
223 Returns
224 -------
225 destination_url: `string`
226 The final destination to which requests must be sent.
227 """
228 destination_url = r.url
229 if r.status_code == 307:
230 destination_url = r.headers['Location']
231 log.debug("Request redirected to %s", destination_url)
232 return destination_url
235class ButlerHttpURI(ButlerURI):
236 """General HTTP(S) resource."""
238 _session = requests.Session()
239 _sessionInitialized = False
240 _is_webdav: Optional[bool] = None
242 @property
243 def session(self) -> requests.Session:
244 """Client object to address remote resource."""
245 if ButlerHttpURI._sessionInitialized:
246 if isTokenAuth():
247 refreshToken(ButlerHttpURI._session)
248 return ButlerHttpURI._session
250 s = getHttpSession()
251 ButlerHttpURI._session = s
252 ButlerHttpURI._sessionInitialized = True
253 return s
255 @property
256 def is_webdav_endpoint(self) -> bool:
257 """Check if the current endpoint implements WebDAV features.
259 This is stored per URI but cached by root so there is
260 only one check per hostname.
261 """
262 if self._is_webdav is not None:
263 return self._is_webdav
265 self._is_webdav = isWebdavEndpoint(self.root_uri())
266 return self._is_webdav
268 def exists(self) -> bool:
269 """Check that a remote HTTP resource exists."""
270 log.debug("Checking if resource exists: %s", self.geturl())
271 r = self.session.head(self.geturl(), timeout=TIMEOUT)
273 return True if r.status_code == 200 else False
275 def size(self) -> int:
276 """Return the size of the remote resource in bytes."""
277 if self.dirLike:
278 return 0
279 r = self.session.head(self.geturl(), timeout=TIMEOUT)
280 if r.status_code == 200:
281 return int(r.headers['Content-Length'])
282 else:
283 raise FileNotFoundError(f"Resource {self} does not exist")
285 def mkdir(self) -> None:
286 """Create the directory resource if it does not already exist."""
287 # Only available on WebDAV backends
288 if not self.is_webdav_endpoint:
289 raise NotImplementedError("Endpoint does not implement WebDAV functionality")
291 if not self.dirLike:
292 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
294 if not self.exists():
295 # We need to test the absence of the parent directory,
296 # but also if parent URL is different from self URL,
297 # otherwise we could be stuck in a recursive loop
298 # where self == parent
299 if not self.parent().exists() and self.parent().geturl() != self.geturl():
300 self.parent().mkdir()
301 log.debug("Creating new directory: %s", self.geturl())
302 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT)
303 if r.status_code != 201:
304 if r.status_code == 405:
305 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
306 else:
307 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}")
309 def remove(self) -> None:
310 """Remove the resource."""
311 log.debug("Removing resource: %s", self.geturl())
312 r = self.session.delete(self.geturl(), timeout=TIMEOUT)
313 if r.status_code not in [200, 202, 204]:
314 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}")
316 def _as_local(self) -> Tuple[str, bool]:
317 """Download object over HTTP and place in temporary directory.
319 Returns
320 -------
321 path : `str`
322 Path to local temporary file.
323 temporary : `bool`
324 Always returns `True`. This is always a temporary file.
325 """
326 log.debug("Downloading remote resource as local file: %s", self.geturl())
327 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT)
328 if r.status_code != 200:
329 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}")
330 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
331 with time_this(log, msg="Downloading %s to local file", args=(self,)):
332 for chunk in r.iter_content():
333 tmpFile.write(chunk)
334 return tmpFile.name, True
336 def read(self, size: int = -1) -> bytes:
337 """Open the resource and return the contents in bytes.
339 Parameters
340 ----------
341 size : `int`, optional
342 The number of bytes to read. Negative or omitted indicates
343 that all data should be read.
344 """
345 log.debug("Reading from remote resource: %s", self.geturl())
346 stream = True if size > 0 else False
347 with time_this(log, msg="Read from remote resource %s", args=(self,)):
348 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT)
349 if r.status_code != 200:
350 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}")
351 if not stream:
352 return r.content
353 else:
354 return next(r.iter_content(chunk_size=size))
356 def write(self, data: bytes, overwrite: bool = True) -> None:
357 """Write the supplied bytes to the new resource.
359 Parameters
360 ----------
361 data : `bytes`
362 The bytes to write to the resource. The entire contents of the
363 resource will be replaced.
364 overwrite : `bool`, optional
365 If `True` the resource will be overwritten if it exists. Otherwise
366 the write will fail.
367 """
368 log.debug("Writing to remote resource: %s", self.geturl())
369 if not overwrite:
370 if self.exists():
371 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
372 dest_url = finalurl(self._emptyPut())
373 with time_this(log, msg="Write data to remote %s", args=(self,)):
374 r = self.session.put(dest_url, data=data, timeout=TIMEOUT)
375 if r.status_code not in [201, 202, 204]:
376 raise ValueError(f"Can not write file {self}, status code: {r.status_code}")
378 def transfer_from(self, src: ButlerURI, transfer: str = "copy",
379 overwrite: bool = False,
380 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
381 """Transfer the current resource to a Webdav repository.
383 Parameters
384 ----------
385 src : `ButlerURI`
386 Source URI.
387 transfer : `str`
388 Mode to use for transferring the resource. Supports the following
389 options: copy.
390 transaction : `DatastoreTransaction`, optional
391 Currently unused.
392 """
393 # Fail early to prevent delays if remote resources are requested
394 if transfer not in self.transferModes:
395 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
397 # Existence checks cost time so do not call this unless we know
398 # that debugging is enabled.
399 if log.isEnabledFor(logging.DEBUG):
400 log.debug("Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)",
401 src, src.exists(), self, self.exists(), transfer)
403 if self.exists():
404 raise FileExistsError(f"Destination path {self} already exists.")
406 if transfer == "auto":
407 transfer = self.transferDefault
409 if isinstance(src, type(self)):
410 # Only available on WebDAV backends
411 if not self.is_webdav_endpoint:
412 raise NotImplementedError("Endpoint does not implement WebDAV functionality")
414 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)):
415 if transfer == "move":
416 r = self.session.request("MOVE", src.geturl(),
417 headers={"Destination": self.geturl()},
418 timeout=TIMEOUT)
419 log.debug("Running move via MOVE HTTP request.")
420 else:
421 r = self.session.request("COPY", src.geturl(),
422 headers={"Destination": self.geturl()},
423 timeout=TIMEOUT)
424 log.debug("Running copy via COPY HTTP request.")
425 else:
426 # Use local file and upload it
427 with src.as_local() as local_uri:
428 with open(local_uri.ospath, "rb") as f:
429 dest_url = finalurl(self._emptyPut())
430 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)):
431 r = self.session.put(dest_url, data=f, timeout=TIMEOUT)
433 if r.status_code not in [201, 202, 204]:
434 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}")
436 # This was an explicit move requested from a remote resource
437 # try to remove that resource
438 if transfer == "move":
439 # Transactions do not work here
440 src.remove()
442 def _emptyPut(self) -> requests.Response:
443 """Send an empty PUT request to current URL.
445 This is used to detect if redirection is enabled before sending actual
446 data.
448 Returns
449 -------
450 response : `requests.Response`
451 HTTP Response from the endpoint.
452 """
453 headers = {"Content-Length": "0"}
454 if useExpect100():
455 headers["Expect"] = "100-continue"
456 return self.session.put(self.geturl(), data=None, headers=headers,
457 allow_redirects=False, timeout=TIMEOUT)