Coverage for python/lsst/daf/butler/core/_butlerUri/http.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import os
25import os.path
26import requests
27import tempfile
28import logging
30__all__ = ('ButlerHttpURI', )
32from requests.adapters import HTTPAdapter
33from requests.packages.urllib3.util.retry import Retry
35from typing import (
36 TYPE_CHECKING,
37 Optional,
38 Tuple,
39 Union,
40)
42from .utils import NoTransaction
43from ._butlerUri import ButlerURI
44from ..location import Location
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ..datastore import DatastoreTransaction
49log = logging.getLogger(__name__)
51# Default timeout for all HTTP requests, in seconds
52TIMEOUT = 20
55def getHttpSession() -> requests.Session:
56 """Create a requests.Session pre-configured with environment variable data
58 Returns
59 -------
60 session : `requests.Session`
61 An http session used to execute requests.
63 Notes
64 -----
65 The following environment variables must be set:
66 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA
67 certificates are stored if you intend to use HTTPS to
68 communicate with the endpoint.
69 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use.
70 Possible values are X509 and TOKEN
71 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy
72 certificate used to authenticate requests
73 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which
74 contains the bearer token used to authenticate requests
75 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an
76 "Expect: 100-Continue" header in all requests. This is required
77 on certain endpoints where requests redirection is made.
78 """
80 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
82 session = requests.Session()
83 session.mount("http://", HTTPAdapter(max_retries=retries))
84 session.mount("https://", HTTPAdapter(max_retries=retries))
86 log.debug("Creating new HTTP session...")
88 try:
89 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH']
90 except KeyError:
91 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, "
92 "please use values X509 or TOKEN")
94 if env_auth_method == "X509":
95 log.debug("... using x509 authentication.")
96 try:
97 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT']
98 except KeyError:
99 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set")
100 session.cert = (proxy_cert, proxy_cert)
101 elif env_auth_method == "TOKEN":
102 log.debug("... using bearer-token authentication.")
103 refreshToken(session)
104 else:
105 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN")
107 ca_bundle = None
108 try:
109 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE']
110 except KeyError:
111 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: "
112 "HTTPS requests will fail. If you intend to use HTTPS, please "
113 "export this variable.")
115 session.verify = ca_bundle
116 log.debug("Session configured and ready.")
118 return session
121def useExpect100() -> bool:
122 """Returns the status of the "Expect-100" header.
124 Returns
125 -------
126 useExpect100 : `bool`
127 True if LSST_BUTLER_WEBDAV_EXPECT100 is set, False otherwise.
128 """
129 # This header is required for request redirection, in dCache for example
130 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ:
131 log.debug("Expect: 100-Continue header enabled.")
132 return True
133 return False
136def isTokenAuth() -> bool:
137 """Returns the status of bearer-token authentication.
139 Returns
140 -------
141 isTokenAuth : `bool`
142 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise.
143 """
144 try:
145 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH']
146 except KeyError:
147 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, "
148 "please use values X509 or TOKEN")
150 if env_auth_method == "TOKEN":
151 return True
152 return False
155def refreshToken(session: requests.Session) -> None:
156 """Set or update the 'Authorization' header of the session,
157 configure bearer token authentication, with the value fetched
158 from LSST_BUTLER_WEBDAV_TOKEN_FILE
160 Parameters
161 ----------
162 session : `requests.Session`
163 Session on which bearer token authentication must be configured
164 """
165 try:
166 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE']
167 if not os.path.isfile(token_path):
168 raise FileNotFoundError(f"No token file: {token_path}")
169 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh:
170 bearer_token = fh.read().replace('\n', '')
171 except KeyError:
172 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set")
174 session.headers.update({'Authorization': 'Bearer ' + bearer_token})
177def webdavCheckFileExists(path: Union[Location, ButlerURI, str],
178 session: Optional[requests.Session] = None) -> Tuple[bool, int]:
179 """Check that a remote HTTP resource exists.
181 Parameters
182 ----------
183 path : `Location`, `ButlerURI` or `str`
184 Location or ButlerURI containing the bucket name and filepath.
185 session : `requests.Session`, optional
186 Session object to query.
188 Returns
189 -------
190 exists : `bool`
191 True if resource exists, False otherwise.
192 size : `int`
193 Size of the resource, if it exists, in bytes, otherwise -1
194 """
195 if session is None:
196 session = getHttpSession()
198 filepath = _getFileURL(path)
200 log.debug("Checking if file exists: %s", filepath)
202 r = session.head(filepath, timeout=TIMEOUT)
203 return (True, int(r.headers['Content-Length'])) if r.status_code == 200 else (False, -1)
206def webdavDeleteFile(path: Union[Location, ButlerURI, str],
207 session: Optional[requests.Session] = None) -> None:
208 """Remove a remote HTTP resource.
209 Raises a FileNotFoundError if the resource does not exist or on failure.
211 Parameters
212 ----------
213 path : `Location`, `ButlerURI` or `str`
214 Location or ButlerURI containing the bucket name and filepath.
215 session : `requests.Session`, optional
216 Session object to query.
217 """
218 if session is None:
219 session = getHttpSession()
221 filepath = _getFileURL(path)
223 log.debug("Removing file: %s", filepath)
224 r = session.delete(filepath, timeout=TIMEOUT)
225 if r.status_code not in [200, 202, 204]:
226 raise FileNotFoundError(f"Unable to delete resource {filepath}; status code: {r.status_code}")
229def folderExists(path: Union[Location, ButlerURI, str],
230 session: Optional[requests.Session] = None) -> bool:
231 """Check if the Webdav repository at a given URL actually exists.
233 Parameters
234 ----------
235 path : `Location`, `ButlerURI` or `str`
236 Location or ButlerURI containing the bucket name and filepath.
237 session : `requests.Session`, optional
238 Session object to query.
240 Returns
241 -------
242 exists : `bool`
243 True if it exists, False if no folder is found.
244 """
245 if session is None:
246 session = getHttpSession()
248 filepath = _getFileURL(path)
250 log.debug("Checking if folder exists: %s", filepath)
251 r = session.head(filepath, timeout=TIMEOUT)
252 return True if r.status_code == 200 else False
255def isWebdavEndpoint(path: Union[Location, ButlerURI, str]) -> bool:
256 """Check whether the remote HTTP endpoint implements Webdav features.
258 Parameters
259 ----------
260 path : `Location`, `ButlerURI` or `str`
261 Location or ButlerURI containing the bucket name and filepath.
263 Returns
264 -------
265 isWebdav : `bool`
266 True if the endpoint implements Webdav, False if it doesn't.
267 """
268 ca_bundle = None
269 try:
270 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE']
271 except KeyError:
272 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: "
273 "HTTPS requests will fail. If you intend to use HTTPS, please "
274 "export this variable.")
275 filepath = _getFileURL(path)
277 log.debug("Detecting HTTP endpoint type...")
278 r = requests.options(filepath, verify=ca_bundle)
279 return True if 'DAV' in r.headers else False
282def finalurl(r: requests.Response) -> str:
283 """Check whether the remote HTTP endpoint redirects to a different
284 endpoint, and return the final destination of the request.
285 This is needed when using PUT operations, to avoid starting
286 to send the data to the endpoint, before having to send it again once
287 the 307 redirect response is received, and thus wasting bandwidth.
289 Parameters
290 ----------
291 r : `requests.Response`
292 An HTTP response received when requesting the endpoint
294 Returns
295 -------
296 destination_url: `string`
297 The final destination to which requests must be sent.
298 """
299 destination_url = r.url
300 if r.status_code == 307:
301 destination_url = r.headers['Location']
302 log.debug("Request redirected to %s", destination_url)
303 return destination_url
306def _getFileURL(path: Union[Location, ButlerURI, str]) -> str:
307 """Returns the absolute URL of the resource as a string.
309 Parameters
310 ----------
311 path : `Location`, `ButlerURI` or `str`
312 Location or ButlerURI containing the bucket name and filepath.
314 Returns
315 -------
316 filepath : `str`
317 The fully qualified URL of the resource.
318 """
319 if isinstance(path, Location):
320 filepath = path.uri.geturl()
321 else:
322 filepath = ButlerURI(path).geturl()
323 return filepath
326class ButlerHttpURI(ButlerURI):
327 """General HTTP(S) resource."""
328 _session = requests.Session()
329 _sessionInitialized = False
331 @property
332 def session(self) -> requests.Session:
333 """Client object to address remote resource."""
334 if ButlerHttpURI._sessionInitialized:
335 if isTokenAuth():
336 refreshToken(ButlerHttpURI._session)
337 return ButlerHttpURI._session
339 baseURL = self.scheme + "://" + self.netloc
341 if isWebdavEndpoint(baseURL):
342 log.debug("%s looks like a Webdav endpoint.", baseURL)
343 s = getHttpSession()
344 else:
345 raise RuntimeError(f"Only Webdav endpoints are supported; got base URL '{baseURL}'.")
347 ButlerHttpURI._session = s
348 ButlerHttpURI._sessionInitialized = True
349 return s
351 def exists(self) -> bool:
352 """Check that a remote HTTP resource exists."""
353 log.debug("Checking if resource exists: %s", self.geturl())
354 r = self.session.head(self.geturl(), timeout=TIMEOUT)
356 return True if r.status_code == 200 else False
358 def size(self) -> int:
359 if self.dirLike:
360 return 0
361 r = self.session.head(self.geturl(), timeout=TIMEOUT)
362 if r.status_code == 200:
363 return int(r.headers['Content-Length'])
364 else:
365 raise FileNotFoundError(f"Resource {self} does not exist")
367 def mkdir(self) -> None:
368 """For a dir-like URI, create the directory resource if it does not
369 already exist.
370 """
371 if not self.dirLike:
372 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
374 if not self.exists():
375 # We need to test the absence of the parent directory,
376 # but also if parent URL is different from self URL,
377 # otherwise we could be stuck in a recursive loop
378 # where self == parent
379 if not self.parent().exists() and self.parent().geturl() != self.geturl():
380 self.parent().mkdir()
381 log.debug("Creating new directory: %s", self.geturl())
382 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT)
383 if r.status_code != 201:
384 if r.status_code == 405:
385 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
386 else:
387 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}")
389 def remove(self) -> None:
390 """Remove the resource."""
391 log.debug("Removing resource: %s", self.geturl())
392 r = self.session.delete(self.geturl(), timeout=TIMEOUT)
393 if r.status_code not in [200, 202, 204]:
394 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}")
396 def _as_local(self) -> Tuple[str, bool]:
397 """Download object over HTTP and place in temporary directory.
399 Returns
400 -------
401 path : `str`
402 Path to local temporary file.
403 temporary : `bool`
404 Always returns `True`. This is always a temporary file.
405 """
406 log.debug("Downloading remote resource as local file: %s", self.geturl())
407 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT)
408 if r.status_code != 200:
409 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}")
410 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
411 for chunk in r.iter_content():
412 tmpFile.write(chunk)
413 return tmpFile.name, True
415 def read(self, size: int = -1) -> bytes:
416 """Open the resource and return the contents in bytes.
418 Parameters
419 ----------
420 size : `int`, optional
421 The number of bytes to read. Negative or omitted indicates
422 that all data should be read.
423 """
424 log.debug("Reading from remote resource: %s", self.geturl())
425 stream = True if size > 0 else False
426 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT)
427 if r.status_code != 200:
428 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}")
429 if not stream:
430 return r.content
431 else:
432 return next(r.iter_content(chunk_size=size))
434 def write(self, data: bytes, overwrite: bool = True) -> None:
435 """Write the supplied bytes to the new resource.
437 Parameters
438 ----------
439 data : `bytes`
440 The bytes to write to the resource. The entire contents of the
441 resource will be replaced.
442 overwrite : `bool`, optional
443 If `True` the resource will be overwritten if it exists. Otherwise
444 the write will fail.
445 """
446 log.debug("Writing to remote resource: %s", self.geturl())
447 if not overwrite:
448 if self.exists():
449 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
450 dest_url = finalurl(self._emptyPut())
451 r = self.session.put(dest_url, data=data, timeout=TIMEOUT)
452 if r.status_code not in [201, 202, 204]:
453 raise ValueError(f"Can not write file {self}, status code: {r.status_code}")
455 def transfer_from(self, src: ButlerURI, transfer: str = "copy",
456 overwrite: bool = False,
457 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
458 """Transfer the current resource to a Webdav repository.
460 Parameters
461 ----------
462 src : `ButlerURI`
463 Source URI.
464 transfer : `str`
465 Mode to use for transferring the resource. Supports the following
466 options: copy.
467 transaction : `DatastoreTransaction`, optional
468 Currently unused.
469 """
470 # Fail early to prevent delays if remote resources are requested
471 if transfer not in self.transferModes:
472 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
474 log.debug(f"Transferring {src} [exists: {src.exists()}] -> "
475 f"{self} [exists: {self.exists()}] (transfer={transfer})")
477 if self.exists():
478 raise FileExistsError(f"Destination path {self} already exists.")
480 if transfer == "auto":
481 transfer = self.transferDefault
483 if isinstance(src, type(self)):
484 if transfer == "move":
485 r = self.session.request("MOVE", src.geturl(),
486 headers={"Destination": self.geturl()},
487 timeout=TIMEOUT)
488 log.debug("Running move via MOVE HTTP request.")
489 else:
490 r = self.session.request("COPY", src.geturl(),
491 headers={"Destination": self.geturl()},
492 timeout=TIMEOUT)
493 log.debug("Running copy via COPY HTTP request.")
494 else:
495 # Use local file and upload it
496 with src.as_local() as local_uri:
497 with open(local_uri.ospath, "rb") as f:
498 dest_url = finalurl(self._emptyPut())
499 r = self.session.put(dest_url, data=f, timeout=TIMEOUT)
500 log.debug("Uploading URI %s to %s via local file", src, self)
502 if r.status_code not in [201, 202, 204]:
503 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}")
505 # This was an explicit move requested from a remote resource
506 # try to remove that resource
507 if transfer == "move":
508 # Transactions do not work here
509 src.remove()
511 def _emptyPut(self) -> requests.Response:
512 """Send an empty PUT request to current URL. This is used to detect
513 if redirection is enabled before sending actual data.
515 Returns
516 -------
517 response : `requests.Response`
518 HTTP Response from the endpoint.
519 """
520 headers = {"Content-Length": "0"}
521 if useExpect100():
522 headers["Expect"] = "100-continue"
523 return self.session.put(self.geturl(), data=None, headers=headers,
524 allow_redirects=False, timeout=TIMEOUT)