Coverage for python/lsst/daf/butler/core/_butlerUri/http.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import os
25import os.path
26import requests
27import tempfile
28import logging
30__all__ = ('ButlerHttpURI', )
32from requests.adapters import HTTPAdapter
33from requests.packages.urllib3.util.retry import Retry
35from typing import (
36 TYPE_CHECKING,
37 Optional,
38 Tuple,
39 Union,
40)
42from .utils import NoTransaction
43from ._butlerUri import ButlerURI
44from ..location import Location
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ..datastore import DatastoreTransaction
49log = logging.getLogger(__name__)
52def getHttpSession() -> requests.Session:
53 """Create a requests.Session pre-configured with environment variable data
55 Returns
56 -------
57 session : `requests.Session`
58 An http session used to execute requests.
60 Notes
61 -----
62 The following environment variables must be set:
63 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA
64 certificates are stored if you intend to use HTTPS to
65 communicate with the endpoint.
66 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use.
67 Possible values are X509 and TOKEN
68 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy
69 certificate used to authenticate requests
70 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which
71 contains the bearer token used to authenticate requests
72 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an
73 "Expect: 100-Continue" header in all requests. This is required
74 on certain endpoints where requests redirection is made.
75 """
77 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
79 session = requests.Session()
80 session.mount("http://", HTTPAdapter(max_retries=retries))
81 session.mount("https://", HTTPAdapter(max_retries=retries))
83 log.debug("Creating new HTTP session...")
85 try:
86 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH']
87 except KeyError:
88 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, "
89 "please use values X509 or TOKEN")
91 if env_auth_method == "X509":
92 log.debug("... using x509 authentication.")
93 try:
94 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT']
95 except KeyError:
96 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set")
97 session.cert = (proxy_cert, proxy_cert)
98 elif env_auth_method == "TOKEN":
99 log.debug("... using bearer-token authentication.")
100 refreshToken(session)
101 else:
102 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN")
104 ca_bundle = None
105 try:
106 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE']
107 except KeyError:
108 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: "
109 "HTTPS requests will fail. If you intend to use HTTPS, please "
110 "export this variable.")
112 session.verify = ca_bundle
114 # This header is required for request redirection, in dCache for example
115 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ:
116 log.debug("Expect: 100-Continue header enabled.")
117 session.headers.update({'Expect': '100-continue'})
119 log.debug("Session configured and ready.")
121 return session
124def isTokenAuth() -> bool:
125 """Returns the status of bearer-token authentication.
127 Returns
128 -------
129 isTokenAuth : `bool`
130 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise.
131 """
132 try:
133 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH']
134 except KeyError:
135 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, "
136 "please use values X509 or TOKEN")
138 if env_auth_method == "TOKEN":
139 return True
140 return False
143def refreshToken(session: requests.Session) -> None:
144 """Set or update the 'Authorization' header of the session,
145 configure bearer token authentication, with the value fetched
146 from LSST_BUTLER_WEBDAV_TOKEN_FILE
148 Parameters
149 ----------
150 session : `requests.Session`
151 Session on which bearer token authentication must be configured
152 """
153 try:
154 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE']
155 if not os.path.isfile(token_path):
156 raise FileNotFoundError(f"No token file: {token_path}")
157 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh:
158 bearer_token = fh.read().replace('\n', '')
159 except KeyError:
160 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set")
162 session.headers.update({'Authorization': 'Bearer ' + bearer_token})
165def webdavCheckFileExists(path: Union[Location, ButlerURI, str],
166 session: Optional[requests.Session] = None) -> Tuple[bool, int]:
167 """Check that a remote HTTP resource exists.
169 Parameters
170 ----------
171 path : `Location`, `ButlerURI` or `str`
172 Location or ButlerURI containing the bucket name and filepath.
173 session : `requests.Session`, optional
174 Session object to query.
176 Returns
177 -------
178 exists : `bool`
179 True if resource exists, False otherwise.
180 size : `int`
181 Size of the resource, if it exists, in bytes, otherwise -1
182 """
183 if session is None:
184 session = getHttpSession()
186 filepath = _getFileURL(path)
188 log.debug("Checking if file exists: %s", filepath)
190 r = session.head(filepath)
191 return (True, int(r.headers['Content-Length'])) if r.status_code == 200 else (False, -1)
194def webdavDeleteFile(path: Union[Location, ButlerURI, str],
195 session: Optional[requests.Session] = None) -> None:
196 """Remove a remote HTTP resource.
197 Raises a FileNotFoundError if the resource does not exist or on failure.
199 Parameters
200 ----------
201 path : `Location`, `ButlerURI` or `str`
202 Location or ButlerURI containing the bucket name and filepath.
203 session : `requests.Session`, optional
204 Session object to query.
205 """
206 if session is None:
207 session = getHttpSession()
209 filepath = _getFileURL(path)
211 log.debug("Removing file: %s", filepath)
212 r = session.delete(filepath)
213 if r.status_code not in [200, 202, 204]:
214 raise FileNotFoundError(f"Unable to delete resource {filepath}; status code: {r.status_code}")
217def folderExists(path: Union[Location, ButlerURI, str],
218 session: Optional[requests.Session] = None) -> bool:
219 """Check if the Webdav repository at a given URL actually exists.
221 Parameters
222 ----------
223 path : `Location`, `ButlerURI` or `str`
224 Location or ButlerURI containing the bucket name and filepath.
225 session : `requests.Session`, optional
226 Session object to query.
228 Returns
229 -------
230 exists : `bool`
231 True if it exists, False if no folder is found.
232 """
233 if session is None:
234 session = getHttpSession()
236 filepath = _getFileURL(path)
238 log.debug("Checking if folder exists: %s", filepath)
239 r = session.head(filepath)
240 return True if r.status_code == 200 else False
243def isWebdavEndpoint(path: Union[Location, ButlerURI, str]) -> bool:
244 """Check whether the remote HTTP endpoint implements Webdav features.
246 Parameters
247 ----------
248 path : `Location`, `ButlerURI` or `str`
249 Location or ButlerURI containing the bucket name and filepath.
251 Returns
252 -------
253 isWebdav : `bool`
254 True if the endpoint implements Webdav, False if it doesn't.
255 """
256 ca_bundle = None
257 try:
258 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE']
259 except KeyError:
260 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: "
261 "HTTPS requests will fail. If you intend to use HTTPS, please "
262 "export this variable.")
263 filepath = _getFileURL(path)
265 log.debug("Detecting HTTP endpoint type...")
266 r = requests.options(filepath, verify=ca_bundle)
267 return True if 'DAV' in r.headers else False
270def finalurl(r: requests.Response) -> str:
271 """Check whether the remote HTTP endpoint redirects to a different
272 endpoint, and return the final destination of the request.
273 This is needed when using PUT operations, to avoid starting
274 to send the data to the endpoint, before having to send it again once
275 the 307 redirect response is received, and thus wasting bandwidth.
277 Parameters
278 ----------
279 r : `requests.Response`
280 An HTTP response received when requesting the endpoint
282 Returns
283 -------
284 destination_url: `string`
285 The final destination to which requests must be sent.
286 """
287 destination_url = r.url
288 if r.status_code == 307:
289 destination_url = r.headers['Location']
290 log.debug("Request redirected to %s", destination_url)
291 return destination_url
294def _getFileURL(path: Union[Location, ButlerURI, str]) -> str:
295 """Returns the absolute URL of the resource as a string.
297 Parameters
298 ----------
299 path : `Location`, `ButlerURI` or `str`
300 Location or ButlerURI containing the bucket name and filepath.
302 Returns
303 -------
304 filepath : `str`
305 The fully qualified URL of the resource.
306 """
307 if isinstance(path, Location):
308 filepath = path.uri.geturl()
309 else:
310 filepath = ButlerURI(path).geturl()
311 return filepath
314class ButlerHttpURI(ButlerURI):
315 """General HTTP(S) resource."""
316 _session = requests.Session()
317 _sessionInitialized = False
319 @property
320 def session(self) -> requests.Session:
321 """Client object to address remote resource."""
322 if ButlerHttpURI._sessionInitialized:
323 if isTokenAuth():
324 refreshToken(ButlerHttpURI._session)
325 return ButlerHttpURI._session
327 baseURL = self.scheme + "://" + self.netloc
329 if isWebdavEndpoint(baseURL):
330 log.debug("%s looks like a Webdav endpoint.", baseURL)
331 s = getHttpSession()
333 ButlerHttpURI._session = s
334 ButlerHttpURI._sessionInitialized = True
335 return s
337 def exists(self) -> bool:
338 """Check that a remote HTTP resource exists."""
339 log.debug("Checking if resource exists: %s", self.geturl())
340 r = self.session.head(self.geturl())
342 return True if r.status_code == 200 else False
344 def size(self) -> int:
345 if self.dirLike:
346 return 0
347 r = self.session.head(self.geturl())
348 if r.status_code == 200:
349 return int(r.headers['Content-Length'])
350 else:
351 raise FileNotFoundError(f"Resource {self} does not exist")
353 def mkdir(self) -> None:
354 """For a dir-like URI, create the directory resource if it does not
355 already exist.
356 """
357 if not self.dirLike:
358 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
360 if not self.exists():
361 # We need to test the absence of the parent directory,
362 # but also if parent URL is different from self URL,
363 # otherwise we could be stuck in a recursive loop
364 # where self == parent
365 if not self.parent().exists() and self.parent().geturl() != self.geturl():
366 self.parent().mkdir()
367 log.debug("Creating new directory: %s", self.geturl())
368 r = self.session.request("MKCOL", self.geturl())
369 if r.status_code != 201:
370 if r.status_code == 405:
371 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
372 else:
373 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}")
375 def remove(self) -> None:
376 """Remove the resource."""
377 log.debug("Removing resource: %s", self.geturl())
378 r = self.session.delete(self.geturl())
379 if r.status_code not in [200, 202, 204]:
380 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}")
382 def _as_local(self) -> Tuple[str, bool]:
383 """Download object over HTTP and place in temporary directory.
385 Returns
386 -------
387 path : `str`
388 Path to local temporary file.
389 temporary : `bool`
390 Always returns `True`. This is always a temporary file.
391 """
392 log.debug("Downloading remote resource as local file: %s", self.geturl())
393 r = self.session.get(self.geturl(), stream=True)
394 if r.status_code != 200:
395 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}")
396 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
397 for chunk in r.iter_content():
398 tmpFile.write(chunk)
399 return tmpFile.name, True
401 def read(self, size: int = -1) -> bytes:
402 """Open the resource and return the contents in bytes.
404 Parameters
405 ----------
406 size : `int`, optional
407 The number of bytes to read. Negative or omitted indicates
408 that all data should be read.
409 """
410 log.debug("Reading from remote resource: %s", self.geturl())
411 stream = True if size > 0 else False
412 r = self.session.get(self.geturl(), stream=stream)
413 if r.status_code != 200:
414 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}")
415 if not stream:
416 return r.content
417 else:
418 return next(r.iter_content(chunk_size=size))
420 def write(self, data: bytes, overwrite: bool = True) -> None:
421 """Write the supplied bytes to the new resource.
423 Parameters
424 ----------
425 data : `bytes`
426 The bytes to write to the resource. The entire contents of the
427 resource will be replaced.
428 overwrite : `bool`, optional
429 If `True` the resource will be overwritten if it exists. Otherwise
430 the write will fail.
431 """
432 log.debug("Writing to remote resource: %s", self.geturl())
433 if not overwrite:
434 if self.exists():
435 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
436 dest_url = finalurl(self._emptyPut())
437 r = self.session.put(dest_url, data=data)
438 if r.status_code not in [201, 202, 204]:
439 raise ValueError(f"Can not write file {self}, status code: {r.status_code}")
441 def transfer_from(self, src: ButlerURI, transfer: str = "copy",
442 overwrite: bool = False,
443 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
444 """Transfer the current resource to a Webdav repository.
446 Parameters
447 ----------
448 src : `ButlerURI`
449 Source URI.
450 transfer : `str`
451 Mode to use for transferring the resource. Supports the following
452 options: copy.
453 transaction : `DatastoreTransaction`, optional
454 Currently unused.
455 """
456 # Fail early to prevent delays if remote resources are requested
457 if transfer not in self.transferModes:
458 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
460 log.debug(f"Transferring {src} [exists: {src.exists()}] -> "
461 f"{self} [exists: {self.exists()}] (transfer={transfer})")
463 if self.exists():
464 raise FileExistsError(f"Destination path {self} already exists.")
466 if transfer == "auto":
467 transfer = self.transferDefault
469 if isinstance(src, type(self)):
470 if transfer == "move":
471 r = self.session.request("MOVE", src.geturl(), headers={"Destination": self.geturl()})
472 log.debug("Running move via MOVE HTTP request.")
473 else:
474 r = self.session.request("COPY", src.geturl(), headers={"Destination": self.geturl()})
475 log.debug("Running copy via COPY HTTP request.")
476 else:
477 # Use local file and upload it
478 with src.as_local() as local_uri:
479 with open(local_uri.ospath, "rb") as f:
480 dest_url = finalurl(self._emptyPut())
481 r = self.session.put(dest_url, data=f)
482 log.debug("Uploading URI %s to %s via local file", src, self)
484 if r.status_code not in [201, 202, 204]:
485 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}")
487 # This was an explicit move requested from a remote resource
488 # try to remove that resource
489 if transfer == "move":
490 # Transactions do not work here
491 src.remove()
493 def _emptyPut(self) -> requests.Response:
494 """Send an empty PUT request to current URL. This is used to detect
495 if redirection is enabled before sending actual data.
497 Returns
498 -------
499 response : `requests.Response`
500 HTTP Response from the endpoint.
501 """
502 return self.session.put(self.geturl(), data=None,
503 headers={"Content-Length": "0"}, allow_redirects=False)