Coverage for python/lsst/daf/butler/core/_butlerUri/http.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import os
25import os.path
26import requests
27import tempfile
28import logging
30__all__ = ('ButlerHttpURI', )
32from requests.adapters import HTTPAdapter
33from requests.packages.urllib3.util.retry import Retry
35from typing import (
36 TYPE_CHECKING,
37 Optional,
38 Tuple,
39 Union,
40)
42from .utils import NoTransaction
43from ._butlerUri import ButlerURI
44from ..location import Location
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ..datastore import DatastoreTransaction
49log = logging.getLogger(__name__)
51# Default timeout for all HTTP requests, in seconds
52TIMEOUT = 20
55def getHttpSession() -> requests.Session:
56 """Create a requests.Session pre-configured with environment variable data.
58 Returns
59 -------
60 session : `requests.Session`
61 An http session used to execute requests.
63 Notes
64 -----
65 The following environment variables must be set:
66 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA
67 certificates are stored if you intend to use HTTPS to
68 communicate with the endpoint.
69 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use.
70 Possible values are X509 and TOKEN
71 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy
72 certificate used to authenticate requests
73 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which
74 contains the bearer token used to authenticate requests
75 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an
76 "Expect: 100-Continue" header in all requests. This is required
77 on certain endpoints where requests redirection is made.
78 """
79 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
81 session = requests.Session()
82 session.mount("http://", HTTPAdapter(max_retries=retries))
83 session.mount("https://", HTTPAdapter(max_retries=retries))
85 log.debug("Creating new HTTP session...")
87 try:
88 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH']
89 except KeyError:
90 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, "
91 "please use values X509 or TOKEN")
93 if env_auth_method == "X509":
94 log.debug("... using x509 authentication.")
95 try:
96 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT']
97 except KeyError:
98 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set")
99 session.cert = (proxy_cert, proxy_cert)
100 elif env_auth_method == "TOKEN":
101 log.debug("... using bearer-token authentication.")
102 refreshToken(session)
103 else:
104 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN")
106 ca_bundle = None
107 try:
108 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE']
109 except KeyError:
110 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: "
111 "HTTPS requests will fail. If you intend to use HTTPS, please "
112 "export this variable.")
114 session.verify = ca_bundle
115 log.debug("Session configured and ready.")
117 return session
120def useExpect100() -> bool:
121 """Return the status of the "Expect-100" header.
123 Returns
124 -------
125 useExpect100 : `bool`
126 True if LSST_BUTLER_WEBDAV_EXPECT100 is set, False otherwise.
127 """
128 # This header is required for request redirection, in dCache for example
129 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ:
130 log.debug("Expect: 100-Continue header enabled.")
131 return True
132 return False
135def isTokenAuth() -> bool:
136 """Return the status of bearer-token authentication.
138 Returns
139 -------
140 isTokenAuth : `bool`
141 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise.
142 """
143 try:
144 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH']
145 except KeyError:
146 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, "
147 "please use values X509 or TOKEN")
149 if env_auth_method == "TOKEN":
150 return True
151 return False
154def refreshToken(session: requests.Session) -> None:
155 """Refresh the session token.
157 Set or update the 'Authorization' header of the session,
158 configure bearer token authentication, with the value fetched
159 from LSST_BUTLER_WEBDAV_TOKEN_FILE
161 Parameters
162 ----------
163 session : `requests.Session`
164 Session on which bearer token authentication must be configured.
165 """
166 try:
167 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE']
168 if not os.path.isfile(token_path):
169 raise FileNotFoundError(f"No token file: {token_path}")
170 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh:
171 bearer_token = fh.read().replace('\n', '')
172 except KeyError:
173 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set")
175 session.headers.update({'Authorization': 'Bearer ' + bearer_token})
178def webdavCheckFileExists(path: Union[Location, ButlerURI, str],
179 session: Optional[requests.Session] = None) -> Tuple[bool, int]:
180 """Check that a remote HTTP resource exists.
182 Parameters
183 ----------
184 path : `Location`, `ButlerURI` or `str`
185 Location or ButlerURI containing the bucket name and filepath.
186 session : `requests.Session`, optional
187 Session object to query.
189 Returns
190 -------
191 exists : `bool`
192 True if resource exists, False otherwise.
193 size : `int`
194 Size of the resource, if it exists, in bytes, otherwise -1
195 """
196 if session is None:
197 session = getHttpSession()
199 filepath = _getFileURL(path)
201 log.debug("Checking if file exists: %s", filepath)
203 r = session.head(filepath, timeout=TIMEOUT)
204 return (True, int(r.headers['Content-Length'])) if r.status_code == 200 else (False, -1)
207def webdavDeleteFile(path: Union[Location, ButlerURI, str],
208 session: Optional[requests.Session] = None) -> None:
209 """Remove a remote HTTP resource.
211 Parameters
212 ----------
213 path : `Location`, `ButlerURI` or `str`
214 Location or ButlerURI containing the bucket name and filepath.
215 session : `requests.Session`, optional
216 Session object to query.
218 Raises
219 ------
220 FileNotFoundError
221 Raises a FileNotFoundError if the resource does not exist or on
222 failure.
223 """
224 if session is None:
225 session = getHttpSession()
227 filepath = _getFileURL(path)
229 log.debug("Removing file: %s", filepath)
230 r = session.delete(filepath, timeout=TIMEOUT)
231 if r.status_code not in [200, 202, 204]:
232 raise FileNotFoundError(f"Unable to delete resource {filepath}; status code: {r.status_code}")
235def folderExists(path: Union[Location, ButlerURI, str],
236 session: Optional[requests.Session] = None) -> bool:
237 """Check if the Webdav repository at a given URL actually exists.
239 Parameters
240 ----------
241 path : `Location`, `ButlerURI` or `str`
242 Location or ButlerURI containing the bucket name and filepath.
243 session : `requests.Session`, optional
244 Session object to query.
246 Returns
247 -------
248 exists : `bool`
249 True if it exists, False if no folder is found.
250 """
251 if session is None:
252 session = getHttpSession()
254 filepath = _getFileURL(path)
256 log.debug("Checking if folder exists: %s", filepath)
257 r = session.head(filepath, timeout=TIMEOUT)
258 return True if r.status_code == 200 else False
261def isWebdavEndpoint(path: Union[Location, ButlerURI, str]) -> bool:
262 """Check whether the remote HTTP endpoint implements Webdav features.
264 Parameters
265 ----------
266 path : `Location`, `ButlerURI` or `str`
267 Location or ButlerURI containing the bucket name and filepath.
269 Returns
270 -------
271 isWebdav : `bool`
272 True if the endpoint implements Webdav, False if it doesn't.
273 """
274 ca_bundle = None
275 try:
276 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE']
277 except KeyError:
278 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: "
279 "HTTPS requests will fail. If you intend to use HTTPS, please "
280 "export this variable.")
281 filepath = _getFileURL(path)
283 log.debug("Detecting HTTP endpoint type...")
284 r = requests.options(filepath, verify=ca_bundle)
285 return True if 'DAV' in r.headers else False
288def finalurl(r: requests.Response) -> str:
289 """Calculate the final URL, including redirects.
291 Check whether the remote HTTP endpoint redirects to a different
292 endpoint, and return the final destination of the request.
293 This is needed when using PUT operations, to avoid starting
294 to send the data to the endpoint, before having to send it again once
295 the 307 redirect response is received, and thus wasting bandwidth.
297 Parameters
298 ----------
299 r : `requests.Response`
300 An HTTP response received when requesting the endpoint
302 Returns
303 -------
304 destination_url: `string`
305 The final destination to which requests must be sent.
306 """
307 destination_url = r.url
308 if r.status_code == 307:
309 destination_url = r.headers['Location']
310 log.debug("Request redirected to %s", destination_url)
311 return destination_url
314def _getFileURL(path: Union[Location, ButlerURI, str]) -> str:
315 """Return the absolute URL of the resource as a string.
317 Parameters
318 ----------
319 path : `Location`, `ButlerURI` or `str`
320 Location or ButlerURI containing the bucket name and filepath.
322 Returns
323 -------
324 filepath : `str`
325 The fully qualified URL of the resource.
326 """
327 if isinstance(path, Location):
328 filepath = path.uri.geturl()
329 else:
330 filepath = ButlerURI(path).geturl()
331 return filepath
334class ButlerHttpURI(ButlerURI):
335 """General HTTP(S) resource."""
337 _session = requests.Session()
338 _sessionInitialized = False
340 @property
341 def session(self) -> requests.Session:
342 """Client object to address remote resource."""
343 if ButlerHttpURI._sessionInitialized:
344 if isTokenAuth():
345 refreshToken(ButlerHttpURI._session)
346 return ButlerHttpURI._session
348 baseURL = self.scheme + "://" + self.netloc
350 if isWebdavEndpoint(baseURL):
351 log.debug("%s looks like a Webdav endpoint.", baseURL)
352 s = getHttpSession()
353 else:
354 s = requests.Session()
356 ButlerHttpURI._session = s
357 ButlerHttpURI._sessionInitialized = True
358 return s
360 def exists(self) -> bool:
361 """Check that a remote HTTP resource exists."""
362 log.debug("Checking if resource exists: %s", self.geturl())
363 r = self.session.head(self.geturl(), timeout=TIMEOUT)
365 return True if r.status_code == 200 else False
367 def size(self) -> int:
368 """Return the size of the remote resource in bytes."""
369 if self.dirLike:
370 return 0
371 r = self.session.head(self.geturl(), timeout=TIMEOUT)
372 if r.status_code == 200:
373 return int(r.headers['Content-Length'])
374 else:
375 raise FileNotFoundError(f"Resource {self} does not exist")
377 def mkdir(self) -> None:
378 """Create the directory resource if it does not already exist."""
379 if not self.dirLike:
380 raise ValueError(f"Can not create a 'directory' for file-like URI {self}")
382 if not self.exists():
383 # We need to test the absence of the parent directory,
384 # but also if parent URL is different from self URL,
385 # otherwise we could be stuck in a recursive loop
386 # where self == parent
387 if not self.parent().exists() and self.parent().geturl() != self.geturl():
388 self.parent().mkdir()
389 log.debug("Creating new directory: %s", self.geturl())
390 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT)
391 if r.status_code != 201:
392 if r.status_code == 405:
393 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl())
394 else:
395 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}")
397 def remove(self) -> None:
398 """Remove the resource."""
399 log.debug("Removing resource: %s", self.geturl())
400 r = self.session.delete(self.geturl(), timeout=TIMEOUT)
401 if r.status_code not in [200, 202, 204]:
402 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}")
404 def _as_local(self) -> Tuple[str, bool]:
405 """Download object over HTTP and place in temporary directory.
407 Returns
408 -------
409 path : `str`
410 Path to local temporary file.
411 temporary : `bool`
412 Always returns `True`. This is always a temporary file.
413 """
414 log.debug("Downloading remote resource as local file: %s", self.geturl())
415 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT)
416 if r.status_code != 200:
417 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}")
418 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
419 for chunk in r.iter_content():
420 tmpFile.write(chunk)
421 return tmpFile.name, True
423 def read(self, size: int = -1) -> bytes:
424 """Open the resource and return the contents in bytes.
426 Parameters
427 ----------
428 size : `int`, optional
429 The number of bytes to read. Negative or omitted indicates
430 that all data should be read.
431 """
432 log.debug("Reading from remote resource: %s", self.geturl())
433 stream = True if size > 0 else False
434 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT)
435 if r.status_code != 200:
436 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}")
437 if not stream:
438 return r.content
439 else:
440 return next(r.iter_content(chunk_size=size))
442 def write(self, data: bytes, overwrite: bool = True) -> None:
443 """Write the supplied bytes to the new resource.
445 Parameters
446 ----------
447 data : `bytes`
448 The bytes to write to the resource. The entire contents of the
449 resource will be replaced.
450 overwrite : `bool`, optional
451 If `True` the resource will be overwritten if it exists. Otherwise
452 the write will fail.
453 """
454 log.debug("Writing to remote resource: %s", self.geturl())
455 if not overwrite:
456 if self.exists():
457 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
458 dest_url = finalurl(self._emptyPut())
459 r = self.session.put(dest_url, data=data, timeout=TIMEOUT)
460 if r.status_code not in [201, 202, 204]:
461 raise ValueError(f"Can not write file {self}, status code: {r.status_code}")
463 def transfer_from(self, src: ButlerURI, transfer: str = "copy",
464 overwrite: bool = False,
465 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
466 """Transfer the current resource to a Webdav repository.
468 Parameters
469 ----------
470 src : `ButlerURI`
471 Source URI.
472 transfer : `str`
473 Mode to use for transferring the resource. Supports the following
474 options: copy.
475 transaction : `DatastoreTransaction`, optional
476 Currently unused.
477 """
478 # Fail early to prevent delays if remote resources are requested
479 if transfer not in self.transferModes:
480 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}")
482 log.debug(f"Transferring {src} [exists: {src.exists()}] -> "
483 f"{self} [exists: {self.exists()}] (transfer={transfer})")
485 if self.exists():
486 raise FileExistsError(f"Destination path {self} already exists.")
488 if transfer == "auto":
489 transfer = self.transferDefault
491 if isinstance(src, type(self)):
492 if transfer == "move":
493 r = self.session.request("MOVE", src.geturl(),
494 headers={"Destination": self.geturl()},
495 timeout=TIMEOUT)
496 log.debug("Running move via MOVE HTTP request.")
497 else:
498 r = self.session.request("COPY", src.geturl(),
499 headers={"Destination": self.geturl()},
500 timeout=TIMEOUT)
501 log.debug("Running copy via COPY HTTP request.")
502 else:
503 # Use local file and upload it
504 with src.as_local() as local_uri:
505 with open(local_uri.ospath, "rb") as f:
506 dest_url = finalurl(self._emptyPut())
507 r = self.session.put(dest_url, data=f, timeout=TIMEOUT)
508 log.debug("Uploading URI %s to %s via local file", src, self)
510 if r.status_code not in [201, 202, 204]:
511 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}")
513 # This was an explicit move requested from a remote resource
514 # try to remove that resource
515 if transfer == "move":
516 # Transactions do not work here
517 src.remove()
519 def _emptyPut(self) -> requests.Response:
520 """Send an empty PUT request to current URL.
522 This is used to detect if redirection is enabled before sending actual
523 data.
525 Returns
526 -------
527 response : `requests.Response`
528 HTTP Response from the endpoint.
529 """
530 headers = {"Content-Length": "0"}
531 if useExpect100():
532 headers["Expect"] = "100-continue"
533 return self.session.put(self.geturl(), data=None, headers=headers,
534 allow_redirects=False, timeout=TIMEOUT)