Coverage for python/lsst/daf/butler/core/_butlerUri/http.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

200 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import os 

25import os.path 

26import requests 

27import tempfile 

28import logging 

29import functools 

30 

31__all__ = ('ButlerHttpURI', ) 

32 

33from requests.adapters import HTTPAdapter 

34from requests.packages.urllib3.util.retry import Retry 

35 

36from typing import ( 

37 TYPE_CHECKING, 

38 Optional, 

39 Tuple, 

40 Union, 

41) 

42 

43from lsst.utils.timer import time_this 

44from .utils import NoTransaction 

45from ._butlerUri import ButlerURI 

46 

47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true

48 from ..datastore import DatastoreTransaction 

49 

50log = logging.getLogger(__name__) 

51 

52# Default timeout for all HTTP requests, in seconds 

53TIMEOUT = 20 

54 

55 

56def getHttpSession() -> requests.Session: 

57 """Create a requests.Session pre-configured with environment variable data. 

58 

59 Returns 

60 ------- 

61 session : `requests.Session` 

62 An http session used to execute requests. 

63 

64 Notes 

65 ----- 

66 The following environment variables must be set: 

67 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA 

68 certificates are stored if you intend to use HTTPS to 

69 communicate with the endpoint. 

70 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use. 

71 Possible values are X509 and TOKEN 

72 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy 

73 certificate used to authenticate requests 

74 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which 

75 contains the bearer token used to authenticate requests 

76 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an 

77 "Expect: 100-Continue" header in all requests. This is required 

78 on certain endpoints where requests redirection is made. 

79 """ 

80 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) 

81 

82 session = requests.Session() 

83 session.mount("http://", HTTPAdapter(max_retries=retries)) 

84 session.mount("https://", HTTPAdapter(max_retries=retries)) 

85 

86 log.debug("Creating new HTTP session...") 

87 

88 ca_bundle = None 

89 try: 

90 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

91 except KeyError: 

92 log.debug("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

93 "If you would like to trust additional CAs, please consider " 

94 "exporting this variable.") 

95 session.verify = ca_bundle 

96 

97 try: 

98 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

99 except KeyError: 

100 log.debug("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

101 "no authentication configured.") 

102 log.debug("Unauthenticated session configured and ready.") 

103 return session 

104 

105 if env_auth_method == "X509": 

106 log.debug("... using x509 authentication.") 

107 try: 

108 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT'] 

109 except KeyError: 

110 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set") 

111 session.cert = (proxy_cert, proxy_cert) 

112 elif env_auth_method == "TOKEN": 

113 log.debug("... using bearer-token authentication.") 

114 refreshToken(session) 

115 else: 

116 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN") 

117 

118 log.debug("Authenticated session configured and ready.") 

119 return session 

120 

121 

122def useExpect100() -> bool: 

123 """Return the status of the "Expect-100" header. 

124 

125 Returns 

126 ------- 

127 useExpect100 : `bool` 

128 True if LSST_BUTLER_WEBDAV_EXPECT100 is set, False otherwise. 

129 """ 

130 # This header is required for request redirection, in dCache for example 

131 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ: 

132 log.debug("Expect: 100-Continue header enabled.") 

133 return True 

134 return False 

135 

136 

137def isTokenAuth() -> bool: 

138 """Return the status of bearer-token authentication. 

139 

140 Returns 

141 ------- 

142 isTokenAuth : `bool` 

143 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise. 

144 """ 

145 try: 

146 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

147 except KeyError: 

148 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

149 "please use values X509 or TOKEN") 

150 

151 if env_auth_method == "TOKEN": 

152 return True 

153 return False 

154 

155 

156def refreshToken(session: requests.Session) -> None: 

157 """Refresh the session token. 

158 

159 Set or update the 'Authorization' header of the session, 

160 configure bearer token authentication, with the value fetched 

161 from LSST_BUTLER_WEBDAV_TOKEN_FILE 

162 

163 Parameters 

164 ---------- 

165 session : `requests.Session` 

166 Session on which bearer token authentication must be configured. 

167 """ 

168 try: 

169 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'] 

170 if not os.path.isfile(token_path): 

171 raise FileNotFoundError(f"No token file: {token_path}") 

172 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh: 

173 bearer_token = fh.read().replace('\n', '') 

174 except KeyError: 

175 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set") 

176 

177 session.headers.update({'Authorization': 'Bearer ' + bearer_token}) 

178 

179 

180@functools.lru_cache 

181def isWebdavEndpoint(path: Union[ButlerURI, str]) -> bool: 

182 """Check whether the remote HTTP endpoint implements Webdav features. 

183 

184 Parameters 

185 ---------- 

186 path : `ButlerURI` or `str` 

187 URL to the resource to be checked. 

188 Should preferably refer to the root since the status is shared 

189 by all paths in that server. 

190 

191 Returns 

192 ------- 

193 isWebdav : `bool` 

194 True if the endpoint implements Webdav, False if it doesn't. 

195 """ 

196 ca_bundle = None 

197 try: 

198 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

199 except KeyError: 

200 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

201 "some HTTPS requests will fail. If you intend to use HTTPS, please " 

202 "export this variable.") 

203 

204 log.debug("Detecting HTTP endpoint type for '%s'...", path) 

205 r = requests.options(str(path), verify=ca_bundle) 

206 return True if 'DAV' in r.headers else False 

207 

208 

209def finalurl(r: requests.Response) -> str: 

210 """Calculate the final URL, including redirects. 

211 

212 Check whether the remote HTTP endpoint redirects to a different 

213 endpoint, and return the final destination of the request. 

214 This is needed when using PUT operations, to avoid starting 

215 to send the data to the endpoint, before having to send it again once 

216 the 307 redirect response is received, and thus wasting bandwidth. 

217 

218 Parameters 

219 ---------- 

220 r : `requests.Response` 

221 An HTTP response received when requesting the endpoint 

222 

223 Returns 

224 ------- 

225 destination_url: `string` 

226 The final destination to which requests must be sent. 

227 """ 

228 destination_url = r.url 

229 if r.status_code == 307: 

230 destination_url = r.headers['Location'] 

231 log.debug("Request redirected to %s", destination_url) 

232 return destination_url 

233 

234 

235class ButlerHttpURI(ButlerURI): 

236 """General HTTP(S) resource.""" 

237 

238 _session = requests.Session() 

239 _sessionInitialized = False 

240 _is_webdav: Optional[bool] = None 

241 

242 @property 

243 def session(self) -> requests.Session: 

244 """Client object to address remote resource.""" 

245 if ButlerHttpURI._sessionInitialized: 

246 if isTokenAuth(): 

247 refreshToken(ButlerHttpURI._session) 

248 return ButlerHttpURI._session 

249 

250 s = getHttpSession() 

251 ButlerHttpURI._session = s 

252 ButlerHttpURI._sessionInitialized = True 

253 return s 

254 

255 @property 

256 def is_webdav_endpoint(self) -> bool: 

257 """Check if the current endpoint implements WebDAV features. 

258 

259 This is stored per URI but cached by root so there is 

260 only one check per hostname. 

261 """ 

262 if self._is_webdav is not None: 

263 return self._is_webdav 

264 

265 self._is_webdav = isWebdavEndpoint(self.root_uri()) 

266 return self._is_webdav 

267 

268 def exists(self) -> bool: 

269 """Check that a remote HTTP resource exists.""" 

270 log.debug("Checking if resource exists: %s", self.geturl()) 

271 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

272 

273 return True if r.status_code == 200 else False 

274 

275 def size(self) -> int: 

276 """Return the size of the remote resource in bytes.""" 

277 if self.dirLike: 

278 return 0 

279 r = self.session.head(self.geturl(), timeout=TIMEOUT) 

280 if r.status_code == 200: 

281 return int(r.headers['Content-Length']) 

282 else: 

283 raise FileNotFoundError(f"Resource {self} does not exist") 

284 

285 def mkdir(self) -> None: 

286 """Create the directory resource if it does not already exist.""" 

287 # Only available on WebDAV backends 

288 if not self.is_webdav_endpoint: 

289 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

290 

291 if not self.dirLike: 

292 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

293 

294 if not self.exists(): 

295 # We need to test the absence of the parent directory, 

296 # but also if parent URL is different from self URL, 

297 # otherwise we could be stuck in a recursive loop 

298 # where self == parent 

299 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

300 self.parent().mkdir() 

301 log.debug("Creating new directory: %s", self.geturl()) 

302 r = self.session.request("MKCOL", self.geturl(), timeout=TIMEOUT) 

303 if r.status_code != 201: 

304 if r.status_code == 405: 

305 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

306 else: 

307 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}") 

308 

309 def remove(self) -> None: 

310 """Remove the resource.""" 

311 log.debug("Removing resource: %s", self.geturl()) 

312 r = self.session.delete(self.geturl(), timeout=TIMEOUT) 

313 if r.status_code not in [200, 202, 204]: 

314 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}") 

315 

316 def _as_local(self) -> Tuple[str, bool]: 

317 """Download object over HTTP and place in temporary directory. 

318 

319 Returns 

320 ------- 

321 path : `str` 

322 Path to local temporary file. 

323 temporary : `bool` 

324 Always returns `True`. This is always a temporary file. 

325 """ 

326 log.debug("Downloading remote resource as local file: %s", self.geturl()) 

327 r = self.session.get(self.geturl(), stream=True, timeout=TIMEOUT) 

328 if r.status_code != 200: 

329 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}") 

330 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

331 with time_this(log, msg="Downloading %s to local file", args=(self,)): 

332 for chunk in r.iter_content(): 

333 tmpFile.write(chunk) 

334 return tmpFile.name, True 

335 

336 def read(self, size: int = -1) -> bytes: 

337 """Open the resource and return the contents in bytes. 

338 

339 Parameters 

340 ---------- 

341 size : `int`, optional 

342 The number of bytes to read. Negative or omitted indicates 

343 that all data should be read. 

344 """ 

345 log.debug("Reading from remote resource: %s", self.geturl()) 

346 stream = True if size > 0 else False 

347 with time_this(log, msg="Read from remote resource %s", args=(self,)): 

348 r = self.session.get(self.geturl(), stream=stream, timeout=TIMEOUT) 

349 if r.status_code != 200: 

350 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}") 

351 if not stream: 

352 return r.content 

353 else: 

354 return next(r.iter_content(chunk_size=size)) 

355 

356 def write(self, data: bytes, overwrite: bool = True) -> None: 

357 """Write the supplied bytes to the new resource. 

358 

359 Parameters 

360 ---------- 

361 data : `bytes` 

362 The bytes to write to the resource. The entire contents of the 

363 resource will be replaced. 

364 overwrite : `bool`, optional 

365 If `True` the resource will be overwritten if it exists. Otherwise 

366 the write will fail. 

367 """ 

368 log.debug("Writing to remote resource: %s", self.geturl()) 

369 if not overwrite: 

370 if self.exists(): 

371 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

372 dest_url = finalurl(self._emptyPut()) 

373 with time_this(log, msg="Write data to remote %s", args=(self,)): 

374 r = self.session.put(dest_url, data=data, timeout=TIMEOUT) 

375 if r.status_code not in [201, 202, 204]: 

376 raise ValueError(f"Can not write file {self}, status code: {r.status_code}") 

377 

378 def transfer_from(self, src: ButlerURI, transfer: str = "copy", 

379 overwrite: bool = False, 

380 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

381 """Transfer the current resource to a Webdav repository. 

382 

383 Parameters 

384 ---------- 

385 src : `ButlerURI` 

386 Source URI. 

387 transfer : `str` 

388 Mode to use for transferring the resource. Supports the following 

389 options: copy. 

390 transaction : `DatastoreTransaction`, optional 

391 Currently unused. 

392 """ 

393 # Fail early to prevent delays if remote resources are requested 

394 if transfer not in self.transferModes: 

395 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

396 

397 # Existence checks cost time so do not call this unless we know 

398 # that debugging is enabled. 

399 if log.isEnabledFor(logging.DEBUG): 

400 log.debug("Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

401 src, src.exists(), self, self.exists(), transfer) 

402 

403 if self.exists(): 

404 raise FileExistsError(f"Destination path {self} already exists.") 

405 

406 if transfer == "auto": 

407 transfer = self.transferDefault 

408 

409 if isinstance(src, type(self)): 

410 # Only available on WebDAV backends 

411 if not self.is_webdav_endpoint: 

412 raise NotImplementedError("Endpoint does not implement WebDAV functionality") 

413 

414 with time_this(log, msg="Transfer from %s to %s directly", args=(src, self)): 

415 if transfer == "move": 

416 r = self.session.request("MOVE", src.geturl(), 

417 headers={"Destination": self.geturl()}, 

418 timeout=TIMEOUT) 

419 log.debug("Running move via MOVE HTTP request.") 

420 else: 

421 r = self.session.request("COPY", src.geturl(), 

422 headers={"Destination": self.geturl()}, 

423 timeout=TIMEOUT) 

424 log.debug("Running copy via COPY HTTP request.") 

425 else: 

426 # Use local file and upload it 

427 with src.as_local() as local_uri: 

428 with open(local_uri.ospath, "rb") as f: 

429 dest_url = finalurl(self._emptyPut()) 

430 with time_this(log, msg="Transfer from %s to %s via local file", args=(src, self)): 

431 r = self.session.put(dest_url, data=f, timeout=TIMEOUT) 

432 

433 if r.status_code not in [201, 202, 204]: 

434 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}") 

435 

436 # This was an explicit move requested from a remote resource 

437 # try to remove that resource 

438 if transfer == "move": 

439 # Transactions do not work here 

440 src.remove() 

441 

442 def _emptyPut(self) -> requests.Response: 

443 """Send an empty PUT request to current URL. 

444 

445 This is used to detect if redirection is enabled before sending actual 

446 data. 

447 

448 Returns 

449 ------- 

450 response : `requests.Response` 

451 HTTP Response from the endpoint. 

452 """ 

453 headers = {"Content-Length": "0"} 

454 if useExpect100(): 

455 headers["Expect"] = "100-continue" 

456 return self.session.put(self.geturl(), data=None, headers=headers, 

457 allow_redirects=False, timeout=TIMEOUT)