Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import os 

25import os.path 

26import requests 

27import tempfile 

28import logging 

29 

30__all__ = ('ButlerHttpURI', ) 

31 

32from requests.adapters import HTTPAdapter 

33from requests.packages.urllib3.util.retry import Retry 

34 

35from typing import ( 

36 TYPE_CHECKING, 

37 Optional, 

38 Tuple, 

39 Union, 

40) 

41 

42from .utils import NoTransaction 

43from ._butlerUri import ButlerURI 

44from ..location import Location 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ..datastore import DatastoreTransaction 

48 

49log = logging.getLogger(__name__) 

50 

51 

52def getHttpSession() -> requests.Session: 

53 """Create a requests.Session pre-configured with environment variable data 

54 

55 Returns 

56 ------- 

57 session : `requests.Session` 

58 An http session used to execute requests. 

59 

60 Notes 

61 ----- 

62 The following environment variables must be set: 

63 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA 

64 certificates are stored if you intend to use HTTPS to 

65 communicate with the endpoint. 

66 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use. 

67 Possible values are X509 and TOKEN 

68 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy 

69 certificate used to authenticate requests 

70 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which 

71 contains the bearer token used to authenticate requests 

72 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an 

73 "Expect: 100-Continue" header in all requests. This is required 

74 on certain endpoints where requests redirection is made. 

75 """ 

76 

77 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) 

78 

79 session = requests.Session() 

80 session.mount("http://", HTTPAdapter(max_retries=retries)) 

81 session.mount("https://", HTTPAdapter(max_retries=retries)) 

82 

83 log.debug("Creating new HTTP session...") 

84 

85 try: 

86 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

87 except KeyError: 

88 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

89 "please use values X509 or TOKEN") 

90 

91 if env_auth_method == "X509": 

92 log.debug("... using x509 authentication.") 

93 try: 

94 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT'] 

95 except KeyError: 

96 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set") 

97 session.cert = (proxy_cert, proxy_cert) 

98 elif env_auth_method == "TOKEN": 

99 log.debug("... using bearer-token authentication.") 

100 refreshToken(session) 

101 else: 

102 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN") 

103 

104 ca_bundle = None 

105 try: 

106 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

107 except KeyError: 

108 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

109 "HTTPS requests will fail. If you intend to use HTTPS, please " 

110 "export this variable.") 

111 

112 session.verify = ca_bundle 

113 

114 # This header is required for request redirection, in dCache for example 

115 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ: 

116 log.debug("Expect: 100-Continue header enabled.") 

117 session.headers.update({'Expect': '100-continue'}) 

118 

119 log.debug("Session configured and ready.") 

120 

121 return session 

122 

123 

124def isTokenAuth() -> bool: 

125 """Returns the status of bearer-token authentication. 

126 

127 Returns 

128 ------- 

129 isTokenAuth : `bool` 

130 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise. 

131 """ 

132 try: 

133 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

134 except KeyError: 

135 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

136 "please use values X509 or TOKEN") 

137 

138 if env_auth_method == "TOKEN": 

139 return True 

140 return False 

141 

142 

143def refreshToken(session: requests.Session) -> None: 

144 """Set or update the 'Authorization' header of the session, 

145 configure bearer token authentication, with the value fetched 

146 from LSST_BUTLER_WEBDAV_TOKEN_FILE 

147 

148 Parameters 

149 ---------- 

150 session : `requests.Session` 

151 Session on which bearer token authentication must be configured 

152 """ 

153 try: 

154 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'] 

155 if not os.path.isfile(token_path): 

156 raise FileNotFoundError(f"No token file: {token_path}") 

157 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh: 

158 bearer_token = fh.read().replace('\n', '') 

159 except KeyError: 

160 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set") 

161 

162 session.headers.update({'Authorization': 'Bearer ' + bearer_token}) 

163 

164 

165def webdavCheckFileExists(path: Union[Location, ButlerURI, str], 

166 session: Optional[requests.Session] = None) -> Tuple[bool, int]: 

167 """Check that a remote HTTP resource exists. 

168 

169 Parameters 

170 ---------- 

171 path : `Location`, `ButlerURI` or `str` 

172 Location or ButlerURI containing the bucket name and filepath. 

173 session : `requests.Session`, optional 

174 Session object to query. 

175 

176 Returns 

177 ------- 

178 exists : `bool` 

179 True if resource exists, False otherwise. 

180 size : `int` 

181 Size of the resource, if it exists, in bytes, otherwise -1 

182 """ 

183 if session is None: 

184 session = getHttpSession() 

185 

186 filepath = _getFileURL(path) 

187 

188 log.debug("Checking if file exists: %s", filepath) 

189 

190 r = session.head(filepath) 

191 return (True, int(r.headers['Content-Length'])) if r.status_code == 200 else (False, -1) 

192 

193 

194def webdavDeleteFile(path: Union[Location, ButlerURI, str], 

195 session: Optional[requests.Session] = None) -> None: 

196 """Remove a remote HTTP resource. 

197 Raises a FileNotFoundError if the resource does not exist or on failure. 

198 

199 Parameters 

200 ---------- 

201 path : `Location`, `ButlerURI` or `str` 

202 Location or ButlerURI containing the bucket name and filepath. 

203 session : `requests.Session`, optional 

204 Session object to query. 

205 """ 

206 if session is None: 

207 session = getHttpSession() 

208 

209 filepath = _getFileURL(path) 

210 

211 log.debug("Removing file: %s", filepath) 

212 r = session.delete(filepath) 

213 if r.status_code not in [200, 202, 204]: 

214 raise FileNotFoundError(f"Unable to delete resource {filepath}; status code: {r.status_code}") 

215 

216 

217def folderExists(path: Union[Location, ButlerURI, str], 

218 session: Optional[requests.Session] = None) -> bool: 

219 """Check if the Webdav repository at a given URL actually exists. 

220 

221 Parameters 

222 ---------- 

223 path : `Location`, `ButlerURI` or `str` 

224 Location or ButlerURI containing the bucket name and filepath. 

225 session : `requests.Session`, optional 

226 Session object to query. 

227 

228 Returns 

229 ------- 

230 exists : `bool` 

231 True if it exists, False if no folder is found. 

232 """ 

233 if session is None: 

234 session = getHttpSession() 

235 

236 filepath = _getFileURL(path) 

237 

238 log.debug("Checking if folder exists: %s", filepath) 

239 r = session.head(filepath) 

240 return True if r.status_code == 200 else False 

241 

242 

243def isWebdavEndpoint(path: Union[Location, ButlerURI, str]) -> bool: 

244 """Check whether the remote HTTP endpoint implements Webdav features. 

245 

246 Parameters 

247 ---------- 

248 path : `Location`, `ButlerURI` or `str` 

249 Location or ButlerURI containing the bucket name and filepath. 

250 

251 Returns 

252 ------- 

253 isWebdav : `bool` 

254 True if the endpoint implements Webdav, False if it doesn't. 

255 """ 

256 ca_bundle = None 

257 try: 

258 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

259 except KeyError: 

260 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

261 "HTTPS requests will fail. If you intend to use HTTPS, please " 

262 "export this variable.") 

263 filepath = _getFileURL(path) 

264 

265 log.debug("Detecting HTTP endpoint type...") 

266 r = requests.options(filepath, verify=ca_bundle) 

267 return True if 'DAV' in r.headers else False 

268 

269 

270def finalurl(r: requests.Response) -> str: 

271 """Check whether the remote HTTP endpoint redirects to a different 

272 endpoint, and return the final destination of the request. 

273 This is needed when using PUT operations, to avoid starting 

274 to send the data to the endpoint, before having to send it again once 

275 the 307 redirect response is received, and thus wasting bandwidth. 

276 

277 Parameters 

278 ---------- 

279 r : `requests.Response` 

280 An HTTP response received when requesting the endpoint 

281 

282 Returns 

283 ------- 

284 destination_url: `string` 

285 The final destination to which requests must be sent. 

286 """ 

287 destination_url = r.url 

288 if r.status_code == 307: 

289 destination_url = r.headers['Location'] 

290 log.debug("Request redirected to %s", destination_url) 

291 return destination_url 

292 

293 

294def _getFileURL(path: Union[Location, ButlerURI, str]) -> str: 

295 """Returns the absolute URL of the resource as a string. 

296 

297 Parameters 

298 ---------- 

299 path : `Location`, `ButlerURI` or `str` 

300 Location or ButlerURI containing the bucket name and filepath. 

301 

302 Returns 

303 ------- 

304 filepath : `str` 

305 The fully qualified URL of the resource. 

306 """ 

307 if isinstance(path, Location): 

308 filepath = path.uri.geturl() 

309 else: 

310 filepath = ButlerURI(path).geturl() 

311 return filepath 

312 

313 

314class ButlerHttpURI(ButlerURI): 

315 """General HTTP(S) resource.""" 

316 _session = requests.Session() 

317 _sessionInitialized = False 

318 

319 @property 

320 def session(self) -> requests.Session: 

321 """Client object to address remote resource.""" 

322 if ButlerHttpURI._sessionInitialized: 

323 if isTokenAuth(): 

324 refreshToken(ButlerHttpURI._session) 

325 return ButlerHttpURI._session 

326 

327 baseURL = self.scheme + "://" + self.netloc 

328 

329 if isWebdavEndpoint(baseURL): 

330 log.debug("%s looks like a Webdav endpoint.", baseURL) 

331 s = getHttpSession() 

332 else: 

333 raise RuntimeError(f"Only Webdav endpoints are supported; got base URL '{baseURL}'.") 

334 

335 ButlerHttpURI._session = s 

336 ButlerHttpURI._sessionInitialized = True 

337 return s 

338 

339 def exists(self) -> bool: 

340 """Check that a remote HTTP resource exists.""" 

341 log.debug("Checking if resource exists: %s", self.geturl()) 

342 r = self.session.head(self.geturl()) 

343 

344 return True if r.status_code == 200 else False 

345 

346 def size(self) -> int: 

347 if self.dirLike: 

348 return 0 

349 r = self.session.head(self.geturl()) 

350 if r.status_code == 200: 

351 return int(r.headers['Content-Length']) 

352 else: 

353 raise FileNotFoundError(f"Resource {self} does not exist") 

354 

355 def mkdir(self) -> None: 

356 """For a dir-like URI, create the directory resource if it does not 

357 already exist. 

358 """ 

359 if not self.dirLike: 

360 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

361 

362 if not self.exists(): 

363 # We need to test the absence of the parent directory, 

364 # but also if parent URL is different from self URL, 

365 # otherwise we could be stuck in a recursive loop 

366 # where self == parent 

367 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

368 self.parent().mkdir() 

369 log.debug("Creating new directory: %s", self.geturl()) 

370 r = self.session.request("MKCOL", self.geturl()) 

371 if r.status_code != 201: 

372 if r.status_code == 405: 

373 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

374 else: 

375 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}") 

376 

377 def remove(self) -> None: 

378 """Remove the resource.""" 

379 log.debug("Removing resource: %s", self.geturl()) 

380 r = self.session.delete(self.geturl()) 

381 if r.status_code not in [200, 202, 204]: 

382 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}") 

383 

384 def _as_local(self) -> Tuple[str, bool]: 

385 """Download object over HTTP and place in temporary directory. 

386 

387 Returns 

388 ------- 

389 path : `str` 

390 Path to local temporary file. 

391 temporary : `bool` 

392 Always returns `True`. This is always a temporary file. 

393 """ 

394 log.debug("Downloading remote resource as local file: %s", self.geturl()) 

395 r = self.session.get(self.geturl(), stream=True) 

396 if r.status_code != 200: 

397 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}") 

398 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

399 for chunk in r.iter_content(): 

400 tmpFile.write(chunk) 

401 return tmpFile.name, True 

402 

403 def read(self, size: int = -1) -> bytes: 

404 """Open the resource and return the contents in bytes. 

405 

406 Parameters 

407 ---------- 

408 size : `int`, optional 

409 The number of bytes to read. Negative or omitted indicates 

410 that all data should be read. 

411 """ 

412 log.debug("Reading from remote resource: %s", self.geturl()) 

413 stream = True if size > 0 else False 

414 r = self.session.get(self.geturl(), stream=stream) 

415 if r.status_code != 200: 

416 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}") 

417 if not stream: 

418 return r.content 

419 else: 

420 return next(r.iter_content(chunk_size=size)) 

421 

422 def write(self, data: bytes, overwrite: bool = True) -> None: 

423 """Write the supplied bytes to the new resource. 

424 

425 Parameters 

426 ---------- 

427 data : `bytes` 

428 The bytes to write to the resource. The entire contents of the 

429 resource will be replaced. 

430 overwrite : `bool`, optional 

431 If `True` the resource will be overwritten if it exists. Otherwise 

432 the write will fail. 

433 """ 

434 log.debug("Writing to remote resource: %s", self.geturl()) 

435 if not overwrite: 

436 if self.exists(): 

437 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

438 dest_url = finalurl(self._emptyPut()) 

439 r = self.session.put(dest_url, data=data) 

440 if r.status_code not in [201, 202, 204]: 

441 raise ValueError(f"Can not write file {self}, status code: {r.status_code}") 

442 

443 def transfer_from(self, src: ButlerURI, transfer: str = "copy", 

444 overwrite: bool = False, 

445 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

446 """Transfer the current resource to a Webdav repository. 

447 

448 Parameters 

449 ---------- 

450 src : `ButlerURI` 

451 Source URI. 

452 transfer : `str` 

453 Mode to use for transferring the resource. Supports the following 

454 options: copy. 

455 transaction : `DatastoreTransaction`, optional 

456 Currently unused. 

457 """ 

458 # Fail early to prevent delays if remote resources are requested 

459 if transfer not in self.transferModes: 

460 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

461 

462 log.debug(f"Transferring {src} [exists: {src.exists()}] -> " 

463 f"{self} [exists: {self.exists()}] (transfer={transfer})") 

464 

465 if self.exists(): 

466 raise FileExistsError(f"Destination path {self} already exists.") 

467 

468 if transfer == "auto": 

469 transfer = self.transferDefault 

470 

471 if isinstance(src, type(self)): 

472 if transfer == "move": 

473 r = self.session.request("MOVE", src.geturl(), headers={"Destination": self.geturl()}) 

474 log.debug("Running move via MOVE HTTP request.") 

475 else: 

476 r = self.session.request("COPY", src.geturl(), headers={"Destination": self.geturl()}) 

477 log.debug("Running copy via COPY HTTP request.") 

478 else: 

479 # Use local file and upload it 

480 with src.as_local() as local_uri: 

481 with open(local_uri.ospath, "rb") as f: 

482 dest_url = finalurl(self._emptyPut()) 

483 r = self.session.put(dest_url, data=f) 

484 log.debug("Uploading URI %s to %s via local file", src, self) 

485 

486 if r.status_code not in [201, 202, 204]: 

487 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}") 

488 

489 # This was an explicit move requested from a remote resource 

490 # try to remove that resource 

491 if transfer == "move": 

492 # Transactions do not work here 

493 src.remove() 

494 

495 def _emptyPut(self) -> requests.Response: 

496 """Send an empty PUT request to current URL. This is used to detect 

497 if redirection is enabled before sending actual data. 

498 

499 Returns 

500 ------- 

501 response : `requests.Response` 

502 HTTP Response from the endpoint. 

503 """ 

504 return self.session.put(self.geturl(), data=None, 

505 headers={"Content-Length": "0"}, allow_redirects=False)