Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import os 

25import os.path 

26import requests 

27import tempfile 

28import logging 

29 

30__all__ = ('ButlerHttpURI', ) 

31 

32from requests.adapters import HTTPAdapter 

33from requests.packages.urllib3.util.retry import Retry 

34 

35from typing import ( 

36 TYPE_CHECKING, 

37 Optional, 

38 Tuple, 

39 Union, 

40) 

41 

42from .utils import NoTransaction 

43from ._butlerUri import ButlerURI 

44from ..location import Location 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from ..datastore import DatastoreTransaction 

48 

49log = logging.getLogger(__name__) 

50 

51 

52def getHttpSession() -> requests.Session: 

53 """Create a requests.Session pre-configured with environment variable data 

54 

55 Returns 

56 ------- 

57 session : `requests.Session` 

58 An http session used to execute requests. 

59 

60 Notes 

61 ----- 

62 The following environment variables must be set: 

63 - LSST_BUTLER_WEBDAV_CA_BUNDLE: the directory where CA 

64 certificates are stored if you intend to use HTTPS to 

65 communicate with the endpoint. 

66 - LSST_BUTLER_WEBDAV_AUTH: which authentication method to use. 

67 Possible values are X509 and TOKEN 

68 - (X509 only) LSST_BUTLER_WEBDAV_PROXY_CERT: path to proxy 

69 certificate used to authenticate requests 

70 - (TOKEN only) LSST_BUTLER_WEBDAV_TOKEN_FILE: file which 

71 contains the bearer token used to authenticate requests 

72 - (OPTIONAL) LSST_BUTLER_WEBDAV_EXPECT100: if set, we will add an 

73 "Expect: 100-Continue" header in all requests. This is required 

74 on certain endpoints where requests redirection is made. 

75 """ 

76 

77 retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) 

78 

79 session = requests.Session() 

80 session.mount("http://", HTTPAdapter(max_retries=retries)) 

81 session.mount("https://", HTTPAdapter(max_retries=retries)) 

82 

83 log.debug("Creating new HTTP session...") 

84 

85 try: 

86 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

87 except KeyError: 

88 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

89 "please use values X509 or TOKEN") 

90 

91 if env_auth_method == "X509": 

92 log.debug("... using x509 authentication.") 

93 try: 

94 proxy_cert = os.environ['LSST_BUTLER_WEBDAV_PROXY_CERT'] 

95 except KeyError: 

96 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_PROXY_CERT is not set") 

97 session.cert = (proxy_cert, proxy_cert) 

98 elif env_auth_method == "TOKEN": 

99 log.debug("... using bearer-token authentication.") 

100 refreshToken(session) 

101 else: 

102 raise ValueError("Environment variable LSST_BUTLER_WEBDAV_AUTH must be set to X509 or TOKEN") 

103 

104 ca_bundle = None 

105 try: 

106 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

107 except KeyError: 

108 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

109 "HTTPS requests will fail. If you intend to use HTTPS, please " 

110 "export this variable.") 

111 

112 session.verify = ca_bundle 

113 

114 # This header is required for request redirection, in dCache for example 

115 if "LSST_BUTLER_WEBDAV_EXPECT100" in os.environ: 

116 log.debug("Expect: 100-Continue header enabled.") 

117 session.headers.update({'Expect': '100-continue'}) 

118 

119 log.debug("Session configured and ready.") 

120 

121 return session 

122 

123 

124def isTokenAuth() -> bool: 

125 """Returns the status of bearer-token authentication. 

126 

127 Returns 

128 ------- 

129 isTokenAuth : `bool` 

130 True if LSST_BUTLER_WEBDAV_AUTH is set to TOKEN, False otherwise. 

131 """ 

132 try: 

133 env_auth_method = os.environ['LSST_BUTLER_WEBDAV_AUTH'] 

134 except KeyError: 

135 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_AUTH is not set, " 

136 "please use values X509 or TOKEN") 

137 

138 if env_auth_method == "TOKEN": 

139 return True 

140 return False 

141 

142 

143def refreshToken(session: requests.Session) -> None: 

144 """Set or update the 'Authorization' header of the session, 

145 configure bearer token authentication, with the value fetched 

146 from LSST_BUTLER_WEBDAV_TOKEN_FILE 

147 

148 Parameters 

149 ---------- 

150 session : `requests.Session` 

151 Session on which bearer token authentication must be configured 

152 """ 

153 try: 

154 token_path = os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'] 

155 if not os.path.isfile(token_path): 

156 raise FileNotFoundError(f"No token file: {token_path}") 

157 with open(os.environ['LSST_BUTLER_WEBDAV_TOKEN_FILE'], "r") as fh: 

158 bearer_token = fh.read().replace('\n', '') 

159 except KeyError: 

160 raise KeyError("Environment variable LSST_BUTLER_WEBDAV_TOKEN_FILE is not set") 

161 

162 session.headers.update({'Authorization': 'Bearer ' + bearer_token}) 

163 

164 

165def webdavCheckFileExists(path: Union[Location, ButlerURI, str], 

166 session: Optional[requests.Session] = None) -> Tuple[bool, int]: 

167 """Check that a remote HTTP resource exists. 

168 

169 Parameters 

170 ---------- 

171 path : `Location`, `ButlerURI` or `str` 

172 Location or ButlerURI containing the bucket name and filepath. 

173 session : `requests.Session`, optional 

174 Session object to query. 

175 

176 Returns 

177 ------- 

178 exists : `bool` 

179 True if resource exists, False otherwise. 

180 size : `int` 

181 Size of the resource, if it exists, in bytes, otherwise -1 

182 """ 

183 if session is None: 

184 session = getHttpSession() 

185 

186 filepath = _getFileURL(path) 

187 

188 log.debug("Checking if file exists: %s", filepath) 

189 

190 r = session.head(filepath) 

191 return (True, int(r.headers['Content-Length'])) if r.status_code == 200 else (False, -1) 

192 

193 

194def webdavDeleteFile(path: Union[Location, ButlerURI, str], 

195 session: Optional[requests.Session] = None) -> None: 

196 """Remove a remote HTTP resource. 

197 Raises a FileNotFoundError if the resource does not exist or on failure. 

198 

199 Parameters 

200 ---------- 

201 path : `Location`, `ButlerURI` or `str` 

202 Location or ButlerURI containing the bucket name and filepath. 

203 session : `requests.Session`, optional 

204 Session object to query. 

205 """ 

206 if session is None: 

207 session = getHttpSession() 

208 

209 filepath = _getFileURL(path) 

210 

211 log.debug("Removing file: %s", filepath) 

212 r = session.delete(filepath) 

213 if r.status_code not in [200, 202, 204]: 

214 raise FileNotFoundError(f"Unable to delete resource {filepath}; status code: {r.status_code}") 

215 

216 

217def folderExists(path: Union[Location, ButlerURI, str], 

218 session: Optional[requests.Session] = None) -> bool: 

219 """Check if the Webdav repository at a given URL actually exists. 

220 

221 Parameters 

222 ---------- 

223 path : `Location`, `ButlerURI` or `str` 

224 Location or ButlerURI containing the bucket name and filepath. 

225 session : `requests.Session`, optional 

226 Session object to query. 

227 

228 Returns 

229 ------- 

230 exists : `bool` 

231 True if it exists, False if no folder is found. 

232 """ 

233 if session is None: 

234 session = getHttpSession() 

235 

236 filepath = _getFileURL(path) 

237 

238 log.debug("Checking if folder exists: %s", filepath) 

239 r = session.head(filepath) 

240 return True if r.status_code == 200 else False 

241 

242 

243def isWebdavEndpoint(path: Union[Location, ButlerURI, str]) -> bool: 

244 """Check whether the remote HTTP endpoint implements Webdav features. 

245 

246 Parameters 

247 ---------- 

248 path : `Location`, `ButlerURI` or `str` 

249 Location or ButlerURI containing the bucket name and filepath. 

250 

251 Returns 

252 ------- 

253 isWebdav : `bool` 

254 True if the endpoint implements Webdav, False if it doesn't. 

255 """ 

256 ca_bundle = None 

257 try: 

258 ca_bundle = os.environ['LSST_BUTLER_WEBDAV_CA_BUNDLE'] 

259 except KeyError: 

260 log.warning("Environment variable LSST_BUTLER_WEBDAV_CA_BUNDLE is not set: " 

261 "HTTPS requests will fail. If you intend to use HTTPS, please " 

262 "export this variable.") 

263 filepath = _getFileURL(path) 

264 

265 log.debug("Detecting HTTP endpoint type...") 

266 r = requests.options(filepath, verify=ca_bundle) 

267 return True if 'DAV' in r.headers else False 

268 

269 

270def finalurl(r: requests.Response) -> str: 

271 """Check whether the remote HTTP endpoint redirects to a different 

272 endpoint, and return the final destination of the request. 

273 This is needed when using PUT operations, to avoid starting 

274 to send the data to the endpoint, before having to send it again once 

275 the 307 redirect response is received, and thus wasting bandwidth. 

276 

277 Parameters 

278 ---------- 

279 r : `requests.Response` 

280 An HTTP response received when requesting the endpoint 

281 

282 Returns 

283 ------- 

284 destination_url: `string` 

285 The final destination to which requests must be sent. 

286 """ 

287 destination_url = r.url 

288 if r.status_code == 307: 

289 destination_url = r.headers['Location'] 

290 log.debug("Request redirected to %s", destination_url) 

291 return destination_url 

292 

293 

294def _getFileURL(path: Union[Location, ButlerURI, str]) -> str: 

295 """Returns the absolute URL of the resource as a string. 

296 

297 Parameters 

298 ---------- 

299 path : `Location`, `ButlerURI` or `str` 

300 Location or ButlerURI containing the bucket name and filepath. 

301 

302 Returns 

303 ------- 

304 filepath : `str` 

305 The fully qualified URL of the resource. 

306 """ 

307 if isinstance(path, Location): 

308 filepath = path.uri.geturl() 

309 else: 

310 filepath = ButlerURI(path).geturl() 

311 return filepath 

312 

313 

314class ButlerHttpURI(ButlerURI): 

315 """General HTTP(S) resource.""" 

316 _session = requests.Session() 

317 _sessionInitialized = False 

318 

319 @property 

320 def session(self) -> requests.Session: 

321 """Client object to address remote resource.""" 

322 if ButlerHttpURI._sessionInitialized: 

323 if isTokenAuth(): 

324 refreshToken(ButlerHttpURI._session) 

325 return ButlerHttpURI._session 

326 

327 baseURL = self.scheme + "://" + self.netloc 

328 

329 if isWebdavEndpoint(baseURL): 

330 log.debug("%s looks like a Webdav endpoint.", baseURL) 

331 s = getHttpSession() 

332 

333 ButlerHttpURI._session = s 

334 ButlerHttpURI._sessionInitialized = True 

335 return s 

336 

337 def exists(self) -> bool: 

338 """Check that a remote HTTP resource exists.""" 

339 log.debug("Checking if resource exists: %s", self.geturl()) 

340 r = self.session.head(self.geturl()) 

341 

342 return True if r.status_code == 200 else False 

343 

344 def size(self) -> int: 

345 if self.dirLike: 

346 return 0 

347 r = self.session.head(self.geturl()) 

348 if r.status_code == 200: 

349 return int(r.headers['Content-Length']) 

350 else: 

351 raise FileNotFoundError(f"Resource {self} does not exist") 

352 

353 def mkdir(self) -> None: 

354 """For a dir-like URI, create the directory resource if it does not 

355 already exist. 

356 """ 

357 if not self.dirLike: 

358 raise ValueError(f"Can not create a 'directory' for file-like URI {self}") 

359 

360 if not self.exists(): 

361 # We need to test the absence of the parent directory, 

362 # but also if parent URL is different from self URL, 

363 # otherwise we could be stuck in a recursive loop 

364 # where self == parent 

365 if not self.parent().exists() and self.parent().geturl() != self.geturl(): 

366 self.parent().mkdir() 

367 log.debug("Creating new directory: %s", self.geturl()) 

368 r = self.session.request("MKCOL", self.geturl()) 

369 if r.status_code != 201: 

370 if r.status_code == 405: 

371 log.debug("Can not create directory: %s may already exist: skipping.", self.geturl()) 

372 else: 

373 raise ValueError(f"Can not create directory {self}, status code: {r.status_code}") 

374 

375 def remove(self) -> None: 

376 """Remove the resource.""" 

377 log.debug("Removing resource: %s", self.geturl()) 

378 r = self.session.delete(self.geturl()) 

379 if r.status_code not in [200, 202, 204]: 

380 raise FileNotFoundError(f"Unable to delete resource {self}; status code: {r.status_code}") 

381 

382 def _as_local(self) -> Tuple[str, bool]: 

383 """Download object over HTTP and place in temporary directory. 

384 

385 Returns 

386 ------- 

387 path : `str` 

388 Path to local temporary file. 

389 temporary : `bool` 

390 Always returns `True`. This is always a temporary file. 

391 """ 

392 log.debug("Downloading remote resource as local file: %s", self.geturl()) 

393 r = self.session.get(self.geturl(), stream=True) 

394 if r.status_code != 200: 

395 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}") 

396 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile: 

397 for chunk in r.iter_content(): 

398 tmpFile.write(chunk) 

399 return tmpFile.name, True 

400 

401 def read(self, size: int = -1) -> bytes: 

402 """Open the resource and return the contents in bytes. 

403 

404 Parameters 

405 ---------- 

406 size : `int`, optional 

407 The number of bytes to read. Negative or omitted indicates 

408 that all data should be read. 

409 """ 

410 log.debug("Reading from remote resource: %s", self.geturl()) 

411 stream = True if size > 0 else False 

412 r = self.session.get(self.geturl(), stream=stream) 

413 if r.status_code != 200: 

414 raise FileNotFoundError(f"Unable to read resource {self}; status code: {r.status_code}") 

415 if not stream: 

416 return r.content 

417 else: 

418 return next(r.iter_content(chunk_size=size)) 

419 

420 def write(self, data: bytes, overwrite: bool = True) -> None: 

421 """Write the supplied bytes to the new resource. 

422 

423 Parameters 

424 ---------- 

425 data : `bytes` 

426 The bytes to write to the resource. The entire contents of the 

427 resource will be replaced. 

428 overwrite : `bool`, optional 

429 If `True` the resource will be overwritten if it exists. Otherwise 

430 the write will fail. 

431 """ 

432 log.debug("Writing to remote resource: %s", self.geturl()) 

433 if not overwrite: 

434 if self.exists(): 

435 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled") 

436 dest_url = finalurl(self._emptyPut()) 

437 r = self.session.put(dest_url, data=data) 

438 if r.status_code not in [201, 202, 204]: 

439 raise ValueError(f"Can not write file {self}, status code: {r.status_code}") 

440 

441 def transfer_from(self, src: ButlerURI, transfer: str = "copy", 

442 overwrite: bool = False, 

443 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

444 """Transfer the current resource to a Webdav repository. 

445 

446 Parameters 

447 ---------- 

448 src : `ButlerURI` 

449 Source URI. 

450 transfer : `str` 

451 Mode to use for transferring the resource. Supports the following 

452 options: copy. 

453 transaction : `DatastoreTransaction`, optional 

454 Currently unused. 

455 """ 

456 # Fail early to prevent delays if remote resources are requested 

457 if transfer not in self.transferModes: 

458 raise ValueError(f"Transfer mode {transfer} not supported by URI scheme {self.scheme}") 

459 

460 log.debug(f"Transferring {src} [exists: {src.exists()}] -> " 

461 f"{self} [exists: {self.exists()}] (transfer={transfer})") 

462 

463 if self.exists(): 

464 raise FileExistsError(f"Destination path {self} already exists.") 

465 

466 if transfer == "auto": 

467 transfer = self.transferDefault 

468 

469 if isinstance(src, type(self)): 

470 if transfer == "move": 

471 r = self.session.request("MOVE", src.geturl(), headers={"Destination": self.geturl()}) 

472 log.debug("Running move via MOVE HTTP request.") 

473 else: 

474 r = self.session.request("COPY", src.geturl(), headers={"Destination": self.geturl()}) 

475 log.debug("Running copy via COPY HTTP request.") 

476 else: 

477 # Use local file and upload it 

478 with src.as_local() as local_uri: 

479 with open(local_uri.ospath, "rb") as f: 

480 dest_url = finalurl(self._emptyPut()) 

481 r = self.session.put(dest_url, data=f) 

482 log.debug("Uploading URI %s to %s via local file", src, self) 

483 

484 if r.status_code not in [201, 202, 204]: 

485 raise ValueError(f"Can not transfer file {self}, status code: {r.status_code}") 

486 

487 # This was an explicit move requested from a remote resource 

488 # try to remove that resource 

489 if transfer == "move": 

490 # Transactions do not work here 

491 src.remove() 

492 

493 def _emptyPut(self) -> requests.Response: 

494 """Send an empty PUT request to current URL. This is used to detect 

495 if redirection is enabled before sending actual data. 

496 

497 Returns 

498 ------- 

499 response : `requests.Response` 

500 HTTP Response from the endpoint. 

501 """ 

502 return self.session.put(self.geturl(), data=None, 

503 headers={"Content-Length": "0"}, allow_redirects=False)