Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import urllib 

25import posixpath 

26import copy 

27import logging 

28import re 

29 

30from pathlib import PurePath, PurePosixPath 

31 

32__all__ = ('ButlerURI',) 

33 

34from typing import ( 

35 TYPE_CHECKING, 

36 Any, 

37 Optional, 

38 Tuple, 

39 Type, 

40 Union, 

41) 

42 

43from .utils import NoTransaction 

44 

45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true

46 from ..datastore import DatastoreTransaction 

47 

48 

49log = logging.getLogger(__name__) 

50 

51# Regex for looking for URI escapes 

52ESCAPES_RE = re.compile(r"%[A-F0-9]{2}") 

53 

54 

55class ButlerURI: 

56 """Convenience wrapper around URI parsers. 

57 

58 Provides access to URI components and can convert file 

59 paths into absolute path URIs. Scheme-less URIs are treated as if 

60 they are local file system paths and are converted to absolute URIs. 

61 

62 A specialist subclass is created for each supported URI scheme. 

63 

64 Parameters 

65 ---------- 

66 uri : `str` or `urllib.parse.ParseResult` 

67 URI in string form. Can be scheme-less if referring to a local 

68 filesystem path. 

69 root : `str` or `ButlerURI`, optional 

70 When fixing up a relative path in a ``file`` scheme or if scheme-less, 

71 use this as the root. Must be absolute. If `None` the current 

72 working directory will be used. Can be a file URI. 

73 forceAbsolute : `bool`, optional 

74 If `True`, scheme-less relative URI will be converted to an absolute 

75 path using a ``file`` scheme. If `False` scheme-less URI will remain 

76 scheme-less and will not be updated to ``file`` or absolute path. 

77 forceDirectory: `bool`, optional 

78 If `True` forces the URI to end with a separator, otherwise given URI 

79 is interpreted as is. 

80 """ 

81 

82 _pathLib: Type[PurePath] = PurePosixPath 

83 """Path library to use for this scheme.""" 

84 

85 _pathModule = posixpath 

86 """Path module to use for this scheme.""" 

87 

88 transferModes: Tuple[str, ...] = ("copy", "auto", "move") 

89 """Transfer modes supported by this implementation. 

90 

91 Move is special in that it is generally a copy followed by an unlink. 

92 Whether that unlink works depends critically on whether the source URI 

93 implements unlink. If it does not the move will be reported as a failure. 

94 """ 

95 

96 transferDefault: str = "copy" 

97 """Default mode to use for transferring if ``auto`` is specified.""" 

98 

99 quotePaths = True 

100 """True if path-like elements modifying a URI should be quoted. 

101 

102 All non-schemeless URIs have to internally use quoted paths. Therefore 

103 if a new file name is given (e.g. to updateFile or join) a decision must 

104 be made whether to quote it to be consistent. 

105 """ 

106 

107 # This is not an ABC with abstract methods because the __new__ being 

108 # a factory confuses mypy such that it assumes that every constructor 

109 # returns a ButlerURI and then determines that all the abstract methods 

110 # are still abstract. If they are not marked abstract but just raise 

111 # mypy is fine with it. 

112 

113 # mypy is confused without this 

114 _uri: urllib.parse.ParseResult 

115 

116 def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI], 

117 root: Optional[Union[str, ButlerURI]] = None, forceAbsolute: bool = True, 

118 forceDirectory: bool = False) -> ButlerURI: 

119 parsed: urllib.parse.ParseResult 

120 dirLike: bool 

121 subclass: Optional[Type] = None 

122 

123 # Record if we need to post process the URI components 

124 # or if the instance is already fully configured 

125 if isinstance(uri, str): 

126 # Since local file names can have special characters in them 

127 # we need to quote them for the parser but we can unquote 

128 # later. Assume that all other URI schemes are quoted. 

129 # Since sometimes people write file:/a/b and not file:///a/b 

130 # we should not quote in the explicit case of file: 

131 if "://" not in uri and not uri.startswith("file:"): 

132 if ESCAPES_RE.search(uri): 132 ↛ 133line 132 didn't jump to line 133, because the condition on line 132 was never true

133 log.warning("Possible double encoding of %s", uri) 

134 else: 

135 uri = urllib.parse.quote(uri) 

136 parsed = urllib.parse.urlparse(uri) 

137 elif isinstance(uri, urllib.parse.ParseResult): 

138 parsed = copy.copy(uri) 

139 elif isinstance(uri, ButlerURI): 139 ↛ 145line 139 didn't jump to line 145, because the condition on line 139 was never false

140 parsed = copy.copy(uri._uri) 

141 dirLike = uri.dirLike 

142 # No further parsing required and we know the subclass 

143 subclass = type(uri) 

144 else: 

145 raise ValueError(f"Supplied URI must be string, ButlerURI, or ParseResult but got '{uri!r}'") 

146 

147 if subclass is None: 

148 # Work out the subclass from the URI scheme 

149 if not parsed.scheme: 

150 from .schemeless import ButlerSchemelessURI 

151 subclass = ButlerSchemelessURI 

152 elif parsed.scheme == "file": 152 ↛ 153line 152 didn't jump to line 153, because the condition on line 152 was never true

153 from .file import ButlerFileURI 

154 subclass = ButlerFileURI 

155 elif parsed.scheme == "s3": 155 ↛ 156line 155 didn't jump to line 156, because the condition on line 155 was never true

156 from .s3 import ButlerS3URI 

157 subclass = ButlerS3URI 

158 elif parsed.scheme.startswith("http"): 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true

159 from .http import ButlerHttpURI 

160 subclass = ButlerHttpURI 

161 elif parsed.scheme == "resource": 161 ↛ 165line 161 didn't jump to line 165, because the condition on line 161 was never false

162 # Rules for scheme names disallow pkg_resource 

163 from .packageresource import ButlerPackageResourceURI 

164 subclass = ButlerPackageResourceURI 

165 elif parsed.scheme == "mem": 

166 # in-memory datastore object 

167 from .mem import ButlerInMemoryURI 

168 subclass = ButlerInMemoryURI 

169 else: 

170 raise NotImplementedError(f"No URI support for scheme: '{parsed.scheme}'" 

171 " in {parsed.geturl()}") 

172 

173 parsed, dirLike = subclass._fixupPathUri(parsed, root=root, 

174 forceAbsolute=forceAbsolute, 

175 forceDirectory=forceDirectory) 

176 

177 # It is possible for the class to change from schemeless 

178 # to file so handle that 

179 if parsed.scheme == "file": 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true

180 from .file import ButlerFileURI 

181 subclass = ButlerFileURI 

182 

183 # Now create an instance of the correct subclass and set the 

184 # attributes directly 

185 self = object.__new__(subclass) 

186 self._uri = parsed 

187 self.dirLike = dirLike 

188 return self 

189 

190 @property 

191 def scheme(self) -> str: 

192 """The URI scheme (``://`` is not part of the scheme).""" 

193 return self._uri.scheme 

194 

195 @property 

196 def netloc(self) -> str: 

197 """The URI network location.""" 

198 return self._uri.netloc 

199 

200 @property 

201 def path(self) -> str: 

202 """The path component of the URI.""" 

203 return self._uri.path 

204 

205 @property 

206 def unquoted_path(self) -> str: 

207 """The path component of the URI with any URI quoting reversed.""" 

208 return urllib.parse.unquote(self._uri.path) 

209 

210 @property 

211 def ospath(self) -> str: 

212 """Path component of the URI localized to current OS.""" 

213 raise AttributeError(f"Non-file URI ({self}) has no local OS path.") 

214 

215 @property 

216 def relativeToPathRoot(self) -> str: 

217 """Returns path relative to network location. 

218 

219 Effectively, this is the path property with posix separator stripped 

220 from the left hand side of the path. 

221 

222 Always unquotes. 

223 """ 

224 p = self._pathLib(self.path) 

225 relToRoot = str(p.relative_to(p.root)) 

226 if self.dirLike and not relToRoot.endswith("/"): 226 ↛ 227line 226 didn't jump to line 227, because the condition on line 226 was never true

227 relToRoot += "/" 

228 return urllib.parse.unquote(relToRoot) 

229 

230 @property 

231 def is_root(self) -> bool: 

232 """`True` if this URI points to the root of the network location. 

233 

234 This means that the path components refers to the top level. 

235 """ 

236 relpath = self.relativeToPathRoot 

237 if relpath == "./": 

238 return True 

239 return False 

240 

241 @property 

242 def fragment(self) -> str: 

243 """The fragment component of the URI.""" 

244 return self._uri.fragment 

245 

246 @property 

247 def params(self) -> str: 

248 """Any parameters included in the URI.""" 

249 return self._uri.params 

250 

251 @property 

252 def query(self) -> str: 

253 """Any query strings included in the URI.""" 

254 return self._uri.query 

255 

256 def geturl(self) -> str: 

257 """Return the URI in string form. 

258 

259 Returns 

260 ------- 

261 url : `str` 

262 String form of URI. 

263 """ 

264 return self._uri.geturl() 

265 

266 def split(self) -> Tuple[ButlerURI, str]: 

267 """Splits URI into head and tail. Equivalent to os.path.split where 

268 head preserves the URI components. 

269 

270 Returns 

271 ------- 

272 head: `ButlerURI` 

273 Everything leading up to tail, expanded and normalized as per 

274 ButlerURI rules. 

275 tail : `str` 

276 Last `self.path` component. Tail will be empty if path ends on a 

277 separator. Tail will never contain separators. It will be 

278 unquoted. 

279 """ 

280 head, tail = self._pathModule.split(self.path) 

281 headuri = self._uri._replace(path=head) 

282 

283 # The file part should never include quoted metacharacters 

284 tail = urllib.parse.unquote(tail) 

285 

286 # Schemeless is special in that it can be a relative path 

287 # We need to ensure that it stays that way. All other URIs will 

288 # be absolute already. 

289 forceAbsolute = self._pathModule.isabs(self.path) 

290 return ButlerURI(headuri, forceDirectory=True, forceAbsolute=forceAbsolute), tail 

291 

292 def basename(self) -> str: 

293 """Returns the base name, last element of path, of the URI. If URI ends 

294 on a slash returns an empty string. This is the second element returned 

295 by split(). 

296 

297 Equivalent of os.path.basename(). 

298 

299 Returns 

300 ------- 

301 tail : `str` 

302 Last part of the path attribute. Trail will be empty if path ends 

303 on a separator. 

304 """ 

305 return self.split()[1] 

306 

307 def dirname(self) -> ButlerURI: 

308 """Returns a ButlerURI containing all the directories of the path 

309 attribute. 

310 

311 Equivalent of os.path.dirname() 

312 

313 Returns 

314 ------- 

315 head : `ButlerURI` 

316 Everything except the tail of path attribute, expanded and 

317 normalized as per ButlerURI rules. 

318 """ 

319 return self.split()[0] 

320 

321 def parent(self) -> ButlerURI: 

322 """Returns a ButlerURI containing all the directories of the path 

323 attribute, minus the last one. 

324 

325 Returns 

326 ------- 

327 head : `ButlerURI` 

328 Everything except the tail of path attribute, expanded and 

329 normalized as per ButlerURI rules. 

330 """ 

331 # When self is file-like, return self.dirname() 

332 if not self.dirLike: 

333 return self.dirname() 

334 # When self is dir-like, return its parent directory, 

335 # regardless of the presence of a trailing separator 

336 originalPath = self._pathLib(self.path) 

337 parentPath = originalPath.parent 

338 parentURI = self._uri._replace(path=str(parentPath)) 

339 

340 return ButlerURI(parentURI, forceDirectory=True) 

341 

342 def replace(self, **kwargs: Any) -> ButlerURI: 

343 """Replace components in a URI with new values and return a new 

344 instance. 

345 

346 Returns 

347 ------- 

348 new : `ButlerURI` 

349 New `ButlerURI` object with updated values. 

350 """ 

351 return self.__class__(self._uri._replace(**kwargs)) 

352 

353 def updateFile(self, newfile: str) -> None: 

354 """Update in place the final component of the path with the supplied 

355 file name. 

356 

357 Parameters 

358 ---------- 

359 newfile : `str` 

360 File name with no path component. 

361 

362 Notes 

363 ----- 

364 Updates the URI in place. 

365 Updates the ButlerURI.dirLike attribute. The new file path will 

366 be quoted if necessary. 

367 """ 

368 if self.quotePaths: 

369 newfile = urllib.parse.quote(newfile) 

370 dir, _ = self._pathModule.split(self.path) 

371 newpath = self._pathModule.join(dir, newfile) 

372 

373 self.dirLike = False 

374 self._uri = self._uri._replace(path=newpath) 

375 

376 def getExtension(self) -> str: 

377 """Return the file extension(s) associated with this URI path. 

378 

379 Returns 

380 ------- 

381 ext : `str` 

382 The file extension (including the ``.``). Can be empty string 

383 if there is no file extension. Usually returns only the last 

384 file extension unless there is a special extension modifier 

385 indicating file compression, in which case the combined 

386 extension (e.g. ``.fits.gz``) will be returned. 

387 """ 

388 special = {".gz", ".bz2", ".xz", ".fz"} 

389 

390 extensions = self._pathLib(self.path).suffixes 

391 

392 if not extensions: 392 ↛ 393line 392 didn't jump to line 393, because the condition on line 392 was never true

393 return "" 

394 

395 ext = extensions.pop() 

396 

397 # Multiple extensions, decide whether to include the final two 

398 if extensions and ext in special: 398 ↛ 399line 398 didn't jump to line 399, because the condition on line 398 was never true

399 ext = f"{extensions[-1]}{ext}" 

400 

401 return ext 

402 

403 def join(self, path: str) -> ButlerURI: 

404 """Create a new `ButlerURI` with additional path components including 

405 a file. 

406 

407 Parameters 

408 ---------- 

409 path : `str` 

410 Additional file components to append to the current URI. Assumed 

411 to include a file at the end. Will be quoted depending on the 

412 associated URI scheme. 

413 

414 Returns 

415 ------- 

416 new : `ButlerURI` 

417 New URI with any file at the end replaced with the new path 

418 components. 

419 

420 Notes 

421 ----- 

422 Schemeless URIs assume local path separator but all other URIs assume 

423 POSIX separator if the supplied path has directory structure. It 

424 may be this never becomes a problem but datastore templates assume 

425 POSIX separator is being used. 

426 """ 

427 new = self.dirname() # By definition a directory URI 

428 

429 # new should be asked about quoting, not self, since dirname can 

430 # change the URI scheme for schemeless -> file 

431 if new.quotePaths: 431 ↛ 434line 431 didn't jump to line 434, because the condition on line 431 was never false

432 path = urllib.parse.quote(path) 

433 

434 newpath = self._pathModule.normpath(self._pathModule.join(new.path, path)) 

435 new._uri = new._uri._replace(path=newpath) 

436 # Declare the new URI not be dirLike unless path ended in / 

437 if not path.endswith(self._pathModule.sep): 437 ↛ 439line 437 didn't jump to line 439, because the condition on line 437 was never false

438 new.dirLike = False 

439 return new 

440 

441 def relative_to(self, other: ButlerURI) -> Optional[str]: 

442 """Return the relative path from this URI to the other URI. 

443 

444 Parameters 

445 ---------- 

446 other : `ButlerURI` 

447 URI to use to calculate the relative path. Must be a parent 

448 of this URI. 

449 

450 Returns 

451 ------- 

452 subpath : `str` 

453 The sub path of this URI relative to the supplied other URI. 

454 Returns `None` if there is no parent child relationship. 

455 Scheme and netloc must match. 

456 """ 

457 if self.scheme != other.scheme or self.netloc != other.netloc: 

458 return None 

459 

460 enclosed_path = self._pathLib(self.relativeToPathRoot) 

461 parent_path = other.relativeToPathRoot 

462 subpath: Optional[str] 

463 try: 

464 subpath = str(enclosed_path.relative_to(parent_path)) 

465 except ValueError: 

466 subpath = None 

467 else: 

468 subpath = urllib.parse.unquote(subpath) 

469 return subpath 

470 

471 def exists(self) -> bool: 

472 """Indicate that the resource is available. 

473 

474 Returns 

475 ------- 

476 exists : `bool` 

477 `True` if the resource exists. 

478 """ 

479 raise NotImplementedError() 

480 

481 def remove(self) -> None: 

482 """Remove the resource.""" 

483 raise NotImplementedError() 

484 

485 def isabs(self) -> bool: 

486 """Indicate that the resource is fully specified. 

487 

488 For non-schemeless URIs this is always true. 

489 

490 Returns 

491 ------- 

492 isabs : `bool` 

493 `True` in all cases except schemeless URI. 

494 """ 

495 return True 

496 

497 def as_local(self) -> Tuple[str, bool]: 

498 """Return the location of the (possibly remote) resource in the 

499 local file system. 

500 

501 Returns 

502 ------- 

503 path : `str` 

504 If this is a remote resource, it will be a copy of the resource 

505 on the local file system, probably in a temporary directory. 

506 For a local resource this should be the actual path to the 

507 resource. 

508 is_temporary : `bool` 

509 Indicates if the local path is a temporary file or not. 

510 """ 

511 raise NotImplementedError() 

512 

513 def read(self, size: int = -1) -> bytes: 

514 """Open the resource and return the contents in bytes. 

515 

516 Parameters 

517 ---------- 

518 size : `int`, optional 

519 The number of bytes to read. Negative or omitted indicates 

520 that all data should be read. 

521 """ 

522 raise NotImplementedError() 

523 

524 def write(self, data: bytes, overwrite: bool = True) -> None: 

525 """Write the supplied bytes to the new resource. 

526 

527 Parameters 

528 ---------- 

529 data : `bytes` 

530 The bytes to write to the resource. The entire contents of the 

531 resource will be replaced. 

532 overwrite : `bool`, optional 

533 If `True` the resource will be overwritten if it exists. Otherwise 

534 the write will fail. 

535 """ 

536 raise NotImplementedError() 

537 

538 def mkdir(self) -> None: 

539 """For a dir-like URI, create the directory resource if it does not 

540 already exist. 

541 """ 

542 raise NotImplementedError() 

543 

544 def size(self) -> int: 

545 """For non-dir-like URI, return the size of the resource. 

546 

547 Returns 

548 ------- 

549 sz : `int` 

550 The size in bytes of the resource associated with this URI. 

551 Returns 0 if dir-like. 

552 """ 

553 raise NotImplementedError() 

554 

555 def __str__(self) -> str: 

556 return self.geturl() 

557 

558 def __repr__(self) -> str: 

559 return f'ButlerURI("{self.geturl()}")' 

560 

561 def __eq__(self, other: Any) -> bool: 

562 if not isinstance(other, ButlerURI): 

563 return False 

564 return self.geturl() == other.geturl() 

565 

566 def __copy__(self) -> ButlerURI: 

567 # Implement here because the __new__ method confuses things 

568 return type(self)(str(self)) 

569 

570 def __deepcopy__(self, memo: Any) -> ButlerURI: 

571 # Implement here because the __new__ method confuses things 

572 return self.__copy__() 

573 

574 def __getnewargs__(self) -> Tuple: 

575 return (str(self),) 

576 

577 @staticmethod 

578 def _fixupPathUri(parsed: urllib.parse.ParseResult, root: Optional[Union[str, ButlerURI]] = None, 

579 forceAbsolute: bool = False, 

580 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]: 

581 """Correct any issues with the supplied URI. 

582 

583 Parameters 

584 ---------- 

585 parsed : `~urllib.parse.ParseResult` 

586 The result from parsing a URI using `urllib.parse`. 

587 root : `str` or `ButlerURI`, ignored 

588 Not used by the this implementation since all URIs are 

589 absolute except for those representing the local file system. 

590 forceAbsolute : `bool`, ignored. 

591 Not used by this implementation. URIs are generally always 

592 absolute. 

593 forceDirectory : `bool`, optional 

594 If `True` forces the URI to end with a separator, otherwise given 

595 URI is interpreted as is. Specifying that the URI is conceptually 

596 equivalent to a directory can break some ambiguities when 

597 interpreting the last element of a path. 

598 

599 Returns 

600 ------- 

601 modified : `~urllib.parse.ParseResult` 

602 Update result if a URI is being handled. 

603 dirLike : `bool` 

604 `True` if given parsed URI has a trailing separator or 

605 forceDirectory is True. Otherwise `False`. 

606 

607 Notes 

608 ----- 

609 Relative paths are explicitly not supported by RFC8089 but `urllib` 

610 does accept URIs of the form ``file:relative/path.ext``. They need 

611 to be turned into absolute paths before they can be used. This is 

612 always done regardless of the ``forceAbsolute`` parameter. 

613 

614 AWS S3 differentiates between keys with trailing POSIX separators (i.e 

615 `/dir` and `/dir/`) whereas POSIX does not neccessarily. 

616 

617 Scheme-less paths are normalized. 

618 """ 

619 # assume we are not dealing with a directory like URI 

620 dirLike = False 

621 

622 # URI is dir-like if explicitly stated or if it ends on a separator 

623 endsOnSep = parsed.path.endswith(posixpath.sep) 

624 if forceDirectory or endsOnSep: 

625 dirLike = True 

626 # only add the separator if it's not already there 

627 if not endsOnSep: 627 ↛ 630line 627 didn't jump to line 630, because the condition on line 627 was never false

628 parsed = parsed._replace(path=parsed.path+posixpath.sep) 

629 

630 return parsed, dirLike 

631 

632 def transfer_from(self, src: ButlerURI, transfer: str, 

633 overwrite: bool = False, 

634 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

635 """Transfer the current resource to a new location. 

636 

637 Parameters 

638 ---------- 

639 src : `ButlerURI` 

640 Source URI. 

641 transfer : `str` 

642 Mode to use for transferring the resource. Generically there are 

643 many standard options: copy, link, symlink, hardlink, relsymlink. 

644 Not all URIs support all modes. 

645 overwrite : `bool`, optional 

646 Allow an existing file to be overwritten. Defaults to `False`. 

647 transaction : `DatastoreTransaction`, optional 

648 A transaction object that can (depending on implementation) 

649 rollback transfers on error. Not guaranteed to be implemented. 

650 

651 Notes 

652 ----- 

653 Conceptually this is hard to scale as the number of URI schemes 

654 grow. The destination URI is more important than the source URI 

655 since that is where all the transfer modes are relevant (with the 

656 complication that "move" deletes the source). 

657 

658 Local file to local file is the fundamental use case but every 

659 other scheme has to support "copy" to local file (with implicit 

660 support for "move") and copy from local file. 

661 All the "link" options tend to be specific to local file systems. 

662 

663 "move" is a "copy" where the remote resource is deleted at the end. 

664 Whether this works depends on the source URI rather than the 

665 destination URI. Reverting a move on transaction rollback is 

666 expected to be problematic if a remote resource was involved. 

667 """ 

668 raise NotImplementedError(f"No transfer modes supported by URI scheme {self.scheme}")