Coverage for python/lsst/daf/butler/core/_butlerUri/_butlerUri.py: 48%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

317 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import contextlib 

25import concurrent.futures 

26import urllib.parse 

27import posixpath 

28import copy 

29import logging 

30import re 

31import shutil 

32import tempfile 

33import os 

34 

35from random import Random 

36from pathlib import Path, PurePath, PurePosixPath 

37 

38__all__ = ('ButlerURI',) 

39 

40from typing import ( 

41 TYPE_CHECKING, 

42 Any, 

43 Iterable, 

44 Iterator, 

45 List, 

46 Dict, 

47 Optional, 

48 Tuple, 

49 Type, 

50 Union, 

51) 

52 

53from .utils import NoTransaction 

54 

55if TYPE_CHECKING: 55 ↛ 56line 55 didn't jump to line 56, because the condition on line 55 was never true

56 from ..datastore import DatastoreTransaction 

57 

58 

59log = logging.getLogger(__name__) 

60 

61# Regex for looking for URI escapes 

62ESCAPES_RE = re.compile(r"%[A-F0-9]{2}") 

63 

64# Precomputed escaped hash 

65ESCAPED_HASH = urllib.parse.quote("#") 

66 

67# Maximum number of worker threads for parallelized operations. 

68# If greater than 10, be aware that this number has to be consistent 

69# with connection pool sizing (for example in urllib3). 

70MAX_WORKERS = 10 

71 

72 

73class ButlerURI: 

74 """Convenience wrapper around URI parsers. 

75 

76 Provides access to URI components and can convert file 

77 paths into absolute path URIs. Scheme-less URIs are treated as if 

78 they are local file system paths and are converted to absolute URIs. 

79 

80 A specialist subclass is created for each supported URI scheme. 

81 

82 Parameters 

83 ---------- 

84 uri : `str` or `urllib.parse.ParseResult` 

85 URI in string form. Can be scheme-less if referring to a local 

86 filesystem path. 

87 root : `str` or `ButlerURI`, optional 

88 When fixing up a relative path in a ``file`` scheme or if scheme-less, 

89 use this as the root. Must be absolute. If `None` the current 

90 working directory will be used. Can be a file URI. 

91 forceAbsolute : `bool`, optional 

92 If `True`, scheme-less relative URI will be converted to an absolute 

93 path using a ``file`` scheme. If `False` scheme-less URI will remain 

94 scheme-less and will not be updated to ``file`` or absolute path. 

95 forceDirectory: `bool`, optional 

96 If `True` forces the URI to end with a separator, otherwise given URI 

97 is interpreted as is. 

98 isTemporary : `bool`, optional 

99 If `True` indicates that this URI points to a temporary resource. 

100 """ 

101 

102 _pathLib: Type[PurePath] = PurePosixPath 

103 """Path library to use for this scheme.""" 

104 

105 _pathModule = posixpath 

106 """Path module to use for this scheme.""" 

107 

108 transferModes: Tuple[str, ...] = ("copy", "auto", "move") 

109 """Transfer modes supported by this implementation. 

110 

111 Move is special in that it is generally a copy followed by an unlink. 

112 Whether that unlink works depends critically on whether the source URI 

113 implements unlink. If it does not the move will be reported as a failure. 

114 """ 

115 

116 transferDefault: str = "copy" 

117 """Default mode to use for transferring if ``auto`` is specified.""" 

118 

119 quotePaths = True 

120 """True if path-like elements modifying a URI should be quoted. 

121 

122 All non-schemeless URIs have to internally use quoted paths. Therefore 

123 if a new file name is given (e.g. to updatedFile or join) a decision must 

124 be made whether to quote it to be consistent. 

125 """ 

126 

127 isLocal = False 

128 """If `True` this URI refers to a local file.""" 

129 

130 # This is not an ABC with abstract methods because the __new__ being 

131 # a factory confuses mypy such that it assumes that every constructor 

132 # returns a ButlerURI and then determines that all the abstract methods 

133 # are still abstract. If they are not marked abstract but just raise 

134 # mypy is fine with it. 

135 

136 # mypy is confused without these 

137 _uri: urllib.parse.ParseResult 

138 isTemporary: bool 

139 dirLike: bool 

140 

141 def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI, Path], 

142 root: Optional[Union[str, ButlerURI]] = None, forceAbsolute: bool = True, 

143 forceDirectory: bool = False, isTemporary: bool = False) -> ButlerURI: 

144 """Create and return new specialist ButlerURI subclass.""" 

145 parsed: urllib.parse.ParseResult 

146 dirLike: bool = False 

147 subclass: Optional[Type[ButlerURI]] = None 

148 

149 if isinstance(uri, os.PathLike): 149 ↛ 150line 149 didn't jump to line 150, because the condition on line 149 was never true

150 uri = str(uri) 

151 

152 # Record if we need to post process the URI components 

153 # or if the instance is already fully configured 

154 if isinstance(uri, str): 

155 # Since local file names can have special characters in them 

156 # we need to quote them for the parser but we can unquote 

157 # later. Assume that all other URI schemes are quoted. 

158 # Since sometimes people write file:/a/b and not file:///a/b 

159 # we should not quote in the explicit case of file: 

160 if "://" not in uri and not uri.startswith("file:"): 

161 if ESCAPES_RE.search(uri): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 log.warning("Possible double encoding of %s", uri) 

163 else: 

164 uri = urllib.parse.quote(uri) 

165 # Special case hash since we must support fragments 

166 # even in schemeless URIs -- although try to only replace 

167 # them in file part and not directory part 

168 if ESCAPED_HASH in uri: 168 ↛ 169line 168 didn't jump to line 169, because the condition on line 168 was never true

169 dirpos = uri.rfind("/") 

170 # Do replacement after this / 

171 uri = uri[:dirpos+1] + uri[dirpos+1:].replace(ESCAPED_HASH, "#") 

172 

173 parsed = urllib.parse.urlparse(uri) 

174 elif isinstance(uri, urllib.parse.ParseResult): 

175 parsed = copy.copy(uri) 

176 # If we are being instantiated with a subclass, rather than 

177 # ButlerURI, ensure that that subclass is used directly. 

178 # This could lead to inconsistencies if this constructor 

179 # is used externally outside of the ButlerURI.replace() method. 

180 # ButlerS3URI(urllib.parse.urlparse("file://a/b.txt")) 

181 # will be a problem. 

182 # This is needed to prevent a schemeless absolute URI become 

183 # a file URI unexpectedly when calling updatedFile or 

184 # updatedExtension 

185 if cls is not ButlerURI: 

186 parsed, dirLike = cls._fixDirectorySep(parsed, forceDirectory) 

187 subclass = cls 

188 

189 elif isinstance(uri, ButlerURI): 189 ↛ 194line 189 didn't jump to line 194, because the condition on line 189 was never false

190 # Since ButlerURI is immutable we can return the argument 

191 # unchanged. 

192 return uri 

193 else: 

194 raise ValueError("Supplied URI must be string, Path, " 

195 f"ButlerURI, or ParseResult but got '{uri!r}'") 

196 

197 if subclass is None: 

198 # Work out the subclass from the URI scheme 

199 if not parsed.scheme: 

200 from .schemeless import ButlerSchemelessURI 

201 subclass = ButlerSchemelessURI 

202 elif parsed.scheme == "file": 202 ↛ 203line 202 didn't jump to line 203, because the condition on line 202 was never true

203 from .file import ButlerFileURI 

204 subclass = ButlerFileURI 

205 elif parsed.scheme == "s3": 205 ↛ 206line 205 didn't jump to line 206, because the condition on line 205 was never true

206 from .s3 import ButlerS3URI 

207 subclass = ButlerS3URI 

208 elif parsed.scheme.startswith("http"): 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true

209 from .http import ButlerHttpURI 

210 subclass = ButlerHttpURI 

211 elif parsed.scheme == "resource": 211 ↛ 215line 211 didn't jump to line 215, because the condition on line 211 was never false

212 # Rules for scheme names disallow pkg_resource 

213 from .packageresource import ButlerPackageResourceURI 

214 subclass = ButlerPackageResourceURI 

215 elif parsed.scheme == "mem": 

216 # in-memory datastore object 

217 from .mem import ButlerInMemoryURI 

218 subclass = ButlerInMemoryURI 

219 else: 

220 raise NotImplementedError(f"No URI support for scheme: '{parsed.scheme}'" 

221 " in {parsed.geturl()}") 

222 

223 parsed, dirLike = subclass._fixupPathUri(parsed, root=root, 

224 forceAbsolute=forceAbsolute, 

225 forceDirectory=forceDirectory) 

226 

227 # It is possible for the class to change from schemeless 

228 # to file so handle that 

229 if parsed.scheme == "file": 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true

230 from .file import ButlerFileURI 

231 subclass = ButlerFileURI 

232 

233 # Now create an instance of the correct subclass and set the 

234 # attributes directly 

235 self = object.__new__(subclass) 

236 self._uri = parsed 

237 self.dirLike = dirLike 

238 self.isTemporary = isTemporary 

239 return self 

240 

241 @property 

242 def scheme(self) -> str: 

243 """Return the URI scheme. 

244 

245 Notes 

246 ----- 

247 (``://`` is not part of the scheme). 

248 """ 

249 return self._uri.scheme 

250 

251 @property 

252 def netloc(self) -> str: 

253 """Return the URI network location.""" 

254 return self._uri.netloc 

255 

256 @property 

257 def path(self) -> str: 

258 """Return the path component of the URI.""" 

259 return self._uri.path 

260 

261 @property 

262 def unquoted_path(self) -> str: 

263 """Return path component of the URI with any URI quoting reversed.""" 

264 return urllib.parse.unquote(self._uri.path) 

265 

266 @property 

267 def ospath(self) -> str: 

268 """Return the path component of the URI localized to current OS.""" 

269 raise AttributeError(f"Non-file URI ({self}) has no local OS path.") 

270 

271 @property 

272 def relativeToPathRoot(self) -> str: 

273 """Return path relative to network location. 

274 

275 Effectively, this is the path property with posix separator stripped 

276 from the left hand side of the path. 

277 

278 Always unquotes. 

279 """ 

280 p = self._pathLib(self.path) 

281 relToRoot = str(p.relative_to(p.root)) 

282 if self.dirLike and not relToRoot.endswith("/"): 282 ↛ 283line 282 didn't jump to line 283, because the condition on line 282 was never true

283 relToRoot += "/" 

284 return urllib.parse.unquote(relToRoot) 

285 

286 @property 

287 def is_root(self) -> bool: 

288 """Return whether this URI points to the root of the network location. 

289 

290 This means that the path components refers to the top level. 

291 """ 

292 relpath = self.relativeToPathRoot 

293 if relpath == "./": 

294 return True 

295 return False 

296 

297 @property 

298 def fragment(self) -> str: 

299 """Return the fragment component of the URI.""" 

300 return self._uri.fragment 

301 

302 @property 

303 def params(self) -> str: 

304 """Return any parameters included in the URI.""" 

305 return self._uri.params 

306 

307 @property 

308 def query(self) -> str: 

309 """Return any query strings included in the URI.""" 

310 return self._uri.query 

311 

312 def geturl(self) -> str: 

313 """Return the URI in string form. 

314 

315 Returns 

316 ------- 

317 url : `str` 

318 String form of URI. 

319 """ 

320 return self._uri.geturl() 

321 

322 def root_uri(self) -> ButlerURI: 

323 """Return the base root URI. 

324 

325 Returns 

326 ------- 

327 uri : `ButlerURI` 

328 root URI. 

329 """ 

330 return self.replace(path="", forceDirectory=True) 

331 

332 def split(self) -> Tuple[ButlerURI, str]: 

333 """Split URI into head and tail. 

334 

335 Returns 

336 ------- 

337 head: `ButlerURI` 

338 Everything leading up to tail, expanded and normalized as per 

339 ButlerURI rules. 

340 tail : `str` 

341 Last `self.path` component. Tail will be empty if path ends on a 

342 separator. Tail will never contain separators. It will be 

343 unquoted. 

344 

345 Notes 

346 ----- 

347 Equivalent to `os.path.split()` where head preserves the URI 

348 components. 

349 """ 

350 head, tail = self._pathModule.split(self.path) 

351 headuri = self._uri._replace(path=head) 

352 

353 # The file part should never include quoted metacharacters 

354 tail = urllib.parse.unquote(tail) 

355 

356 # Schemeless is special in that it can be a relative path 

357 # We need to ensure that it stays that way. All other URIs will 

358 # be absolute already. 

359 forceAbsolute = self._pathModule.isabs(self.path) 

360 return ButlerURI(headuri, forceDirectory=True, forceAbsolute=forceAbsolute), tail 

361 

362 def basename(self) -> str: 

363 """Return the base name, last element of path, of the URI. 

364 

365 Returns 

366 ------- 

367 tail : `str` 

368 Last part of the path attribute. Trail will be empty if path ends 

369 on a separator. 

370 

371 Notes 

372 ----- 

373 If URI ends on a slash returns an empty string. This is the second 

374 element returned by `split()`. 

375 

376 Equivalent of `os.path.basename()``. 

377 """ 

378 return self.split()[1] 

379 

380 def dirname(self) -> ButlerURI: 

381 """Return the directory component of the path as a new `ButlerURI`. 

382 

383 Returns 

384 ------- 

385 head : `ButlerURI` 

386 Everything except the tail of path attribute, expanded and 

387 normalized as per ButlerURI rules. 

388 

389 Notes 

390 ----- 

391 Equivalent of `os.path.dirname()`. 

392 """ 

393 return self.split()[0] 

394 

395 def parent(self) -> ButlerURI: 

396 """Return a `ButlerURI` of the parent directory. 

397 

398 Returns 

399 ------- 

400 head : `ButlerURI` 

401 Everything except the tail of path attribute, expanded and 

402 normalized as per `ButlerURI` rules. 

403 

404 Notes 

405 ----- 

406 For a file-like URI this will be the same as calling `dirname()`. 

407 """ 

408 # When self is file-like, return self.dirname() 

409 if not self.dirLike: 

410 return self.dirname() 

411 # When self is dir-like, return its parent directory, 

412 # regardless of the presence of a trailing separator 

413 originalPath = self._pathLib(self.path) 

414 parentPath = originalPath.parent 

415 return self.replace(path=str(parentPath), forceDirectory=True) 

416 

417 def replace(self, forceDirectory: bool = False, isTemporary: bool = False, **kwargs: Any) -> ButlerURI: 

418 """Return new `ButlerURI` with specified components replaced. 

419 

420 Parameters 

421 ---------- 

422 forceDirectory : `bool`, optional 

423 Parameter passed to ButlerURI constructor to force this 

424 new URI to be dir-like. 

425 isTemporary : `bool`, optional 

426 Indicate that the resulting URI is temporary resource. 

427 **kwargs 

428 Components of a `urllib.parse.ParseResult` that should be 

429 modified for the newly-created `ButlerURI`. 

430 

431 Returns 

432 ------- 

433 new : `ButlerURI` 

434 New `ButlerURI` object with updated values. 

435 

436 Notes 

437 ----- 

438 Does not, for now, allow a change in URI scheme. 

439 """ 

440 # Disallow a change in scheme 

441 if "scheme" in kwargs: 441 ↛ 442line 441 didn't jump to line 442, because the condition on line 441 was never true

442 raise ValueError(f"Can not use replace() method to change URI scheme for {self}") 

443 return self.__class__(self._uri._replace(**kwargs), forceDirectory=forceDirectory, 

444 isTemporary=isTemporary) 

445 

446 def updatedFile(self, newfile: str) -> ButlerURI: 

447 """Return new URI with an updated final component of the path. 

448 

449 Parameters 

450 ---------- 

451 newfile : `str` 

452 File name with no path component. 

453 

454 Returns 

455 ------- 

456 updated : `ButlerURI` 

457 

458 Notes 

459 ----- 

460 Forces the ButlerURI.dirLike attribute to be false. The new file path 

461 will be quoted if necessary. 

462 """ 

463 if self.quotePaths: 

464 newfile = urllib.parse.quote(newfile) 

465 dir, _ = self._pathModule.split(self.path) 

466 newpath = self._pathModule.join(dir, newfile) 

467 

468 updated = self.replace(path=newpath) 

469 updated.dirLike = False 

470 return updated 

471 

472 def updatedExtension(self, ext: Optional[str]) -> ButlerURI: 

473 """Return a new `ButlerURI` with updated file extension. 

474 

475 All file extensions are replaced. 

476 

477 Parameters 

478 ---------- 

479 ext : `str` or `None` 

480 New extension. If an empty string is given any extension will 

481 be removed. If `None` is given there will be no change. 

482 

483 Returns 

484 ------- 

485 updated : `ButlerURI` 

486 URI with the specified extension. Can return itself if 

487 no extension was specified. 

488 """ 

489 if ext is None: 

490 return self 

491 

492 # Get the extension 

493 current = self.getExtension() 

494 

495 # Nothing to do if the extension already matches 

496 if current == ext: 

497 return self 

498 

499 # Remove the current extension from the path 

500 # .fits.gz counts as one extension do not use os.path.splitext 

501 path = self.path 

502 if current: 

503 path = path[:-len(current)] 

504 

505 # Ensure that we have a leading "." on file extension (and we do not 

506 # try to modify the empty string) 

507 if ext and not ext.startswith("."): 

508 ext = "." + ext 

509 

510 return self.replace(path=path + ext) 

511 

512 def getExtension(self) -> str: 

513 """Return the file extension(s) associated with this URI path. 

514 

515 Returns 

516 ------- 

517 ext : `str` 

518 The file extension (including the ``.``). Can be empty string 

519 if there is no file extension. Usually returns only the last 

520 file extension unless there is a special extension modifier 

521 indicating file compression, in which case the combined 

522 extension (e.g. ``.fits.gz``) will be returned. 

523 """ 

524 special = {".gz", ".bz2", ".xz", ".fz"} 

525 

526 # Get the file part of the path so as not to be confused by 

527 # "." in directory names. 

528 basename = self.basename() 

529 extensions = self._pathLib(basename).suffixes 

530 

531 if not extensions: 531 ↛ 532line 531 didn't jump to line 532, because the condition on line 531 was never true

532 return "" 

533 

534 ext = extensions.pop() 

535 

536 # Multiple extensions, decide whether to include the final two 

537 if extensions and ext in special: 537 ↛ 538line 537 didn't jump to line 538, because the condition on line 537 was never true

538 ext = f"{extensions[-1]}{ext}" 

539 

540 return ext 

541 

542 def join(self, path: Union[str, ButlerURI], isTemporary: bool = False) -> ButlerURI: 

543 """Return new `ButlerURI` with additional path components. 

544 

545 Parameters 

546 ---------- 

547 path : `str`, `ButlerURI` 

548 Additional file components to append to the current URI. Assumed 

549 to include a file at the end. Will be quoted depending on the 

550 associated URI scheme. If the path looks like a URI with a scheme 

551 referring to an absolute location, it will be returned 

552 directly (matching the behavior of `os.path.join()`). It can 

553 also be a `ButlerURI`. 

554 isTemporary : `bool`, optional 

555 Indicate that the resulting URI represents a temporary resource. 

556 

557 Returns 

558 ------- 

559 new : `ButlerURI` 

560 New URI with any file at the end replaced with the new path 

561 components. 

562 

563 Notes 

564 ----- 

565 Schemeless URIs assume local path separator but all other URIs assume 

566 POSIX separator if the supplied path has directory structure. It 

567 may be this never becomes a problem but datastore templates assume 

568 POSIX separator is being used. 

569 

570 If an absolute `ButlerURI` is given for ``path`` is is assumed that 

571 this should be returned directly. Giving a ``path`` of an absolute 

572 scheme-less URI is not allowed for safety reasons as it may indicate 

573 a mistake in the calling code. 

574 

575 Raises 

576 ------ 

577 ValueError 

578 Raised if the ``path`` is an absolute scheme-less URI. In that 

579 situation it is unclear whether the intent is to return a 

580 ``file`` URI or it was a mistake and a relative scheme-less URI 

581 was meant. 

582 """ 

583 # If we have a full URI in path we will use it directly 

584 # but without forcing to absolute so that we can trap the 

585 # expected option of relative path. 

586 path_uri = ButlerURI(path, forceAbsolute=False) 

587 if path_uri.scheme: 587 ↛ 590line 587 didn't jump to line 590, because the condition on line 587 was never true

588 # Check for scheme so can distinguish explicit URIs from 

589 # absolute scheme-less URIs. 

590 return path_uri 

591 

592 if path_uri.isabs(): 592 ↛ 594line 592 didn't jump to line 594, because the condition on line 592 was never true

593 # Absolute scheme-less path. 

594 raise ValueError(f"Can not join absolute scheme-less {path_uri!r} to another URI.") 

595 

596 # If this was originally a ButlerURI extract the unquoted path from it. 

597 # Otherwise we use the string we were given to allow "#" to appear 

598 # in the filename if given as a plain string. 

599 if not isinstance(path, str): 599 ↛ 600line 599 didn't jump to line 600, because the condition on line 599 was never true

600 path = path_uri.unquoted_path 

601 

602 new = self.dirname() # By definition a directory URI 

603 

604 # new should be asked about quoting, not self, since dirname can 

605 # change the URI scheme for schemeless -> file 

606 if new.quotePaths: 606 ↛ 609line 606 didn't jump to line 609, because the condition on line 606 was never false

607 path = urllib.parse.quote(path) 

608 

609 newpath = self._pathModule.normpath(self._pathModule.join(new.path, path)) 

610 

611 # normpath can strip trailing / so we force directory if the supplied 

612 # path ended with a / 

613 return new.replace(path=newpath, forceDirectory=path.endswith(self._pathModule.sep), 

614 isTemporary=isTemporary) 

615 

616 def relative_to(self, other: ButlerURI) -> Optional[str]: 

617 """Return the relative path from this URI to the other URI. 

618 

619 Parameters 

620 ---------- 

621 other : `ButlerURI` 

622 URI to use to calculate the relative path. Must be a parent 

623 of this URI. 

624 

625 Returns 

626 ------- 

627 subpath : `str` 

628 The sub path of this URI relative to the supplied other URI. 

629 Returns `None` if there is no parent child relationship. 

630 Scheme and netloc must match. 

631 """ 

632 # Scheme-less absolute other is treated as if it's a file scheme. 

633 # Scheme-less relative other can only return non-None if self 

634 # is also scheme-less relative and that is handled specifically 

635 # in a subclass. 

636 if not other.scheme and other.isabs(): 

637 other = other.abspath() 

638 

639 # Scheme-less self is handled elsewhere. 

640 if self.scheme != other.scheme or self.netloc != other.netloc: 

641 return None 

642 

643 enclosed_path = self._pathLib(self.relativeToPathRoot) 

644 parent_path = other.relativeToPathRoot 

645 subpath: Optional[str] 

646 try: 

647 subpath = str(enclosed_path.relative_to(parent_path)) 

648 except ValueError: 

649 subpath = None 

650 else: 

651 subpath = urllib.parse.unquote(subpath) 

652 return subpath 

653 

654 def exists(self) -> bool: 

655 """Indicate that the resource is available. 

656 

657 Returns 

658 ------- 

659 exists : `bool` 

660 `True` if the resource exists. 

661 """ 

662 raise NotImplementedError() 

663 

664 @classmethod 

665 def mexists(cls, uris: Iterable[ButlerURI]) -> Dict[ButlerURI, bool]: 

666 """Check for existence of multiple URIs at once. 

667 

668 Parameters 

669 ---------- 

670 uris : iterable of `ButlerURI` 

671 The URIs to test. 

672 

673 Returns 

674 ------- 

675 existence : `dict` of [`ButlerURI`, `bool`] 

676 Mapping of original URI to boolean indicating existence. 

677 """ 

678 exists_executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) 

679 future_exists = {exists_executor.submit(uri.exists): uri for uri in uris} 

680 

681 results: Dict[ButlerURI, bool] = {} 

682 for future in concurrent.futures.as_completed(future_exists): 

683 uri = future_exists[future] 

684 try: 

685 exists = future.result() 

686 except Exception: 

687 exists = False 

688 results[uri] = exists 

689 return results 

690 

691 def remove(self) -> None: 

692 """Remove the resource.""" 

693 raise NotImplementedError() 

694 

695 def isabs(self) -> bool: 

696 """Indicate that the resource is fully specified. 

697 

698 For non-schemeless URIs this is always true. 

699 

700 Returns 

701 ------- 

702 isabs : `bool` 

703 `True` in all cases except schemeless URI. 

704 """ 

705 return True 

706 

707 def abspath(self) -> ButlerURI: 

708 """Return URI using an absolute path. 

709 

710 Returns 

711 ------- 

712 abs : `ButlerURI` 

713 Absolute URI. For non-schemeless URIs this always returns itself. 

714 Schemeless URIs are upgraded to file URIs. 

715 """ 

716 return self 

717 

718 def _as_local(self) -> Tuple[str, bool]: 

719 """Return the location of the (possibly remote) resource as local file. 

720 

721 This is a helper function for `as_local` context manager. 

722 

723 Returns 

724 ------- 

725 path : `str` 

726 If this is a remote resource, it will be a copy of the resource 

727 on the local file system, probably in a temporary directory. 

728 For a local resource this should be the actual path to the 

729 resource. 

730 is_temporary : `bool` 

731 Indicates if the local path is a temporary file or not. 

732 """ 

733 raise NotImplementedError() 

734 

735 @contextlib.contextmanager 

736 def as_local(self) -> Iterator[ButlerURI]: 

737 """Return the location of the (possibly remote) resource as local file. 

738 

739 Yields 

740 ------ 

741 local : `ButlerURI` 

742 If this is a remote resource, it will be a copy of the resource 

743 on the local file system, probably in a temporary directory. 

744 For a local resource this should be the actual path to the 

745 resource. 

746 

747 Notes 

748 ----- 

749 The context manager will automatically delete any local temporary 

750 file. 

751 

752 Examples 

753 -------- 

754 Should be used as a context manager: 

755 

756 .. code-block:: py 

757 

758 with uri.as_local() as local: 

759 ospath = local.ospath 

760 """ 

761 local_src, is_temporary = self._as_local() 

762 local_uri = ButlerURI(local_src, isTemporary=is_temporary) 

763 

764 try: 

765 yield local_uri 

766 finally: 

767 # The caller might have relocated the temporary file 

768 if is_temporary and local_uri.exists(): 

769 local_uri.remove() 

770 

771 @classmethod 

772 @contextlib.contextmanager 

773 def temporary_uri(cls, prefix: Optional[ButlerURI] = None, 

774 suffix: Optional[str] = None) -> Iterator[ButlerURI]: 

775 """Create a temporary URI. 

776 

777 Parameters 

778 ---------- 

779 prefix : `ButlerURI`, optional 

780 Prefix to use. Without this the path will be formed as a local 

781 file URI in a temporary directory. Ensuring that the prefix 

782 location exists is the responsibility of the caller. 

783 suffix : `str`, optional 

784 A file suffix to be used. The ``.`` should be included in this 

785 suffix. 

786 

787 Yields 

788 ------ 

789 uri : `ButlerURI` 

790 The temporary URI. Will be removed when the context is completed. 

791 """ 

792 use_tempdir = False 

793 if prefix is None: 

794 prefix = ButlerURI(tempfile.mkdtemp(), forceDirectory=True, isTemporary=True) 

795 # Record that we need to delete this directory. Can not rely 

796 # on isTemporary flag since an external prefix may have that 

797 # set as well. 

798 use_tempdir = True 

799 

800 # Need to create a randomized file name. For consistency do not 

801 # use mkstemp for local and something else for remote. Additionally 

802 # this method does not create the file to prevent name clashes. 

803 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

804 rng = Random() 

805 tempname = "".join(rng.choice(characters) for _ in range(16)) 

806 if suffix: 

807 tempname += suffix 

808 temporary_uri = prefix.join(tempname, isTemporary=True) 

809 

810 try: 

811 yield temporary_uri 

812 finally: 

813 if use_tempdir: 

814 shutil.rmtree(prefix.ospath, ignore_errors=True) 

815 else: 

816 try: 

817 # It's okay if this does not work because the user removed 

818 # the file. 

819 temporary_uri.remove() 

820 except FileNotFoundError: 

821 pass 

822 

823 def read(self, size: int = -1) -> bytes: 

824 """Open the resource and return the contents in bytes. 

825 

826 Parameters 

827 ---------- 

828 size : `int`, optional 

829 The number of bytes to read. Negative or omitted indicates 

830 that all data should be read. 

831 """ 

832 raise NotImplementedError() 

833 

834 def write(self, data: bytes, overwrite: bool = True) -> None: 

835 """Write the supplied bytes to the new resource. 

836 

837 Parameters 

838 ---------- 

839 data : `bytes` 

840 The bytes to write to the resource. The entire contents of the 

841 resource will be replaced. 

842 overwrite : `bool`, optional 

843 If `True` the resource will be overwritten if it exists. Otherwise 

844 the write will fail. 

845 """ 

846 raise NotImplementedError() 

847 

848 def mkdir(self) -> None: 

849 """For a dir-like URI, create the directory resource if needed.""" 

850 raise NotImplementedError() 

851 

852 def isdir(self) -> bool: 

853 """Return True if this URI looks like a directory, else False.""" 

854 return self.dirLike 

855 

856 def size(self) -> int: 

857 """For non-dir-like URI, return the size of the resource. 

858 

859 Returns 

860 ------- 

861 sz : `int` 

862 The size in bytes of the resource associated with this URI. 

863 Returns 0 if dir-like. 

864 """ 

865 raise NotImplementedError() 

866 

867 def __str__(self) -> str: 

868 """Convert the URI to its native string form.""" 

869 return self.geturl() 

870 

871 def __repr__(self) -> str: 

872 """Return string representation suitable for evaluation.""" 

873 return f'ButlerURI("{self.geturl()}")' 

874 

875 def __eq__(self, other: Any) -> bool: 

876 """Compare supplied object with this `ButlerURI`.""" 

877 if not isinstance(other, ButlerURI): 

878 return NotImplemented 

879 return self.geturl() == other.geturl() 

880 

881 def __hash__(self) -> int: 

882 """Return hash of this object.""" 

883 return hash(str(self)) 

884 

885 def __copy__(self) -> ButlerURI: 

886 """Copy constructor. 

887 

888 Object is immutable so copy can return itself. 

889 """ 

890 # Implement here because the __new__ method confuses things 

891 return self 

892 

893 def __deepcopy__(self, memo: Any) -> ButlerURI: 

894 """Deepcopy the object. 

895 

896 Object is immutable so copy can return itself. 

897 """ 

898 # Implement here because the __new__ method confuses things 

899 return self 

900 

901 def __getnewargs__(self) -> Tuple: 

902 """Support pickling.""" 

903 return (str(self),) 

904 

905 @classmethod 

906 def _fixDirectorySep(cls, parsed: urllib.parse.ParseResult, 

907 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]: 

908 """Ensure that a path separator is present on directory paths. 

909 

910 Parameters 

911 ---------- 

912 parsed : `~urllib.parse.ParseResult` 

913 The result from parsing a URI using `urllib.parse`. 

914 forceDirectory : `bool`, optional 

915 If `True` forces the URI to end with a separator, otherwise given 

916 URI is interpreted as is. Specifying that the URI is conceptually 

917 equivalent to a directory can break some ambiguities when 

918 interpreting the last element of a path. 

919 

920 Returns 

921 ------- 

922 modified : `~urllib.parse.ParseResult` 

923 Update result if a URI is being handled. 

924 dirLike : `bool` 

925 `True` if given parsed URI has a trailing separator or 

926 forceDirectory is True. Otherwise `False`. 

927 """ 

928 # assume we are not dealing with a directory like URI 

929 dirLike = False 

930 

931 # Directory separator 

932 sep = cls._pathModule.sep 

933 

934 # URI is dir-like if explicitly stated or if it ends on a separator 

935 endsOnSep = parsed.path.endswith(sep) 

936 if forceDirectory or endsOnSep: 

937 dirLike = True 

938 # only add the separator if it's not already there 

939 if not endsOnSep: 939 ↛ 942line 939 didn't jump to line 942, because the condition on line 939 was never false

940 parsed = parsed._replace(path=parsed.path+sep) 

941 

942 return parsed, dirLike 

943 

944 @classmethod 

945 def _fixupPathUri(cls, parsed: urllib.parse.ParseResult, root: Optional[Union[str, ButlerURI]] = None, 

946 forceAbsolute: bool = False, 

947 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]: 

948 """Correct any issues with the supplied URI. 

949 

950 Parameters 

951 ---------- 

952 parsed : `~urllib.parse.ParseResult` 

953 The result from parsing a URI using `urllib.parse`. 

954 root : `str` or `ButlerURI`, ignored 

955 Not used by the this implementation since all URIs are 

956 absolute except for those representing the local file system. 

957 forceAbsolute : `bool`, ignored. 

958 Not used by this implementation. URIs are generally always 

959 absolute. 

960 forceDirectory : `bool`, optional 

961 If `True` forces the URI to end with a separator, otherwise given 

962 URI is interpreted as is. Specifying that the URI is conceptually 

963 equivalent to a directory can break some ambiguities when 

964 interpreting the last element of a path. 

965 

966 Returns 

967 ------- 

968 modified : `~urllib.parse.ParseResult` 

969 Update result if a URI is being handled. 

970 dirLike : `bool` 

971 `True` if given parsed URI has a trailing separator or 

972 forceDirectory is True. Otherwise `False`. 

973 

974 Notes 

975 ----- 

976 Relative paths are explicitly not supported by RFC8089 but `urllib` 

977 does accept URIs of the form ``file:relative/path.ext``. They need 

978 to be turned into absolute paths before they can be used. This is 

979 always done regardless of the ``forceAbsolute`` parameter. 

980 

981 AWS S3 differentiates between keys with trailing POSIX separators (i.e 

982 `/dir` and `/dir/`) whereas POSIX does not neccessarily. 

983 

984 Scheme-less paths are normalized. 

985 """ 

986 return cls._fixDirectorySep(parsed, forceDirectory) 

987 

988 def transfer_from(self, src: ButlerURI, transfer: str, 

989 overwrite: bool = False, 

990 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

991 """Transfer the current resource to a new location. 

992 

993 Parameters 

994 ---------- 

995 src : `ButlerURI` 

996 Source URI. 

997 transfer : `str` 

998 Mode to use for transferring the resource. Generically there are 

999 many standard options: copy, link, symlink, hardlink, relsymlink. 

1000 Not all URIs support all modes. 

1001 overwrite : `bool`, optional 

1002 Allow an existing file to be overwritten. Defaults to `False`. 

1003 transaction : `DatastoreTransaction`, optional 

1004 A transaction object that can (depending on implementation) 

1005 rollback transfers on error. Not guaranteed to be implemented. 

1006 

1007 Notes 

1008 ----- 

1009 Conceptually this is hard to scale as the number of URI schemes 

1010 grow. The destination URI is more important than the source URI 

1011 since that is where all the transfer modes are relevant (with the 

1012 complication that "move" deletes the source). 

1013 

1014 Local file to local file is the fundamental use case but every 

1015 other scheme has to support "copy" to local file (with implicit 

1016 support for "move") and copy from local file. 

1017 All the "link" options tend to be specific to local file systems. 

1018 

1019 "move" is a "copy" where the remote resource is deleted at the end. 

1020 Whether this works depends on the source URI rather than the 

1021 destination URI. Reverting a move on transaction rollback is 

1022 expected to be problematic if a remote resource was involved. 

1023 """ 

1024 raise NotImplementedError(f"No transfer modes supported by URI scheme {self.scheme}") 

1025 

1026 def walk(self, file_filter: Optional[Union[str, re.Pattern]] = None) -> Iterator[Union[List, 

1027 Tuple[ButlerURI, 

1028 List[str], 

1029 List[str]]]]: 

1030 """Walk the directory tree returning matching files and directories. 

1031 

1032 Parameters 

1033 ---------- 

1034 file_filter : `str` or `re.Pattern`, optional 

1035 Regex to filter out files from the list before it is returned. 

1036 

1037 Yields 

1038 ------ 

1039 dirpath : `ButlerURI` 

1040 Current directory being examined. 

1041 dirnames : `list` of `str` 

1042 Names of subdirectories within dirpath. 

1043 filenames : `list` of `str` 

1044 Names of all the files within dirpath. 

1045 """ 

1046 raise NotImplementedError() 

1047 

1048 @classmethod 

1049 def findFileResources(cls, candidates: Iterable[Union[str, ButlerURI]], 

1050 file_filter: Optional[str] = None, 

1051 grouped: bool = False) -> Iterator[Union[ButlerURI, Iterator[ButlerURI]]]: 

1052 """Get all the files from a list of values. 

1053 

1054 Parameters 

1055 ---------- 

1056 candidates : iterable [`str` or `ButlerURI`] 

1057 The files to return and directories in which to look for files to 

1058 return. 

1059 file_filter : `str`, optional 

1060 The regex to use when searching for files within directories. 

1061 By default returns all the found files. 

1062 grouped : `bool`, optional 

1063 If `True` the results will be grouped by directory and each 

1064 yielded value will be an iterator over URIs. If `False` each 

1065 URI will be returned separately. 

1066 

1067 Yields 

1068 ------ 

1069 found_file: `ButlerURI` 

1070 The passed-in URIs and URIs found in passed-in directories. 

1071 If grouping is enabled, each of the yielded values will be an 

1072 iterator yielding members of the group. Files given explicitly 

1073 will be returned as a single group at the end. 

1074 

1075 Notes 

1076 ----- 

1077 If a value is a file it is yielded immediately. If a value is a 

1078 directory, all the files in the directory (recursively) that match 

1079 the regex will be yielded in turn. 

1080 """ 

1081 fileRegex = None if file_filter is None else re.compile(file_filter) 

1082 

1083 singles = [] 

1084 

1085 # Find all the files of interest 

1086 for location in candidates: 

1087 uri = ButlerURI(location) 

1088 if uri.isdir(): 

1089 for found in uri.walk(fileRegex): 

1090 if not found: 

1091 # This means the uri does not exist and by 

1092 # convention we ignore it 

1093 continue 

1094 root, dirs, files = found 

1095 if not files: 

1096 continue 

1097 if grouped: 

1098 yield (root.join(name) for name in files) 

1099 else: 

1100 for name in files: 

1101 yield root.join(name) 

1102 else: 

1103 if grouped: 

1104 singles.append(uri) 

1105 else: 

1106 yield uri 

1107 

1108 # Finally, return any explicitly given files in one group 

1109 if grouped and singles: 

1110 yield iter(singles)