Coverage for python/lsst/daf/butler/core/_butlerUri/_butlerUri.py: 48%

316 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-01 19:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import contextlib 

25import concurrent.futures 

26import urllib.parse 

27import posixpath 

28import copy 

29import logging 

30import re 

31import shutil 

32import tempfile 

33 

34from random import Random 

35from pathlib import Path, PurePath, PurePosixPath 

36 

37__all__ = ('ButlerURI',) 

38 

39from typing import ( 

40 TYPE_CHECKING, 

41 Any, 

42 Iterable, 

43 Iterator, 

44 List, 

45 Dict, 

46 Optional, 

47 Tuple, 

48 Type, 

49 Union, 

50) 

51 

52from .utils import NoTransaction 

53 

54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 from ..datastore import DatastoreTransaction 

56 

57 

58log = logging.getLogger(__name__) 

59 

60# Regex for looking for URI escapes 

61ESCAPES_RE = re.compile(r"%[A-F0-9]{2}") 

62 

63# Precomputed escaped hash 

64ESCAPED_HASH = urllib.parse.quote("#") 

65 

66# Maximum number of worker threads for parallelized operations. 

67# If greater than 10, be aware that this number has to be consistent 

68# with connection pool sizing (for example in urllib3). 

69MAX_WORKERS = 10 

70 

71 

72class ButlerURI: 

73 """Convenience wrapper around URI parsers. 

74 

75 Provides access to URI components and can convert file 

76 paths into absolute path URIs. Scheme-less URIs are treated as if 

77 they are local file system paths and are converted to absolute URIs. 

78 

79 A specialist subclass is created for each supported URI scheme. 

80 

81 Parameters 

82 ---------- 

83 uri : `str` or `urllib.parse.ParseResult` 

84 URI in string form. Can be scheme-less if referring to a local 

85 filesystem path. 

86 root : `str` or `ButlerURI`, optional 

87 When fixing up a relative path in a ``file`` scheme or if scheme-less, 

88 use this as the root. Must be absolute. If `None` the current 

89 working directory will be used. Can be a file URI. 

90 forceAbsolute : `bool`, optional 

91 If `True`, scheme-less relative URI will be converted to an absolute 

92 path using a ``file`` scheme. If `False` scheme-less URI will remain 

93 scheme-less and will not be updated to ``file`` or absolute path. 

94 forceDirectory: `bool`, optional 

95 If `True` forces the URI to end with a separator, otherwise given URI 

96 is interpreted as is. 

97 isTemporary : `bool`, optional 

98 If `True` indicates that this URI points to a temporary resource. 

99 """ 

100 

101 _pathLib: Type[PurePath] = PurePosixPath 

102 """Path library to use for this scheme.""" 

103 

104 _pathModule = posixpath 

105 """Path module to use for this scheme.""" 

106 

107 transferModes: Tuple[str, ...] = ("copy", "auto", "move") 

108 """Transfer modes supported by this implementation. 

109 

110 Move is special in that it is generally a copy followed by an unlink. 

111 Whether that unlink works depends critically on whether the source URI 

112 implements unlink. If it does not the move will be reported as a failure. 

113 """ 

114 

115 transferDefault: str = "copy" 

116 """Default mode to use for transferring if ``auto`` is specified.""" 

117 

118 quotePaths = True 

119 """True if path-like elements modifying a URI should be quoted. 

120 

121 All non-schemeless URIs have to internally use quoted paths. Therefore 

122 if a new file name is given (e.g. to updatedFile or join) a decision must 

123 be made whether to quote it to be consistent. 

124 """ 

125 

126 isLocal = False 

127 """If `True` this URI refers to a local file.""" 

128 

129 # This is not an ABC with abstract methods because the __new__ being 

130 # a factory confuses mypy such that it assumes that every constructor 

131 # returns a ButlerURI and then determines that all the abstract methods 

132 # are still abstract. If they are not marked abstract but just raise 

133 # mypy is fine with it. 

134 

135 # mypy is confused without these 

136 _uri: urllib.parse.ParseResult 

137 isTemporary: bool 

138 dirLike: bool 

139 

140 def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI, Path], 

141 root: Optional[Union[str, ButlerURI]] = None, forceAbsolute: bool = True, 

142 forceDirectory: bool = False, isTemporary: bool = False) -> ButlerURI: 

143 """Create and return new specialist ButlerURI subclass.""" 

144 parsed: urllib.parse.ParseResult 

145 dirLike: bool = False 

146 subclass: Optional[Type[ButlerURI]] = None 

147 

148 if isinstance(uri, Path): 148 ↛ 149line 148 didn't jump to line 149, because the condition on line 148 was never true

149 uri = str(uri) 

150 

151 # Record if we need to post process the URI components 

152 # or if the instance is already fully configured 

153 if isinstance(uri, str): 

154 # Since local file names can have special characters in them 

155 # we need to quote them for the parser but we can unquote 

156 # later. Assume that all other URI schemes are quoted. 

157 # Since sometimes people write file:/a/b and not file:///a/b 

158 # we should not quote in the explicit case of file: 

159 if "://" not in uri and not uri.startswith("file:"): 

160 if ESCAPES_RE.search(uri): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true

161 log.warning("Possible double encoding of %s", uri) 

162 else: 

163 uri = urllib.parse.quote(uri) 

164 # Special case hash since we must support fragments 

165 # even in schemeless URIs -- although try to only replace 

166 # them in file part and not directory part 

167 if ESCAPED_HASH in uri: 167 ↛ 168line 167 didn't jump to line 168, because the condition on line 167 was never true

168 dirpos = uri.rfind("/") 

169 # Do replacement after this / 

170 uri = uri[:dirpos+1] + uri[dirpos+1:].replace(ESCAPED_HASH, "#") 

171 

172 parsed = urllib.parse.urlparse(uri) 

173 elif isinstance(uri, urllib.parse.ParseResult): 

174 parsed = copy.copy(uri) 

175 # If we are being instantiated with a subclass, rather than 

176 # ButlerURI, ensure that that subclass is used directly. 

177 # This could lead to inconsistencies if this constructor 

178 # is used externally outside of the ButlerURI.replace() method. 

179 # ButlerS3URI(urllib.parse.urlparse("file://a/b.txt")) 

180 # will be a problem. 

181 # This is needed to prevent a schemeless absolute URI become 

182 # a file URI unexpectedly when calling updatedFile or 

183 # updatedExtension 

184 if cls is not ButlerURI: 

185 parsed, dirLike = cls._fixDirectorySep(parsed, forceDirectory) 

186 subclass = cls 

187 

188 elif isinstance(uri, ButlerURI): 188 ↛ 193line 188 didn't jump to line 193, because the condition on line 188 was never false

189 # Since ButlerURI is immutable we can return the argument 

190 # unchanged. 

191 return uri 

192 else: 

193 raise ValueError("Supplied URI must be string, Path, " 

194 f"ButlerURI, or ParseResult but got '{uri!r}'") 

195 

196 if subclass is None: 

197 # Work out the subclass from the URI scheme 

198 if not parsed.scheme: 

199 from .schemeless import ButlerSchemelessURI 

200 subclass = ButlerSchemelessURI 

201 elif parsed.scheme == "file": 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true

202 from .file import ButlerFileURI 

203 subclass = ButlerFileURI 

204 elif parsed.scheme == "s3": 204 ↛ 205line 204 didn't jump to line 205, because the condition on line 204 was never true

205 from .s3 import ButlerS3URI 

206 subclass = ButlerS3URI 

207 elif parsed.scheme.startswith("http"): 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true

208 from .http import ButlerHttpURI 

209 subclass = ButlerHttpURI 

210 elif parsed.scheme == "resource": 210 ↛ 214line 210 didn't jump to line 214, because the condition on line 210 was never false

211 # Rules for scheme names disallow pkg_resource 

212 from .packageresource import ButlerPackageResourceURI 

213 subclass = ButlerPackageResourceURI 

214 elif parsed.scheme == "mem": 

215 # in-memory datastore object 

216 from .mem import ButlerInMemoryURI 

217 subclass = ButlerInMemoryURI 

218 else: 

219 raise NotImplementedError(f"No URI support for scheme: '{parsed.scheme}'" 

220 " in {parsed.geturl()}") 

221 

222 parsed, dirLike = subclass._fixupPathUri(parsed, root=root, 

223 forceAbsolute=forceAbsolute, 

224 forceDirectory=forceDirectory) 

225 

226 # It is possible for the class to change from schemeless 

227 # to file so handle that 

228 if parsed.scheme == "file": 228 ↛ 229line 228 didn't jump to line 229, because the condition on line 228 was never true

229 from .file import ButlerFileURI 

230 subclass = ButlerFileURI 

231 

232 # Now create an instance of the correct subclass and set the 

233 # attributes directly 

234 self = object.__new__(subclass) 

235 self._uri = parsed 

236 self.dirLike = dirLike 

237 self.isTemporary = isTemporary 

238 return self 

239 

240 @property 

241 def scheme(self) -> str: 

242 """Return the URI scheme. 

243 

244 Notes 

245 ----- 

246 (``://`` is not part of the scheme). 

247 """ 

248 return self._uri.scheme 

249 

250 @property 

251 def netloc(self) -> str: 

252 """Return the URI network location.""" 

253 return self._uri.netloc 

254 

255 @property 

256 def path(self) -> str: 

257 """Return the path component of the URI.""" 

258 return self._uri.path 

259 

260 @property 

261 def unquoted_path(self) -> str: 

262 """Return path component of the URI with any URI quoting reversed.""" 

263 return urllib.parse.unquote(self._uri.path) 

264 

265 @property 

266 def ospath(self) -> str: 

267 """Return the path component of the URI localized to current OS.""" 

268 raise AttributeError(f"Non-file URI ({self}) has no local OS path.") 

269 

270 @property 

271 def relativeToPathRoot(self) -> str: 

272 """Return path relative to network location. 

273 

274 Effectively, this is the path property with posix separator stripped 

275 from the left hand side of the path. 

276 

277 Always unquotes. 

278 """ 

279 p = self._pathLib(self.path) 

280 relToRoot = str(p.relative_to(p.root)) 

281 if self.dirLike and not relToRoot.endswith("/"): 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true

282 relToRoot += "/" 

283 return urllib.parse.unquote(relToRoot) 

284 

285 @property 

286 def is_root(self) -> bool: 

287 """Return whether this URI points to the root of the network location. 

288 

289 This means that the path components refers to the top level. 

290 """ 

291 relpath = self.relativeToPathRoot 

292 if relpath == "./": 

293 return True 

294 return False 

295 

296 @property 

297 def fragment(self) -> str: 

298 """Return the fragment component of the URI.""" 

299 return self._uri.fragment 

300 

301 @property 

302 def params(self) -> str: 

303 """Return any parameters included in the URI.""" 

304 return self._uri.params 

305 

306 @property 

307 def query(self) -> str: 

308 """Return any query strings included in the URI.""" 

309 return self._uri.query 

310 

311 def geturl(self) -> str: 

312 """Return the URI in string form. 

313 

314 Returns 

315 ------- 

316 url : `str` 

317 String form of URI. 

318 """ 

319 return self._uri.geturl() 

320 

321 def root_uri(self) -> ButlerURI: 

322 """Return the base root URI. 

323 

324 Returns 

325 ------- 

326 uri : `ButlerURI` 

327 root URI. 

328 """ 

329 return self.replace(path="", forceDirectory=True) 

330 

331 def split(self) -> Tuple[ButlerURI, str]: 

332 """Split URI into head and tail. 

333 

334 Returns 

335 ------- 

336 head: `ButlerURI` 

337 Everything leading up to tail, expanded and normalized as per 

338 ButlerURI rules. 

339 tail : `str` 

340 Last `self.path` component. Tail will be empty if path ends on a 

341 separator. Tail will never contain separators. It will be 

342 unquoted. 

343 

344 Notes 

345 ----- 

346 Equivalent to `os.path.split()` where head preserves the URI 

347 components. 

348 """ 

349 head, tail = self._pathModule.split(self.path) 

350 headuri = self._uri._replace(path=head) 

351 

352 # The file part should never include quoted metacharacters 

353 tail = urllib.parse.unquote(tail) 

354 

355 # Schemeless is special in that it can be a relative path 

356 # We need to ensure that it stays that way. All other URIs will 

357 # be absolute already. 

358 forceAbsolute = self._pathModule.isabs(self.path) 

359 return ButlerURI(headuri, forceDirectory=True, forceAbsolute=forceAbsolute), tail 

360 

361 def basename(self) -> str: 

362 """Return the base name, last element of path, of the URI. 

363 

364 Returns 

365 ------- 

366 tail : `str` 

367 Last part of the path attribute. Trail will be empty if path ends 

368 on a separator. 

369 

370 Notes 

371 ----- 

372 If URI ends on a slash returns an empty string. This is the second 

373 element returned by `split()`. 

374 

375 Equivalent of `os.path.basename()``. 

376 """ 

377 return self.split()[1] 

378 

379 def dirname(self) -> ButlerURI: 

380 """Return the directory component of the path as a new `ButlerURI`. 

381 

382 Returns 

383 ------- 

384 head : `ButlerURI` 

385 Everything except the tail of path attribute, expanded and 

386 normalized as per ButlerURI rules. 

387 

388 Notes 

389 ----- 

390 Equivalent of `os.path.dirname()`. 

391 """ 

392 return self.split()[0] 

393 

394 def parent(self) -> ButlerURI: 

395 """Return a `ButlerURI` of the parent directory. 

396 

397 Returns 

398 ------- 

399 head : `ButlerURI` 

400 Everything except the tail of path attribute, expanded and 

401 normalized as per `ButlerURI` rules. 

402 

403 Notes 

404 ----- 

405 For a file-like URI this will be the same as calling `dirname()`. 

406 """ 

407 # When self is file-like, return self.dirname() 

408 if not self.dirLike: 

409 return self.dirname() 

410 # When self is dir-like, return its parent directory, 

411 # regardless of the presence of a trailing separator 

412 originalPath = self._pathLib(self.path) 

413 parentPath = originalPath.parent 

414 return self.replace(path=str(parentPath), forceDirectory=True) 

415 

416 def replace(self, forceDirectory: bool = False, isTemporary: bool = False, **kwargs: Any) -> ButlerURI: 

417 """Return new `ButlerURI` with specified components replaced. 

418 

419 Parameters 

420 ---------- 

421 forceDirectory : `bool`, optional 

422 Parameter passed to ButlerURI constructor to force this 

423 new URI to be dir-like. 

424 isTemporary : `bool`, optional 

425 Indicate that the resulting URI is temporary resource. 

426 **kwargs 

427 Components of a `urllib.parse.ParseResult` that should be 

428 modified for the newly-created `ButlerURI`. 

429 

430 Returns 

431 ------- 

432 new : `ButlerURI` 

433 New `ButlerURI` object with updated values. 

434 

435 Notes 

436 ----- 

437 Does not, for now, allow a change in URI scheme. 

438 """ 

439 # Disallow a change in scheme 

440 if "scheme" in kwargs: 440 ↛ 441line 440 didn't jump to line 441, because the condition on line 440 was never true

441 raise ValueError(f"Can not use replace() method to change URI scheme for {self}") 

442 return self.__class__(self._uri._replace(**kwargs), forceDirectory=forceDirectory, 

443 isTemporary=isTemporary) 

444 

445 def updatedFile(self, newfile: str) -> ButlerURI: 

446 """Return new URI with an updated final component of the path. 

447 

448 Parameters 

449 ---------- 

450 newfile : `str` 

451 File name with no path component. 

452 

453 Returns 

454 ------- 

455 updated : `ButlerURI` 

456 

457 Notes 

458 ----- 

459 Forces the ButlerURI.dirLike attribute to be false. The new file path 

460 will be quoted if necessary. 

461 """ 

462 if self.quotePaths: 

463 newfile = urllib.parse.quote(newfile) 

464 dir, _ = self._pathModule.split(self.path) 

465 newpath = self._pathModule.join(dir, newfile) 

466 

467 updated = self.replace(path=newpath) 

468 updated.dirLike = False 

469 return updated 

470 

471 def updatedExtension(self, ext: Optional[str]) -> ButlerURI: 

472 """Return a new `ButlerURI` with updated file extension. 

473 

474 All file extensions are replaced. 

475 

476 Parameters 

477 ---------- 

478 ext : `str` or `None` 

479 New extension. If an empty string is given any extension will 

480 be removed. If `None` is given there will be no change. 

481 

482 Returns 

483 ------- 

484 updated : `ButlerURI` 

485 URI with the specified extension. Can return itself if 

486 no extension was specified. 

487 """ 

488 if ext is None: 

489 return self 

490 

491 # Get the extension 

492 current = self.getExtension() 

493 

494 # Nothing to do if the extension already matches 

495 if current == ext: 

496 return self 

497 

498 # Remove the current extension from the path 

499 # .fits.gz counts as one extension do not use os.path.splitext 

500 path = self.path 

501 if current: 

502 path = path[:-len(current)] 

503 

504 # Ensure that we have a leading "." on file extension (and we do not 

505 # try to modify the empty string) 

506 if ext and not ext.startswith("."): 

507 ext = "." + ext 

508 

509 return self.replace(path=path + ext) 

510 

511 def getExtension(self) -> str: 

512 """Return the file extension(s) associated with this URI path. 

513 

514 Returns 

515 ------- 

516 ext : `str` 

517 The file extension (including the ``.``). Can be empty string 

518 if there is no file extension. Usually returns only the last 

519 file extension unless there is a special extension modifier 

520 indicating file compression, in which case the combined 

521 extension (e.g. ``.fits.gz``) will be returned. 

522 """ 

523 special = {".gz", ".bz2", ".xz", ".fz"} 

524 

525 # Get the file part of the path so as not to be confused by 

526 # "." in directory names. 

527 basename = self.basename() 

528 extensions = self._pathLib(basename).suffixes 

529 

530 if not extensions: 530 ↛ 531line 530 didn't jump to line 531, because the condition on line 530 was never true

531 return "" 

532 

533 ext = extensions.pop() 

534 

535 # Multiple extensions, decide whether to include the final two 

536 if extensions and ext in special: 536 ↛ 537line 536 didn't jump to line 537, because the condition on line 536 was never true

537 ext = f"{extensions[-1]}{ext}" 

538 

539 return ext 

540 

541 def join(self, path: Union[str, ButlerURI], isTemporary: bool = False) -> ButlerURI: 

542 """Return new `ButlerURI` with additional path components. 

543 

544 Parameters 

545 ---------- 

546 path : `str`, `ButlerURI` 

547 Additional file components to append to the current URI. Assumed 

548 to include a file at the end. Will be quoted depending on the 

549 associated URI scheme. If the path looks like a URI with a scheme 

550 referring to an absolute location, it will be returned 

551 directly (matching the behavior of `os.path.join()`). It can 

552 also be a `ButlerURI`. 

553 isTemporary : `bool`, optional 

554 Indicate that the resulting URI represents a temporary resource. 

555 

556 Returns 

557 ------- 

558 new : `ButlerURI` 

559 New URI with any file at the end replaced with the new path 

560 components. 

561 

562 Notes 

563 ----- 

564 Schemeless URIs assume local path separator but all other URIs assume 

565 POSIX separator if the supplied path has directory structure. It 

566 may be this never becomes a problem but datastore templates assume 

567 POSIX separator is being used. 

568 

569 If an absolute `ButlerURI` is given for ``path`` is is assumed that 

570 this should be returned directly. Giving a ``path`` of an absolute 

571 scheme-less URI is not allowed for safety reasons as it may indicate 

572 a mistake in the calling code. 

573 

574 Raises 

575 ------ 

576 ValueError 

577 Raised if the ``path`` is an absolute scheme-less URI. In that 

578 situation it is unclear whether the intent is to return a 

579 ``file`` URI or it was a mistake and a relative scheme-less URI 

580 was meant. 

581 """ 

582 # If we have a full URI in path we will use it directly 

583 # but without forcing to absolute so that we can trap the 

584 # expected option of relative path. 

585 path_uri = ButlerURI(path, forceAbsolute=False) 

586 if path_uri.scheme: 586 ↛ 589line 586 didn't jump to line 589, because the condition on line 586 was never true

587 # Check for scheme so can distinguish explicit URIs from 

588 # absolute scheme-less URIs. 

589 return path_uri 

590 

591 if path_uri.isabs(): 591 ↛ 593line 591 didn't jump to line 593, because the condition on line 591 was never true

592 # Absolute scheme-less path. 

593 raise ValueError(f"Can not join absolute scheme-less {path_uri!r} to another URI.") 

594 

595 # If this was originally a ButlerURI extract the unquoted path from it. 

596 # Otherwise we use the string we were given to allow "#" to appear 

597 # in the filename if given as a plain string. 

598 if not isinstance(path, str): 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true

599 path = path_uri.unquoted_path 

600 

601 new = self.dirname() # By definition a directory URI 

602 

603 # new should be asked about quoting, not self, since dirname can 

604 # change the URI scheme for schemeless -> file 

605 if new.quotePaths: 605 ↛ 608line 605 didn't jump to line 608, because the condition on line 605 was never false

606 path = urllib.parse.quote(path) 

607 

608 newpath = self._pathModule.normpath(self._pathModule.join(new.path, path)) 

609 

610 # normpath can strip trailing / so we force directory if the supplied 

611 # path ended with a / 

612 return new.replace(path=newpath, forceDirectory=path.endswith(self._pathModule.sep), 

613 isTemporary=isTemporary) 

614 

615 def relative_to(self, other: ButlerURI) -> Optional[str]: 

616 """Return the relative path from this URI to the other URI. 

617 

618 Parameters 

619 ---------- 

620 other : `ButlerURI` 

621 URI to use to calculate the relative path. Must be a parent 

622 of this URI. 

623 

624 Returns 

625 ------- 

626 subpath : `str` 

627 The sub path of this URI relative to the supplied other URI. 

628 Returns `None` if there is no parent child relationship. 

629 Scheme and netloc must match. 

630 """ 

631 # Scheme-less absolute other is treated as if it's a file scheme. 

632 # Scheme-less relative other can only return non-None if self 

633 # is also scheme-less relative and that is handled specifically 

634 # in a subclass. 

635 if not other.scheme and other.isabs(): 

636 other = other.abspath() 

637 

638 # Scheme-less self is handled elsewhere. 

639 if self.scheme != other.scheme or self.netloc != other.netloc: 

640 return None 

641 

642 enclosed_path = self._pathLib(self.relativeToPathRoot) 

643 parent_path = other.relativeToPathRoot 

644 subpath: Optional[str] 

645 try: 

646 subpath = str(enclosed_path.relative_to(parent_path)) 

647 except ValueError: 

648 subpath = None 

649 else: 

650 subpath = urllib.parse.unquote(subpath) 

651 return subpath 

652 

653 def exists(self) -> bool: 

654 """Indicate that the resource is available. 

655 

656 Returns 

657 ------- 

658 exists : `bool` 

659 `True` if the resource exists. 

660 """ 

661 raise NotImplementedError() 

662 

663 @classmethod 

664 def mexists(cls, uris: Iterable[ButlerURI]) -> Dict[ButlerURI, bool]: 

665 """Check for existence of multiple URIs at once. 

666 

667 Parameters 

668 ---------- 

669 uris : iterable of `ButlerURI` 

670 The URIs to test. 

671 

672 Returns 

673 ------- 

674 existence : `dict` of [`ButlerURI`, `bool`] 

675 Mapping of original URI to boolean indicating existence. 

676 """ 

677 exists_executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) 

678 future_exists = {exists_executor.submit(uri.exists): uri for uri in uris} 

679 

680 results: Dict[ButlerURI, bool] = {} 

681 for future in concurrent.futures.as_completed(future_exists): 

682 uri = future_exists[future] 

683 try: 

684 exists = future.result() 

685 except Exception: 

686 exists = False 

687 results[uri] = exists 

688 return results 

689 

690 def remove(self) -> None: 

691 """Remove the resource.""" 

692 raise NotImplementedError() 

693 

694 def isabs(self) -> bool: 

695 """Indicate that the resource is fully specified. 

696 

697 For non-schemeless URIs this is always true. 

698 

699 Returns 

700 ------- 

701 isabs : `bool` 

702 `True` in all cases except schemeless URI. 

703 """ 

704 return True 

705 

706 def abspath(self) -> ButlerURI: 

707 """Return URI using an absolute path. 

708 

709 Returns 

710 ------- 

711 abs : `ButlerURI` 

712 Absolute URI. For non-schemeless URIs this always returns itself. 

713 Schemeless URIs are upgraded to file URIs. 

714 """ 

715 return self 

716 

717 def _as_local(self) -> Tuple[str, bool]: 

718 """Return the location of the (possibly remote) resource as local file. 

719 

720 This is a helper function for `as_local` context manager. 

721 

722 Returns 

723 ------- 

724 path : `str` 

725 If this is a remote resource, it will be a copy of the resource 

726 on the local file system, probably in a temporary directory. 

727 For a local resource this should be the actual path to the 

728 resource. 

729 is_temporary : `bool` 

730 Indicates if the local path is a temporary file or not. 

731 """ 

732 raise NotImplementedError() 

733 

734 @contextlib.contextmanager 

735 def as_local(self) -> Iterator[ButlerURI]: 

736 """Return the location of the (possibly remote) resource as local file. 

737 

738 Yields 

739 ------ 

740 local : `ButlerURI` 

741 If this is a remote resource, it will be a copy of the resource 

742 on the local file system, probably in a temporary directory. 

743 For a local resource this should be the actual path to the 

744 resource. 

745 

746 Notes 

747 ----- 

748 The context manager will automatically delete any local temporary 

749 file. 

750 

751 Examples 

752 -------- 

753 Should be used as a context manager: 

754 

755 .. code-block:: py 

756 

757 with uri.as_local() as local: 

758 ospath = local.ospath 

759 """ 

760 local_src, is_temporary = self._as_local() 

761 local_uri = ButlerURI(local_src, isTemporary=is_temporary) 

762 

763 try: 

764 yield local_uri 

765 finally: 

766 # The caller might have relocated the temporary file 

767 if is_temporary and local_uri.exists(): 

768 local_uri.remove() 

769 

770 @classmethod 

771 @contextlib.contextmanager 

772 def temporary_uri(cls, prefix: Optional[ButlerURI] = None, 

773 suffix: Optional[str] = None) -> Iterator[ButlerURI]: 

774 """Create a temporary URI. 

775 

776 Parameters 

777 ---------- 

778 prefix : `ButlerURI`, optional 

779 Prefix to use. Without this the path will be formed as a local 

780 file URI in a temporary directory. Ensuring that the prefix 

781 location exists is the responsibility of the caller. 

782 suffix : `str`, optional 

783 A file suffix to be used. The ``.`` should be included in this 

784 suffix. 

785 

786 Yields 

787 ------ 

788 uri : `ButlerURI` 

789 The temporary URI. Will be removed when the context is completed. 

790 """ 

791 use_tempdir = False 

792 if prefix is None: 

793 prefix = ButlerURI(tempfile.mkdtemp(), forceDirectory=True, isTemporary=True) 

794 # Record that we need to delete this directory. Can not rely 

795 # on isTemporary flag since an external prefix may have that 

796 # set as well. 

797 use_tempdir = True 

798 

799 # Need to create a randomized file name. For consistency do not 

800 # use mkstemp for local and something else for remote. Additionally 

801 # this method does not create the file to prevent name clashes. 

802 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

803 rng = Random() 

804 tempname = "".join(rng.choice(characters) for _ in range(16)) 

805 if suffix: 

806 tempname += suffix 

807 temporary_uri = prefix.join(tempname, isTemporary=True) 

808 

809 try: 

810 yield temporary_uri 

811 finally: 

812 if use_tempdir: 

813 shutil.rmtree(prefix.ospath, ignore_errors=True) 

814 else: 

815 try: 

816 # It's okay if this does not work because the user removed 

817 # the file. 

818 temporary_uri.remove() 

819 except FileNotFoundError: 

820 pass 

821 

822 def read(self, size: int = -1) -> bytes: 

823 """Open the resource and return the contents in bytes. 

824 

825 Parameters 

826 ---------- 

827 size : `int`, optional 

828 The number of bytes to read. Negative or omitted indicates 

829 that all data should be read. 

830 """ 

831 raise NotImplementedError() 

832 

833 def write(self, data: bytes, overwrite: bool = True) -> None: 

834 """Write the supplied bytes to the new resource. 

835 

836 Parameters 

837 ---------- 

838 data : `bytes` 

839 The bytes to write to the resource. The entire contents of the 

840 resource will be replaced. 

841 overwrite : `bool`, optional 

842 If `True` the resource will be overwritten if it exists. Otherwise 

843 the write will fail. 

844 """ 

845 raise NotImplementedError() 

846 

847 def mkdir(self) -> None: 

848 """For a dir-like URI, create the directory resource if needed.""" 

849 raise NotImplementedError() 

850 

851 def isdir(self) -> bool: 

852 """Return True if this URI looks like a directory, else False.""" 

853 return self.dirLike 

854 

855 def size(self) -> int: 

856 """For non-dir-like URI, return the size of the resource. 

857 

858 Returns 

859 ------- 

860 sz : `int` 

861 The size in bytes of the resource associated with this URI. 

862 Returns 0 if dir-like. 

863 """ 

864 raise NotImplementedError() 

865 

866 def __str__(self) -> str: 

867 """Convert the URI to its native string form.""" 

868 return self.geturl() 

869 

870 def __repr__(self) -> str: 

871 """Return string representation suitable for evaluation.""" 

872 return f'ButlerURI("{self.geturl()}")' 

873 

874 def __eq__(self, other: Any) -> bool: 

875 """Compare supplied object with this `ButlerURI`.""" 

876 if not isinstance(other, ButlerURI): 

877 return NotImplemented 

878 return self.geturl() == other.geturl() 

879 

880 def __hash__(self) -> int: 

881 """Return hash of this object.""" 

882 return hash(str(self)) 

883 

884 def __copy__(self) -> ButlerURI: 

885 """Copy constructor. 

886 

887 Object is immutable so copy can return itself. 

888 """ 

889 # Implement here because the __new__ method confuses things 

890 return self 

891 

892 def __deepcopy__(self, memo: Any) -> ButlerURI: 

893 """Deepcopy the object. 

894 

895 Object is immutable so copy can return itself. 

896 """ 

897 # Implement here because the __new__ method confuses things 

898 return self 

899 

900 def __getnewargs__(self) -> Tuple: 

901 """Support pickling.""" 

902 return (str(self),) 

903 

904 @classmethod 

905 def _fixDirectorySep(cls, parsed: urllib.parse.ParseResult, 

906 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]: 

907 """Ensure that a path separator is present on directory paths. 

908 

909 Parameters 

910 ---------- 

911 parsed : `~urllib.parse.ParseResult` 

912 The result from parsing a URI using `urllib.parse`. 

913 forceDirectory : `bool`, optional 

914 If `True` forces the URI to end with a separator, otherwise given 

915 URI is interpreted as is. Specifying that the URI is conceptually 

916 equivalent to a directory can break some ambiguities when 

917 interpreting the last element of a path. 

918 

919 Returns 

920 ------- 

921 modified : `~urllib.parse.ParseResult` 

922 Update result if a URI is being handled. 

923 dirLike : `bool` 

924 `True` if given parsed URI has a trailing separator or 

925 forceDirectory is True. Otherwise `False`. 

926 """ 

927 # assume we are not dealing with a directory like URI 

928 dirLike = False 

929 

930 # Directory separator 

931 sep = cls._pathModule.sep 

932 

933 # URI is dir-like if explicitly stated or if it ends on a separator 

934 endsOnSep = parsed.path.endswith(sep) 

935 if forceDirectory or endsOnSep: 

936 dirLike = True 

937 # only add the separator if it's not already there 

938 if not endsOnSep: 938 ↛ 941line 938 didn't jump to line 941, because the condition on line 938 was never false

939 parsed = parsed._replace(path=parsed.path+sep) 

940 

941 return parsed, dirLike 

942 

943 @classmethod 

944 def _fixupPathUri(cls, parsed: urllib.parse.ParseResult, root: Optional[Union[str, ButlerURI]] = None, 

945 forceAbsolute: bool = False, 

946 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]: 

947 """Correct any issues with the supplied URI. 

948 

949 Parameters 

950 ---------- 

951 parsed : `~urllib.parse.ParseResult` 

952 The result from parsing a URI using `urllib.parse`. 

953 root : `str` or `ButlerURI`, ignored 

954 Not used by the this implementation since all URIs are 

955 absolute except for those representing the local file system. 

956 forceAbsolute : `bool`, ignored. 

957 Not used by this implementation. URIs are generally always 

958 absolute. 

959 forceDirectory : `bool`, optional 

960 If `True` forces the URI to end with a separator, otherwise given 

961 URI is interpreted as is. Specifying that the URI is conceptually 

962 equivalent to a directory can break some ambiguities when 

963 interpreting the last element of a path. 

964 

965 Returns 

966 ------- 

967 modified : `~urllib.parse.ParseResult` 

968 Update result if a URI is being handled. 

969 dirLike : `bool` 

970 `True` if given parsed URI has a trailing separator or 

971 forceDirectory is True. Otherwise `False`. 

972 

973 Notes 

974 ----- 

975 Relative paths are explicitly not supported by RFC8089 but `urllib` 

976 does accept URIs of the form ``file:relative/path.ext``. They need 

977 to be turned into absolute paths before they can be used. This is 

978 always done regardless of the ``forceAbsolute`` parameter. 

979 

980 AWS S3 differentiates between keys with trailing POSIX separators (i.e 

981 `/dir` and `/dir/`) whereas POSIX does not neccessarily. 

982 

983 Scheme-less paths are normalized. 

984 """ 

985 return cls._fixDirectorySep(parsed, forceDirectory) 

986 

987 def transfer_from(self, src: ButlerURI, transfer: str, 

988 overwrite: bool = False, 

989 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None: 

990 """Transfer the current resource to a new location. 

991 

992 Parameters 

993 ---------- 

994 src : `ButlerURI` 

995 Source URI. 

996 transfer : `str` 

997 Mode to use for transferring the resource. Generically there are 

998 many standard options: copy, link, symlink, hardlink, relsymlink. 

999 Not all URIs support all modes. 

1000 overwrite : `bool`, optional 

1001 Allow an existing file to be overwritten. Defaults to `False`. 

1002 transaction : `DatastoreTransaction`, optional 

1003 A transaction object that can (depending on implementation) 

1004 rollback transfers on error. Not guaranteed to be implemented. 

1005 

1006 Notes 

1007 ----- 

1008 Conceptually this is hard to scale as the number of URI schemes 

1009 grow. The destination URI is more important than the source URI 

1010 since that is where all the transfer modes are relevant (with the 

1011 complication that "move" deletes the source). 

1012 

1013 Local file to local file is the fundamental use case but every 

1014 other scheme has to support "copy" to local file (with implicit 

1015 support for "move") and copy from local file. 

1016 All the "link" options tend to be specific to local file systems. 

1017 

1018 "move" is a "copy" where the remote resource is deleted at the end. 

1019 Whether this works depends on the source URI rather than the 

1020 destination URI. Reverting a move on transaction rollback is 

1021 expected to be problematic if a remote resource was involved. 

1022 """ 

1023 raise NotImplementedError(f"No transfer modes supported by URI scheme {self.scheme}") 

1024 

1025 def walk(self, file_filter: Optional[Union[str, re.Pattern]] = None) -> Iterator[Union[List, 

1026 Tuple[ButlerURI, 

1027 List[str], 

1028 List[str]]]]: 

1029 """Walk the directory tree returning matching files and directories. 

1030 

1031 Parameters 

1032 ---------- 

1033 file_filter : `str` or `re.Pattern`, optional 

1034 Regex to filter out files from the list before it is returned. 

1035 

1036 Yields 

1037 ------ 

1038 dirpath : `ButlerURI` 

1039 Current directory being examined. 

1040 dirnames : `list` of `str` 

1041 Names of subdirectories within dirpath. 

1042 filenames : `list` of `str` 

1043 Names of all the files within dirpath. 

1044 """ 

1045 raise NotImplementedError() 

1046 

1047 @classmethod 

1048 def findFileResources(cls, candidates: Iterable[Union[str, ButlerURI]], 

1049 file_filter: Optional[str] = None, 

1050 grouped: bool = False) -> Iterator[Union[ButlerURI, Iterator[ButlerURI]]]: 

1051 """Get all the files from a list of values. 

1052 

1053 Parameters 

1054 ---------- 

1055 candidates : iterable [`str` or `ButlerURI`] 

1056 The files to return and directories in which to look for files to 

1057 return. 

1058 file_filter : `str`, optional 

1059 The regex to use when searching for files within directories. 

1060 By default returns all the found files. 

1061 grouped : `bool`, optional 

1062 If `True` the results will be grouped by directory and each 

1063 yielded value will be an iterator over URIs. If `False` each 

1064 URI will be returned separately. 

1065 

1066 Yields 

1067 ------ 

1068 found_file: `ButlerURI` 

1069 The passed-in URIs and URIs found in passed-in directories. 

1070 If grouping is enabled, each of the yielded values will be an 

1071 iterator yielding members of the group. Files given explicitly 

1072 will be returned as a single group at the end. 

1073 

1074 Notes 

1075 ----- 

1076 If a value is a file it is yielded immediately. If a value is a 

1077 directory, all the files in the directory (recursively) that match 

1078 the regex will be yielded in turn. 

1079 """ 

1080 fileRegex = None if file_filter is None else re.compile(file_filter) 

1081 

1082 singles = [] 

1083 

1084 # Find all the files of interest 

1085 for location in candidates: 

1086 uri = ButlerURI(location) 

1087 if uri.isdir(): 

1088 for found in uri.walk(fileRegex): 

1089 if not found: 

1090 # This means the uri does not exist and by 

1091 # convention we ignore it 

1092 continue 

1093 root, dirs, files = found 

1094 if not files: 

1095 continue 

1096 if grouped: 

1097 yield (root.join(name) for name in files) 

1098 else: 

1099 for name in files: 

1100 yield root.join(name) 

1101 else: 

1102 if grouped: 

1103 singles.append(uri) 

1104 else: 

1105 yield uri 

1106 

1107 # Finally, return any explicitly given files in one group 

1108 if grouped and singles: 

1109 yield iter(singles)