Coverage for python/lsst/daf/butler/core/_butlerUri/_butlerUri.py: 48%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import contextlib
25import concurrent.futures
26import urllib.parse
27import posixpath
28import copy
29import logging
30import re
31import shutil
32import tempfile
33import os
35from random import Random
36from pathlib import Path, PurePath, PurePosixPath
38__all__ = ('ButlerURI',)
40from typing import (
41 TYPE_CHECKING,
42 Any,
43 Iterable,
44 Iterator,
45 List,
46 Dict,
47 Optional,
48 Tuple,
49 Type,
50 Union,
51)
53from .utils import NoTransaction
55if TYPE_CHECKING: 55 ↛ 56line 55 didn't jump to line 56, because the condition on line 55 was never true
56 from ..datastore import DatastoreTransaction
59log = logging.getLogger(__name__)
61# Regex for looking for URI escapes
62ESCAPES_RE = re.compile(r"%[A-F0-9]{2}")
64# Precomputed escaped hash
65ESCAPED_HASH = urllib.parse.quote("#")
67# Maximum number of worker threads for parallelized operations.
68# If greater than 10, be aware that this number has to be consistent
69# with connection pool sizing (for example in urllib3).
70MAX_WORKERS = 10
73class ButlerURI:
74 """Convenience wrapper around URI parsers.
76 Provides access to URI components and can convert file
77 paths into absolute path URIs. Scheme-less URIs are treated as if
78 they are local file system paths and are converted to absolute URIs.
80 A specialist subclass is created for each supported URI scheme.
82 Parameters
83 ----------
84 uri : `str` or `urllib.parse.ParseResult`
85 URI in string form. Can be scheme-less if referring to a local
86 filesystem path.
87 root : `str` or `ButlerURI`, optional
88 When fixing up a relative path in a ``file`` scheme or if scheme-less,
89 use this as the root. Must be absolute. If `None` the current
90 working directory will be used. Can be a file URI.
91 forceAbsolute : `bool`, optional
92 If `True`, scheme-less relative URI will be converted to an absolute
93 path using a ``file`` scheme. If `False` scheme-less URI will remain
94 scheme-less and will not be updated to ``file`` or absolute path.
95 forceDirectory: `bool`, optional
96 If `True` forces the URI to end with a separator, otherwise given URI
97 is interpreted as is.
98 isTemporary : `bool`, optional
99 If `True` indicates that this URI points to a temporary resource.
100 """
102 _pathLib: Type[PurePath] = PurePosixPath
103 """Path library to use for this scheme."""
105 _pathModule = posixpath
106 """Path module to use for this scheme."""
108 transferModes: Tuple[str, ...] = ("copy", "auto", "move")
109 """Transfer modes supported by this implementation.
111 Move is special in that it is generally a copy followed by an unlink.
112 Whether that unlink works depends critically on whether the source URI
113 implements unlink. If it does not the move will be reported as a failure.
114 """
116 transferDefault: str = "copy"
117 """Default mode to use for transferring if ``auto`` is specified."""
119 quotePaths = True
120 """True if path-like elements modifying a URI should be quoted.
122 All non-schemeless URIs have to internally use quoted paths. Therefore
123 if a new file name is given (e.g. to updatedFile or join) a decision must
124 be made whether to quote it to be consistent.
125 """
127 isLocal = False
128 """If `True` this URI refers to a local file."""
130 # This is not an ABC with abstract methods because the __new__ being
131 # a factory confuses mypy such that it assumes that every constructor
132 # returns a ButlerURI and then determines that all the abstract methods
133 # are still abstract. If they are not marked abstract but just raise
134 # mypy is fine with it.
136 # mypy is confused without these
137 _uri: urllib.parse.ParseResult
138 isTemporary: bool
139 dirLike: bool
141 def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI, Path],
142 root: Optional[Union[str, ButlerURI]] = None, forceAbsolute: bool = True,
143 forceDirectory: bool = False, isTemporary: bool = False) -> ButlerURI:
144 """Create and return new specialist ButlerURI subclass."""
145 parsed: urllib.parse.ParseResult
146 dirLike: bool = False
147 subclass: Optional[Type[ButlerURI]] = None
149 if isinstance(uri, os.PathLike): 149 ↛ 150line 149 didn't jump to line 150, because the condition on line 149 was never true
150 uri = str(uri)
152 # Record if we need to post process the URI components
153 # or if the instance is already fully configured
154 if isinstance(uri, str):
155 # Since local file names can have special characters in them
156 # we need to quote them for the parser but we can unquote
157 # later. Assume that all other URI schemes are quoted.
158 # Since sometimes people write file:/a/b and not file:///a/b
159 # we should not quote in the explicit case of file:
160 if "://" not in uri and not uri.startswith("file:"):
161 if ESCAPES_RE.search(uri): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true
162 log.warning("Possible double encoding of %s", uri)
163 else:
164 uri = urllib.parse.quote(uri)
165 # Special case hash since we must support fragments
166 # even in schemeless URIs -- although try to only replace
167 # them in file part and not directory part
168 if ESCAPED_HASH in uri: 168 ↛ 169line 168 didn't jump to line 169, because the condition on line 168 was never true
169 dirpos = uri.rfind("/")
170 # Do replacement after this /
171 uri = uri[:dirpos+1] + uri[dirpos+1:].replace(ESCAPED_HASH, "#")
173 parsed = urllib.parse.urlparse(uri)
174 elif isinstance(uri, urllib.parse.ParseResult):
175 parsed = copy.copy(uri)
176 # If we are being instantiated with a subclass, rather than
177 # ButlerURI, ensure that that subclass is used directly.
178 # This could lead to inconsistencies if this constructor
179 # is used externally outside of the ButlerURI.replace() method.
180 # ButlerS3URI(urllib.parse.urlparse("file://a/b.txt"))
181 # will be a problem.
182 # This is needed to prevent a schemeless absolute URI become
183 # a file URI unexpectedly when calling updatedFile or
184 # updatedExtension
185 if cls is not ButlerURI:
186 parsed, dirLike = cls._fixDirectorySep(parsed, forceDirectory)
187 subclass = cls
189 elif isinstance(uri, ButlerURI): 189 ↛ 194line 189 didn't jump to line 194, because the condition on line 189 was never false
190 # Since ButlerURI is immutable we can return the argument
191 # unchanged.
192 return uri
193 else:
194 raise ValueError("Supplied URI must be string, Path, "
195 f"ButlerURI, or ParseResult but got '{uri!r}'")
197 if subclass is None:
198 # Work out the subclass from the URI scheme
199 if not parsed.scheme:
200 from .schemeless import ButlerSchemelessURI
201 subclass = ButlerSchemelessURI
202 elif parsed.scheme == "file": 202 ↛ 203line 202 didn't jump to line 203, because the condition on line 202 was never true
203 from .file import ButlerFileURI
204 subclass = ButlerFileURI
205 elif parsed.scheme == "s3": 205 ↛ 206line 205 didn't jump to line 206, because the condition on line 205 was never true
206 from .s3 import ButlerS3URI
207 subclass = ButlerS3URI
208 elif parsed.scheme.startswith("http"): 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true
209 from .http import ButlerHttpURI
210 subclass = ButlerHttpURI
211 elif parsed.scheme == "resource": 211 ↛ 215line 211 didn't jump to line 215, because the condition on line 211 was never false
212 # Rules for scheme names disallow pkg_resource
213 from .packageresource import ButlerPackageResourceURI
214 subclass = ButlerPackageResourceURI
215 elif parsed.scheme == "mem":
216 # in-memory datastore object
217 from .mem import ButlerInMemoryURI
218 subclass = ButlerInMemoryURI
219 else:
220 raise NotImplementedError(f"No URI support for scheme: '{parsed.scheme}'"
221 " in {parsed.geturl()}")
223 parsed, dirLike = subclass._fixupPathUri(parsed, root=root,
224 forceAbsolute=forceAbsolute,
225 forceDirectory=forceDirectory)
227 # It is possible for the class to change from schemeless
228 # to file so handle that
229 if parsed.scheme == "file": 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true
230 from .file import ButlerFileURI
231 subclass = ButlerFileURI
233 # Now create an instance of the correct subclass and set the
234 # attributes directly
235 self = object.__new__(subclass)
236 self._uri = parsed
237 self.dirLike = dirLike
238 self.isTemporary = isTemporary
239 return self
241 @property
242 def scheme(self) -> str:
243 """Return the URI scheme.
245 Notes
246 -----
247 (``://`` is not part of the scheme).
248 """
249 return self._uri.scheme
251 @property
252 def netloc(self) -> str:
253 """Return the URI network location."""
254 return self._uri.netloc
256 @property
257 def path(self) -> str:
258 """Return the path component of the URI."""
259 return self._uri.path
261 @property
262 def unquoted_path(self) -> str:
263 """Return path component of the URI with any URI quoting reversed."""
264 return urllib.parse.unquote(self._uri.path)
266 @property
267 def ospath(self) -> str:
268 """Return the path component of the URI localized to current OS."""
269 raise AttributeError(f"Non-file URI ({self}) has no local OS path.")
271 @property
272 def relativeToPathRoot(self) -> str:
273 """Return path relative to network location.
275 Effectively, this is the path property with posix separator stripped
276 from the left hand side of the path.
278 Always unquotes.
279 """
280 p = self._pathLib(self.path)
281 relToRoot = str(p.relative_to(p.root))
282 if self.dirLike and not relToRoot.endswith("/"): 282 ↛ 283line 282 didn't jump to line 283, because the condition on line 282 was never true
283 relToRoot += "/"
284 return urllib.parse.unquote(relToRoot)
286 @property
287 def is_root(self) -> bool:
288 """Return whether this URI points to the root of the network location.
290 This means that the path components refers to the top level.
291 """
292 relpath = self.relativeToPathRoot
293 if relpath == "./":
294 return True
295 return False
297 @property
298 def fragment(self) -> str:
299 """Return the fragment component of the URI."""
300 return self._uri.fragment
302 @property
303 def params(self) -> str:
304 """Return any parameters included in the URI."""
305 return self._uri.params
307 @property
308 def query(self) -> str:
309 """Return any query strings included in the URI."""
310 return self._uri.query
312 def geturl(self) -> str:
313 """Return the URI in string form.
315 Returns
316 -------
317 url : `str`
318 String form of URI.
319 """
320 return self._uri.geturl()
322 def root_uri(self) -> ButlerURI:
323 """Return the base root URI.
325 Returns
326 -------
327 uri : `ButlerURI`
328 root URI.
329 """
330 return self.replace(path="", forceDirectory=True)
332 def split(self) -> Tuple[ButlerURI, str]:
333 """Split URI into head and tail.
335 Returns
336 -------
337 head: `ButlerURI`
338 Everything leading up to tail, expanded and normalized as per
339 ButlerURI rules.
340 tail : `str`
341 Last `self.path` component. Tail will be empty if path ends on a
342 separator. Tail will never contain separators. It will be
343 unquoted.
345 Notes
346 -----
347 Equivalent to `os.path.split()` where head preserves the URI
348 components.
349 """
350 head, tail = self._pathModule.split(self.path)
351 headuri = self._uri._replace(path=head)
353 # The file part should never include quoted metacharacters
354 tail = urllib.parse.unquote(tail)
356 # Schemeless is special in that it can be a relative path
357 # We need to ensure that it stays that way. All other URIs will
358 # be absolute already.
359 forceAbsolute = self._pathModule.isabs(self.path)
360 return ButlerURI(headuri, forceDirectory=True, forceAbsolute=forceAbsolute), tail
362 def basename(self) -> str:
363 """Return the base name, last element of path, of the URI.
365 Returns
366 -------
367 tail : `str`
368 Last part of the path attribute. Trail will be empty if path ends
369 on a separator.
371 Notes
372 -----
373 If URI ends on a slash returns an empty string. This is the second
374 element returned by `split()`.
376 Equivalent of `os.path.basename()``.
377 """
378 return self.split()[1]
380 def dirname(self) -> ButlerURI:
381 """Return the directory component of the path as a new `ButlerURI`.
383 Returns
384 -------
385 head : `ButlerURI`
386 Everything except the tail of path attribute, expanded and
387 normalized as per ButlerURI rules.
389 Notes
390 -----
391 Equivalent of `os.path.dirname()`.
392 """
393 return self.split()[0]
395 def parent(self) -> ButlerURI:
396 """Return a `ButlerURI` of the parent directory.
398 Returns
399 -------
400 head : `ButlerURI`
401 Everything except the tail of path attribute, expanded and
402 normalized as per `ButlerURI` rules.
404 Notes
405 -----
406 For a file-like URI this will be the same as calling `dirname()`.
407 """
408 # When self is file-like, return self.dirname()
409 if not self.dirLike:
410 return self.dirname()
411 # When self is dir-like, return its parent directory,
412 # regardless of the presence of a trailing separator
413 originalPath = self._pathLib(self.path)
414 parentPath = originalPath.parent
415 return self.replace(path=str(parentPath), forceDirectory=True)
417 def replace(self, forceDirectory: bool = False, isTemporary: bool = False, **kwargs: Any) -> ButlerURI:
418 """Return new `ButlerURI` with specified components replaced.
420 Parameters
421 ----------
422 forceDirectory : `bool`, optional
423 Parameter passed to ButlerURI constructor to force this
424 new URI to be dir-like.
425 isTemporary : `bool`, optional
426 Indicate that the resulting URI is temporary resource.
427 **kwargs
428 Components of a `urllib.parse.ParseResult` that should be
429 modified for the newly-created `ButlerURI`.
431 Returns
432 -------
433 new : `ButlerURI`
434 New `ButlerURI` object with updated values.
436 Notes
437 -----
438 Does not, for now, allow a change in URI scheme.
439 """
440 # Disallow a change in scheme
441 if "scheme" in kwargs: 441 ↛ 442line 441 didn't jump to line 442, because the condition on line 441 was never true
442 raise ValueError(f"Can not use replace() method to change URI scheme for {self}")
443 return self.__class__(self._uri._replace(**kwargs), forceDirectory=forceDirectory,
444 isTemporary=isTemporary)
446 def updatedFile(self, newfile: str) -> ButlerURI:
447 """Return new URI with an updated final component of the path.
449 Parameters
450 ----------
451 newfile : `str`
452 File name with no path component.
454 Returns
455 -------
456 updated : `ButlerURI`
458 Notes
459 -----
460 Forces the ButlerURI.dirLike attribute to be false. The new file path
461 will be quoted if necessary.
462 """
463 if self.quotePaths:
464 newfile = urllib.parse.quote(newfile)
465 dir, _ = self._pathModule.split(self.path)
466 newpath = self._pathModule.join(dir, newfile)
468 updated = self.replace(path=newpath)
469 updated.dirLike = False
470 return updated
472 def updatedExtension(self, ext: Optional[str]) -> ButlerURI:
473 """Return a new `ButlerURI` with updated file extension.
475 All file extensions are replaced.
477 Parameters
478 ----------
479 ext : `str` or `None`
480 New extension. If an empty string is given any extension will
481 be removed. If `None` is given there will be no change.
483 Returns
484 -------
485 updated : `ButlerURI`
486 URI with the specified extension. Can return itself if
487 no extension was specified.
488 """
489 if ext is None:
490 return self
492 # Get the extension
493 current = self.getExtension()
495 # Nothing to do if the extension already matches
496 if current == ext:
497 return self
499 # Remove the current extension from the path
500 # .fits.gz counts as one extension do not use os.path.splitext
501 path = self.path
502 if current:
503 path = path[:-len(current)]
505 # Ensure that we have a leading "." on file extension (and we do not
506 # try to modify the empty string)
507 if ext and not ext.startswith("."):
508 ext = "." + ext
510 return self.replace(path=path + ext)
512 def getExtension(self) -> str:
513 """Return the file extension(s) associated with this URI path.
515 Returns
516 -------
517 ext : `str`
518 The file extension (including the ``.``). Can be empty string
519 if there is no file extension. Usually returns only the last
520 file extension unless there is a special extension modifier
521 indicating file compression, in which case the combined
522 extension (e.g. ``.fits.gz``) will be returned.
523 """
524 special = {".gz", ".bz2", ".xz", ".fz"}
526 # Get the file part of the path so as not to be confused by
527 # "." in directory names.
528 basename = self.basename()
529 extensions = self._pathLib(basename).suffixes
531 if not extensions: 531 ↛ 532line 531 didn't jump to line 532, because the condition on line 531 was never true
532 return ""
534 ext = extensions.pop()
536 # Multiple extensions, decide whether to include the final two
537 if extensions and ext in special: 537 ↛ 538line 537 didn't jump to line 538, because the condition on line 537 was never true
538 ext = f"{extensions[-1]}{ext}"
540 return ext
542 def join(self, path: Union[str, ButlerURI], isTemporary: bool = False) -> ButlerURI:
543 """Return new `ButlerURI` with additional path components.
545 Parameters
546 ----------
547 path : `str`, `ButlerURI`
548 Additional file components to append to the current URI. Assumed
549 to include a file at the end. Will be quoted depending on the
550 associated URI scheme. If the path looks like a URI with a scheme
551 referring to an absolute location, it will be returned
552 directly (matching the behavior of `os.path.join()`). It can
553 also be a `ButlerURI`.
554 isTemporary : `bool`, optional
555 Indicate that the resulting URI represents a temporary resource.
557 Returns
558 -------
559 new : `ButlerURI`
560 New URI with any file at the end replaced with the new path
561 components.
563 Notes
564 -----
565 Schemeless URIs assume local path separator but all other URIs assume
566 POSIX separator if the supplied path has directory structure. It
567 may be this never becomes a problem but datastore templates assume
568 POSIX separator is being used.
570 If an absolute `ButlerURI` is given for ``path`` is is assumed that
571 this should be returned directly. Giving a ``path`` of an absolute
572 scheme-less URI is not allowed for safety reasons as it may indicate
573 a mistake in the calling code.
575 Raises
576 ------
577 ValueError
578 Raised if the ``path`` is an absolute scheme-less URI. In that
579 situation it is unclear whether the intent is to return a
580 ``file`` URI or it was a mistake and a relative scheme-less URI
581 was meant.
582 """
583 # If we have a full URI in path we will use it directly
584 # but without forcing to absolute so that we can trap the
585 # expected option of relative path.
586 path_uri = ButlerURI(path, forceAbsolute=False)
587 if path_uri.scheme: 587 ↛ 590line 587 didn't jump to line 590, because the condition on line 587 was never true
588 # Check for scheme so can distinguish explicit URIs from
589 # absolute scheme-less URIs.
590 return path_uri
592 if path_uri.isabs(): 592 ↛ 594line 592 didn't jump to line 594, because the condition on line 592 was never true
593 # Absolute scheme-less path.
594 raise ValueError(f"Can not join absolute scheme-less {path_uri!r} to another URI.")
596 # If this was originally a ButlerURI extract the unquoted path from it.
597 # Otherwise we use the string we were given to allow "#" to appear
598 # in the filename if given as a plain string.
599 if not isinstance(path, str): 599 ↛ 600line 599 didn't jump to line 600, because the condition on line 599 was never true
600 path = path_uri.unquoted_path
602 new = self.dirname() # By definition a directory URI
604 # new should be asked about quoting, not self, since dirname can
605 # change the URI scheme for schemeless -> file
606 if new.quotePaths: 606 ↛ 609line 606 didn't jump to line 609, because the condition on line 606 was never false
607 path = urllib.parse.quote(path)
609 newpath = self._pathModule.normpath(self._pathModule.join(new.path, path))
611 # normpath can strip trailing / so we force directory if the supplied
612 # path ended with a /
613 return new.replace(path=newpath, forceDirectory=path.endswith(self._pathModule.sep),
614 isTemporary=isTemporary)
616 def relative_to(self, other: ButlerURI) -> Optional[str]:
617 """Return the relative path from this URI to the other URI.
619 Parameters
620 ----------
621 other : `ButlerURI`
622 URI to use to calculate the relative path. Must be a parent
623 of this URI.
625 Returns
626 -------
627 subpath : `str`
628 The sub path of this URI relative to the supplied other URI.
629 Returns `None` if there is no parent child relationship.
630 Scheme and netloc must match.
631 """
632 # Scheme-less absolute other is treated as if it's a file scheme.
633 # Scheme-less relative other can only return non-None if self
634 # is also scheme-less relative and that is handled specifically
635 # in a subclass.
636 if not other.scheme and other.isabs():
637 other = other.abspath()
639 # Scheme-less self is handled elsewhere.
640 if self.scheme != other.scheme or self.netloc != other.netloc:
641 return None
643 enclosed_path = self._pathLib(self.relativeToPathRoot)
644 parent_path = other.relativeToPathRoot
645 subpath: Optional[str]
646 try:
647 subpath = str(enclosed_path.relative_to(parent_path))
648 except ValueError:
649 subpath = None
650 else:
651 subpath = urllib.parse.unquote(subpath)
652 return subpath
654 def exists(self) -> bool:
655 """Indicate that the resource is available.
657 Returns
658 -------
659 exists : `bool`
660 `True` if the resource exists.
661 """
662 raise NotImplementedError()
664 @classmethod
665 def mexists(cls, uris: Iterable[ButlerURI]) -> Dict[ButlerURI, bool]:
666 """Check for existence of multiple URIs at once.
668 Parameters
669 ----------
670 uris : iterable of `ButlerURI`
671 The URIs to test.
673 Returns
674 -------
675 existence : `dict` of [`ButlerURI`, `bool`]
676 Mapping of original URI to boolean indicating existence.
677 """
678 exists_executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
679 future_exists = {exists_executor.submit(uri.exists): uri for uri in uris}
681 results: Dict[ButlerURI, bool] = {}
682 for future in concurrent.futures.as_completed(future_exists):
683 uri = future_exists[future]
684 try:
685 exists = future.result()
686 except Exception:
687 exists = False
688 results[uri] = exists
689 return results
691 def remove(self) -> None:
692 """Remove the resource."""
693 raise NotImplementedError()
695 def isabs(self) -> bool:
696 """Indicate that the resource is fully specified.
698 For non-schemeless URIs this is always true.
700 Returns
701 -------
702 isabs : `bool`
703 `True` in all cases except schemeless URI.
704 """
705 return True
707 def abspath(self) -> ButlerURI:
708 """Return URI using an absolute path.
710 Returns
711 -------
712 abs : `ButlerURI`
713 Absolute URI. For non-schemeless URIs this always returns itself.
714 Schemeless URIs are upgraded to file URIs.
715 """
716 return self
718 def _as_local(self) -> Tuple[str, bool]:
719 """Return the location of the (possibly remote) resource as local file.
721 This is a helper function for `as_local` context manager.
723 Returns
724 -------
725 path : `str`
726 If this is a remote resource, it will be a copy of the resource
727 on the local file system, probably in a temporary directory.
728 For a local resource this should be the actual path to the
729 resource.
730 is_temporary : `bool`
731 Indicates if the local path is a temporary file or not.
732 """
733 raise NotImplementedError()
735 @contextlib.contextmanager
736 def as_local(self) -> Iterator[ButlerURI]:
737 """Return the location of the (possibly remote) resource as local file.
739 Yields
740 ------
741 local : `ButlerURI`
742 If this is a remote resource, it will be a copy of the resource
743 on the local file system, probably in a temporary directory.
744 For a local resource this should be the actual path to the
745 resource.
747 Notes
748 -----
749 The context manager will automatically delete any local temporary
750 file.
752 Examples
753 --------
754 Should be used as a context manager:
756 .. code-block:: py
758 with uri.as_local() as local:
759 ospath = local.ospath
760 """
761 local_src, is_temporary = self._as_local()
762 local_uri = ButlerURI(local_src, isTemporary=is_temporary)
764 try:
765 yield local_uri
766 finally:
767 # The caller might have relocated the temporary file
768 if is_temporary and local_uri.exists():
769 local_uri.remove()
771 @classmethod
772 @contextlib.contextmanager
773 def temporary_uri(cls, prefix: Optional[ButlerURI] = None,
774 suffix: Optional[str] = None) -> Iterator[ButlerURI]:
775 """Create a temporary URI.
777 Parameters
778 ----------
779 prefix : `ButlerURI`, optional
780 Prefix to use. Without this the path will be formed as a local
781 file URI in a temporary directory. Ensuring that the prefix
782 location exists is the responsibility of the caller.
783 suffix : `str`, optional
784 A file suffix to be used. The ``.`` should be included in this
785 suffix.
787 Yields
788 ------
789 uri : `ButlerURI`
790 The temporary URI. Will be removed when the context is completed.
791 """
792 use_tempdir = False
793 if prefix is None:
794 prefix = ButlerURI(tempfile.mkdtemp(), forceDirectory=True, isTemporary=True)
795 # Record that we need to delete this directory. Can not rely
796 # on isTemporary flag since an external prefix may have that
797 # set as well.
798 use_tempdir = True
800 # Need to create a randomized file name. For consistency do not
801 # use mkstemp for local and something else for remote. Additionally
802 # this method does not create the file to prevent name clashes.
803 characters = "abcdefghijklmnopqrstuvwxyz0123456789_"
804 rng = Random()
805 tempname = "".join(rng.choice(characters) for _ in range(16))
806 if suffix:
807 tempname += suffix
808 temporary_uri = prefix.join(tempname, isTemporary=True)
810 try:
811 yield temporary_uri
812 finally:
813 if use_tempdir:
814 shutil.rmtree(prefix.ospath, ignore_errors=True)
815 else:
816 try:
817 # It's okay if this does not work because the user removed
818 # the file.
819 temporary_uri.remove()
820 except FileNotFoundError:
821 pass
823 def read(self, size: int = -1) -> bytes:
824 """Open the resource and return the contents in bytes.
826 Parameters
827 ----------
828 size : `int`, optional
829 The number of bytes to read. Negative or omitted indicates
830 that all data should be read.
831 """
832 raise NotImplementedError()
834 def write(self, data: bytes, overwrite: bool = True) -> None:
835 """Write the supplied bytes to the new resource.
837 Parameters
838 ----------
839 data : `bytes`
840 The bytes to write to the resource. The entire contents of the
841 resource will be replaced.
842 overwrite : `bool`, optional
843 If `True` the resource will be overwritten if it exists. Otherwise
844 the write will fail.
845 """
846 raise NotImplementedError()
848 def mkdir(self) -> None:
849 """For a dir-like URI, create the directory resource if needed."""
850 raise NotImplementedError()
852 def isdir(self) -> bool:
853 """Return True if this URI looks like a directory, else False."""
854 return self.dirLike
856 def size(self) -> int:
857 """For non-dir-like URI, return the size of the resource.
859 Returns
860 -------
861 sz : `int`
862 The size in bytes of the resource associated with this URI.
863 Returns 0 if dir-like.
864 """
865 raise NotImplementedError()
867 def __str__(self) -> str:
868 """Convert the URI to its native string form."""
869 return self.geturl()
871 def __repr__(self) -> str:
872 """Return string representation suitable for evaluation."""
873 return f'ButlerURI("{self.geturl()}")'
875 def __eq__(self, other: Any) -> bool:
876 """Compare supplied object with this `ButlerURI`."""
877 if not isinstance(other, ButlerURI):
878 return NotImplemented
879 return self.geturl() == other.geturl()
881 def __hash__(self) -> int:
882 """Return hash of this object."""
883 return hash(str(self))
885 def __copy__(self) -> ButlerURI:
886 """Copy constructor.
888 Object is immutable so copy can return itself.
889 """
890 # Implement here because the __new__ method confuses things
891 return self
893 def __deepcopy__(self, memo: Any) -> ButlerURI:
894 """Deepcopy the object.
896 Object is immutable so copy can return itself.
897 """
898 # Implement here because the __new__ method confuses things
899 return self
901 def __getnewargs__(self) -> Tuple:
902 """Support pickling."""
903 return (str(self),)
905 @classmethod
906 def _fixDirectorySep(cls, parsed: urllib.parse.ParseResult,
907 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]:
908 """Ensure that a path separator is present on directory paths.
910 Parameters
911 ----------
912 parsed : `~urllib.parse.ParseResult`
913 The result from parsing a URI using `urllib.parse`.
914 forceDirectory : `bool`, optional
915 If `True` forces the URI to end with a separator, otherwise given
916 URI is interpreted as is. Specifying that the URI is conceptually
917 equivalent to a directory can break some ambiguities when
918 interpreting the last element of a path.
920 Returns
921 -------
922 modified : `~urllib.parse.ParseResult`
923 Update result if a URI is being handled.
924 dirLike : `bool`
925 `True` if given parsed URI has a trailing separator or
926 forceDirectory is True. Otherwise `False`.
927 """
928 # assume we are not dealing with a directory like URI
929 dirLike = False
931 # Directory separator
932 sep = cls._pathModule.sep
934 # URI is dir-like if explicitly stated or if it ends on a separator
935 endsOnSep = parsed.path.endswith(sep)
936 if forceDirectory or endsOnSep:
937 dirLike = True
938 # only add the separator if it's not already there
939 if not endsOnSep: 939 ↛ 942line 939 didn't jump to line 942, because the condition on line 939 was never false
940 parsed = parsed._replace(path=parsed.path+sep)
942 return parsed, dirLike
944 @classmethod
945 def _fixupPathUri(cls, parsed: urllib.parse.ParseResult, root: Optional[Union[str, ButlerURI]] = None,
946 forceAbsolute: bool = False,
947 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]:
948 """Correct any issues with the supplied URI.
950 Parameters
951 ----------
952 parsed : `~urllib.parse.ParseResult`
953 The result from parsing a URI using `urllib.parse`.
954 root : `str` or `ButlerURI`, ignored
955 Not used by the this implementation since all URIs are
956 absolute except for those representing the local file system.
957 forceAbsolute : `bool`, ignored.
958 Not used by this implementation. URIs are generally always
959 absolute.
960 forceDirectory : `bool`, optional
961 If `True` forces the URI to end with a separator, otherwise given
962 URI is interpreted as is. Specifying that the URI is conceptually
963 equivalent to a directory can break some ambiguities when
964 interpreting the last element of a path.
966 Returns
967 -------
968 modified : `~urllib.parse.ParseResult`
969 Update result if a URI is being handled.
970 dirLike : `bool`
971 `True` if given parsed URI has a trailing separator or
972 forceDirectory is True. Otherwise `False`.
974 Notes
975 -----
976 Relative paths are explicitly not supported by RFC8089 but `urllib`
977 does accept URIs of the form ``file:relative/path.ext``. They need
978 to be turned into absolute paths before they can be used. This is
979 always done regardless of the ``forceAbsolute`` parameter.
981 AWS S3 differentiates between keys with trailing POSIX separators (i.e
982 `/dir` and `/dir/`) whereas POSIX does not neccessarily.
984 Scheme-less paths are normalized.
985 """
986 return cls._fixDirectorySep(parsed, forceDirectory)
988 def transfer_from(self, src: ButlerURI, transfer: str,
989 overwrite: bool = False,
990 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
991 """Transfer the current resource to a new location.
993 Parameters
994 ----------
995 src : `ButlerURI`
996 Source URI.
997 transfer : `str`
998 Mode to use for transferring the resource. Generically there are
999 many standard options: copy, link, symlink, hardlink, relsymlink.
1000 Not all URIs support all modes.
1001 overwrite : `bool`, optional
1002 Allow an existing file to be overwritten. Defaults to `False`.
1003 transaction : `DatastoreTransaction`, optional
1004 A transaction object that can (depending on implementation)
1005 rollback transfers on error. Not guaranteed to be implemented.
1007 Notes
1008 -----
1009 Conceptually this is hard to scale as the number of URI schemes
1010 grow. The destination URI is more important than the source URI
1011 since that is where all the transfer modes are relevant (with the
1012 complication that "move" deletes the source).
1014 Local file to local file is the fundamental use case but every
1015 other scheme has to support "copy" to local file (with implicit
1016 support for "move") and copy from local file.
1017 All the "link" options tend to be specific to local file systems.
1019 "move" is a "copy" where the remote resource is deleted at the end.
1020 Whether this works depends on the source URI rather than the
1021 destination URI. Reverting a move on transaction rollback is
1022 expected to be problematic if a remote resource was involved.
1023 """
1024 raise NotImplementedError(f"No transfer modes supported by URI scheme {self.scheme}")
1026 def walk(self, file_filter: Optional[Union[str, re.Pattern]] = None) -> Iterator[Union[List,
1027 Tuple[ButlerURI,
1028 List[str],
1029 List[str]]]]:
1030 """Walk the directory tree returning matching files and directories.
1032 Parameters
1033 ----------
1034 file_filter : `str` or `re.Pattern`, optional
1035 Regex to filter out files from the list before it is returned.
1037 Yields
1038 ------
1039 dirpath : `ButlerURI`
1040 Current directory being examined.
1041 dirnames : `list` of `str`
1042 Names of subdirectories within dirpath.
1043 filenames : `list` of `str`
1044 Names of all the files within dirpath.
1045 """
1046 raise NotImplementedError()
1048 @classmethod
1049 def findFileResources(cls, candidates: Iterable[Union[str, ButlerURI]],
1050 file_filter: Optional[str] = None,
1051 grouped: bool = False) -> Iterator[Union[ButlerURI, Iterator[ButlerURI]]]:
1052 """Get all the files from a list of values.
1054 Parameters
1055 ----------
1056 candidates : iterable [`str` or `ButlerURI`]
1057 The files to return and directories in which to look for files to
1058 return.
1059 file_filter : `str`, optional
1060 The regex to use when searching for files within directories.
1061 By default returns all the found files.
1062 grouped : `bool`, optional
1063 If `True` the results will be grouped by directory and each
1064 yielded value will be an iterator over URIs. If `False` each
1065 URI will be returned separately.
1067 Yields
1068 ------
1069 found_file: `ButlerURI`
1070 The passed-in URIs and URIs found in passed-in directories.
1071 If grouping is enabled, each of the yielded values will be an
1072 iterator yielding members of the group. Files given explicitly
1073 will be returned as a single group at the end.
1075 Notes
1076 -----
1077 If a value is a file it is yielded immediately. If a value is a
1078 directory, all the files in the directory (recursively) that match
1079 the regex will be yielded in turn.
1080 """
1081 fileRegex = None if file_filter is None else re.compile(file_filter)
1083 singles = []
1085 # Find all the files of interest
1086 for location in candidates:
1087 uri = ButlerURI(location)
1088 if uri.isdir():
1089 for found in uri.walk(fileRegex):
1090 if not found:
1091 # This means the uri does not exist and by
1092 # convention we ignore it
1093 continue
1094 root, dirs, files = found
1095 if not files:
1096 continue
1097 if grouped:
1098 yield (root.join(name) for name in files)
1099 else:
1100 for name in files:
1101 yield root.join(name)
1102 else:
1103 if grouped:
1104 singles.append(uri)
1105 else:
1106 yield uri
1108 # Finally, return any explicitly given files in one group
1109 if grouped and singles:
1110 yield iter(singles)