Coverage for python/lsst/daf/butler/core/_butlerUri/_butlerUri.py : 57%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import contextlib
25import urllib.parse
26import posixpath
27import copy
28import logging
29import re
31from pathlib import PurePath, PurePosixPath
33__all__ = ('ButlerURI',)
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 Iterator,
39 Optional,
40 Tuple,
41 Type,
42 Union,
43)
45from .utils import NoTransaction
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 from ..datastore import DatastoreTransaction
51log = logging.getLogger(__name__)
53# Regex for looking for URI escapes
54ESCAPES_RE = re.compile(r"%[A-F0-9]{2}")
57class ButlerURI:
58 """Convenience wrapper around URI parsers.
60 Provides access to URI components and can convert file
61 paths into absolute path URIs. Scheme-less URIs are treated as if
62 they are local file system paths and are converted to absolute URIs.
64 A specialist subclass is created for each supported URI scheme.
66 Parameters
67 ----------
68 uri : `str` or `urllib.parse.ParseResult`
69 URI in string form. Can be scheme-less if referring to a local
70 filesystem path.
71 root : `str` or `ButlerURI`, optional
72 When fixing up a relative path in a ``file`` scheme or if scheme-less,
73 use this as the root. Must be absolute. If `None` the current
74 working directory will be used. Can be a file URI.
75 forceAbsolute : `bool`, optional
76 If `True`, scheme-less relative URI will be converted to an absolute
77 path using a ``file`` scheme. If `False` scheme-less URI will remain
78 scheme-less and will not be updated to ``file`` or absolute path.
79 forceDirectory: `bool`, optional
80 If `True` forces the URI to end with a separator, otherwise given URI
81 is interpreted as is.
82 isTemporary : `bool`, optional
83 If `True` indicates that this URI points to a temporary resource.
84 """
86 _pathLib: Type[PurePath] = PurePosixPath
87 """Path library to use for this scheme."""
89 _pathModule = posixpath
90 """Path module to use for this scheme."""
92 transferModes: Tuple[str, ...] = ("copy", "auto", "move")
93 """Transfer modes supported by this implementation.
95 Move is special in that it is generally a copy followed by an unlink.
96 Whether that unlink works depends critically on whether the source URI
97 implements unlink. If it does not the move will be reported as a failure.
98 """
100 transferDefault: str = "copy"
101 """Default mode to use for transferring if ``auto`` is specified."""
103 quotePaths = True
104 """True if path-like elements modifying a URI should be quoted.
106 All non-schemeless URIs have to internally use quoted paths. Therefore
107 if a new file name is given (e.g. to updateFile or join) a decision must
108 be made whether to quote it to be consistent.
109 """
111 isLocal = False
112 """If `True` this URI refers to a local file."""
114 # This is not an ABC with abstract methods because the __new__ being
115 # a factory confuses mypy such that it assumes that every constructor
116 # returns a ButlerURI and then determines that all the abstract methods
117 # are still abstract. If they are not marked abstract but just raise
118 # mypy is fine with it.
120 # mypy is confused without these
121 _uri: urllib.parse.ParseResult
122 isTemporary: bool
124 def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI],
125 root: Optional[Union[str, ButlerURI]] = None, forceAbsolute: bool = True,
126 forceDirectory: bool = False, isTemporary: bool = False) -> ButlerURI:
127 parsed: urllib.parse.ParseResult
128 dirLike: bool = False
129 subclass: Optional[Type] = None
131 # Record if we need to post process the URI components
132 # or if the instance is already fully configured
133 if isinstance(uri, str):
134 # Since local file names can have special characters in them
135 # we need to quote them for the parser but we can unquote
136 # later. Assume that all other URI schemes are quoted.
137 # Since sometimes people write file:/a/b and not file:///a/b
138 # we should not quote in the explicit case of file:
139 if "://" not in uri and not uri.startswith("file:"):
140 if ESCAPES_RE.search(uri): 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 log.warning("Possible double encoding of %s", uri)
142 else:
143 uri = urllib.parse.quote(uri)
144 parsed = urllib.parse.urlparse(uri)
145 elif isinstance(uri, urllib.parse.ParseResult):
146 parsed = copy.copy(uri)
147 elif isinstance(uri, ButlerURI): 147 ↛ 153line 147 didn't jump to line 153, because the condition on line 147 was never false
148 parsed = copy.copy(uri._uri)
149 dirLike = uri.dirLike
150 # No further parsing required and we know the subclass
151 subclass = type(uri)
152 else:
153 raise ValueError(f"Supplied URI must be string, ButlerURI, or ParseResult but got '{uri!r}'")
155 if subclass is None:
156 # Work out the subclass from the URI scheme
157 if not parsed.scheme:
158 from .schemeless import ButlerSchemelessURI
159 subclass = ButlerSchemelessURI
160 elif parsed.scheme == "file": 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 from .file import ButlerFileURI
162 subclass = ButlerFileURI
163 elif parsed.scheme == "s3": 163 ↛ 164line 163 didn't jump to line 164, because the condition on line 163 was never true
164 from .s3 import ButlerS3URI
165 subclass = ButlerS3URI
166 elif parsed.scheme.startswith("http"): 166 ↛ 167line 166 didn't jump to line 167, because the condition on line 166 was never true
167 from .http import ButlerHttpURI
168 subclass = ButlerHttpURI
169 elif parsed.scheme == "resource": 169 ↛ 173line 169 didn't jump to line 173, because the condition on line 169 was never false
170 # Rules for scheme names disallow pkg_resource
171 from .packageresource import ButlerPackageResourceURI
172 subclass = ButlerPackageResourceURI
173 elif parsed.scheme == "mem":
174 # in-memory datastore object
175 from .mem import ButlerInMemoryURI
176 subclass = ButlerInMemoryURI
177 else:
178 raise NotImplementedError(f"No URI support for scheme: '{parsed.scheme}'"
179 " in {parsed.geturl()}")
181 parsed, dirLike = subclass._fixupPathUri(parsed, root=root,
182 forceAbsolute=forceAbsolute,
183 forceDirectory=forceDirectory)
185 # It is possible for the class to change from schemeless
186 # to file so handle that
187 if parsed.scheme == "file": 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true
188 from .file import ButlerFileURI
189 subclass = ButlerFileURI
191 # Now create an instance of the correct subclass and set the
192 # attributes directly
193 self = object.__new__(subclass)
194 self._uri = parsed
195 self.dirLike = dirLike
196 self.isTemporary = isTemporary
197 return self
199 @property
200 def scheme(self) -> str:
201 """The URI scheme (``://`` is not part of the scheme)."""
202 return self._uri.scheme
204 @property
205 def netloc(self) -> str:
206 """The URI network location."""
207 return self._uri.netloc
209 @property
210 def path(self) -> str:
211 """The path component of the URI."""
212 return self._uri.path
214 @property
215 def unquoted_path(self) -> str:
216 """The path component of the URI with any URI quoting reversed."""
217 return urllib.parse.unquote(self._uri.path)
219 @property
220 def ospath(self) -> str:
221 """Path component of the URI localized to current OS."""
222 raise AttributeError(f"Non-file URI ({self}) has no local OS path.")
224 @property
225 def relativeToPathRoot(self) -> str:
226 """Returns path relative to network location.
228 Effectively, this is the path property with posix separator stripped
229 from the left hand side of the path.
231 Always unquotes.
232 """
233 p = self._pathLib(self.path)
234 relToRoot = str(p.relative_to(p.root))
235 if self.dirLike and not relToRoot.endswith("/"): 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true
236 relToRoot += "/"
237 return urllib.parse.unquote(relToRoot)
239 @property
240 def is_root(self) -> bool:
241 """`True` if this URI points to the root of the network location.
243 This means that the path components refers to the top level.
244 """
245 relpath = self.relativeToPathRoot
246 if relpath == "./":
247 return True
248 return False
250 @property
251 def fragment(self) -> str:
252 """The fragment component of the URI."""
253 return self._uri.fragment
255 @property
256 def params(self) -> str:
257 """Any parameters included in the URI."""
258 return self._uri.params
260 @property
261 def query(self) -> str:
262 """Any query strings included in the URI."""
263 return self._uri.query
265 def geturl(self) -> str:
266 """Return the URI in string form.
268 Returns
269 -------
270 url : `str`
271 String form of URI.
272 """
273 return self._uri.geturl()
275 def split(self) -> Tuple[ButlerURI, str]:
276 """Splits URI into head and tail. Equivalent to os.path.split where
277 head preserves the URI components.
279 Returns
280 -------
281 head: `ButlerURI`
282 Everything leading up to tail, expanded and normalized as per
283 ButlerURI rules.
284 tail : `str`
285 Last `self.path` component. Tail will be empty if path ends on a
286 separator. Tail will never contain separators. It will be
287 unquoted.
288 """
289 head, tail = self._pathModule.split(self.path)
290 headuri = self._uri._replace(path=head)
292 # The file part should never include quoted metacharacters
293 tail = urllib.parse.unquote(tail)
295 # Schemeless is special in that it can be a relative path
296 # We need to ensure that it stays that way. All other URIs will
297 # be absolute already.
298 forceAbsolute = self._pathModule.isabs(self.path)
299 return ButlerURI(headuri, forceDirectory=True, forceAbsolute=forceAbsolute), tail
301 def basename(self) -> str:
302 """Returns the base name, last element of path, of the URI. If URI ends
303 on a slash returns an empty string. This is the second element returned
304 by split().
306 Equivalent of os.path.basename().
308 Returns
309 -------
310 tail : `str`
311 Last part of the path attribute. Trail will be empty if path ends
312 on a separator.
313 """
314 return self.split()[1]
316 def dirname(self) -> ButlerURI:
317 """Returns a ButlerURI containing all the directories of the path
318 attribute.
320 Equivalent of os.path.dirname()
322 Returns
323 -------
324 head : `ButlerURI`
325 Everything except the tail of path attribute, expanded and
326 normalized as per ButlerURI rules.
327 """
328 return self.split()[0]
330 def parent(self) -> ButlerURI:
331 """Returns a ButlerURI containing all the directories of the path
332 attribute, minus the last one.
334 Returns
335 -------
336 head : `ButlerURI`
337 Everything except the tail of path attribute, expanded and
338 normalized as per ButlerURI rules.
339 """
340 # When self is file-like, return self.dirname()
341 if not self.dirLike:
342 return self.dirname()
343 # When self is dir-like, return its parent directory,
344 # regardless of the presence of a trailing separator
345 originalPath = self._pathLib(self.path)
346 parentPath = originalPath.parent
347 parentURI = self._uri._replace(path=str(parentPath))
349 return ButlerURI(parentURI, forceDirectory=True)
351 def replace(self, **kwargs: Any) -> ButlerURI:
352 """Replace components in a URI with new values and return a new
353 instance.
355 Returns
356 -------
357 new : `ButlerURI`
358 New `ButlerURI` object with updated values.
359 """
360 return self.__class__(self._uri._replace(**kwargs))
362 def updateFile(self, newfile: str) -> None:
363 """Update in place the final component of the path with the supplied
364 file name.
366 Parameters
367 ----------
368 newfile : `str`
369 File name with no path component.
371 Notes
372 -----
373 Updates the URI in place.
374 Updates the ButlerURI.dirLike attribute. The new file path will
375 be quoted if necessary.
376 """
377 if self.quotePaths:
378 newfile = urllib.parse.quote(newfile)
379 dir, _ = self._pathModule.split(self.path)
380 newpath = self._pathModule.join(dir, newfile)
382 self.dirLike = False
383 self._uri = self._uri._replace(path=newpath)
385 def updateExtension(self, ext: Optional[str]) -> None:
386 """Update the file extension associated with this `ButlerURI` in place.
388 All file extensions are replaced.
390 Parameters
391 ----------
392 ext : `str` or `None`
393 New extension. If an empty string is given any extension will
394 be removed. If `None` is given there will be no change.
395 """
396 if ext is None:
397 return
399 # Get the extension and remove it from the path if one is found
400 # .fits.gz counts as one extension do not use os.path.splitext
401 current = self.getExtension()
402 path = self.path
403 if current:
404 path = path[:-len(current)]
406 # Ensure that we have a leading "." on file extension (and we do not
407 # try to modify the empty string)
408 if ext and not ext.startswith("."):
409 ext = "." + ext
411 self._uri = self._uri._replace(path=path + ext)
413 def getExtension(self) -> str:
414 """Return the file extension(s) associated with this URI path.
416 Returns
417 -------
418 ext : `str`
419 The file extension (including the ``.``). Can be empty string
420 if there is no file extension. Usually returns only the last
421 file extension unless there is a special extension modifier
422 indicating file compression, in which case the combined
423 extension (e.g. ``.fits.gz``) will be returned.
424 """
425 special = {".gz", ".bz2", ".xz", ".fz"}
427 extensions = self._pathLib(self.path).suffixes
429 if not extensions: 429 ↛ 430line 429 didn't jump to line 430, because the condition on line 429 was never true
430 return ""
432 ext = extensions.pop()
434 # Multiple extensions, decide whether to include the final two
435 if extensions and ext in special: 435 ↛ 436line 435 didn't jump to line 436, because the condition on line 435 was never true
436 ext = f"{extensions[-1]}{ext}"
438 return ext
440 def join(self, path: Union[str, ButlerURI]) -> ButlerURI:
441 """Create a new `ButlerURI` with additional path components including
442 a file.
444 Parameters
445 ----------
446 path : `str`, `ButlerURI`
447 Additional file components to append to the current URI. Assumed
448 to include a file at the end. Will be quoted depending on the
449 associated URI scheme. If the path looks like a URI with a scheme
450 referring to an absolute location, it will be returned
451 directly (matching the behavior of `os.path.join()`). It can
452 also be a `ButlerURI`.
454 Returns
455 -------
456 new : `ButlerURI`
457 New URI with any file at the end replaced with the new path
458 components.
460 Notes
461 -----
462 Schemeless URIs assume local path separator but all other URIs assume
463 POSIX separator if the supplied path has directory structure. It
464 may be this never becomes a problem but datastore templates assume
465 POSIX separator is being used.
466 """
467 # If we have a full URI in path we will use it directly
468 # but without forcing to absolute so that we can trap the
469 # expected option of relative path.
470 path_uri = ButlerURI(path, forceAbsolute=False)
471 if path_uri.scheme: 471 ↛ 472line 471 didn't jump to line 472, because the condition on line 471 was never true
472 return path_uri
474 # Force back to string
475 path = path_uri.path
477 new = self.dirname() # By definition a directory URI
479 # new should be asked about quoting, not self, since dirname can
480 # change the URI scheme for schemeless -> file
481 if new.quotePaths: 481 ↛ 484line 481 didn't jump to line 484, because the condition on line 481 was never false
482 path = urllib.parse.quote(path)
484 newpath = self._pathModule.normpath(self._pathModule.join(new.path, path))
485 new._uri = new._uri._replace(path=newpath)
486 # Declare the new URI not be dirLike unless path ended in /
487 if not path.endswith(self._pathModule.sep): 487 ↛ 489line 487 didn't jump to line 489, because the condition on line 487 was never false
488 new.dirLike = False
489 return new
491 def relative_to(self, other: ButlerURI) -> Optional[str]:
492 """Return the relative path from this URI to the other URI.
494 Parameters
495 ----------
496 other : `ButlerURI`
497 URI to use to calculate the relative path. Must be a parent
498 of this URI.
500 Returns
501 -------
502 subpath : `str`
503 The sub path of this URI relative to the supplied other URI.
504 Returns `None` if there is no parent child relationship.
505 Scheme and netloc must match.
506 """
507 if self.scheme != other.scheme or self.netloc != other.netloc:
508 return None
510 enclosed_path = self._pathLib(self.relativeToPathRoot)
511 parent_path = other.relativeToPathRoot
512 subpath: Optional[str]
513 try:
514 subpath = str(enclosed_path.relative_to(parent_path))
515 except ValueError:
516 subpath = None
517 else:
518 subpath = urllib.parse.unquote(subpath)
519 return subpath
521 def exists(self) -> bool:
522 """Indicate that the resource is available.
524 Returns
525 -------
526 exists : `bool`
527 `True` if the resource exists.
528 """
529 raise NotImplementedError()
531 def remove(self) -> None:
532 """Remove the resource."""
533 raise NotImplementedError()
535 def isabs(self) -> bool:
536 """Indicate that the resource is fully specified.
538 For non-schemeless URIs this is always true.
540 Returns
541 -------
542 isabs : `bool`
543 `True` in all cases except schemeless URI.
544 """
545 return True
547 def _as_local(self) -> Tuple[str, bool]:
548 """Return the location of the (possibly remote) resource in the
549 local file system.
551 This is a helper function for ``as_local`` context manager.
553 Returns
554 -------
555 path : `str`
556 If this is a remote resource, it will be a copy of the resource
557 on the local file system, probably in a temporary directory.
558 For a local resource this should be the actual path to the
559 resource.
560 is_temporary : `bool`
561 Indicates if the local path is a temporary file or not.
562 """
563 raise NotImplementedError()
565 @contextlib.contextmanager
566 def as_local(self) -> Iterator[ButlerURI]:
567 """Return the location of the (possibly remote) resource in the
568 local file system.
570 Yields
571 ------
572 local : `ButlerURI`
573 If this is a remote resource, it will be a copy of the resource
574 on the local file system, probably in a temporary directory.
575 For a local resource this should be the actual path to the
576 resource.
578 Notes
579 -----
580 The context manager will automatically delete any local temporary
581 file.
583 Examples
584 --------
585 Should be used as a context manager:
587 .. code-block:: py
589 with uri.as_local() as local:
590 ospath = local.ospath
591 """
592 local_src, is_temporary = self._as_local()
593 local_uri = ButlerURI(local_src, isTemporary=is_temporary)
595 try:
596 yield local_uri
597 finally:
598 # The caller might have relocated the temporary file
599 if is_temporary and local_uri.exists():
600 local_uri.remove()
602 def read(self, size: int = -1) -> bytes:
603 """Open the resource and return the contents in bytes.
605 Parameters
606 ----------
607 size : `int`, optional
608 The number of bytes to read. Negative or omitted indicates
609 that all data should be read.
610 """
611 raise NotImplementedError()
613 def write(self, data: bytes, overwrite: bool = True) -> None:
614 """Write the supplied bytes to the new resource.
616 Parameters
617 ----------
618 data : `bytes`
619 The bytes to write to the resource. The entire contents of the
620 resource will be replaced.
621 overwrite : `bool`, optional
622 If `True` the resource will be overwritten if it exists. Otherwise
623 the write will fail.
624 """
625 raise NotImplementedError()
627 def mkdir(self) -> None:
628 """For a dir-like URI, create the directory resource if it does not
629 already exist.
630 """
631 raise NotImplementedError()
633 def size(self) -> int:
634 """For non-dir-like URI, return the size of the resource.
636 Returns
637 -------
638 sz : `int`
639 The size in bytes of the resource associated with this URI.
640 Returns 0 if dir-like.
641 """
642 raise NotImplementedError()
644 def __str__(self) -> str:
645 return self.geturl()
647 def __repr__(self) -> str:
648 return f'ButlerURI("{self.geturl()}")'
650 def __eq__(self, other: Any) -> bool:
651 if not isinstance(other, ButlerURI):
652 return False
653 return self.geturl() == other.geturl()
655 def __copy__(self) -> ButlerURI:
656 # Implement here because the __new__ method confuses things
657 # Be careful not to convert a relative schemeless URI to absolute
658 return type(self)(str(self), forceAbsolute=self.isabs())
660 def __deepcopy__(self, memo: Any) -> ButlerURI:
661 # Implement here because the __new__ method confuses things
662 return self.__copy__()
664 def __getnewargs__(self) -> Tuple:
665 return (str(self),)
667 @staticmethod
668 def _fixupPathUri(parsed: urllib.parse.ParseResult, root: Optional[Union[str, ButlerURI]] = None,
669 forceAbsolute: bool = False,
670 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]:
671 """Correct any issues with the supplied URI.
673 Parameters
674 ----------
675 parsed : `~urllib.parse.ParseResult`
676 The result from parsing a URI using `urllib.parse`.
677 root : `str` or `ButlerURI`, ignored
678 Not used by the this implementation since all URIs are
679 absolute except for those representing the local file system.
680 forceAbsolute : `bool`, ignored.
681 Not used by this implementation. URIs are generally always
682 absolute.
683 forceDirectory : `bool`, optional
684 If `True` forces the URI to end with a separator, otherwise given
685 URI is interpreted as is. Specifying that the URI is conceptually
686 equivalent to a directory can break some ambiguities when
687 interpreting the last element of a path.
689 Returns
690 -------
691 modified : `~urllib.parse.ParseResult`
692 Update result if a URI is being handled.
693 dirLike : `bool`
694 `True` if given parsed URI has a trailing separator or
695 forceDirectory is True. Otherwise `False`.
697 Notes
698 -----
699 Relative paths are explicitly not supported by RFC8089 but `urllib`
700 does accept URIs of the form ``file:relative/path.ext``. They need
701 to be turned into absolute paths before they can be used. This is
702 always done regardless of the ``forceAbsolute`` parameter.
704 AWS S3 differentiates between keys with trailing POSIX separators (i.e
705 `/dir` and `/dir/`) whereas POSIX does not neccessarily.
707 Scheme-less paths are normalized.
708 """
709 # assume we are not dealing with a directory like URI
710 dirLike = False
712 # URI is dir-like if explicitly stated or if it ends on a separator
713 endsOnSep = parsed.path.endswith(posixpath.sep)
714 if forceDirectory or endsOnSep:
715 dirLike = True
716 # only add the separator if it's not already there
717 if not endsOnSep: 717 ↛ 720line 717 didn't jump to line 720, because the condition on line 717 was never false
718 parsed = parsed._replace(path=parsed.path+posixpath.sep)
720 return parsed, dirLike
722 def transfer_from(self, src: ButlerURI, transfer: str,
723 overwrite: bool = False,
724 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
725 """Transfer the current resource to a new location.
727 Parameters
728 ----------
729 src : `ButlerURI`
730 Source URI.
731 transfer : `str`
732 Mode to use for transferring the resource. Generically there are
733 many standard options: copy, link, symlink, hardlink, relsymlink.
734 Not all URIs support all modes.
735 overwrite : `bool`, optional
736 Allow an existing file to be overwritten. Defaults to `False`.
737 transaction : `DatastoreTransaction`, optional
738 A transaction object that can (depending on implementation)
739 rollback transfers on error. Not guaranteed to be implemented.
741 Notes
742 -----
743 Conceptually this is hard to scale as the number of URI schemes
744 grow. The destination URI is more important than the source URI
745 since that is where all the transfer modes are relevant (with the
746 complication that "move" deletes the source).
748 Local file to local file is the fundamental use case but every
749 other scheme has to support "copy" to local file (with implicit
750 support for "move") and copy from local file.
751 All the "link" options tend to be specific to local file systems.
753 "move" is a "copy" where the remote resource is deleted at the end.
754 Whether this works depends on the source URI rather than the
755 destination URI. Reverting a move on transaction rollback is
756 expected to be problematic if a remote resource was involved.
757 """
758 raise NotImplementedError(f"No transfer modes supported by URI scheme {self.scheme}")