Coverage for python/lsst/daf/butler/core/_butlerUri/_butlerUri.py : 59%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import contextlib
25import urllib
26import posixpath
27import copy
28import logging
29import re
31from pathlib import PurePath, PurePosixPath
33__all__ = ('ButlerURI',)
35from typing import (
36 TYPE_CHECKING,
37 Any,
38 Iterator,
39 Optional,
40 Tuple,
41 Type,
42 Union,
43)
45from .utils import NoTransaction
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 from ..datastore import DatastoreTransaction
51log = logging.getLogger(__name__)
53# Regex for looking for URI escapes
54ESCAPES_RE = re.compile(r"%[A-F0-9]{2}")
57class ButlerURI:
58 """Convenience wrapper around URI parsers.
60 Provides access to URI components and can convert file
61 paths into absolute path URIs. Scheme-less URIs are treated as if
62 they are local file system paths and are converted to absolute URIs.
64 A specialist subclass is created for each supported URI scheme.
66 Parameters
67 ----------
68 uri : `str` or `urllib.parse.ParseResult`
69 URI in string form. Can be scheme-less if referring to a local
70 filesystem path.
71 root : `str` or `ButlerURI`, optional
72 When fixing up a relative path in a ``file`` scheme or if scheme-less,
73 use this as the root. Must be absolute. If `None` the current
74 working directory will be used. Can be a file URI.
75 forceAbsolute : `bool`, optional
76 If `True`, scheme-less relative URI will be converted to an absolute
77 path using a ``file`` scheme. If `False` scheme-less URI will remain
78 scheme-less and will not be updated to ``file`` or absolute path.
79 forceDirectory: `bool`, optional
80 If `True` forces the URI to end with a separator, otherwise given URI
81 is interpreted as is.
82 isTemporary : `bool`, optional
83 If `True` indicates that this URI points to a temporary resource.
84 """
86 _pathLib: Type[PurePath] = PurePosixPath
87 """Path library to use for this scheme."""
89 _pathModule = posixpath
90 """Path module to use for this scheme."""
92 transferModes: Tuple[str, ...] = ("copy", "auto", "move")
93 """Transfer modes supported by this implementation.
95 Move is special in that it is generally a copy followed by an unlink.
96 Whether that unlink works depends critically on whether the source URI
97 implements unlink. If it does not the move will be reported as a failure.
98 """
100 transferDefault: str = "copy"
101 """Default mode to use for transferring if ``auto`` is specified."""
103 quotePaths = True
104 """True if path-like elements modifying a URI should be quoted.
106 All non-schemeless URIs have to internally use quoted paths. Therefore
107 if a new file name is given (e.g. to updateFile or join) a decision must
108 be made whether to quote it to be consistent.
109 """
111 isLocal = False
112 """If `True` this URI refers to a local file."""
114 # This is not an ABC with abstract methods because the __new__ being
115 # a factory confuses mypy such that it assumes that every constructor
116 # returns a ButlerURI and then determines that all the abstract methods
117 # are still abstract. If they are not marked abstract but just raise
118 # mypy is fine with it.
120 # mypy is confused without these
121 _uri: urllib.parse.ParseResult
122 isTemporary: bool
124 def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI],
125 root: Optional[Union[str, ButlerURI]] = None, forceAbsolute: bool = True,
126 forceDirectory: bool = False, isTemporary: bool = False) -> ButlerURI:
127 parsed: urllib.parse.ParseResult
128 dirLike: bool
129 subclass: Optional[Type] = None
131 # Record if we need to post process the URI components
132 # or if the instance is already fully configured
133 if isinstance(uri, str):
134 # Since local file names can have special characters in them
135 # we need to quote them for the parser but we can unquote
136 # later. Assume that all other URI schemes are quoted.
137 # Since sometimes people write file:/a/b and not file:///a/b
138 # we should not quote in the explicit case of file:
139 if "://" not in uri and not uri.startswith("file:"):
140 if ESCAPES_RE.search(uri): 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 log.warning("Possible double encoding of %s", uri)
142 else:
143 uri = urllib.parse.quote(uri)
144 parsed = urllib.parse.urlparse(uri)
145 elif isinstance(uri, urllib.parse.ParseResult):
146 parsed = copy.copy(uri)
147 elif isinstance(uri, ButlerURI): 147 ↛ 153line 147 didn't jump to line 153, because the condition on line 147 was never false
148 parsed = copy.copy(uri._uri)
149 dirLike = uri.dirLike
150 # No further parsing required and we know the subclass
151 subclass = type(uri)
152 else:
153 raise ValueError(f"Supplied URI must be string, ButlerURI, or ParseResult but got '{uri!r}'")
155 if subclass is None:
156 # Work out the subclass from the URI scheme
157 if not parsed.scheme:
158 from .schemeless import ButlerSchemelessURI
159 subclass = ButlerSchemelessURI
160 elif parsed.scheme == "file": 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 from .file import ButlerFileURI
162 subclass = ButlerFileURI
163 elif parsed.scheme == "s3": 163 ↛ 164line 163 didn't jump to line 164, because the condition on line 163 was never true
164 from .s3 import ButlerS3URI
165 subclass = ButlerS3URI
166 elif parsed.scheme.startswith("http"): 166 ↛ 167line 166 didn't jump to line 167, because the condition on line 166 was never true
167 from .http import ButlerHttpURI
168 subclass = ButlerHttpURI
169 elif parsed.scheme == "resource": 169 ↛ 173line 169 didn't jump to line 173, because the condition on line 169 was never false
170 # Rules for scheme names disallow pkg_resource
171 from .packageresource import ButlerPackageResourceURI
172 subclass = ButlerPackageResourceURI
173 elif parsed.scheme == "mem":
174 # in-memory datastore object
175 from .mem import ButlerInMemoryURI
176 subclass = ButlerInMemoryURI
177 else:
178 raise NotImplementedError(f"No URI support for scheme: '{parsed.scheme}'"
179 " in {parsed.geturl()}")
181 parsed, dirLike = subclass._fixupPathUri(parsed, root=root,
182 forceAbsolute=forceAbsolute,
183 forceDirectory=forceDirectory)
185 # It is possible for the class to change from schemeless
186 # to file so handle that
187 if parsed.scheme == "file": 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true
188 from .file import ButlerFileURI
189 subclass = ButlerFileURI
191 # Now create an instance of the correct subclass and set the
192 # attributes directly
193 self = object.__new__(subclass)
194 self._uri = parsed
195 self.dirLike = dirLike
196 self.isTemporary = isTemporary
197 return self
199 @property
200 def scheme(self) -> str:
201 """The URI scheme (``://`` is not part of the scheme)."""
202 return self._uri.scheme
204 @property
205 def netloc(self) -> str:
206 """The URI network location."""
207 return self._uri.netloc
209 @property
210 def path(self) -> str:
211 """The path component of the URI."""
212 return self._uri.path
214 @property
215 def unquoted_path(self) -> str:
216 """The path component of the URI with any URI quoting reversed."""
217 return urllib.parse.unquote(self._uri.path)
219 @property
220 def ospath(self) -> str:
221 """Path component of the URI localized to current OS."""
222 raise AttributeError(f"Non-file URI ({self}) has no local OS path.")
224 @property
225 def relativeToPathRoot(self) -> str:
226 """Returns path relative to network location.
228 Effectively, this is the path property with posix separator stripped
229 from the left hand side of the path.
231 Always unquotes.
232 """
233 p = self._pathLib(self.path)
234 relToRoot = str(p.relative_to(p.root))
235 if self.dirLike and not relToRoot.endswith("/"): 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true
236 relToRoot += "/"
237 return urllib.parse.unquote(relToRoot)
239 @property
240 def is_root(self) -> bool:
241 """`True` if this URI points to the root of the network location.
243 This means that the path components refers to the top level.
244 """
245 relpath = self.relativeToPathRoot
246 if relpath == "./":
247 return True
248 return False
250 @property
251 def fragment(self) -> str:
252 """The fragment component of the URI."""
253 return self._uri.fragment
255 @property
256 def params(self) -> str:
257 """Any parameters included in the URI."""
258 return self._uri.params
260 @property
261 def query(self) -> str:
262 """Any query strings included in the URI."""
263 return self._uri.query
265 def geturl(self) -> str:
266 """Return the URI in string form.
268 Returns
269 -------
270 url : `str`
271 String form of URI.
272 """
273 return self._uri.geturl()
275 def split(self) -> Tuple[ButlerURI, str]:
276 """Splits URI into head and tail. Equivalent to os.path.split where
277 head preserves the URI components.
279 Returns
280 -------
281 head: `ButlerURI`
282 Everything leading up to tail, expanded and normalized as per
283 ButlerURI rules.
284 tail : `str`
285 Last `self.path` component. Tail will be empty if path ends on a
286 separator. Tail will never contain separators. It will be
287 unquoted.
288 """
289 head, tail = self._pathModule.split(self.path)
290 headuri = self._uri._replace(path=head)
292 # The file part should never include quoted metacharacters
293 tail = urllib.parse.unquote(tail)
295 # Schemeless is special in that it can be a relative path
296 # We need to ensure that it stays that way. All other URIs will
297 # be absolute already.
298 forceAbsolute = self._pathModule.isabs(self.path)
299 return ButlerURI(headuri, forceDirectory=True, forceAbsolute=forceAbsolute), tail
301 def basename(self) -> str:
302 """Returns the base name, last element of path, of the URI. If URI ends
303 on a slash returns an empty string. This is the second element returned
304 by split().
306 Equivalent of os.path.basename().
308 Returns
309 -------
310 tail : `str`
311 Last part of the path attribute. Trail will be empty if path ends
312 on a separator.
313 """
314 return self.split()[1]
316 def dirname(self) -> ButlerURI:
317 """Returns a ButlerURI containing all the directories of the path
318 attribute.
320 Equivalent of os.path.dirname()
322 Returns
323 -------
324 head : `ButlerURI`
325 Everything except the tail of path attribute, expanded and
326 normalized as per ButlerURI rules.
327 """
328 return self.split()[0]
330 def parent(self) -> ButlerURI:
331 """Returns a ButlerURI containing all the directories of the path
332 attribute, minus the last one.
334 Returns
335 -------
336 head : `ButlerURI`
337 Everything except the tail of path attribute, expanded and
338 normalized as per ButlerURI rules.
339 """
340 # When self is file-like, return self.dirname()
341 if not self.dirLike:
342 return self.dirname()
343 # When self is dir-like, return its parent directory,
344 # regardless of the presence of a trailing separator
345 originalPath = self._pathLib(self.path)
346 parentPath = originalPath.parent
347 parentURI = self._uri._replace(path=str(parentPath))
349 return ButlerURI(parentURI, forceDirectory=True)
351 def replace(self, **kwargs: Any) -> ButlerURI:
352 """Replace components in a URI with new values and return a new
353 instance.
355 Returns
356 -------
357 new : `ButlerURI`
358 New `ButlerURI` object with updated values.
359 """
360 return self.__class__(self._uri._replace(**kwargs))
362 def updateFile(self, newfile: str) -> None:
363 """Update in place the final component of the path with the supplied
364 file name.
366 Parameters
367 ----------
368 newfile : `str`
369 File name with no path component.
371 Notes
372 -----
373 Updates the URI in place.
374 Updates the ButlerURI.dirLike attribute. The new file path will
375 be quoted if necessary.
376 """
377 if self.quotePaths:
378 newfile = urllib.parse.quote(newfile)
379 dir, _ = self._pathModule.split(self.path)
380 newpath = self._pathModule.join(dir, newfile)
382 self.dirLike = False
383 self._uri = self._uri._replace(path=newpath)
385 def getExtension(self) -> str:
386 """Return the file extension(s) associated with this URI path.
388 Returns
389 -------
390 ext : `str`
391 The file extension (including the ``.``). Can be empty string
392 if there is no file extension. Usually returns only the last
393 file extension unless there is a special extension modifier
394 indicating file compression, in which case the combined
395 extension (e.g. ``.fits.gz``) will be returned.
396 """
397 special = {".gz", ".bz2", ".xz", ".fz"}
399 extensions = self._pathLib(self.path).suffixes
401 if not extensions: 401 ↛ 402line 401 didn't jump to line 402, because the condition on line 401 was never true
402 return ""
404 ext = extensions.pop()
406 # Multiple extensions, decide whether to include the final two
407 if extensions and ext in special: 407 ↛ 408line 407 didn't jump to line 408, because the condition on line 407 was never true
408 ext = f"{extensions[-1]}{ext}"
410 return ext
412 def join(self, path: str) -> ButlerURI:
413 """Create a new `ButlerURI` with additional path components including
414 a file.
416 Parameters
417 ----------
418 path : `str`
419 Additional file components to append to the current URI. Assumed
420 to include a file at the end. Will be quoted depending on the
421 associated URI scheme.
423 Returns
424 -------
425 new : `ButlerURI`
426 New URI with any file at the end replaced with the new path
427 components.
429 Notes
430 -----
431 Schemeless URIs assume local path separator but all other URIs assume
432 POSIX separator if the supplied path has directory structure. It
433 may be this never becomes a problem but datastore templates assume
434 POSIX separator is being used.
435 """
436 new = self.dirname() # By definition a directory URI
438 # new should be asked about quoting, not self, since dirname can
439 # change the URI scheme for schemeless -> file
440 if new.quotePaths: 440 ↛ 443line 440 didn't jump to line 443, because the condition on line 440 was never false
441 path = urllib.parse.quote(path)
443 newpath = self._pathModule.normpath(self._pathModule.join(new.path, path))
444 new._uri = new._uri._replace(path=newpath)
445 # Declare the new URI not be dirLike unless path ended in /
446 if not path.endswith(self._pathModule.sep): 446 ↛ 448line 446 didn't jump to line 448, because the condition on line 446 was never false
447 new.dirLike = False
448 return new
450 def relative_to(self, other: ButlerURI) -> Optional[str]:
451 """Return the relative path from this URI to the other URI.
453 Parameters
454 ----------
455 other : `ButlerURI`
456 URI to use to calculate the relative path. Must be a parent
457 of this URI.
459 Returns
460 -------
461 subpath : `str`
462 The sub path of this URI relative to the supplied other URI.
463 Returns `None` if there is no parent child relationship.
464 Scheme and netloc must match.
465 """
466 if self.scheme != other.scheme or self.netloc != other.netloc:
467 return None
469 enclosed_path = self._pathLib(self.relativeToPathRoot)
470 parent_path = other.relativeToPathRoot
471 subpath: Optional[str]
472 try:
473 subpath = str(enclosed_path.relative_to(parent_path))
474 except ValueError:
475 subpath = None
476 else:
477 subpath = urllib.parse.unquote(subpath)
478 return subpath
480 def exists(self) -> bool:
481 """Indicate that the resource is available.
483 Returns
484 -------
485 exists : `bool`
486 `True` if the resource exists.
487 """
488 raise NotImplementedError()
490 def remove(self) -> None:
491 """Remove the resource."""
492 raise NotImplementedError()
494 def isabs(self) -> bool:
495 """Indicate that the resource is fully specified.
497 For non-schemeless URIs this is always true.
499 Returns
500 -------
501 isabs : `bool`
502 `True` in all cases except schemeless URI.
503 """
504 return True
506 def _as_local(self) -> Tuple[str, bool]:
507 """Return the location of the (possibly remote) resource in the
508 local file system.
510 This is a helper function for ``as_local`` context manager.
512 Returns
513 -------
514 path : `str`
515 If this is a remote resource, it will be a copy of the resource
516 on the local file system, probably in a temporary directory.
517 For a local resource this should be the actual path to the
518 resource.
519 is_temporary : `bool`
520 Indicates if the local path is a temporary file or not.
521 """
522 raise NotImplementedError()
524 @contextlib.contextmanager
525 def as_local(self) -> Iterator[ButlerURI]:
526 """Return the location of the (possibly remote) resource in the
527 local file system.
529 Yields
530 ------
531 local : `ButlerURI`
532 If this is a remote resource, it will be a copy of the resource
533 on the local file system, probably in a temporary directory.
534 For a local resource this should be the actual path to the
535 resource.
537 Notes
538 -----
539 The context manager will automatically delete any local temporary
540 file.
542 Examples
543 --------
544 Should be used as a context manager:
546 .. code-block:: py
548 with uri.as_local() as local:
549 ospath = local.ospath
550 """
551 local_src, is_temporary = self._as_local()
552 local_uri = ButlerURI(local_src, isTemporary=is_temporary)
554 try:
555 yield local_uri
556 finally:
557 # The caller might have relocated the temporary file
558 if is_temporary and local_uri.exists():
559 local_uri.remove()
561 def read(self, size: int = -1) -> bytes:
562 """Open the resource and return the contents in bytes.
564 Parameters
565 ----------
566 size : `int`, optional
567 The number of bytes to read. Negative or omitted indicates
568 that all data should be read.
569 """
570 raise NotImplementedError()
572 def write(self, data: bytes, overwrite: bool = True) -> None:
573 """Write the supplied bytes to the new resource.
575 Parameters
576 ----------
577 data : `bytes`
578 The bytes to write to the resource. The entire contents of the
579 resource will be replaced.
580 overwrite : `bool`, optional
581 If `True` the resource will be overwritten if it exists. Otherwise
582 the write will fail.
583 """
584 raise NotImplementedError()
586 def mkdir(self) -> None:
587 """For a dir-like URI, create the directory resource if it does not
588 already exist.
589 """
590 raise NotImplementedError()
592 def size(self) -> int:
593 """For non-dir-like URI, return the size of the resource.
595 Returns
596 -------
597 sz : `int`
598 The size in bytes of the resource associated with this URI.
599 Returns 0 if dir-like.
600 """
601 raise NotImplementedError()
603 def __str__(self) -> str:
604 return self.geturl()
606 def __repr__(self) -> str:
607 return f'ButlerURI("{self.geturl()}")'
609 def __eq__(self, other: Any) -> bool:
610 if not isinstance(other, ButlerURI):
611 return False
612 return self.geturl() == other.geturl()
614 def __copy__(self) -> ButlerURI:
615 # Implement here because the __new__ method confuses things
616 return type(self)(str(self))
618 def __deepcopy__(self, memo: Any) -> ButlerURI:
619 # Implement here because the __new__ method confuses things
620 return self.__copy__()
622 def __getnewargs__(self) -> Tuple:
623 return (str(self),)
625 @staticmethod
626 def _fixupPathUri(parsed: urllib.parse.ParseResult, root: Optional[Union[str, ButlerURI]] = None,
627 forceAbsolute: bool = False,
628 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]:
629 """Correct any issues with the supplied URI.
631 Parameters
632 ----------
633 parsed : `~urllib.parse.ParseResult`
634 The result from parsing a URI using `urllib.parse`.
635 root : `str` or `ButlerURI`, ignored
636 Not used by the this implementation since all URIs are
637 absolute except for those representing the local file system.
638 forceAbsolute : `bool`, ignored.
639 Not used by this implementation. URIs are generally always
640 absolute.
641 forceDirectory : `bool`, optional
642 If `True` forces the URI to end with a separator, otherwise given
643 URI is interpreted as is. Specifying that the URI is conceptually
644 equivalent to a directory can break some ambiguities when
645 interpreting the last element of a path.
647 Returns
648 -------
649 modified : `~urllib.parse.ParseResult`
650 Update result if a URI is being handled.
651 dirLike : `bool`
652 `True` if given parsed URI has a trailing separator or
653 forceDirectory is True. Otherwise `False`.
655 Notes
656 -----
657 Relative paths are explicitly not supported by RFC8089 but `urllib`
658 does accept URIs of the form ``file:relative/path.ext``. They need
659 to be turned into absolute paths before they can be used. This is
660 always done regardless of the ``forceAbsolute`` parameter.
662 AWS S3 differentiates between keys with trailing POSIX separators (i.e
663 `/dir` and `/dir/`) whereas POSIX does not neccessarily.
665 Scheme-less paths are normalized.
666 """
667 # assume we are not dealing with a directory like URI
668 dirLike = False
670 # URI is dir-like if explicitly stated or if it ends on a separator
671 endsOnSep = parsed.path.endswith(posixpath.sep)
672 if forceDirectory or endsOnSep:
673 dirLike = True
674 # only add the separator if it's not already there
675 if not endsOnSep: 675 ↛ 678line 675 didn't jump to line 678, because the condition on line 675 was never false
676 parsed = parsed._replace(path=parsed.path+posixpath.sep)
678 return parsed, dirLike
680 def transfer_from(self, src: ButlerURI, transfer: str,
681 overwrite: bool = False,
682 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
683 """Transfer the current resource to a new location.
685 Parameters
686 ----------
687 src : `ButlerURI`
688 Source URI.
689 transfer : `str`
690 Mode to use for transferring the resource. Generically there are
691 many standard options: copy, link, symlink, hardlink, relsymlink.
692 Not all URIs support all modes.
693 overwrite : `bool`, optional
694 Allow an existing file to be overwritten. Defaults to `False`.
695 transaction : `DatastoreTransaction`, optional
696 A transaction object that can (depending on implementation)
697 rollback transfers on error. Not guaranteed to be implemented.
699 Notes
700 -----
701 Conceptually this is hard to scale as the number of URI schemes
702 grow. The destination URI is more important than the source URI
703 since that is where all the transfer modes are relevant (with the
704 complication that "move" deletes the source).
706 Local file to local file is the fundamental use case but every
707 other scheme has to support "copy" to local file (with implicit
708 support for "move") and copy from local file.
709 All the "link" options tend to be specific to local file systems.
711 "move" is a "copy" where the remote resource is deleted at the end.
712 Whether this works depends on the source URI rather than the
713 destination URI. Reverting a move on transaction rollback is
714 expected to be problematic if a remote resource was involved.
715 """
716 raise NotImplementedError(f"No transfer modes supported by URI scheme {self.scheme}")