Coverage for python/lsst/daf/butler/core/_butlerUri/_butlerUri.py : 61%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24import urllib
25import posixpath
26import copy
27import logging
28import re
30from pathlib import PurePath, PurePosixPath
32__all__ = ('ButlerURI',)
34from typing import (
35 TYPE_CHECKING,
36 Any,
37 Optional,
38 Tuple,
39 Type,
40 Union,
41)
43from .utils import NoTransaction
45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true
46 from ..datastore import DatastoreTransaction
49log = logging.getLogger(__name__)
51# Regex for looking for URI escapes
52ESCAPES_RE = re.compile(r"%[A-F0-9]{2}")
55class ButlerURI:
56 """Convenience wrapper around URI parsers.
58 Provides access to URI components and can convert file
59 paths into absolute path URIs. Scheme-less URIs are treated as if
60 they are local file system paths and are converted to absolute URIs.
62 A specialist subclass is created for each supported URI scheme.
64 Parameters
65 ----------
66 uri : `str` or `urllib.parse.ParseResult`
67 URI in string form. Can be scheme-less if referring to a local
68 filesystem path.
69 root : `str` or `ButlerURI`, optional
70 When fixing up a relative path in a ``file`` scheme or if scheme-less,
71 use this as the root. Must be absolute. If `None` the current
72 working directory will be used. Can be a file URI.
73 forceAbsolute : `bool`, optional
74 If `True`, scheme-less relative URI will be converted to an absolute
75 path using a ``file`` scheme. If `False` scheme-less URI will remain
76 scheme-less and will not be updated to ``file`` or absolute path.
77 forceDirectory: `bool`, optional
78 If `True` forces the URI to end with a separator, otherwise given URI
79 is interpreted as is.
80 """
82 _pathLib: Type[PurePath] = PurePosixPath
83 """Path library to use for this scheme."""
85 _pathModule = posixpath
86 """Path module to use for this scheme."""
88 transferModes: Tuple[str, ...] = ("copy", "auto", "move")
89 """Transfer modes supported by this implementation.
91 Move is special in that it is generally a copy followed by an unlink.
92 Whether that unlink works depends critically on whether the source URI
93 implements unlink. If it does not the move will be reported as a failure.
94 """
96 transferDefault: str = "copy"
97 """Default mode to use for transferring if ``auto`` is specified."""
99 quotePaths = True
100 """True if path-like elements modifying a URI should be quoted.
102 All non-schemeless URIs have to internally use quoted paths. Therefore
103 if a new file name is given (e.g. to updateFile or join) a decision must
104 be made whether to quote it to be consistent.
105 """
107 # This is not an ABC with abstract methods because the __new__ being
108 # a factory confuses mypy such that it assumes that every constructor
109 # returns a ButlerURI and then determines that all the abstract methods
110 # are still abstract. If they are not marked abstract but just raise
111 # mypy is fine with it.
113 # mypy is confused without this
114 _uri: urllib.parse.ParseResult
116 def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI],
117 root: Optional[Union[str, ButlerURI]] = None, forceAbsolute: bool = True,
118 forceDirectory: bool = False) -> ButlerURI:
119 parsed: urllib.parse.ParseResult
120 dirLike: bool
121 subclass: Optional[Type] = None
123 # Record if we need to post process the URI components
124 # or if the instance is already fully configured
125 if isinstance(uri, str):
126 # Since local file names can have special characters in them
127 # we need to quote them for the parser but we can unquote
128 # later. Assume that all other URI schemes are quoted.
129 # Since sometimes people write file:/a/b and not file:///a/b
130 # we should not quote in the explicit case of file:
131 if "://" not in uri and not uri.startswith("file:"):
132 if ESCAPES_RE.search(uri): 132 ↛ 133line 132 didn't jump to line 133, because the condition on line 132 was never true
133 log.warning("Possible double encoding of %s", uri)
134 else:
135 uri = urllib.parse.quote(uri)
136 parsed = urllib.parse.urlparse(uri)
137 elif isinstance(uri, urllib.parse.ParseResult):
138 parsed = copy.copy(uri)
139 elif isinstance(uri, ButlerURI): 139 ↛ 145line 139 didn't jump to line 145, because the condition on line 139 was never false
140 parsed = copy.copy(uri._uri)
141 dirLike = uri.dirLike
142 # No further parsing required and we know the subclass
143 subclass = type(uri)
144 else:
145 raise ValueError(f"Supplied URI must be string, ButlerURI, or ParseResult but got '{uri!r}'")
147 if subclass is None:
148 # Work out the subclass from the URI scheme
149 if not parsed.scheme:
150 from .schemeless import ButlerSchemelessURI
151 subclass = ButlerSchemelessURI
152 elif parsed.scheme == "file": 152 ↛ 153line 152 didn't jump to line 153, because the condition on line 152 was never true
153 from .file import ButlerFileURI
154 subclass = ButlerFileURI
155 elif parsed.scheme == "s3": 155 ↛ 156line 155 didn't jump to line 156, because the condition on line 155 was never true
156 from .s3 import ButlerS3URI
157 subclass = ButlerS3URI
158 elif parsed.scheme.startswith("http"): 158 ↛ 159line 158 didn't jump to line 159, because the condition on line 158 was never true
159 from .http import ButlerHttpURI
160 subclass = ButlerHttpURI
161 elif parsed.scheme == "resource": 161 ↛ 165line 161 didn't jump to line 165, because the condition on line 161 was never false
162 # Rules for scheme names disallow pkg_resource
163 from .packageresource import ButlerPackageResourceURI
164 subclass = ButlerPackageResourceURI
165 elif parsed.scheme == "mem":
166 # in-memory datastore object
167 from .mem import ButlerInMemoryURI
168 subclass = ButlerInMemoryURI
169 else:
170 raise NotImplementedError(f"No URI support for scheme: '{parsed.scheme}'"
171 " in {parsed.geturl()}")
173 parsed, dirLike = subclass._fixupPathUri(parsed, root=root,
174 forceAbsolute=forceAbsolute,
175 forceDirectory=forceDirectory)
177 # It is possible for the class to change from schemeless
178 # to file so handle that
179 if parsed.scheme == "file": 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true
180 from .file import ButlerFileURI
181 subclass = ButlerFileURI
183 # Now create an instance of the correct subclass and set the
184 # attributes directly
185 self = object.__new__(subclass)
186 self._uri = parsed
187 self.dirLike = dirLike
188 return self
190 @property
191 def scheme(self) -> str:
192 """The URI scheme (``://`` is not part of the scheme)."""
193 return self._uri.scheme
195 @property
196 def netloc(self) -> str:
197 """The URI network location."""
198 return self._uri.netloc
200 @property
201 def path(self) -> str:
202 """The path component of the URI."""
203 return self._uri.path
205 @property
206 def unquoted_path(self) -> str:
207 """The path component of the URI with any URI quoting reversed."""
208 return urllib.parse.unquote(self._uri.path)
210 @property
211 def ospath(self) -> str:
212 """Path component of the URI localized to current OS."""
213 raise AttributeError(f"Non-file URI ({self}) has no local OS path.")
215 @property
216 def relativeToPathRoot(self) -> str:
217 """Returns path relative to network location.
219 Effectively, this is the path property with posix separator stripped
220 from the left hand side of the path.
222 Always unquotes.
223 """
224 p = self._pathLib(self.path)
225 relToRoot = str(p.relative_to(p.root))
226 if self.dirLike and not relToRoot.endswith("/"): 226 ↛ 227line 226 didn't jump to line 227, because the condition on line 226 was never true
227 relToRoot += "/"
228 return urllib.parse.unquote(relToRoot)
230 @property
231 def is_root(self) -> bool:
232 """`True` if this URI points to the root of the network location.
234 This means that the path components refers to the top level.
235 """
236 relpath = self.relativeToPathRoot
237 if relpath == "./":
238 return True
239 return False
241 @property
242 def fragment(self) -> str:
243 """The fragment component of the URI."""
244 return self._uri.fragment
246 @property
247 def params(self) -> str:
248 """Any parameters included in the URI."""
249 return self._uri.params
251 @property
252 def query(self) -> str:
253 """Any query strings included in the URI."""
254 return self._uri.query
256 def geturl(self) -> str:
257 """Return the URI in string form.
259 Returns
260 -------
261 url : `str`
262 String form of URI.
263 """
264 return self._uri.geturl()
266 def split(self) -> Tuple[ButlerURI, str]:
267 """Splits URI into head and tail. Equivalent to os.path.split where
268 head preserves the URI components.
270 Returns
271 -------
272 head: `ButlerURI`
273 Everything leading up to tail, expanded and normalized as per
274 ButlerURI rules.
275 tail : `str`
276 Last `self.path` component. Tail will be empty if path ends on a
277 separator. Tail will never contain separators. It will be
278 unquoted.
279 """
280 head, tail = self._pathModule.split(self.path)
281 headuri = self._uri._replace(path=head)
283 # The file part should never include quoted metacharacters
284 tail = urllib.parse.unquote(tail)
286 # Schemeless is special in that it can be a relative path
287 # We need to ensure that it stays that way. All other URIs will
288 # be absolute already.
289 forceAbsolute = self._pathModule.isabs(self.path)
290 return ButlerURI(headuri, forceDirectory=True, forceAbsolute=forceAbsolute), tail
292 def basename(self) -> str:
293 """Returns the base name, last element of path, of the URI. If URI ends
294 on a slash returns an empty string. This is the second element returned
295 by split().
297 Equivalent of os.path.basename().
299 Returns
300 -------
301 tail : `str`
302 Last part of the path attribute. Trail will be empty if path ends
303 on a separator.
304 """
305 return self.split()[1]
307 def dirname(self) -> ButlerURI:
308 """Returns a ButlerURI containing all the directories of the path
309 attribute.
311 Equivalent of os.path.dirname()
313 Returns
314 -------
315 head : `ButlerURI`
316 Everything except the tail of path attribute, expanded and
317 normalized as per ButlerURI rules.
318 """
319 return self.split()[0]
321 def parent(self) -> ButlerURI:
322 """Returns a ButlerURI containing all the directories of the path
323 attribute, minus the last one.
325 Returns
326 -------
327 head : `ButlerURI`
328 Everything except the tail of path attribute, expanded and
329 normalized as per ButlerURI rules.
330 """
331 # When self is file-like, return self.dirname()
332 if not self.dirLike:
333 return self.dirname()
334 # When self is dir-like, return its parent directory,
335 # regardless of the presence of a trailing separator
336 originalPath = self._pathLib(self.path)
337 parentPath = originalPath.parent
338 parentURI = self._uri._replace(path=str(parentPath))
340 return ButlerURI(parentURI, forceDirectory=True)
342 def replace(self, **kwargs: Any) -> ButlerURI:
343 """Replace components in a URI with new values and return a new
344 instance.
346 Returns
347 -------
348 new : `ButlerURI`
349 New `ButlerURI` object with updated values.
350 """
351 return self.__class__(self._uri._replace(**kwargs))
353 def updateFile(self, newfile: str) -> None:
354 """Update in place the final component of the path with the supplied
355 file name.
357 Parameters
358 ----------
359 newfile : `str`
360 File name with no path component.
362 Notes
363 -----
364 Updates the URI in place.
365 Updates the ButlerURI.dirLike attribute. The new file path will
366 be quoted if necessary.
367 """
368 if self.quotePaths:
369 newfile = urllib.parse.quote(newfile)
370 dir, _ = self._pathModule.split(self.path)
371 newpath = self._pathModule.join(dir, newfile)
373 self.dirLike = False
374 self._uri = self._uri._replace(path=newpath)
376 def getExtension(self) -> str:
377 """Return the file extension(s) associated with this URI path.
379 Returns
380 -------
381 ext : `str`
382 The file extension (including the ``.``). Can be empty string
383 if there is no file extension. Usually returns only the last
384 file extension unless there is a special extension modifier
385 indicating file compression, in which case the combined
386 extension (e.g. ``.fits.gz``) will be returned.
387 """
388 special = {".gz", ".bz2", ".xz", ".fz"}
390 extensions = self._pathLib(self.path).suffixes
392 if not extensions: 392 ↛ 393line 392 didn't jump to line 393, because the condition on line 392 was never true
393 return ""
395 ext = extensions.pop()
397 # Multiple extensions, decide whether to include the final two
398 if extensions and ext in special: 398 ↛ 399line 398 didn't jump to line 399, because the condition on line 398 was never true
399 ext = f"{extensions[-1]}{ext}"
401 return ext
403 def join(self, path: str) -> ButlerURI:
404 """Create a new `ButlerURI` with additional path components including
405 a file.
407 Parameters
408 ----------
409 path : `str`
410 Additional file components to append to the current URI. Assumed
411 to include a file at the end. Will be quoted depending on the
412 associated URI scheme.
414 Returns
415 -------
416 new : `ButlerURI`
417 New URI with any file at the end replaced with the new path
418 components.
420 Notes
421 -----
422 Schemeless URIs assume local path separator but all other URIs assume
423 POSIX separator if the supplied path has directory structure. It
424 may be this never becomes a problem but datastore templates assume
425 POSIX separator is being used.
426 """
427 new = self.dirname() # By definition a directory URI
429 # new should be asked about quoting, not self, since dirname can
430 # change the URI scheme for schemeless -> file
431 if new.quotePaths: 431 ↛ 434line 431 didn't jump to line 434, because the condition on line 431 was never false
432 path = urllib.parse.quote(path)
434 newpath = self._pathModule.normpath(self._pathModule.join(new.path, path))
435 new._uri = new._uri._replace(path=newpath)
436 # Declare the new URI not be dirLike unless path ended in /
437 if not path.endswith(self._pathModule.sep): 437 ↛ 439line 437 didn't jump to line 439, because the condition on line 437 was never false
438 new.dirLike = False
439 return new
441 def relative_to(self, other: ButlerURI) -> Optional[str]:
442 """Return the relative path from this URI to the other URI.
444 Parameters
445 ----------
446 other : `ButlerURI`
447 URI to use to calculate the relative path. Must be a parent
448 of this URI.
450 Returns
451 -------
452 subpath : `str`
453 The sub path of this URI relative to the supplied other URI.
454 Returns `None` if there is no parent child relationship.
455 Scheme and netloc must match.
456 """
457 if self.scheme != other.scheme or self.netloc != other.netloc:
458 return None
460 enclosed_path = self._pathLib(self.relativeToPathRoot)
461 parent_path = other.relativeToPathRoot
462 subpath: Optional[str]
463 try:
464 subpath = str(enclosed_path.relative_to(parent_path))
465 except ValueError:
466 subpath = None
467 else:
468 subpath = urllib.parse.unquote(subpath)
469 return subpath
471 def exists(self) -> bool:
472 """Indicate that the resource is available.
474 Returns
475 -------
476 exists : `bool`
477 `True` if the resource exists.
478 """
479 raise NotImplementedError()
481 def remove(self) -> None:
482 """Remove the resource."""
483 raise NotImplementedError()
485 def isabs(self) -> bool:
486 """Indicate that the resource is fully specified.
488 For non-schemeless URIs this is always true.
490 Returns
491 -------
492 isabs : `bool`
493 `True` in all cases except schemeless URI.
494 """
495 return True
497 def as_local(self) -> Tuple[str, bool]:
498 """Return the location of the (possibly remote) resource in the
499 local file system.
501 Returns
502 -------
503 path : `str`
504 If this is a remote resource, it will be a copy of the resource
505 on the local file system, probably in a temporary directory.
506 For a local resource this should be the actual path to the
507 resource.
508 is_temporary : `bool`
509 Indicates if the local path is a temporary file or not.
510 """
511 raise NotImplementedError()
513 def read(self, size: int = -1) -> bytes:
514 """Open the resource and return the contents in bytes.
516 Parameters
517 ----------
518 size : `int`, optional
519 The number of bytes to read. Negative or omitted indicates
520 that all data should be read.
521 """
522 raise NotImplementedError()
524 def write(self, data: bytes, overwrite: bool = True) -> None:
525 """Write the supplied bytes to the new resource.
527 Parameters
528 ----------
529 data : `bytes`
530 The bytes to write to the resource. The entire contents of the
531 resource will be replaced.
532 overwrite : `bool`, optional
533 If `True` the resource will be overwritten if it exists. Otherwise
534 the write will fail.
535 """
536 raise NotImplementedError()
538 def mkdir(self) -> None:
539 """For a dir-like URI, create the directory resource if it does not
540 already exist.
541 """
542 raise NotImplementedError()
544 def size(self) -> int:
545 """For non-dir-like URI, return the size of the resource.
547 Returns
548 -------
549 sz : `int`
550 The size in bytes of the resource associated with this URI.
551 Returns 0 if dir-like.
552 """
553 raise NotImplementedError()
555 def __str__(self) -> str:
556 return self.geturl()
558 def __repr__(self) -> str:
559 return f'ButlerURI("{self.geturl()}")'
561 def __eq__(self, other: Any) -> bool:
562 if not isinstance(other, ButlerURI):
563 return False
564 return self.geturl() == other.geturl()
566 def __copy__(self) -> ButlerURI:
567 # Implement here because the __new__ method confuses things
568 return type(self)(str(self))
570 def __deepcopy__(self, memo: Any) -> ButlerURI:
571 # Implement here because the __new__ method confuses things
572 return self.__copy__()
574 def __getnewargs__(self) -> Tuple:
575 return (str(self),)
577 @staticmethod
578 def _fixupPathUri(parsed: urllib.parse.ParseResult, root: Optional[Union[str, ButlerURI]] = None,
579 forceAbsolute: bool = False,
580 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]:
581 """Correct any issues with the supplied URI.
583 Parameters
584 ----------
585 parsed : `~urllib.parse.ParseResult`
586 The result from parsing a URI using `urllib.parse`.
587 root : `str` or `ButlerURI`, ignored
588 Not used by the this implementation since all URIs are
589 absolute except for those representing the local file system.
590 forceAbsolute : `bool`, ignored.
591 Not used by this implementation. URIs are generally always
592 absolute.
593 forceDirectory : `bool`, optional
594 If `True` forces the URI to end with a separator, otherwise given
595 URI is interpreted as is. Specifying that the URI is conceptually
596 equivalent to a directory can break some ambiguities when
597 interpreting the last element of a path.
599 Returns
600 -------
601 modified : `~urllib.parse.ParseResult`
602 Update result if a URI is being handled.
603 dirLike : `bool`
604 `True` if given parsed URI has a trailing separator or
605 forceDirectory is True. Otherwise `False`.
607 Notes
608 -----
609 Relative paths are explicitly not supported by RFC8089 but `urllib`
610 does accept URIs of the form ``file:relative/path.ext``. They need
611 to be turned into absolute paths before they can be used. This is
612 always done regardless of the ``forceAbsolute`` parameter.
614 AWS S3 differentiates between keys with trailing POSIX separators (i.e
615 `/dir` and `/dir/`) whereas POSIX does not neccessarily.
617 Scheme-less paths are normalized.
618 """
619 # assume we are not dealing with a directory like URI
620 dirLike = False
622 # URI is dir-like if explicitly stated or if it ends on a separator
623 endsOnSep = parsed.path.endswith(posixpath.sep)
624 if forceDirectory or endsOnSep:
625 dirLike = True
626 # only add the separator if it's not already there
627 if not endsOnSep: 627 ↛ 630line 627 didn't jump to line 630, because the condition on line 627 was never false
628 parsed = parsed._replace(path=parsed.path+posixpath.sep)
630 return parsed, dirLike
632 def transfer_from(self, src: ButlerURI, transfer: str,
633 overwrite: bool = False,
634 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
635 """Transfer the current resource to a new location.
637 Parameters
638 ----------
639 src : `ButlerURI`
640 Source URI.
641 transfer : `str`
642 Mode to use for transferring the resource. Generically there are
643 many standard options: copy, link, symlink, hardlink, relsymlink.
644 Not all URIs support all modes.
645 overwrite : `bool`, optional
646 Allow an existing file to be overwritten. Defaults to `False`.
647 transaction : `DatastoreTransaction`, optional
648 A transaction object that can (depending on implementation)
649 rollback transfers on error. Not guaranteed to be implemented.
651 Notes
652 -----
653 Conceptually this is hard to scale as the number of URI schemes
654 grow. The destination URI is more important than the source URI
655 since that is where all the transfer modes are relevant (with the
656 complication that "move" deletes the source).
658 Local file to local file is the fundamental use case but every
659 other scheme has to support "copy" to local file (with implicit
660 support for "move") and copy from local file.
661 All the "link" options tend to be specific to local file systems.
663 "move" is a "copy" where the remote resource is deleted at the end.
664 Whether this works depends on the source URI rather than the
665 destination URI. Reverting a move on transaction rollback is
666 expected to be problematic if a remote resource was involved.
667 """
668 raise NotImplementedError(f"No transfer modes supported by URI scheme {self.scheme}")