Coverage for python/lsst/daf/butler/core/_butlerUri.py : 19%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("ButlerURI",)
26import contextlib
27import os
28import os.path
29import shutil
30import urllib
31import pkg_resources
32import posixpath
33from pathlib import Path, PurePath, PurePosixPath
34import requests
35import tempfile
36import copy
37import logging
38import re
40from typing import (
41 TYPE_CHECKING,
42 Any,
43 Callable,
44 cast,
45 Iterator,
46 Optional,
47 Tuple,
48 Type,
49 Union,
50)
52from .utils import safeMakeDir
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 try:
56 import boto3
57 except ImportError:
58 pass
59 from .datastore import DatastoreTransaction
62log = logging.getLogger(__name__)
64# Determine if the path separator for the OS looks like POSIX
65IS_POSIX = os.sep == posixpath.sep
67# Root path for this operating system
68OS_ROOT_PATH = Path().resolve().root
70# Regex for looking for URI escapes
71ESCAPES_RE = re.compile(r"%[A-F0-9]{2}")
74def os2posix(ospath: str) -> str:
75 """Convert a local path description to a POSIX path description.
77 Parameters
78 ----------
79 ospath : `str`
80 Path using the local path separator.
82 Returns
83 -------
84 posix : `str`
85 Path using POSIX path separator
86 """
87 if IS_POSIX:
88 return ospath
90 posix = PurePath(ospath).as_posix()
92 # PurePath strips trailing "/" from paths such that you can no
93 # longer tell if a path is meant to be referring to a directory
94 # Try to fix this.
95 if ospath.endswith(os.sep) and not posix.endswith(posixpath.sep):
96 posix += posixpath.sep
98 return posix
101def posix2os(posix: Union[PurePath, str]) -> str:
102 """Convert a POSIX path description to a local path description.
104 Parameters
105 ----------
106 posix : `str`, `PurePath`
107 Path using the POSIX path separator.
109 Returns
110 -------
111 ospath : `str`
112 Path using OS path separator
113 """
114 if IS_POSIX:
115 return str(posix)
117 posixPath = PurePosixPath(posix)
118 paths = list(posixPath.parts)
120 # Have to convert the root directory after splitting
121 if paths[0] == posixPath.root:
122 paths[0] = OS_ROOT_PATH
124 # Trailing "/" is stripped so we need to add back an empty path
125 # for consistency
126 if str(posix).endswith(posixpath.sep):
127 paths.append("")
129 return os.path.join(*paths)
132class NoTransaction:
133 """A simple emulation of the `DatastoreTransaction` class.
135 Does nothing.
136 """
138 def __init__(self) -> None:
139 return
141 @contextlib.contextmanager
142 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]:
143 """No-op context manager to replace `DatastoreTransaction`
144 """
145 yield None
148class ButlerURI:
149 """Convenience wrapper around URI parsers.
151 Provides access to URI components and can convert file
152 paths into absolute path URIs. Scheme-less URIs are treated as if
153 they are local file system paths and are converted to absolute URIs.
155 A specialist subclass is created for each supported URI scheme.
157 Parameters
158 ----------
159 uri : `str` or `urllib.parse.ParseResult`
160 URI in string form. Can be scheme-less if referring to a local
161 filesystem path.
162 root : `str`, optional
163 When fixing up a relative path in a ``file`` scheme or if scheme-less,
164 use this as the root. Must be absolute. If `None` the current
165 working directory will be used.
166 forceAbsolute : `bool`, optional
167 If `True`, scheme-less relative URI will be converted to an absolute
168 path using a ``file`` scheme. If `False` scheme-less URI will remain
169 scheme-less and will not be updated to ``file`` or absolute path.
170 forceDirectory: `bool`, optional
171 If `True` forces the URI to end with a separator, otherwise given URI
172 is interpreted as is.
173 """
175 _pathLib: Type[PurePath] = PurePosixPath
176 """Path library to use for this scheme."""
178 _pathModule = posixpath
179 """Path module to use for this scheme."""
181 transferModes: Tuple[str, ...] = ("copy", "auto", "move")
182 """Transfer modes supported by this implementation.
184 Move is special in that it is generally a copy followed by an unlink.
185 Whether that unlink works depends critically on whether the source URI
186 implements unlink. If it does not the move will be reported as a failure.
187 """
189 transferDefault: str = "copy"
190 """Default mode to use for transferring if ``auto`` is specified."""
192 quotePaths = True
193 """True if path-like elements modifying a URI should be quoted.
195 All non-schemeless URIs have to internally use quoted paths. Therefore
196 if a new file name is given (e.g. to updateFile or join) a decision must
197 be made whether to quote it to be consistent.
198 """
200 # This is not an ABC with abstract methods because the __new__ being
201 # a factory confuses mypy such that it assumes that every constructor
202 # returns a ButlerURI and then determines that all the abstract methods
203 # are still abstract. If they are not marked abstract but just raise
204 # mypy is fine with it.
206 # mypy is confused without this
207 _uri: urllib.parse.ParseResult
209 def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI],
210 root: Optional[str] = None, forceAbsolute: bool = True,
211 forceDirectory: bool = False) -> ButlerURI:
212 parsed: urllib.parse.ParseResult
213 dirLike: bool
214 subclass: Optional[Type] = None
216 # Record if we need to post process the URI components
217 # or if the instance is already fully configured
218 if isinstance(uri, str):
219 # Since local file names can have special characters in them
220 # we need to quote them for the parser but we can unquote
221 # later. Assume that all other URI schemes are quoted.
222 # Since sometimes people write file:/a/b and not file:///a/b
223 # we should not quote in the explicit case of file:
224 if "://" not in uri and not uri.startswith("file:"):
225 if ESCAPES_RE.search(uri):
226 log.warning("Possible double encoding of %s", uri)
227 else:
228 uri = urllib.parse.quote(uri)
229 parsed = urllib.parse.urlparse(uri)
230 elif isinstance(uri, urllib.parse.ParseResult):
231 parsed = copy.copy(uri)
232 elif isinstance(uri, ButlerURI):
233 parsed = copy.copy(uri._uri)
234 dirLike = uri.dirLike
235 # No further parsing required and we know the subclass
236 subclass = type(uri)
237 else:
238 raise ValueError(f"Supplied URI must be string, ButlerURI, or ParseResult but got '{uri!r}'")
240 if subclass is None:
241 # Work out the subclass from the URI scheme
242 if not parsed.scheme:
243 subclass = ButlerSchemelessURI
244 elif parsed.scheme == "file":
245 subclass = ButlerFileURI
246 elif parsed.scheme == "s3":
247 subclass = ButlerS3URI
248 elif parsed.scheme.startswith("http"):
249 subclass = ButlerHttpURI
250 elif parsed.scheme == "resource":
251 # Rules for scheme names disasllow pkg_resource
252 subclass = ButlerPackageResourceURI
253 elif parsed.scheme == "mem":
254 # in-memory datastore object
255 subclass = ButlerInMemoryURI
256 else:
257 raise NotImplementedError(f"No URI support for scheme: '{parsed.scheme}'"
258 " in {parsed.geturl()}")
260 parsed, dirLike = subclass._fixupPathUri(parsed, root=root,
261 forceAbsolute=forceAbsolute,
262 forceDirectory=forceDirectory)
264 # It is possible for the class to change from schemeless
265 # to file so handle that
266 if parsed.scheme == "file":
267 subclass = ButlerFileURI
269 # Now create an instance of the correct subclass and set the
270 # attributes directly
271 self = object.__new__(subclass)
272 self._uri = parsed
273 self.dirLike = dirLike
274 return self
276 @property
277 def scheme(self) -> str:
278 """The URI scheme (``://`` is not part of the scheme)."""
279 return self._uri.scheme
281 @property
282 def netloc(self) -> str:
283 """The URI network location."""
284 return self._uri.netloc
286 @property
287 def path(self) -> str:
288 """The path component of the URI."""
289 return self._uri.path
291 @property
292 def unquoted_path(self) -> str:
293 """The path component of the URI with any URI quoting reversed."""
294 return urllib.parse.unquote(self._uri.path)
296 @property
297 def ospath(self) -> str:
298 """Path component of the URI localized to current OS."""
299 raise AttributeError(f"Non-file URI ({self}) has no local OS path.")
301 @property
302 def relativeToPathRoot(self) -> str:
303 """Returns path relative to network location.
305 Effectively, this is the path property with posix separator stripped
306 from the left hand side of the path.
308 Always unquotes.
309 """
310 p = self._pathLib(self.path)
311 relToRoot = str(p.relative_to(p.root))
312 if self.dirLike and not relToRoot.endswith("/"):
313 relToRoot += "/"
314 return urllib.parse.unquote(relToRoot)
316 @property
317 def fragment(self) -> str:
318 """The fragment component of the URI."""
319 return self._uri.fragment
321 @property
322 def params(self) -> str:
323 """Any parameters included in the URI."""
324 return self._uri.params
326 @property
327 def query(self) -> str:
328 """Any query strings included in the URI."""
329 return self._uri.query
331 def geturl(self) -> str:
332 """Return the URI in string form.
334 Returns
335 -------
336 url : `str`
337 String form of URI.
338 """
339 return self._uri.geturl()
341 def split(self) -> Tuple[ButlerURI, str]:
342 """Splits URI into head and tail. Equivalent to os.path.split where
343 head preserves the URI components.
345 Returns
346 -------
347 head: `ButlerURI`
348 Everything leading up to tail, expanded and normalized as per
349 ButlerURI rules.
350 tail : `str`
351 Last `self.path` component. Tail will be empty if path ends on a
352 separator. Tail will never contain separators. It will be
353 unquoted.
354 """
355 head, tail = self._pathModule.split(self.path)
356 headuri = self._uri._replace(path=head)
358 # The file part should never include quoted metacharacters
359 tail = urllib.parse.unquote(tail)
361 # Schemeless is special in that it can be a relative path
362 # We need to ensure that it stays that way. All other URIs will
363 # be absolute already.
364 forceAbsolute = self._pathModule.isabs(self.path)
365 return ButlerURI(headuri, forceDirectory=True, forceAbsolute=forceAbsolute), tail
367 def basename(self) -> str:
368 """Returns the base name, last element of path, of the URI. If URI ends
369 on a slash returns an empty string. This is the second element returned
370 by split().
372 Equivalent of os.path.basename().
374 Returns
375 -------
376 tail : `str`
377 Last part of the path attribute. Trail will be empty if path ends
378 on a separator.
379 """
380 return self.split()[1]
382 def dirname(self) -> ButlerURI:
383 """Returns a ButlerURI containing all the directories of the path
384 attribute.
386 Equivalent of os.path.dirname()
388 Returns
389 -------
390 head : `ButlerURI`
391 Everything except the tail of path attribute, expanded and
392 normalized as per ButlerURI rules.
393 """
394 return self.split()[0]
396 def replace(self, **kwargs: Any) -> ButlerURI:
397 """Replace components in a URI with new values and return a new
398 instance.
400 Returns
401 -------
402 new : `ButlerURI`
403 New `ButlerURI` object with updated values.
404 """
405 return self.__class__(self._uri._replace(**kwargs))
407 def updateFile(self, newfile: str) -> None:
408 """Update in place the final component of the path with the supplied
409 file name.
411 Parameters
412 ----------
413 newfile : `str`
414 File name with no path component.
416 Notes
417 -----
418 Updates the URI in place.
419 Updates the ButlerURI.dirLike attribute. The new file path will
420 be quoted if necessary.
421 """
422 if self.quotePaths:
423 newfile = urllib.parse.quote(newfile)
424 dir, _ = self._pathModule.split(self.path)
425 newpath = self._pathModule.join(dir, newfile)
427 self.dirLike = False
428 self._uri = self._uri._replace(path=newpath)
430 def getExtension(self) -> str:
431 """Return the file extension(s) associated with this URI path.
433 Returns
434 -------
435 ext : `str`
436 The file extension (including the ``.``). Can be empty string
437 if there is no file extension. Will return all file extensions
438 as a single extension such that ``file.fits.gz`` will return
439 a value of ``.fits.gz``.
440 """
441 extensions = self._pathLib(self.path).suffixes
442 return "".join(extensions)
444 def join(self, path: str) -> ButlerURI:
445 """Create a new `ButlerURI` with additional path components including
446 a file.
448 Parameters
449 ----------
450 path : `str`
451 Additional file components to append to the current URI. Assumed
452 to include a file at the end. Will be quoted depending on the
453 associated URI scheme.
455 Returns
456 -------
457 new : `ButlerURI`
458 New URI with any file at the end replaced with the new path
459 components.
461 Notes
462 -----
463 Schemeless URIs assume local path separator but all other URIs assume
464 POSIX separator if the supplied path has directory structure. It
465 may be this never becomes a problem but datastore templates assume
466 POSIX separator is being used.
467 """
468 new = self.dirname() # By definition a directory URI
470 # new should be asked about quoting, not self, since dirname can
471 # change the URI scheme for schemeless -> file
472 if new.quotePaths:
473 path = urllib.parse.quote(path)
475 newpath = self._pathModule.normpath(self._pathModule.join(new.path, path))
476 new._uri = new._uri._replace(path=newpath)
477 # Declare the new URI not be dirLike unless path ended in /
478 if not path.endswith(self._pathModule.sep):
479 new.dirLike = False
480 return new
482 def relative_to(self, other: ButlerURI) -> Optional[str]:
483 """Return the relative path from this URI to the other URI.
485 Parameters
486 ----------
487 other : `ButlerURI`
488 URI to use to calculate the relative path. Must be a parent
489 of this URI.
491 Returns
492 -------
493 subpath : `str`
494 The sub path of this URI relative to the supplied other URI.
495 Returns `None` if there is no parent child relationship.
496 Scheme and netloc must match.
497 """
498 if self.scheme != other.scheme or self.netloc != other.netloc:
499 return None
501 enclosed_path = self._pathLib(self.relativeToPathRoot)
502 parent_path = other.relativeToPathRoot
503 subpath: Optional[str]
504 try:
505 subpath = str(enclosed_path.relative_to(parent_path))
506 except ValueError:
507 subpath = None
508 else:
509 subpath = urllib.parse.unquote(subpath)
510 return subpath
512 def exists(self) -> bool:
513 """Indicate that the resource is available.
515 Returns
516 -------
517 exists : `bool`
518 `True` if the resource exists.
519 """
520 raise NotImplementedError()
522 def remove(self) -> None:
523 """Remove the resource."""
524 raise NotImplementedError()
526 def isabs(self) -> bool:
527 """Indicate that the resource is fully specified.
529 For non-schemeless URIs this is always true.
531 Returns
532 -------
533 isabs : `bool`
534 `True` in all cases except schemeless URI.
535 """
536 return True
538 def as_local(self) -> Tuple[str, bool]:
539 """Return the location of the (possibly remote) resource in the
540 local file system.
542 Returns
543 -------
544 path : `str`
545 If this is a remote resource, it will be a copy of the resource
546 on the local file system, probably in a temporary directory.
547 For a local resource this should be the actual path to the
548 resource.
549 is_temporary : `bool`
550 Indicates if the local path is a temporary file or not.
551 """
552 raise NotImplementedError()
554 def read(self, size: int = -1) -> bytes:
555 """Open the resource and return the contents in bytes.
557 Parameters
558 ----------
559 size : `int`, optional
560 The number of bytes to read. Negative or omitted indicates
561 that all data should be read.
562 """
563 raise NotImplementedError()
565 def write(self, data: bytes, overwrite: bool = True) -> None:
566 """Write the supplied bytes to the new resource.
568 Parameters
569 ----------
570 data : `bytes`
571 The bytes to write to the resource. The entire contents of the
572 resource will be replaced.
573 overwrite : `bool`, optional
574 If `True` the resource will be overwritten if it exists. Otherwise
575 the write will fail.
576 """
577 raise NotImplementedError()
579 def mkdir(self) -> None:
580 """For a dir-like URI, create the directory resource if it does not
581 already exist.
582 """
583 raise NotImplementedError()
585 def __str__(self) -> str:
586 return self.geturl()
588 def __repr__(self) -> str:
589 return f'ButlerURI("{self.geturl()}")'
591 def __eq__(self, other: Any) -> bool:
592 if not isinstance(other, ButlerURI):
593 return False
594 return self.geturl() == other.geturl()
596 def __copy__(self) -> ButlerURI:
597 # Implement here because the __new__ method confuses things
598 return type(self)(str(self))
600 def __deepcopy__(self, memo: Any) -> ButlerURI:
601 # Implement here because the __new__ method confuses things
602 return self.__copy__()
604 def __getnewargs__(self) -> Tuple:
605 return (str(self),)
607 @staticmethod
608 def _fixupPathUri(parsed: urllib.parse.ParseResult, root: Optional[str] = None,
609 forceAbsolute: bool = False,
610 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]:
611 """Correct any issues with the supplied URI.
613 Parameters
614 ----------
615 parsed : `~urllib.parse.ParseResult`
616 The result from parsing a URI using `urllib.parse`.
617 root : `str`, ignored
618 Not used by the this implementation since all URIs are
619 absolute except for those representing the local file system.
620 forceAbsolute : `bool`, ignored.
621 Not used by this implementation. URIs are generally always
622 absolute.
623 forceDirectory : `bool`, optional
624 If `True` forces the URI to end with a separator, otherwise given
625 URI is interpreted as is. Specifying that the URI is conceptually
626 equivalent to a directory can break some ambiguities when
627 interpreting the last element of a path.
629 Returns
630 -------
631 modified : `~urllib.parse.ParseResult`
632 Update result if a URI is being handled.
633 dirLike : `bool`
634 `True` if given parsed URI has a trailing separator or
635 forceDirectory is True. Otherwise `False`.
637 Notes
638 -----
639 Relative paths are explicitly not supported by RFC8089 but `urllib`
640 does accept URIs of the form ``file:relative/path.ext``. They need
641 to be turned into absolute paths before they can be used. This is
642 always done regardless of the ``forceAbsolute`` parameter.
644 AWS S3 differentiates between keys with trailing POSIX separators (i.e
645 `/dir` and `/dir/`) whereas POSIX does not neccessarily.
647 Scheme-less paths are normalized.
648 """
649 # assume we are not dealing with a directory like URI
650 dirLike = False
652 # URI is dir-like if explicitly stated or if it ends on a separator
653 endsOnSep = parsed.path.endswith(posixpath.sep)
654 if forceDirectory or endsOnSep:
655 dirLike = True
656 # only add the separator if it's not already there
657 if not endsOnSep:
658 parsed = parsed._replace(path=parsed.path+posixpath.sep)
660 return parsed, dirLike
662 def transfer_from(self, src: ButlerURI, transfer: str,
663 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
664 """Transfer the current resource to a new location.
666 Parameters
667 ----------
668 src : `ButlerURI`
669 Source URI.
670 transfer : `str`
671 Mode to use for transferring the resource. Generically there are
672 many standard options: copy, link, symlink, hardlink, relsymlink.
673 Not all URIs support all modes.
674 transaction : `DatastoreTransaction`, optional
675 A transaction object that can (depending on implementation)
676 rollback transfers on error. Not guaranteed to be implemented.
678 Notes
679 -----
680 Conceptually this is hard to scale as the number of URI schemes
681 grow. The destination URI is more important than the source URI
682 since that is where all the transfer modes are relevant (with the
683 complication that "move" deletes the source).
685 Local file to local file is the fundamental use case but every
686 other scheme has to support "copy" to local file (with implicit
687 support for "move") and copy from local file.
688 All the "link" options tend to be specific to local file systems.
690 "move" is a "copy" where the remote resource is deleted at the end.
691 Whether this works depends on the source URI rather than the
692 destination URI. Reverting a move on transaction rollback is
693 expected to be problematic if a remote resource was involved.
694 """
695 raise NotImplementedError(f"No transfer modes supported by URI scheme {self.scheme}")
698class ButlerFileURI(ButlerURI):
699 """URI for explicit ``file`` scheme."""
701 transferModes = ("copy", "link", "symlink", "hardlink", "relsymlink", "auto", "move")
702 transferDefault: str = "link"
704 @property
705 def ospath(self) -> str:
706 """Path component of the URI localized to current OS.
708 Will unquote URI path since a formal URI must include the quoting.
709 """
710 return urllib.parse.unquote(posix2os(self._uri.path))
712 def exists(self) -> bool:
713 # Uses os.path.exists so if there is a soft link that points
714 # to a file that no longer exists this will return False
715 return os.path.exists(self.ospath)
717 def remove(self) -> None:
718 """Remove the resource."""
719 os.remove(self.ospath)
721 def as_local(self) -> Tuple[str, bool]:
722 """Return the local path of the file.
724 Returns
725 -------
726 path : `str`
727 The local path to this file.
728 temporary : `bool`
729 Always returns `False` (this is not a temporary file).
730 """
731 return self.ospath, False
733 def _force_to_file(self) -> ButlerFileURI:
734 """Force a schemeless URI to a file URI and returns a new URI.
736 Returns
737 -------
738 file : `ButlerFileURI`
739 A copy of the URI using file scheme. If already a file scheme
740 the copy will be identical.
742 Raises
743 ------
744 ValueError
745 Raised if this URI is schemeless and relative path and so can
746 not be forced to file absolute path without context.
747 """
748 # This is always a file scheme so always return copy
749 return copy.copy(self)
751 def relative_to(self, other: ButlerURI) -> Optional[str]:
752 """Return the relative path from this URI to the other URI.
754 Parameters
755 ----------
756 other : `ButlerURI`
757 URI to use to calculate the relative path. Must be a parent
758 of this URI.
760 Returns
761 -------
762 subpath : `str`
763 The sub path of this URI relative to the supplied other URI.
764 Returns `None` if there is no parent child relationship.
765 Scheme and netloc must match but for file URIs schemeless
766 is also used. If this URI is a relative URI but the other is
767 absolute, it is assumed to be in the parent completely unless it
768 starts with ".." (in which case the path is combined and tested).
769 If both URIs are relative, the relative paths are compared
770 for commonality.
772 Notes
773 -----
774 By definition a relative path will be relative to the enclosing
775 absolute parent URI. It will be returned unchanged if it does not
776 use a parent directory specification.
777 """
778 # We know self is a file so check the other. Anything other than
779 # file or schemeless means by definition these have no paths in common
780 if other.scheme and other.scheme != "file":
781 return None
783 # for case where both URIs are relative use the normal logic
784 # where a/b/c.txt and a/b/ returns c.txt.
785 if not self.isabs() and not other.isabs():
786 return super().relative_to(other)
788 # if we have a relative path convert it to absolute
789 # relative to the supplied parent. This is solely to handle
790 # the case where the relative path includes ".." but somehow
791 # then goes back inside the directory of the parent
792 if not self.isabs():
793 childUri = other.join(self.path)
794 return childUri.relative_to(other)
796 # By this point if the schemes are identical we can use the
797 # base class implementation.
798 if self.scheme == other.scheme:
799 return super().relative_to(other)
801 # if one is schemeless and the other is not the base implementation
802 # will fail so we need to fix that -- they are both absolute so
803 # forcing to file is fine.
804 # Use a cast to convince mypy that other has to be a ButlerFileURI
805 # in order to get to this part of the code.
806 return self._force_to_file().relative_to(cast(ButlerFileURI, other)._force_to_file())
808 def read(self, size: int = -1) -> bytes:
809 # Docstring inherits
810 with open(self.ospath, "rb") as fh:
811 return fh.read(size)
813 def write(self, data: bytes, overwrite: bool = True) -> None:
814 dir = os.path.dirname(self.ospath)
815 if not os.path.exists(dir):
816 safeMakeDir(dir)
817 if overwrite:
818 mode = "wb"
819 else:
820 mode = "xb"
821 with open(self.ospath, mode) as f:
822 f.write(data)
824 def mkdir(self) -> None:
825 if not os.path.exists(self.ospath):
826 safeMakeDir(self.ospath)
827 elif not os.path.isdir(self.ospath):
828 raise FileExistsError(f"URI {self} exists but is not a directory!")
830 def transfer_from(self, src: ButlerURI, transfer: str,
831 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
832 """Transfer the current resource to a local file.
834 Parameters
835 ----------
836 src : `ButlerURI`
837 Source URI.
838 transfer : `str`
839 Mode to use for transferring the resource. Supports the following
840 options: copy, link, symlink, hardlink, relsymlink.
841 transaction : `DatastoreTransaction`, optional
842 If a transaction is provided, undo actions will be registered.
843 """
844 # Fail early to prevent delays if remote resources are requested
845 if transfer not in self.transferModes:
846 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
848 log.debug(f"Transferring {src} [exists: {src.exists()}] -> "
849 f"{self} [exists: {self.exists()}] (transfer={transfer})")
851 # We do not have to special case ButlerFileURI here because
852 # as_local handles that.
853 local_src, is_temporary = src.as_local()
855 # Default transfer mode depends on whether we have a temporary
856 # file or not.
857 if transfer == "auto":
858 transfer = self.transferDefault if not is_temporary else "copy"
860 # Follow soft links
861 local_src = os.path.realpath(os.path.normpath(local_src))
863 if not os.path.exists(local_src):
864 raise FileNotFoundError(f"Source URI {src} does not exist")
866 # All the modes involving linking use "link" somewhere
867 if "link" in transfer and is_temporary:
868 raise RuntimeError("Can not use local file system transfer mode"
869 f" {transfer} for remote resource ({src})")
871 # For temporary files we can own them
872 requested_transfer = transfer
873 if is_temporary and transfer == "copy":
874 transfer = "move"
876 # The output location should not exist
877 if self.exists():
878 raise FileExistsError(f"Destination path '{self}' already exists. Transfer "
879 f"from {src} cannot be completed.")
881 # Make the path absolute (but don't follow links since that
882 # would possibly cause us to end up in the wrong place if the
883 # file existed already as a soft link)
884 newFullPath = os.path.abspath(self.ospath)
885 outputDir = os.path.dirname(newFullPath)
886 if not os.path.isdir(outputDir):
887 # Must create the directory -- this can not be rolled back
888 # since another transfer running concurrently may
889 # be relying on this existing.
890 safeMakeDir(outputDir)
892 if transaction is None:
893 # Use a no-op transaction to reduce code duplication
894 transaction = NoTransaction()
896 if transfer == "move":
897 with transaction.undoWith(f"move from {local_src}", shutil.move, newFullPath, local_src):
898 shutil.move(local_src, newFullPath)
899 elif transfer == "copy":
900 with transaction.undoWith(f"copy from {local_src}", os.remove, newFullPath):
901 shutil.copy(local_src, newFullPath)
902 elif transfer == "link":
903 # Try hard link and if that fails use a symlink
904 with transaction.undoWith(f"link to {local_src}", os.remove, newFullPath):
905 try:
906 os.link(local_src, newFullPath)
907 except OSError:
908 # Read through existing symlinks
909 os.symlink(local_src, newFullPath)
910 elif transfer == "hardlink":
911 with transaction.undoWith(f"hardlink to {local_src}", os.remove, newFullPath):
912 os.link(local_src, newFullPath)
913 elif transfer == "symlink":
914 # Read through existing symlinks
915 with transaction.undoWith(f"symlink to {local_src}", os.remove, newFullPath):
916 os.symlink(local_src, newFullPath)
917 elif transfer == "relsymlink":
918 # This is a standard symlink but using a relative path
919 # Need the directory name to give to relative root
920 # A full file path confuses it into an extra ../
921 newFullPathRoot = os.path.dirname(newFullPath)
922 relPath = os.path.relpath(local_src, newFullPathRoot)
923 with transaction.undoWith(f"relsymlink to {local_src}", os.remove, newFullPath):
924 os.symlink(relPath, newFullPath)
925 else:
926 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer))
928 # This was an explicit move requested from a remote resource
929 # try to remove that resource. We check is_temporary because
930 # the local file would have been moved by shutil.move already.
931 if requested_transfer == "move" and is_temporary:
932 # Transactions do not work here
933 src.remove()
935 if is_temporary and os.path.exists(local_src):
936 # This should never happen since we have moved it above
937 os.remove(local_src)
939 @staticmethod
940 def _fixupPathUri(parsed: urllib.parse.ParseResult, root: Optional[str] = None,
941 forceAbsolute: bool = False,
942 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]:
943 """Fix up relative paths in URI instances.
945 Parameters
946 ----------
947 parsed : `~urllib.parse.ParseResult`
948 The result from parsing a URI using `urllib.parse`.
949 root : `str`, optional
950 Path to use as root when converting relative to absolute.
951 If `None`, it will be the current working directory. This
952 is a local file system path, not a URI. It is only used if
953 a file-scheme is used incorrectly with a relative path.
954 forceAbsolute : `bool`, ignored
955 Has no effect for this subclass. ``file`` URIs are always
956 absolute.
957 forceDirectory : `bool`, optional
958 If `True` forces the URI to end with a separator, otherwise given
959 URI is interpreted as is.
961 Returns
962 -------
963 modified : `~urllib.parse.ParseResult`
964 Update result if a URI is being handled.
965 dirLike : `bool`
966 `True` if given parsed URI has a trailing separator or
967 forceDirectory is True. Otherwise `False`.
969 Notes
970 -----
971 Relative paths are explicitly not supported by RFC8089 but `urllib`
972 does accept URIs of the form ``file:relative/path.ext``. They need
973 to be turned into absolute paths before they can be used. This is
974 always done regardless of the ``forceAbsolute`` parameter.
975 """
976 # assume we are not dealing with a directory like URI
977 dirLike = False
979 # file URI implies POSIX path separators so split as POSIX,
980 # then join as os, and convert to abspath. Do not handle
981 # home directories since "file" scheme is explicitly documented
982 # to not do tilde expansion.
983 sep = posixpath.sep
985 # For local file system we can explicitly check to see if this
986 # really is a directory. The URI might point to a location that
987 # does not exists yet but all that matters is if it is a directory
988 # then we make sure use that fact. No need to do the check if
989 # we are already being told.
990 if not forceDirectory and posixpath.isdir(parsed.path):
991 forceDirectory = True
993 # For an absolute path all we need to do is check if we need
994 # to force the directory separator
995 if posixpath.isabs(parsed.path):
996 if forceDirectory:
997 if not parsed.path.endswith(sep):
998 parsed = parsed._replace(path=parsed.path+sep)
999 dirLike = True
1000 return copy.copy(parsed), dirLike
1002 # Relative path so must fix it to be compliant with the standard
1004 # Replacement values for the URI
1005 replacements = {}
1007 if root is None:
1008 root = os.path.abspath(os.path.curdir)
1010 replacements["path"] = posixpath.normpath(posixpath.join(os2posix(root), parsed.path))
1012 # normpath strips trailing "/" so put it back if necessary
1013 # Acknowledge that trailing separator exists.
1014 if forceDirectory or (parsed.path.endswith(sep) and not replacements["path"].endswith(sep)):
1015 replacements["path"] += sep
1016 dirLike = True
1018 # ParseResult is a NamedTuple so _replace is standard API
1019 parsed = parsed._replace(**replacements)
1021 if parsed.params or parsed.query:
1022 log.warning("Additional items unexpectedly encountered in file URI: %s", parsed.geturl())
1024 return parsed, dirLike
1027class ButlerS3URI(ButlerURI):
1028 """S3 URI"""
1030 @property
1031 def client(self) -> boto3.client:
1032 """Client object to address remote resource."""
1033 # Defer import for circular dependencies
1034 from .s3utils import getS3Client
1035 return getS3Client()
1037 def exists(self) -> bool:
1038 # s3utils itself imports ButlerURI so defer this import
1039 from .s3utils import s3CheckFileExists
1040 exists, _ = s3CheckFileExists(self, client=self.client)
1041 return exists
1043 def remove(self) -> None:
1044 """Remove the resource."""
1046 # https://github.com/boto/boto3/issues/507 - there is no
1047 # way of knowing if the file was actually deleted except
1048 # for checking all the keys again, reponse is HTTP 204 OK
1049 # response all the time
1050 self.client.delete(Bucket=self.netloc, Key=self.relativeToPathRoot)
1052 def read(self, size: int = -1) -> bytes:
1053 args = {}
1054 if size > 0:
1055 args["Range"] = f"bytes=0-{size-1}"
1056 try:
1057 response = self.client.get_object(Bucket=self.netloc,
1058 Key=self.relativeToPathRoot,
1059 **args)
1060 except (self.client.exceptions.NoSuchKey, self.client.exceptions.NoSuchBucket) as err:
1061 raise FileNotFoundError(f"No such resource: {self}") from err
1062 body = response["Body"].read()
1063 response["Body"].close()
1064 return body
1066 def write(self, data: bytes, overwrite: bool = True) -> None:
1067 if not overwrite:
1068 if self.exists():
1069 raise FileExistsError(f"Remote resource {self} exists and overwrite has been disabled")
1070 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot,
1071 Body=data)
1073 def mkdir(self) -> None:
1074 # Defer import for circular dependencies
1075 from .s3utils import bucketExists
1076 if not bucketExists(self.netloc):
1077 raise ValueError(f"Bucket {self.netloc} does not exist for {self}!")
1079 if not self.dirLike:
1080 raise ValueError("Can not create a 'directory' for file-like URI {self}")
1082 # don't create S3 key when root is at the top-level of an Bucket
1083 if not self.path == "/":
1084 self.client.put_object(Bucket=self.netloc, Key=self.relativeToPathRoot)
1086 def as_local(self) -> Tuple[str, bool]:
1087 """Download object from S3 and place in temporary directory.
1089 Returns
1090 -------
1091 path : `str`
1092 Path to local temporary file.
1093 temporary : `bool`
1094 Always returns `True`. This is always a temporary file.
1095 """
1096 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
1097 self.client.download_fileobj(self.netloc, self.relativeToPathRoot, tmpFile)
1098 return tmpFile.name, True
1100 def transfer_from(self, src: ButlerURI, transfer: str = "copy",
1101 transaction: Optional[Union[DatastoreTransaction, NoTransaction]] = None) -> None:
1102 """Transfer the current resource to an S3 bucket.
1104 Parameters
1105 ----------
1106 src : `ButlerURI`
1107 Source URI.
1108 transfer : `str`
1109 Mode to use for transferring the resource. Supports the following
1110 options: copy.
1111 transaction : `DatastoreTransaction`, optional
1112 Currently unused.
1113 """
1114 # Fail early to prevent delays if remote resources are requested
1115 if transfer not in self.transferModes:
1116 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}")
1118 log.debug(f"Transferring {src} [exists: {src.exists()}] -> "
1119 f"{self} [exists: {self.exists()}] (transfer={transfer})")
1121 if self.exists():
1122 raise FileExistsError(f"Destination path '{self}' already exists.")
1124 if transfer == "auto":
1125 transfer = self.transferDefault
1127 if isinstance(src, type(self)):
1128 # Looks like an S3 remote uri so we can use direct copy
1129 # note that boto3.resource.meta.copy is cleverer than the low
1130 # level copy_object
1131 copy_source = {
1132 "Bucket": src.netloc,
1133 "Key": src.relativeToPathRoot,
1134 }
1135 self.client.copy_object(CopySource=copy_source, Bucket=self.netloc, Key=self.relativeToPathRoot)
1136 else:
1137 # Use local file and upload it
1138 local_src, is_temporary = src.as_local()
1140 # resource.meta.upload_file seems like the right thing
1141 # but we have a low level client
1142 with open(local_src, "rb") as fh:
1143 self.client.put_object(Bucket=self.netloc,
1144 Key=self.relativeToPathRoot, Body=fh)
1145 if is_temporary:
1146 os.remove(local_src)
1148 # This was an explicit move requested from a remote resource
1149 # try to remove that resource
1150 if transfer == "move":
1151 # Transactions do not work here
1152 src.remove()
1155class ButlerPackageResourceURI(ButlerURI):
1156 """URI referring to a Python package resource.
1158 These URIs look like: ``resource://lsst.daf.butler/configs/file.yaml``
1159 where the network location is the Python package and the path is the
1160 resource name.
1161 """
1163 def exists(self) -> bool:
1164 """Check that the python resource exists."""
1165 return pkg_resources.resource_exists(self.netloc, self.relativeToPathRoot)
1167 def read(self, size: int = -1) -> bytes:
1168 with pkg_resources.resource_stream(self.netloc, self.relativeToPathRoot) as fh:
1169 return fh.read(size)
1172class ButlerHttpURI(ButlerURI):
1173 """General HTTP(S) resource."""
1175 def exists(self) -> bool:
1176 """Check that a remote HTTP resource exists."""
1177 header = requests.head(self.geturl())
1178 return True if header.status_code == 200 else False
1180 def as_local(self) -> Tuple[str, bool]:
1181 """Download object over HTTP and place in temporary directory.
1183 Returns
1184 -------
1185 path : `str`
1186 Path to local temporary file.
1187 temporary : `bool`
1188 Always returns `True`. This is always a temporary file.
1189 """
1190 r = requests.get(self.geturl(), stream=True)
1191 if r.status_code != 200:
1192 raise FileNotFoundError(f"Unable to download resource {self}; status code: {r.status_code}")
1193 with tempfile.NamedTemporaryFile(suffix=self.getExtension(), delete=False) as tmpFile:
1194 for chunk in r.iter_content():
1195 tmpFile.write(chunk)
1196 return tmpFile.name, True
1198 def read(self, size: int = -1) -> bytes:
1199 # Docstring inherits
1200 stream = True if size > 0 else False
1201 r = requests.get(self.geturl(), stream=stream)
1202 if not stream:
1203 return r.content
1204 else:
1205 return next(r.iter_content(chunk_size=size))
1208class ButlerInMemoryURI(ButlerURI):
1209 """Internal in-memory datastore URI (`mem://`).
1211 Not used for any real purpose other than indicating that the dataset
1212 is in memory.
1213 """
1215 def exists(self) -> bool:
1216 """Test for existence and always return False."""
1217 return True
1219 def as_local(self) -> Tuple[str, bool]:
1220 raise RuntimeError(f"Do not know how to retrieve data for URI '{self}'")
1223class ButlerSchemelessURI(ButlerFileURI):
1224 """Scheme-less URI referring to the local file system"""
1226 _pathLib = PurePath
1227 _pathModule = os.path
1228 quotePaths = False
1230 @property
1231 def ospath(self) -> str:
1232 """Path component of the URI localized to current OS."""
1233 return self.path
1235 def isabs(self) -> bool:
1236 """Indicate that the resource is fully specified.
1238 For non-schemeless URIs this is always true.
1240 Returns
1241 -------
1242 isabs : `bool`
1243 `True` if the file is absolute, `False` otherwise.
1244 """
1245 return os.path.isabs(self.ospath)
1247 def _force_to_file(self) -> ButlerFileURI:
1248 """Force a schemeless URI to a file URI and returns a new URI.
1250 This will include URI quoting of the path.
1252 Returns
1253 -------
1254 file : `ButlerFileURI`
1255 A copy of the URI using file scheme. If already a file scheme
1256 the copy will be identical.
1258 Raises
1259 ------
1260 ValueError
1261 Raised if this URI is schemeless and relative path and so can
1262 not be forced to file absolute path without context.
1263 """
1264 if not self.isabs():
1265 raise RuntimeError(f"Internal error: Can not force {self} to absolute file URI")
1266 uri = self._uri._replace(scheme="file", path=urllib.parse.quote(os2posix(self.path)))
1267 # mypy really wants a ButlerFileURI to be returned here
1268 return ButlerURI(uri, forceDirectory=self.dirLike) # type: ignore
1270 @staticmethod
1271 def _fixupPathUri(parsed: urllib.parse.ParseResult, root: Optional[str] = None,
1272 forceAbsolute: bool = False,
1273 forceDirectory: bool = False) -> Tuple[urllib.parse.ParseResult, bool]:
1274 """Fix up relative paths for local file system.
1276 Parameters
1277 ----------
1278 parsed : `~urllib.parse.ParseResult`
1279 The result from parsing a URI using `urllib.parse`.
1280 root : `str`, optional
1281 Path to use as root when converting relative to absolute.
1282 If `None`, it will be the current working directory. This
1283 is a local file system path, not a URI.
1284 forceAbsolute : `bool`, optional
1285 If `True`, scheme-less relative URI will be converted to an
1286 absolute path using a ``file`` scheme. If `False` scheme-less URI
1287 will remain scheme-less and will not be updated to ``file`` or
1288 absolute path.
1289 forceDirectory : `bool`, optional
1290 If `True` forces the URI to end with a separator, otherwise given
1291 URI is interpreted as is.
1293 Returns
1294 -------
1295 modified : `~urllib.parse.ParseResult`
1296 Update result if a URI is being handled.
1297 dirLike : `bool`
1298 `True` if given parsed URI has a trailing separator or
1299 forceDirectory is True. Otherwise `False`.
1301 Notes
1302 -----
1303 Relative paths are explicitly not supported by RFC8089 but `urllib`
1304 does accept URIs of the form ``file:relative/path.ext``. They need
1305 to be turned into absolute paths before they can be used. This is
1306 always done regardless of the ``forceAbsolute`` parameter.
1308 Scheme-less paths are normalized.
1309 """
1310 # assume we are not dealing with a directory URI
1311 dirLike = False
1313 # Replacement values for the URI
1314 replacements = {}
1316 if root is None:
1317 root = os.path.abspath(os.path.curdir)
1319 # this is a local OS file path which can support tilde expansion.
1320 # we quoted it in the constructor so unquote here
1321 expandedPath = os.path.expanduser(urllib.parse.unquote(parsed.path))
1323 # Ensure that this becomes a file URI if it is already absolute
1324 if os.path.isabs(expandedPath):
1325 replacements["scheme"] = "file"
1326 # Keep in OS form for now to simplify later logic
1327 replacements["path"] = os.path.normpath(expandedPath)
1328 elif forceAbsolute:
1329 # This can stay in OS path form, do not change to file
1330 # scheme.
1331 replacements["path"] = os.path.normpath(os.path.join(root, expandedPath))
1332 else:
1333 # No change needed for relative local path staying relative
1334 # except normalization
1335 replacements["path"] = os.path.normpath(expandedPath)
1336 # normalization of empty path returns "." so we are dirLike
1337 if expandedPath == "":
1338 dirLike = True
1340 # normpath strips trailing "/" which makes it hard to keep
1341 # track of directory vs file when calling replaceFile
1343 # For local file system we can explicitly check to see if this
1344 # really is a directory. The URI might point to a location that
1345 # does not exists yet but all that matters is if it is a directory
1346 # then we make sure use that fact. No need to do the check if
1347 # we are already being told.
1348 if not forceDirectory and os.path.isdir(replacements["path"]):
1349 forceDirectory = True
1351 # add the trailing separator only if explicitly required or
1352 # if it was stripped by normpath. Acknowledge that trailing
1353 # separator exists.
1354 endsOnSep = expandedPath.endswith(os.sep) and not replacements["path"].endswith(os.sep)
1355 if (forceDirectory or endsOnSep or dirLike):
1356 dirLike = True
1357 if not replacements["path"].endswith(os.sep):
1358 replacements["path"] += os.sep
1360 if "scheme" in replacements:
1361 # This is now meant to be a URI path so force to posix
1362 # and quote
1363 replacements["path"] = urllib.parse.quote(os2posix(replacements["path"]))
1365 # ParseResult is a NamedTuple so _replace is standard API
1366 parsed = parsed._replace(**replacements)
1368 if parsed.params or parsed.fragment or parsed.query:
1369 log.warning("Additional items unexpectedly encountered in schemeless URI: %s", parsed.geturl())
1371 return parsed, dirLike