Coverage for python/lsst/daf/butler/datastores/posixDatastore.py : 86%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""POSIX datastore."""
26__all__ = ("PosixDatastore", )
28import hashlib
29import logging
30import os
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 ClassVar,
35 Optional,
36 Type,
37 Union
38)
40from .fileLikeDatastore import FileLikeDatastore
41from lsst.daf.butler.core.utils import safeMakeDir
42from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter, DatasetRef
44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true
45 from .fileLikeDatastore import DatastoreFileGetInformation
46 from lsst.daf.butler import DatastoreConfig, Location
47 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager
49log = logging.getLogger(__name__)
52class PosixDatastore(FileLikeDatastore):
53 """Basic POSIX filesystem backed Datastore.
55 Parameters
56 ----------
57 config : `DatastoreConfig` or `str`
58 Configuration. A string should refer to the name of the config file.
59 bridgeManager : `DatastoreRegistryBridgeManager`
60 Object that manages the interface between `Registry` and datastores.
61 butlerRoot : `str`, optional
62 New datastore root to use to override the configuration value.
64 Raises
65 ------
66 ValueError
67 If root location does not exist and ``create`` is `False` in the
68 configuration.
70 Notes
71 -----
72 PosixDatastore supports all transfer modes for file-based ingest:
73 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"`
74 and `None` (no transfer).
75 """
77 defaultConfigFile: ClassVar[Optional[str]] = "datastores/posixDatastore.yaml"
78 """Path to configuration defaults. Accessed within the ``config`` resource
79 or relative to a search path. Can be None if no defaults specified.
80 """
82 def __init__(self, config: Union[DatastoreConfig, str],
83 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None):
84 super().__init__(config, bridgeManager, butlerRoot)
86 # Check that root is a valid URI for this datastore
87 root = ButlerURI(self.root, forceDirectory=True)
88 if root.scheme and root.scheme != "file": 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true
89 raise ValueError(f"Root location must only be a file URI not {self.root}")
91 self.root = root.ospath
92 if not os.path.isdir(self.root):
93 if "create" not in self.config or not self.config["create"]: 93 ↛ 94line 93 didn't jump to line 94, because the condition on line 93 was never true
94 raise ValueError(f"No valid root at: {self.root}")
95 safeMakeDir(self.root)
97 def _artifact_exists(self, location: Location) -> bool:
98 """Check that an artifact exists in this datastore at the specified
99 location.
101 Parameters
102 ----------
103 location : `Location`
104 Expected location of the artifact associated with this datastore.
106 Returns
107 -------
108 exists : `bool`
109 True if the location can be found, false otherwise.
110 """
111 return os.path.exists(location.path)
113 def _delete_artifact(self, location: Location) -> None:
114 """Delete the artifact from the datastore.
116 Parameters
117 ----------
118 location : `Location`
119 Location of the artifact associated with this datastore.
120 """
121 os.remove(location.path)
123 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation,
124 ref: DatasetRef, isComponent: bool = False) -> Any:
125 location = getInfo.location
127 # Too expensive to recalculate the checksum on fetch
128 # but we can check size and existence
129 if not os.path.exists(location.path):
130 raise FileNotFoundError("Dataset with Id {} does not seem to exist at"
131 " expected location of {}".format(ref.id, location.path))
132 stat = os.stat(location.path)
133 size = stat.st_size
134 storedFileInfo = getInfo.info
135 if size != storedFileInfo.file_size: 135 ↛ 136line 135 didn't jump to line 136, because the condition on line 135 was never true
136 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
137 " match recorded size of {}".format(location.path, size,
138 storedFileInfo.file_size))
140 formatter = getInfo.formatter
141 try:
142 log.debug("Reading %s from location %s with formatter %s",
143 f"component {getInfo.component}" if isComponent else "",
144 location.uri, type(formatter).__name__)
145 result = formatter.read(component=getInfo.component if isComponent else None)
146 except Exception as e:
147 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
148 f" ({ref.datasetType.name} from {location.path}): {e}") from e
150 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
151 isComponent=isComponent)
153 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo:
154 # Inherit docstring
156 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
158 storageDir = os.path.dirname(location.path)
159 if not os.path.isdir(storageDir):
160 # Never try to remove this after creating it since there might
161 # be a butler ingest process running concurrently that will
162 # already think this directory exists.
163 safeMakeDir(storageDir)
165 # Write the file
166 predictedFullPath = os.path.join(self.root, formatter.predictPath())
168 if os.path.exists(predictedFullPath): 168 ↛ 175line 168 didn't jump to line 175, because the condition on line 168 was never true
169 # Assume that by this point if registry thinks the file should
170 # not exist then the file should not exist and therefore we can
171 # overwrite it. This can happen if a put was interrupted by
172 # an external interrupt. The only time this could be problematic is
173 # if the file template is incomplete and multiple dataset refs
174 # result in identical filenames.
175 log.warning("Object %s exists in datastore for ref %s", location.uri, ref)
177 def _removeFileExists(path: str) -> None:
178 """Remove a file and do not complain if it is not there.
180 This is important since a formatter might fail before the file
181 is written and we should not confuse people by writing spurious
182 error messages to the log.
183 """
184 try:
185 os.remove(path)
186 except FileNotFoundError:
187 pass
189 if self._transaction is None: 189 ↛ 190line 189 didn't jump to line 190, because the condition on line 189 was never true
190 raise RuntimeError("Attempting to write dataset without transaction enabled")
192 formatter_exception = None
193 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath):
194 try:
195 path = formatter.write(inMemoryDataset)
196 log.debug("Wrote file to %s", path)
197 except Exception as e:
198 formatter_exception = e
200 if formatter_exception:
201 raise formatter_exception
203 assert predictedFullPath == os.path.join(self.root, path)
205 return self._extractIngestInfo(path, ref, formatter=formatter)
207 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]:
208 # Docstring inherited from base class
209 if transfer != "auto":
210 return transfer
212 # See if the paths are within the datastore or not
213 inside = [self._pathInStore(d.path) is not None for d in datasets]
215 if all(inside):
216 transfer = None
217 elif not any(inside): 217 ↛ 220line 217 didn't jump to line 220, because the condition on line 217 was never false
218 transfer = "link"
219 else:
220 raise ValueError("Some datasets are inside the datastore and some are outside."
221 " Please use an explicit transfer mode and not 'auto'.")
223 return transfer
225 def _pathInStore(self, path: str) -> Optional[str]:
226 """Return path relative to datastore root
228 Parameters
229 ----------
230 path : `str`
231 Path to dataset. Can be absolute path. Returns path in datastore
232 or raises an exception if the path it outside.
234 Returns
235 -------
236 inStore : `str`
237 Path relative to datastore root. Returns `None` if the file is
238 outside the root.
239 """
240 pathUri = ButlerURI(path, forceAbsolute=False)
241 rootUri = ButlerURI(self.root, forceDirectory=True, forceAbsolute=True)
242 return pathUri.relative_to(rootUri)
244 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
245 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
246 fullPath = os.path.normpath(os.path.join(self.root, path))
247 if not os.path.exists(fullPath):
248 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest "
249 f"are assumed to be relative to self.root unless they are absolute.")
250 if transfer is None:
251 # Can not reuse path var because of typing
252 pathx = self._pathInStore(path)
253 if pathx is None:
254 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.")
255 path = pathx
256 return path
258 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *,
259 formatter: Union[Formatter, Type[Formatter]],
260 transfer: Optional[str] = None) -> StoredFileInfo:
261 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
262 if self._transaction is None: 262 ↛ 263line 262 didn't jump to line 263, because the condition on line 262 was never true
263 raise RuntimeError("Ingest called without transaction enabled")
265 # Calculate the full path to the source
266 srcUri = ButlerURI(path, root=self.root, forceAbsolute=True)
267 if transfer is None:
268 # File should exist already
269 rootUri = ButlerURI(self.root, forceDirectory=True)
270 pathInStore = srcUri.relative_to(rootUri)
271 if pathInStore is None: 271 ↛ 272line 271 didn't jump to line 272, because the condition on line 271 was never true
272 raise RuntimeError(f"Unexpectedly learned that {srcUri} is not within datastore {rootUri}")
273 if not rootUri.exists(): 273 ↛ 274line 273 didn't jump to line 274, because the condition on line 273 was never true
274 raise RuntimeError(f"Unexpectedly discovered that {srcUri} does not exist inside datastore"
275 f" {rootUri}")
276 path = pathInStore
277 fullPath = srcUri.ospath
278 elif transfer is not None: 278 ↛ 287line 278 didn't jump to line 287, because the condition on line 278 was never false
279 # Work out the name we want this ingested file to have
280 # inside the datastore
281 location = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
282 path = location.pathInStore
283 fullPath = location.path
284 targetUri = ButlerURI(location.uri)
285 targetUri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction)
287 checksum = self.computeChecksum(fullPath) if self.useChecksum else None
288 stat = os.stat(fullPath)
289 size = stat.st_size
290 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass,
291 component=ref.datasetType.component(),
292 file_size=size, checksum=checksum)
294 @staticmethod
295 def computeChecksum(filename: str, algorithm: str = "blake2b", block_size: int = 8192) -> str:
296 """Compute the checksum of the supplied file.
298 Parameters
299 ----------
300 filename : `str`
301 Name of file to calculate checksum from.
302 algorithm : `str`, optional
303 Name of algorithm to use. Must be one of the algorithms supported
304 by :py:class`hashlib`.
305 block_size : `int`
306 Number of bytes to read from file at one time.
308 Returns
309 -------
310 hexdigest : `str`
311 Hex digest of the file.
312 """
313 if algorithm not in hashlib.algorithms_guaranteed: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true
314 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
316 hasher = hashlib.new(algorithm)
318 with open(filename, "rb") as f:
319 for chunk in iter(lambda: f.read(block_size), b""):
320 hasher.update(chunk)
322 return hasher.hexdigest()