Coverage for python/lsst/daf/butler/datastores/posixDatastore.py : 89%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""POSIX datastore."""
24from __future__ import annotations
26__all__ = ("PosixDatastore", )
28import hashlib
29import logging
30import os
31import shutil
32from typing import TYPE_CHECKING, Iterable, Optional, Type
34from .fileLikeDatastore import FileLikeDatastore
35from lsst.daf.butler.core.safeFileIo import safeMakeDir
36from lsst.daf.butler.core.utils import transactional
37from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter
39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 from lsst.daf.butler import DatasetRef
42log = logging.getLogger(__name__)
45class PosixDatastore(FileLikeDatastore):
46 """Basic POSIX filesystem backed Datastore.
48 Parameters
49 ----------
50 config : `DatastoreConfig` or `str`
51 Configuration. A string should refer to the name of the config file.
52 registry : `Registry`
53 Registry to use for storing internal information about the datasets.
54 butlerRoot : `str`, optional
55 New datastore root to use to override the configuration value.
57 Raises
58 ------
59 ValueError
60 If root location does not exist and ``create`` is `False` in the
61 configuration.
63 Notes
64 -----
65 PosixDatastore supports all transfer modes for file-based ingest:
66 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, and `None` (no transfer).
67 """
69 defaultConfigFile = "datastores/posixDatastore.yaml"
70 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
71 absolute path. Can be None if no defaults specified.
72 """
74 def __init__(self, config, registry, butlerRoot=None):
75 super().__init__(config, registry, butlerRoot)
77 # Check that root is a valid URI for this datastore
78 root = ButlerURI(self.root)
79 if root.scheme and root.scheme != "file": 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true
80 raise ValueError(f"Root location must only be a file URI not {self.root}")
82 self.root = root.path
83 if not os.path.isdir(self.root):
84 if "create" not in self.config or not self.config["create"]: 84 ↛ 85line 84 didn't jump to line 85, because the condition on line 84 was never true
85 raise ValueError(f"No valid root at: {self.root}")
86 safeMakeDir(self.root)
88 def exists(self, ref):
89 """Check if the dataset exists in the datastore.
91 Parameters
92 ----------
93 ref : `DatasetRef`
94 Reference to the required dataset.
96 Returns
97 -------
98 exists : `bool`
99 `True` if the entity exists in the `Datastore`.
100 """
101 location, _ = self._get_dataset_location_info(ref)
102 if location is None:
103 return False
104 return os.path.exists(location.path)
106 def get(self, ref, parameters=None):
107 """Load an InMemoryDataset from the store.
109 Parameters
110 ----------
111 ref : `DatasetRef`
112 Reference to the required Dataset.
113 parameters : `dict`
114 `StorageClass`-specific parameters that specify, for example,
115 a slice of the Dataset to be loaded.
117 Returns
118 -------
119 inMemoryDataset : `object`
120 Requested Dataset or slice thereof as an InMemoryDataset.
122 Raises
123 ------
124 FileNotFoundError
125 Requested dataset can not be retrieved.
126 TypeError
127 Return value from formatter has unexpected type.
128 ValueError
129 Formatter failed to process the dataset.
130 """
131 getInfo = self._prepare_for_get(ref, parameters)
132 location = getInfo.location
134 # Too expensive to recalculate the checksum on fetch
135 # but we can check size and existence
136 if not os.path.exists(location.path):
137 raise FileNotFoundError("Dataset with Id {} does not seem to exist at"
138 " expected location of {}".format(ref.id, location.path))
139 stat = os.stat(location.path)
140 size = stat.st_size
141 storedFileInfo = getInfo.info
142 if size != storedFileInfo.file_size: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true
143 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
144 " match recorded size of {}".format(location.path, size,
145 storedFileInfo.file_size))
147 formatter = getInfo.formatter
148 try:
149 result = formatter.read(component=getInfo.component)
150 except Exception as e:
151 raise ValueError(f"Failure from formatter '{formatter.name()}' for Dataset {ref.id}") from e
153 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams)
155 @transactional
156 def put(self, inMemoryDataset, ref):
157 """Write a InMemoryDataset with a given `DatasetRef` to the store.
159 Parameters
160 ----------
161 inMemoryDataset : `object`
162 The Dataset to store.
163 ref : `DatasetRef`
164 Reference to the associated Dataset.
166 Raises
167 ------
168 TypeError
169 Supplied object and storage class are inconsistent.
170 DatasetTypeNotSupportedError
171 The associated `DatasetType` is not handled by this datastore.
173 Notes
174 -----
175 If the datastore is configured to reject certain dataset types it
176 is possible that the put will fail and raise a
177 `DatasetTypeNotSupportedError`. The main use case for this is to
178 allow `ChainedDatastore` to put to multiple datastores without
179 requiring that every datastore accepts the dataset.
180 """
181 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
183 storageDir = os.path.dirname(location.path)
184 if not os.path.isdir(storageDir):
185 # Never try to remove this after creating it since there might
186 # be a butler ingest process running concurrently that will
187 # already think this directory exists.
188 safeMakeDir(storageDir)
190 # Write the file
191 predictedFullPath = os.path.join(self.root, formatter.predictPath())
193 if os.path.exists(predictedFullPath):
194 raise FileExistsError(f"Cannot write file for ref {ref} as "
195 f"output file {predictedFullPath} already exists")
197 def _removeFileExists(path):
198 """Remove a file and do not complain if it is not there.
200 This is important since a formatter might fail before the file
201 is written and we should not confuse people by writing spurious
202 error messages to the log.
203 """
204 try:
205 os.remove(path)
206 except FileNotFoundError:
207 pass
209 formatter_exception = None
210 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath):
211 try:
212 path = formatter.write(inMemoryDataset)
213 log.debug("Wrote file to %s", path)
214 except Exception as e:
215 formatter_exception = e
217 if formatter_exception:
218 raise formatter_exception
220 assert predictedFullPath == os.path.join(self.root, path)
222 info = self._extractIngestInfo(path, ref, formatter=formatter)
223 self._register_datasets([(ref, info)])
225 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
226 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
227 fullPath = os.path.normpath(os.path.join(self.root, path))
228 if not os.path.exists(fullPath):
229 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest "
230 f"are assumed to be relative to self.root unless they are absolute.")
231 if transfer is None:
232 if os.path.isabs(path):
233 absRoot = os.path.abspath(self.root)
234 if os.path.commonpath([absRoot, path]) != absRoot: 234 ↛ 236line 234 didn't jump to line 236, because the condition on line 234 was never false
235 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.")
236 return os.path.relpath(path, absRoot)
237 elif path.startswith(os.path.pardir): 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true
238 raise RuntimeError(f"'{path}' is outside repository root '{self.root}.'")
239 return path
241 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
242 transfer: Optional[str] = None) -> StoredFileInfo:
243 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
244 fullPath = os.path.normpath(os.path.join(self.root, path))
245 if transfer is not None:
246 template = self.templates.getTemplate(ref)
247 location = self.locationFactory.fromPath(template.format(ref))
248 newPath = formatter.predictPathFromLocation(location)
249 newFullPath = os.path.join(self.root, newPath)
250 if os.path.exists(newFullPath):
251 raise FileExistsError(f"File '{newFullPath}' already exists.")
252 storageDir = os.path.dirname(newFullPath)
253 if not os.path.isdir(storageDir):
254 with self._transaction.undoWith("mkdir", os.rmdir, storageDir):
255 safeMakeDir(storageDir)
256 if transfer == "move":
257 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath):
258 shutil.move(fullPath, newFullPath)
259 elif transfer == "copy":
260 with self._transaction.undoWith("copy", os.remove, newFullPath):
261 shutil.copy(fullPath, newFullPath)
262 elif transfer == "hardlink":
263 with self._transaction.undoWith("hardlink", os.unlink, newFullPath):
264 os.link(fullPath, newFullPath)
265 elif transfer == "symlink": 265 ↛ 270line 265 didn't jump to line 270, because the condition on line 265 was never false
266 with self._transaction.undoWith("symlink", os.unlink, newFullPath):
267 # Read through existing symlinks
268 os.symlink(os.path.realpath(fullPath), newFullPath)
269 else:
270 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer))
271 path = newPath
272 fullPath = newFullPath
273 if self.useChecksum:
274 checksum = self.computeChecksum(fullPath)
275 else:
276 checksum = None
277 stat = os.stat(fullPath)
278 size = stat.st_size
279 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass,
280 file_size=size, checksum=checksum)
282 def remove(self, ref):
283 """Indicate to the Datastore that a Dataset can be removed.
285 .. warning::
287 This method does not support transactions; removals are
288 immediate, cannot be undone, and are not guaranteed to
289 be atomic if deleting either the file or the internal
290 database records fails.
292 Parameters
293 ----------
294 ref : `DatasetRef`
295 Reference to the required Dataset.
297 Raises
298 ------
299 FileNotFoundError
300 Attempt to remove a dataset that does not exist.
301 """
302 # Get file metadata and internal metadata
303 location, _ = self._get_dataset_location_info(ref)
304 if location is None:
305 raise FileNotFoundError(f"Requested dataset ({ref}) does not exist")
307 if not os.path.exists(location.path): 307 ↛ 308line 307 didn't jump to line 308, because the condition on line 307 was never true
308 raise FileNotFoundError(f"No such file: {location.uri}")
310 if self._can_remove_dataset_artifact(ref):
311 # Only reference to this path so we can remove it
312 os.remove(location.path)
314 # Remove rows from registries
315 self._remove_from_registry(ref)
317 @staticmethod
318 def computeChecksum(filename, algorithm="blake2b", block_size=8192):
319 """Compute the checksum of the supplied file.
321 Parameters
322 ----------
323 filename : `str`
324 Name of file to calculate checksum from.
325 algorithm : `str`, optional
326 Name of algorithm to use. Must be one of the algorithms supported
327 by :py:class`hashlib`.
328 block_size : `int`
329 Number of bytes to read from file at one time.
331 Returns
332 -------
333 hexdigest : `str`
334 Hex digest of the file.
335 """
336 if algorithm not in hashlib.algorithms_guaranteed: 336 ↛ 337line 336 didn't jump to line 337, because the condition on line 336 was never true
337 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
339 hasher = hashlib.new(algorithm)
341 with open(filename, "rb") as f:
342 for chunk in iter(lambda: f.read(block_size), b""):
343 hasher.update(chunk)
345 return hasher.hexdigest()
347 def export(self, refs: Iterable[DatasetRef], *,
348 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
349 # Docstring inherited from Datastore.export.
350 for ref in refs:
351 location, storedFileInfo = self._get_dataset_location_info(ref)
352 if location is None: 352 ↛ 353line 352 didn't jump to line 353, because the condition on line 352 was never true
353 raise FileNotFoundError(f"Could not retrieve Dataset {ref}.")
354 if transfer is None: 354 ↛ 360line 354 didn't jump to line 360, because the condition on line 354 was never false
355 # TODO: do we also need to return the readStorageClass somehow?
356 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
357 else:
358 # TODO: add support for other transfer modes. If we support
359 # moving, this method should become transactional.
360 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")