Coverage for python/lsst/daf/butler/datastores/posixDatastore.py : 91%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""POSIX datastore."""
24from __future__ import annotations
26__all__ = ("PosixDatastore", )
28import hashlib
29import logging
30import os
31import shutil
32from typing import TYPE_CHECKING, Iterable, Optional, Type
34from .fileLikeDatastore import FileLikeDatastore
35from lsst.daf.butler.core.safeFileIo import safeMakeDir
36from lsst.daf.butler.core.utils import transactional
37from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter
39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 from lsst.daf.butler import DatasetRef
42log = logging.getLogger(__name__)
45class PosixDatastore(FileLikeDatastore):
46 """Basic POSIX filesystem backed Datastore.
48 Parameters
49 ----------
50 config : `DatastoreConfig` or `str`
51 Configuration. A string should refer to the name of the config file.
52 registry : `Registry`
53 Registry to use for storing internal information about the datasets.
54 butlerRoot : `str`, optional
55 New datastore root to use to override the configuration value.
57 Raises
58 ------
59 ValueError
60 If root location does not exist and ``create`` is `False` in the
61 configuration.
63 Notes
64 -----
65 PosixDatastore supports all transfer modes for file-based ingest:
66 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"`
67 and `None` (no transfer).
68 """
70 defaultConfigFile = "datastores/posixDatastore.yaml"
71 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
72 absolute path. Can be None if no defaults specified.
73 """
75 def __init__(self, config, registry, butlerRoot=None):
76 super().__init__(config, registry, butlerRoot)
78 # Check that root is a valid URI for this datastore
79 root = ButlerURI(self.root)
80 if root.scheme and root.scheme != "file": 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 raise ValueError(f"Root location must only be a file URI not {self.root}")
83 self.root = root.path
84 if not os.path.isdir(self.root):
85 if "create" not in self.config or not self.config["create"]: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 raise ValueError(f"No valid root at: {self.root}")
87 safeMakeDir(self.root)
89 def exists(self, ref):
90 """Check if the dataset exists in the datastore.
92 Parameters
93 ----------
94 ref : `DatasetRef`
95 Reference to the required dataset.
97 Returns
98 -------
99 exists : `bool`
100 `True` if the entity exists in the `Datastore`.
101 """
102 location, _ = self._get_dataset_location_info(ref)
103 if location is None:
104 return False
105 return os.path.exists(location.path)
107 def get(self, ref, parameters=None):
108 """Load an InMemoryDataset from the store.
110 Parameters
111 ----------
112 ref : `DatasetRef`
113 Reference to the required Dataset.
114 parameters : `dict`
115 `StorageClass`-specific parameters that specify, for example,
116 a slice of the Dataset to be loaded.
118 Returns
119 -------
120 inMemoryDataset : `object`
121 Requested Dataset or slice thereof as an InMemoryDataset.
123 Raises
124 ------
125 FileNotFoundError
126 Requested dataset can not be retrieved.
127 TypeError
128 Return value from formatter has unexpected type.
129 ValueError
130 Formatter failed to process the dataset.
131 """
132 getInfo = self._prepare_for_get(ref, parameters)
133 location = getInfo.location
135 # Too expensive to recalculate the checksum on fetch
136 # but we can check size and existence
137 if not os.path.exists(location.path):
138 raise FileNotFoundError("Dataset with Id {} does not seem to exist at"
139 " expected location of {}".format(ref.id, location.path))
140 stat = os.stat(location.path)
141 size = stat.st_size
142 storedFileInfo = getInfo.info
143 if size != storedFileInfo.file_size: 143 ↛ 144line 143 didn't jump to line 144, because the condition on line 143 was never true
144 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
145 " match recorded size of {}".format(location.path, size,
146 storedFileInfo.file_size))
148 formatter = getInfo.formatter
149 try:
150 result = formatter.read(component=getInfo.component)
151 except Exception as e:
152 raise ValueError(f"Failure from formatter '{formatter.name()}' for Dataset {ref.id}") from e
154 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams)
156 @transactional
157 def put(self, inMemoryDataset, ref):
158 """Write a InMemoryDataset with a given `DatasetRef` to the store.
160 Parameters
161 ----------
162 inMemoryDataset : `object`
163 The Dataset to store.
164 ref : `DatasetRef`
165 Reference to the associated Dataset.
167 Raises
168 ------
169 TypeError
170 Supplied object and storage class are inconsistent.
171 DatasetTypeNotSupportedError
172 The associated `DatasetType` is not handled by this datastore.
174 Notes
175 -----
176 If the datastore is configured to reject certain dataset types it
177 is possible that the put will fail and raise a
178 `DatasetTypeNotSupportedError`. The main use case for this is to
179 allow `ChainedDatastore` to put to multiple datastores without
180 requiring that every datastore accepts the dataset.
181 """
182 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
184 storageDir = os.path.dirname(location.path)
185 if not os.path.isdir(storageDir):
186 # Never try to remove this after creating it since there might
187 # be a butler ingest process running concurrently that will
188 # already think this directory exists.
189 safeMakeDir(storageDir)
191 # Write the file
192 predictedFullPath = os.path.join(self.root, formatter.predictPath())
194 if os.path.exists(predictedFullPath):
195 raise FileExistsError(f"Cannot write file for ref {ref} as "
196 f"output file {predictedFullPath} already exists")
198 def _removeFileExists(path):
199 """Remove a file and do not complain if it is not there.
201 This is important since a formatter might fail before the file
202 is written and we should not confuse people by writing spurious
203 error messages to the log.
204 """
205 try:
206 os.remove(path)
207 except FileNotFoundError:
208 pass
210 formatter_exception = None
211 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath):
212 try:
213 path = formatter.write(inMemoryDataset)
214 log.debug("Wrote file to %s", path)
215 except Exception as e:
216 formatter_exception = e
218 if formatter_exception:
219 raise formatter_exception
221 assert predictedFullPath == os.path.join(self.root, path)
223 info = self._extractIngestInfo(path, ref, formatter=formatter)
224 self._register_datasets([(ref, info)])
226 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str:
227 # Docstring inherited from base class
228 if transfer != "auto":
229 return transfer
231 # See if the paths are within the datastore or not
232 inside = [self._pathInStore(d.path) is not None for d in datasets]
234 if all(inside):
235 transfer = None
236 elif not any(inside): 236 ↛ 239line 236 didn't jump to line 239, because the condition on line 236 was never false
237 transfer = "link"
238 else:
239 raise ValueError("Some datasets are inside the datastore and some are outside."
240 " Please use an explicit transfer mode and not 'auto'.")
242 return transfer
244 def _pathInStore(self, path: str) -> str:
245 """Return path relative to datastore root
247 Parameters
248 ----------
249 path : `str`
250 Path to dataset. Can be absolute path. Returns path in datastore
251 or raises an exception if the path it outside.
253 Returns
254 -------
255 inStore : `str`
256 Path relative to datastore root. Returns `None` if the file is
257 outside the root.
258 """
259 if os.path.isabs(path):
260 absRoot = os.path.abspath(self.root)
261 if os.path.commonpath([absRoot, path]) != absRoot: 261 ↛ 263line 261 didn't jump to line 263, because the condition on line 261 was never false
262 return None
263 return os.path.relpath(path, absRoot)
264 elif path.startswith(os.path.pardir):
265 return None
266 return path
268 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
269 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
270 fullPath = os.path.normpath(os.path.join(self.root, path))
271 if not os.path.exists(fullPath):
272 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest "
273 f"are assumed to be relative to self.root unless they are absolute.")
274 if transfer is None:
275 path = self._pathInStore(path)
276 if path is None:
277 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.")
278 return path
280 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
281 transfer: Optional[str] = None) -> StoredFileInfo:
282 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
283 fullPath = os.path.normpath(os.path.join(self.root, path))
284 if transfer is not None:
285 template = self.templates.getTemplate(ref)
286 location = self.locationFactory.fromPath(template.format(ref))
287 newPath = formatter.predictPathFromLocation(location)
288 newFullPath = os.path.join(self.root, newPath)
289 if os.path.exists(newFullPath):
290 raise FileExistsError(f"File '{newFullPath}' already exists.")
291 storageDir = os.path.dirname(newFullPath)
292 if not os.path.isdir(storageDir):
293 # Do not attempt to reverse directory creation
294 # because of race conditions with other processes running
295 # ingest in parallel.
296 safeMakeDir(storageDir)
297 if transfer == "move":
298 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath):
299 shutil.move(fullPath, newFullPath)
300 elif transfer == "copy":
301 with self._transaction.undoWith("copy", os.remove, newFullPath):
302 shutil.copy(fullPath, newFullPath)
303 elif transfer == "link":
304 with self._transaction.undoWith("link", os.unlink, newFullPath):
305 realPath = os.path.realpath(fullPath)
306 # Try hard link and if that fails use a symlink
307 try:
308 os.link(realPath, newFullPath)
309 except OSError:
310 # Read through existing symlinks
311 os.symlink(realPath, newFullPath)
312 elif transfer == "hardlink":
313 with self._transaction.undoWith("hardlink", os.unlink, newFullPath):
314 os.link(os.path.realpath(fullPath), newFullPath)
315 elif transfer == "symlink":
316 with self._transaction.undoWith("symlink", os.unlink, newFullPath):
317 # Read through existing symlinks
318 os.symlink(os.path.realpath(fullPath), newFullPath)
319 elif transfer == "relsymlink": 319 ↛ 330line 319 didn't jump to line 330, because the condition on line 319 was never false
320 # This is a standard symlink but using a relative path
321 fullPath = os.path.realpath(fullPath)
323 # Need the directory name to give to relative root
324 # A full file path confuses it into an extra ../
325 newFullPathRoot, _ = os.path.split(newFullPath)
326 relPath = os.path.relpath(fullPath, newFullPathRoot)
327 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath):
328 os.symlink(relPath, newFullPath)
329 else:
330 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer))
331 path = newPath
332 fullPath = newFullPath
333 if self.useChecksum:
334 checksum = self.computeChecksum(fullPath)
335 else:
336 checksum = None
337 stat = os.stat(fullPath)
338 size = stat.st_size
339 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass,
340 file_size=size, checksum=checksum)
342 def remove(self, ref):
343 """Indicate to the Datastore that a Dataset can be removed.
345 .. warning::
347 This method does not support transactions; removals are
348 immediate, cannot be undone, and are not guaranteed to
349 be atomic if deleting either the file or the internal
350 database records fails.
352 Parameters
353 ----------
354 ref : `DatasetRef`
355 Reference to the required Dataset.
357 Raises
358 ------
359 FileNotFoundError
360 Attempt to remove a dataset that does not exist.
361 """
362 # Get file metadata and internal metadata
363 location, _ = self._get_dataset_location_info(ref)
364 if location is None:
365 raise FileNotFoundError(f"Requested dataset ({ref}) does not exist")
367 if not os.path.exists(location.path): 367 ↛ 368line 367 didn't jump to line 368, because the condition on line 367 was never true
368 raise FileNotFoundError(f"No such file: {location.uri}")
370 if self._can_remove_dataset_artifact(ref):
371 # Only reference to this path so we can remove it
372 os.remove(location.path)
374 # Remove rows from registries
375 self._remove_from_registry(ref)
377 @staticmethod
378 def computeChecksum(filename, algorithm="blake2b", block_size=8192):
379 """Compute the checksum of the supplied file.
381 Parameters
382 ----------
383 filename : `str`
384 Name of file to calculate checksum from.
385 algorithm : `str`, optional
386 Name of algorithm to use. Must be one of the algorithms supported
387 by :py:class`hashlib`.
388 block_size : `int`
389 Number of bytes to read from file at one time.
391 Returns
392 -------
393 hexdigest : `str`
394 Hex digest of the file.
395 """
396 if algorithm not in hashlib.algorithms_guaranteed: 396 ↛ 397line 396 didn't jump to line 397, because the condition on line 396 was never true
397 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
399 hasher = hashlib.new(algorithm)
401 with open(filename, "rb") as f:
402 for chunk in iter(lambda: f.read(block_size), b""):
403 hasher.update(chunk)
405 return hasher.hexdigest()
407 def export(self, refs: Iterable[DatasetRef], *,
408 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
409 # Docstring inherited from Datastore.export.
410 for ref in refs:
411 location, storedFileInfo = self._get_dataset_location_info(ref)
412 if location is None: 412 ↛ 413line 412 didn't jump to line 413, because the condition on line 412 was never true
413 raise FileNotFoundError(f"Could not retrieve Dataset {ref}.")
414 if transfer is None: 414 ↛ 420line 414 didn't jump to line 420, because the condition on line 414 was never false
415 # TODO: do we also need to return the readStorageClass somehow?
416 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
417 else:
418 # TODO: add support for other transfer modes. If we support
419 # moving, this method should become transactional.
420 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")