Coverage for python/lsst/daf/butler/datastores/posixDatastore.py : 90%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""POSIX datastore."""
24from __future__ import annotations
26__all__ = ("PosixDatastore", )
28import hashlib
29import logging
30import os
31import shutil
32from typing import TYPE_CHECKING, Iterable, Optional, Type
34from .fileLikeDatastore import FileLikeDatastore
35from lsst.daf.butler.core.safeFileIo import safeMakeDir
36from lsst.daf.butler.core.utils import transactional
37from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter
39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 from lsst.daf.butler import DatasetRef
42log = logging.getLogger(__name__)
45class PosixDatastore(FileLikeDatastore):
46 """Basic POSIX filesystem backed Datastore.
48 Parameters
49 ----------
50 config : `DatastoreConfig` or `str`
51 Configuration. A string should refer to the name of the config file.
52 registry : `Registry`
53 Registry to use for storing internal information about the datasets.
54 butlerRoot : `str`, optional
55 New datastore root to use to override the configuration value.
57 Raises
58 ------
59 ValueError
60 If root location does not exist and ``create`` is `False` in the
61 configuration.
63 Notes
64 -----
65 PosixDatastore supports all transfer modes for file-based ingest:
66 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"`
67 and `None` (no transfer).
68 """
70 defaultConfigFile = "datastores/posixDatastore.yaml"
71 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
72 absolute path. Can be None if no defaults specified.
73 """
75 def __init__(self, config, registry, butlerRoot=None):
76 super().__init__(config, registry, butlerRoot)
78 # Check that root is a valid URI for this datastore
79 root = ButlerURI(self.root)
80 if root.scheme and root.scheme != "file": 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 raise ValueError(f"Root location must only be a file URI not {self.root}")
83 self.root = root.path
84 if not os.path.isdir(self.root):
85 if "create" not in self.config or not self.config["create"]: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 raise ValueError(f"No valid root at: {self.root}")
87 safeMakeDir(self.root)
89 def exists(self, ref):
90 """Check if the dataset exists in the datastore.
92 Parameters
93 ----------
94 ref : `DatasetRef`
95 Reference to the required dataset.
97 Returns
98 -------
99 exists : `bool`
100 `True` if the entity exists in the `Datastore`.
101 """
102 location, _ = self._get_dataset_location_info(ref)
103 if location is None:
104 return False
105 return os.path.exists(location.path)
107 def get(self, ref, parameters=None):
108 """Load an InMemoryDataset from the store.
110 Parameters
111 ----------
112 ref : `DatasetRef`
113 Reference to the required Dataset.
114 parameters : `dict`
115 `StorageClass`-specific parameters that specify, for example,
116 a slice of the Dataset to be loaded.
118 Returns
119 -------
120 inMemoryDataset : `object`
121 Requested Dataset or slice thereof as an InMemoryDataset.
123 Raises
124 ------
125 FileNotFoundError
126 Requested dataset can not be retrieved.
127 TypeError
128 Return value from formatter has unexpected type.
129 ValueError
130 Formatter failed to process the dataset.
131 """
132 getInfo = self._prepare_for_get(ref, parameters)
133 location = getInfo.location
135 # Too expensive to recalculate the checksum on fetch
136 # but we can check size and existence
137 if not os.path.exists(location.path):
138 raise FileNotFoundError("Dataset with Id {} does not seem to exist at"
139 " expected location of {}".format(ref.id, location.path))
140 stat = os.stat(location.path)
141 size = stat.st_size
142 storedFileInfo = getInfo.info
143 if size != storedFileInfo.file_size: 143 ↛ 144line 143 didn't jump to line 144, because the condition on line 143 was never true
144 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
145 " match recorded size of {}".format(location.path, size,
146 storedFileInfo.file_size))
148 formatter = getInfo.formatter
149 try:
150 result = formatter.read(component=getInfo.component)
151 except Exception as e:
152 raise ValueError(f"Failure from formatter '{formatter.name()}' for Dataset {ref.id}") from e
154 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams)
156 @transactional
157 def put(self, inMemoryDataset, ref):
158 """Write a InMemoryDataset with a given `DatasetRef` to the store.
160 Parameters
161 ----------
162 inMemoryDataset : `object`
163 The Dataset to store.
164 ref : `DatasetRef`
165 Reference to the associated Dataset.
167 Raises
168 ------
169 TypeError
170 Supplied object and storage class are inconsistent.
171 DatasetTypeNotSupportedError
172 The associated `DatasetType` is not handled by this datastore.
174 Notes
175 -----
176 If the datastore is configured to reject certain dataset types it
177 is possible that the put will fail and raise a
178 `DatasetTypeNotSupportedError`. The main use case for this is to
179 allow `ChainedDatastore` to put to multiple datastores without
180 requiring that every datastore accepts the dataset.
181 """
182 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
184 storageDir = os.path.dirname(location.path)
185 if not os.path.isdir(storageDir):
186 # Never try to remove this after creating it since there might
187 # be a butler ingest process running concurrently that will
188 # already think this directory exists.
189 safeMakeDir(storageDir)
191 # Write the file
192 predictedFullPath = os.path.join(self.root, formatter.predictPath())
194 if os.path.exists(predictedFullPath):
195 raise FileExistsError(f"Cannot write file for ref {ref} as "
196 f"output file {predictedFullPath} already exists")
198 def _removeFileExists(path):
199 """Remove a file and do not complain if it is not there.
201 This is important since a formatter might fail before the file
202 is written and we should not confuse people by writing spurious
203 error messages to the log.
204 """
205 try:
206 os.remove(path)
207 except FileNotFoundError:
208 pass
210 formatter_exception = None
211 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath):
212 try:
213 path = formatter.write(inMemoryDataset)
214 log.debug("Wrote file to %s", path)
215 except Exception as e:
216 formatter_exception = e
218 if formatter_exception:
219 raise formatter_exception
221 assert predictedFullPath == os.path.join(self.root, path)
223 info = self._extractIngestInfo(path, ref, formatter=formatter)
224 self._register_datasets([(ref, info)])
226 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str:
227 # Docstring inherited from base class
228 if transfer != "auto":
229 return transfer
231 # See if the paths are within the datastore or not
232 inside = [self._pathInStore(d.path) is not None for d in datasets]
234 if all(inside):
235 transfer = None
236 elif not any(inside): 236 ↛ 239line 236 didn't jump to line 239, because the condition on line 236 was never false
237 transfer = "link"
238 else:
239 raise ValueError("Some datasets are inside the datastore and some are outside."
240 " Please use an explicit transfer mode and not 'auto'.")
242 return transfer
244 def _pathInStore(self, path: str) -> str:
245 """Return path relative to datastore root
247 Parameters
248 ----------
249 path : `str`
250 Path to dataset. Can be absolute path. Returns path in datastore
251 or raises an exception if the path it outside.
253 Returns
254 -------
255 inStore : `str`
256 Path relative to datastore root. Returns `None` if the file is
257 outside the root.
258 """
259 if os.path.isabs(path):
260 absRoot = os.path.abspath(self.root)
261 if os.path.commonpath([absRoot, path]) != absRoot: 261 ↛ 263line 261 didn't jump to line 263, because the condition on line 261 was never false
262 return None
263 return os.path.relpath(path, absRoot)
264 elif path.startswith(os.path.pardir):
265 return None
266 return path
268 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
269 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
270 fullPath = os.path.normpath(os.path.join(self.root, path))
271 if not os.path.exists(fullPath):
272 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest "
273 f"are assumed to be relative to self.root unless they are absolute.")
274 if transfer is None:
275 path = self._pathInStore(path)
276 if path is None:
277 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.")
278 return path
280 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
281 transfer: Optional[str] = None) -> StoredFileInfo:
282 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
283 fullPath = os.path.normpath(os.path.join(self.root, path))
284 if transfer is not None:
285 template = self.templates.getTemplate(ref)
286 location = self.locationFactory.fromPath(template.format(ref))
287 newPath = formatter.predictPathFromLocation(location)
288 newFullPath = os.path.join(self.root, newPath)
289 if os.path.exists(newFullPath):
290 raise FileExistsError(f"File '{newFullPath}' already exists.")
291 storageDir = os.path.dirname(newFullPath)
292 if not os.path.isdir(storageDir):
293 # Do not attempt to reverse directory creation
294 # because of race conditions with other processes running
295 # ingest in parallel.
296 safeMakeDir(storageDir)
297 if transfer == "move":
298 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath):
299 shutil.move(fullPath, newFullPath)
300 elif transfer == "copy":
301 with self._transaction.undoWith("copy", os.remove, newFullPath):
302 shutil.copy(fullPath, newFullPath)
303 elif transfer == "link":
304 with self._transaction.undoWith("link", os.unlink, newFullPath):
305 # Try hard link and if that fails use a symlink
306 try:
307 os.link(fullPath, newFullPath)
308 except OSError:
309 # Read through existing symlinks
310 os.symlink(os.path.realpath(fullPath), newFullPath)
311 elif transfer == "hardlink":
312 with self._transaction.undoWith("hardlink", os.unlink, newFullPath):
313 os.link(fullPath, newFullPath)
314 elif transfer == "symlink":
315 with self._transaction.undoWith("symlink", os.unlink, newFullPath):
316 # Read through existing symlinks
317 os.symlink(os.path.realpath(fullPath), newFullPath)
318 elif transfer == "relsymlink": 318 ↛ 329line 318 didn't jump to line 329, because the condition on line 318 was never false
319 # This is a standard symlink but using a relative path
320 fullPath = os.path.realpath(fullPath)
322 # Need the directory name to give to relative root
323 # A full file path confuses it into an extra ../
324 newFullPathRoot, _ = os.path.split(newFullPath)
325 relPath = os.path.relpath(fullPath, newFullPathRoot)
326 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath):
327 os.symlink(relPath, newFullPath)
328 else:
329 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer))
330 path = newPath
331 fullPath = newFullPath
332 if self.useChecksum:
333 checksum = self.computeChecksum(fullPath)
334 else:
335 checksum = None
336 stat = os.stat(fullPath)
337 size = stat.st_size
338 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass,
339 file_size=size, checksum=checksum)
341 def remove(self, ref):
342 """Indicate to the Datastore that a Dataset can be removed.
344 .. warning::
346 This method does not support transactions; removals are
347 immediate, cannot be undone, and are not guaranteed to
348 be atomic if deleting either the file or the internal
349 database records fails.
351 Parameters
352 ----------
353 ref : `DatasetRef`
354 Reference to the required Dataset.
356 Raises
357 ------
358 FileNotFoundError
359 Attempt to remove a dataset that does not exist.
360 """
361 # Get file metadata and internal metadata
362 location, _ = self._get_dataset_location_info(ref)
363 if location is None:
364 raise FileNotFoundError(f"Requested dataset ({ref}) does not exist")
366 if not os.path.exists(location.path): 366 ↛ 367line 366 didn't jump to line 367, because the condition on line 366 was never true
367 raise FileNotFoundError(f"No such file: {location.uri}")
369 if self._can_remove_dataset_artifact(ref):
370 # Only reference to this path so we can remove it
371 os.remove(location.path)
373 # Remove rows from registries
374 self._remove_from_registry(ref)
376 @staticmethod
377 def computeChecksum(filename, algorithm="blake2b", block_size=8192):
378 """Compute the checksum of the supplied file.
380 Parameters
381 ----------
382 filename : `str`
383 Name of file to calculate checksum from.
384 algorithm : `str`, optional
385 Name of algorithm to use. Must be one of the algorithms supported
386 by :py:class`hashlib`.
387 block_size : `int`
388 Number of bytes to read from file at one time.
390 Returns
391 -------
392 hexdigest : `str`
393 Hex digest of the file.
394 """
395 if algorithm not in hashlib.algorithms_guaranteed: 395 ↛ 396line 395 didn't jump to line 396, because the condition on line 395 was never true
396 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
398 hasher = hashlib.new(algorithm)
400 with open(filename, "rb") as f:
401 for chunk in iter(lambda: f.read(block_size), b""):
402 hasher.update(chunk)
404 return hasher.hexdigest()
406 def export(self, refs: Iterable[DatasetRef], *,
407 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
408 # Docstring inherited from Datastore.export.
409 for ref in refs:
410 location, storedFileInfo = self._get_dataset_location_info(ref)
411 if location is None: 411 ↛ 412line 411 didn't jump to line 412, because the condition on line 411 was never true
412 raise FileNotFoundError(f"Could not retrieve Dataset {ref}.")
413 if transfer is None: 413 ↛ 419line 413 didn't jump to line 419, because the condition on line 413 was never false
414 # TODO: do we also need to return the readStorageClass somehow?
415 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
416 else:
417 # TODO: add support for other transfer modes. If we support
418 # moving, this method should become transactional.
419 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")