Coverage for python/lsst/daf/butler/datastores/posixDatastore.py : 91%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""POSIX datastore."""
24from __future__ import annotations
26__all__ = ("PosixDatastore", )
28import hashlib
29import logging
30import os
31import shutil
32from typing import TYPE_CHECKING, Iterable, Optional, Type
34from .fileLikeDatastore import FileLikeDatastore
35from lsst.daf.butler.core.safeFileIo import safeMakeDir
36from lsst.daf.butler.core.utils import transactional
37from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter
39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 from lsst.daf.butler import DatasetRef
42log = logging.getLogger(__name__)
45class PosixDatastore(FileLikeDatastore):
46 """Basic POSIX filesystem backed Datastore.
48 Parameters
49 ----------
50 config : `DatastoreConfig` or `str`
51 Configuration. A string should refer to the name of the config file.
52 registry : `Registry`
53 Registry to use for storing internal information about the datasets.
54 butlerRoot : `str`, optional
55 New datastore root to use to override the configuration value.
57 Raises
58 ------
59 ValueError
60 If root location does not exist and ``create`` is `False` in the
61 configuration.
63 Notes
64 -----
65 PosixDatastore supports all transfer modes for file-based ingest:
66 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"`
67 and `None` (no transfer).
68 """
70 defaultConfigFile = "datastores/posixDatastore.yaml"
71 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
72 absolute path. Can be None if no defaults specified.
73 """
75 def __init__(self, config, registry, butlerRoot=None):
76 super().__init__(config, registry, butlerRoot)
78 # Check that root is a valid URI for this datastore
79 root = ButlerURI(self.root)
80 if root.scheme and root.scheme != "file": 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 raise ValueError(f"Root location must only be a file URI not {self.root}")
83 self.root = root.path
84 if not os.path.isdir(self.root):
85 if "create" not in self.config or not self.config["create"]: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 raise ValueError(f"No valid root at: {self.root}")
87 safeMakeDir(self.root)
89 def _artifact_exists(self, location):
90 """Check that an artifact exists in this datastore at the specified
91 location.
93 Parameters
94 ----------
95 location : `Location`
96 Expected location of the artifact associated with this datastore.
98 Returns
99 -------
100 exists : `bool`
101 True if the location can be found, false otherwise.
102 """
103 return os.path.exists(location.path)
105 def _delete_artifact(self, location):
106 """Delete the artifact from the datastore.
108 Parameters
109 ----------
110 location : `Location`
111 Location of the artifact associated with this datastore.
112 """
113 os.remove(location.path)
115 def get(self, ref, parameters=None):
116 """Load an InMemoryDataset from the store.
118 Parameters
119 ----------
120 ref : `DatasetRef`
121 Reference to the required Dataset.
122 parameters : `dict`
123 `StorageClass`-specific parameters that specify, for example,
124 a slice of the dataset to be loaded.
126 Returns
127 -------
128 inMemoryDataset : `object`
129 Requested dataset or slice thereof as an InMemoryDataset.
131 Raises
132 ------
133 FileNotFoundError
134 Requested dataset can not be retrieved.
135 TypeError
136 Return value from formatter has unexpected type.
137 ValueError
138 Formatter failed to process the dataset.
139 """
140 getInfo = self._prepare_for_get(ref, parameters)
141 location = getInfo.location
143 # Too expensive to recalculate the checksum on fetch
144 # but we can check size and existence
145 if not os.path.exists(location.path):
146 raise FileNotFoundError("Dataset with Id {} does not seem to exist at"
147 " expected location of {}".format(ref.id, location.path))
148 stat = os.stat(location.path)
149 size = stat.st_size
150 storedFileInfo = getInfo.info
151 if size != storedFileInfo.file_size: 151 ↛ 152line 151 didn't jump to line 152, because the condition on line 151 was never true
152 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
153 " match recorded size of {}".format(location.path, size,
154 storedFileInfo.file_size))
156 formatter = getInfo.formatter
157 try:
158 result = formatter.read(component=getInfo.component)
159 except Exception as e:
160 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}") from e
162 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams)
164 @transactional
165 def put(self, inMemoryDataset, ref):
166 """Write a InMemoryDataset with a given `DatasetRef` to the store.
168 Parameters
169 ----------
170 inMemoryDataset : `object`
171 The dataset to store.
172 ref : `DatasetRef`
173 Reference to the associated Dataset.
175 Raises
176 ------
177 TypeError
178 Supplied object and storage class are inconsistent.
179 DatasetTypeNotSupportedError
180 The associated `DatasetType` is not handled by this datastore.
182 Notes
183 -----
184 If the datastore is configured to reject certain dataset types it
185 is possible that the put will fail and raise a
186 `DatasetTypeNotSupportedError`. The main use case for this is to
187 allow `ChainedDatastore` to put to multiple datastores without
188 requiring that every datastore accepts the dataset.
189 """
190 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
192 storageDir = os.path.dirname(location.path)
193 if not os.path.isdir(storageDir):
194 # Never try to remove this after creating it since there might
195 # be a butler ingest process running concurrently that will
196 # already think this directory exists.
197 safeMakeDir(storageDir)
199 # Write the file
200 predictedFullPath = os.path.join(self.root, formatter.predictPath())
202 if os.path.exists(predictedFullPath):
203 raise FileExistsError(f"Cannot write file for ref {ref} as "
204 f"output file {predictedFullPath} already exists")
206 def _removeFileExists(path):
207 """Remove a file and do not complain if it is not there.
209 This is important since a formatter might fail before the file
210 is written and we should not confuse people by writing spurious
211 error messages to the log.
212 """
213 try:
214 os.remove(path)
215 except FileNotFoundError:
216 pass
218 formatter_exception = None
219 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath):
220 try:
221 path = formatter.write(inMemoryDataset)
222 log.debug("Wrote file to %s", path)
223 except Exception as e:
224 formatter_exception = e
226 if formatter_exception:
227 raise formatter_exception
229 assert predictedFullPath == os.path.join(self.root, path)
231 info = self._extractIngestInfo(path, ref, formatter=formatter)
232 self._register_datasets([(ref, info)])
234 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str:
235 # Docstring inherited from base class
236 if transfer != "auto":
237 return transfer
239 # See if the paths are within the datastore or not
240 inside = [self._pathInStore(d.path) is not None for d in datasets]
242 if all(inside):
243 transfer = None
244 elif not any(inside): 244 ↛ 247line 244 didn't jump to line 247, because the condition on line 244 was never false
245 transfer = "link"
246 else:
247 raise ValueError("Some datasets are inside the datastore and some are outside."
248 " Please use an explicit transfer mode and not 'auto'.")
250 return transfer
252 def _pathInStore(self, path: str) -> str:
253 """Return path relative to datastore root
255 Parameters
256 ----------
257 path : `str`
258 Path to dataset. Can be absolute path. Returns path in datastore
259 or raises an exception if the path it outside.
261 Returns
262 -------
263 inStore : `str`
264 Path relative to datastore root. Returns `None` if the file is
265 outside the root.
266 """
267 if os.path.isabs(path):
268 absRoot = os.path.abspath(self.root)
269 if os.path.commonpath([absRoot, path]) != absRoot: 269 ↛ 271line 269 didn't jump to line 271, because the condition on line 269 was never false
270 return None
271 return os.path.relpath(path, absRoot)
272 elif path.startswith(os.path.pardir):
273 return None
274 return path
276 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
277 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
278 fullPath = os.path.normpath(os.path.join(self.root, path))
279 if not os.path.exists(fullPath):
280 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest "
281 f"are assumed to be relative to self.root unless they are absolute.")
282 if transfer is None:
283 path = self._pathInStore(path)
284 if path is None:
285 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.")
286 return path
288 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
289 transfer: Optional[str] = None) -> StoredFileInfo:
290 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
291 fullPath = os.path.normpath(os.path.join(self.root, path))
292 if transfer is not None:
293 template = self.templates.getTemplate(ref)
294 location = self.locationFactory.fromPath(template.format(ref))
295 newPath = formatter.predictPathFromLocation(location)
296 newFullPath = os.path.join(self.root, newPath)
297 if os.path.exists(newFullPath):
298 raise FileExistsError(f"File '{newFullPath}' already exists.")
299 storageDir = os.path.dirname(newFullPath)
300 if not os.path.isdir(storageDir):
301 # Do not attempt to reverse directory creation
302 # because of race conditions with other processes running
303 # ingest in parallel.
304 safeMakeDir(storageDir)
305 if transfer == "move":
306 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath):
307 shutil.move(fullPath, newFullPath)
308 elif transfer == "copy":
309 with self._transaction.undoWith("copy", os.remove, newFullPath):
310 shutil.copy(fullPath, newFullPath)
311 elif transfer == "link":
312 with self._transaction.undoWith("link", os.unlink, newFullPath):
313 realPath = os.path.realpath(fullPath)
314 # Try hard link and if that fails use a symlink
315 try:
316 os.link(realPath, newFullPath)
317 except OSError:
318 # Read through existing symlinks
319 os.symlink(realPath, newFullPath)
320 elif transfer == "hardlink":
321 with self._transaction.undoWith("hardlink", os.unlink, newFullPath):
322 os.link(os.path.realpath(fullPath), newFullPath)
323 elif transfer == "symlink":
324 with self._transaction.undoWith("symlink", os.unlink, newFullPath):
325 # Read through existing symlinks
326 os.symlink(os.path.realpath(fullPath), newFullPath)
327 elif transfer == "relsymlink": 327 ↛ 338line 327 didn't jump to line 338, because the condition on line 327 was never false
328 # This is a standard symlink but using a relative path
329 fullPath = os.path.realpath(fullPath)
331 # Need the directory name to give to relative root
332 # A full file path confuses it into an extra ../
333 newFullPathRoot, _ = os.path.split(newFullPath)
334 relPath = os.path.relpath(fullPath, newFullPathRoot)
335 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath):
336 os.symlink(relPath, newFullPath)
337 else:
338 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer))
339 path = newPath
340 fullPath = newFullPath
341 if self.useChecksum:
342 checksum = self.computeChecksum(fullPath)
343 else:
344 checksum = None
345 stat = os.stat(fullPath)
346 size = stat.st_size
347 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass,
348 file_size=size, checksum=checksum)
350 @staticmethod
351 def computeChecksum(filename, algorithm="blake2b", block_size=8192):
352 """Compute the checksum of the supplied file.
354 Parameters
355 ----------
356 filename : `str`
357 Name of file to calculate checksum from.
358 algorithm : `str`, optional
359 Name of algorithm to use. Must be one of the algorithms supported
360 by :py:class`hashlib`.
361 block_size : `int`
362 Number of bytes to read from file at one time.
364 Returns
365 -------
366 hexdigest : `str`
367 Hex digest of the file.
368 """
369 if algorithm not in hashlib.algorithms_guaranteed: 369 ↛ 370line 369 didn't jump to line 370, because the condition on line 369 was never true
370 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
372 hasher = hashlib.new(algorithm)
374 with open(filename, "rb") as f:
375 for chunk in iter(lambda: f.read(block_size), b""):
376 hasher.update(chunk)
378 return hasher.hexdigest()
380 def export(self, refs: Iterable[DatasetRef], *,
381 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
382 # Docstring inherited from Datastore.export.
383 for ref in refs:
384 location, storedFileInfo = self._get_dataset_location_info(ref)
385 if location is None: 385 ↛ 386line 385 didn't jump to line 386, because the condition on line 385 was never true
386 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
387 if transfer is None: 387 ↛ 393line 387 didn't jump to line 393, because the condition on line 387 was never false
388 # TODO: do we also need to return the readStorageClass somehow?
389 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
390 else:
391 # TODO: add support for other transfer modes. If we support
392 # moving, this method should become transactional.
393 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")