Coverage for python/lsst/daf/butler/datastores/posixDatastore.py : 92%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""POSIX datastore."""
24from __future__ import annotations
26__all__ = ("PosixDatastore", )
28import hashlib
29import logging
30import os
31import shutil
32from typing import Iterable, Optional, Type
34from .fileLikeDatastore import FileLikeDatastore
35from lsst.daf.butler.core.safeFileIo import safeMakeDir
36from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter, DatasetRef
38log = logging.getLogger(__name__)
41class PosixDatastore(FileLikeDatastore):
42 """Basic POSIX filesystem backed Datastore.
44 Parameters
45 ----------
46 config : `DatastoreConfig` or `str`
47 Configuration. A string should refer to the name of the config file.
48 bridgeManager : `DatastoreRegistryBridgeManager`
49 Object that manages the interface between `Registry` and datastores.
50 butlerRoot : `str`, optional
51 New datastore root to use to override the configuration value.
53 Raises
54 ------
55 ValueError
56 If root location does not exist and ``create`` is `False` in the
57 configuration.
59 Notes
60 -----
61 PosixDatastore supports all transfer modes for file-based ingest:
62 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"`
63 and `None` (no transfer).
64 """
66 defaultConfigFile = "datastores/posixDatastore.yaml"
67 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
68 absolute path. Can be None if no defaults specified.
69 """
71 def __init__(self, config, bridgeManager, butlerRoot=None):
72 super().__init__(config, bridgeManager, butlerRoot)
74 # Check that root is a valid URI for this datastore
75 root = ButlerURI(self.root)
76 if root.scheme and root.scheme != "file": 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true
77 raise ValueError(f"Root location must only be a file URI not {self.root}")
79 self.root = root.path
80 if not os.path.isdir(self.root):
81 if "create" not in self.config or not self.config["create"]: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true
82 raise ValueError(f"No valid root at: {self.root}")
83 safeMakeDir(self.root)
85 def _artifact_exists(self, location):
86 """Check that an artifact exists in this datastore at the specified
87 location.
89 Parameters
90 ----------
91 location : `Location`
92 Expected location of the artifact associated with this datastore.
94 Returns
95 -------
96 exists : `bool`
97 True if the location can be found, false otherwise.
98 """
99 return os.path.exists(location.path)
101 def _delete_artifact(self, location):
102 """Delete the artifact from the datastore.
104 Parameters
105 ----------
106 location : `Location`
107 Location of the artifact associated with this datastore.
108 """
109 os.remove(location.path)
111 def _read_artifact_into_memory(self, getInfo, ref, isComponent=False):
112 location = getInfo.location
114 # Too expensive to recalculate the checksum on fetch
115 # but we can check size and existence
116 if not os.path.exists(location.path):
117 raise FileNotFoundError("Dataset with Id {} does not seem to exist at"
118 " expected location of {}".format(ref.id, location.path))
119 stat = os.stat(location.path)
120 size = stat.st_size
121 storedFileInfo = getInfo.info
122 if size != storedFileInfo.file_size: 122 ↛ 123line 122 didn't jump to line 123, because the condition on line 122 was never true
123 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
124 " match recorded size of {}".format(location.path, size,
125 storedFileInfo.file_size))
127 formatter = getInfo.formatter
128 try:
129 result = formatter.read(component=getInfo.component if isComponent else None)
130 except Exception as e:
131 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
132 f" ({ref.datasetType.name} from {location.path}): {e}") from e
134 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
135 isComponent=isComponent)
137 def _write_in_memory_to_artifact(self, inMemoryDataset, ref):
138 # Inherit docstring
140 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
142 storageDir = os.path.dirname(location.path)
143 if not os.path.isdir(storageDir):
144 # Never try to remove this after creating it since there might
145 # be a butler ingest process running concurrently that will
146 # already think this directory exists.
147 safeMakeDir(storageDir)
149 # Write the file
150 predictedFullPath = os.path.join(self.root, formatter.predictPath())
152 if os.path.exists(predictedFullPath):
153 raise FileExistsError(f"Cannot write file for ref {ref} as "
154 f"output file {predictedFullPath} already exists")
156 def _removeFileExists(path):
157 """Remove a file and do not complain if it is not there.
159 This is important since a formatter might fail before the file
160 is written and we should not confuse people by writing spurious
161 error messages to the log.
162 """
163 try:
164 os.remove(path)
165 except FileNotFoundError:
166 pass
168 formatter_exception = None
169 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath):
170 try:
171 path = formatter.write(inMemoryDataset)
172 log.debug("Wrote file to %s", path)
173 except Exception as e:
174 formatter_exception = e
176 if formatter_exception:
177 raise formatter_exception
179 assert predictedFullPath == os.path.join(self.root, path)
181 return self._extractIngestInfo(path, ref, formatter=formatter)
183 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str:
184 # Docstring inherited from base class
185 if transfer != "auto":
186 return transfer
188 # See if the paths are within the datastore or not
189 inside = [self._pathInStore(d.path) is not None for d in datasets]
191 if all(inside):
192 transfer = None
193 elif not any(inside): 193 ↛ 196line 193 didn't jump to line 196, because the condition on line 193 was never false
194 transfer = "link"
195 else:
196 raise ValueError("Some datasets are inside the datastore and some are outside."
197 " Please use an explicit transfer mode and not 'auto'.")
199 return transfer
201 def _pathInStore(self, path: str) -> str:
202 """Return path relative to datastore root
204 Parameters
205 ----------
206 path : `str`
207 Path to dataset. Can be absolute path. Returns path in datastore
208 or raises an exception if the path it outside.
210 Returns
211 -------
212 inStore : `str`
213 Path relative to datastore root. Returns `None` if the file is
214 outside the root.
215 """
216 if os.path.isabs(path):
217 absRoot = os.path.abspath(self.root)
218 if os.path.commonpath([absRoot, path]) != absRoot: 218 ↛ 220line 218 didn't jump to line 220, because the condition on line 218 was never false
219 return None
220 return os.path.relpath(path, absRoot)
221 elif path.startswith(os.path.pardir):
222 return None
223 return path
225 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
226 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
227 fullPath = os.path.normpath(os.path.join(self.root, path))
228 if not os.path.exists(fullPath):
229 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest "
230 f"are assumed to be relative to self.root unless they are absolute.")
231 if transfer is None:
232 path = self._pathInStore(path)
233 if path is None:
234 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.")
235 return path
237 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
238 transfer: Optional[str] = None) -> StoredFileInfo:
239 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
240 fullPath = os.path.normpath(os.path.join(self.root, path))
241 if transfer is not None:
242 template = self.templates.getTemplate(ref)
243 location = self.locationFactory.fromPath(template.format(ref))
244 newPath = formatter.predictPathFromLocation(location)
245 newFullPath = os.path.join(self.root, newPath)
246 if os.path.exists(newFullPath):
247 raise FileExistsError(f"File '{newFullPath}' already exists.")
248 storageDir = os.path.dirname(newFullPath)
249 if not os.path.isdir(storageDir):
250 # Do not attempt to reverse directory creation
251 # because of race conditions with other processes running
252 # ingest in parallel.
253 safeMakeDir(storageDir)
254 if transfer == "move":
255 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath):
256 shutil.move(fullPath, newFullPath)
257 elif transfer == "copy":
258 with self._transaction.undoWith("copy", os.remove, newFullPath):
259 shutil.copy(fullPath, newFullPath)
260 elif transfer == "link":
261 with self._transaction.undoWith("link", os.unlink, newFullPath):
262 realPath = os.path.realpath(fullPath)
263 # Try hard link and if that fails use a symlink
264 try:
265 os.link(realPath, newFullPath)
266 except OSError:
267 # Read through existing symlinks
268 os.symlink(realPath, newFullPath)
269 elif transfer == "hardlink":
270 with self._transaction.undoWith("hardlink", os.unlink, newFullPath):
271 os.link(os.path.realpath(fullPath), newFullPath)
272 elif transfer == "symlink":
273 with self._transaction.undoWith("symlink", os.unlink, newFullPath):
274 # Read through existing symlinks
275 os.symlink(os.path.realpath(fullPath), newFullPath)
276 elif transfer == "relsymlink": 276 ↛ 287line 276 didn't jump to line 287, because the condition on line 276 was never false
277 # This is a standard symlink but using a relative path
278 fullPath = os.path.realpath(fullPath)
280 # Need the directory name to give to relative root
281 # A full file path confuses it into an extra ../
282 newFullPathRoot, _ = os.path.split(newFullPath)
283 relPath = os.path.relpath(fullPath, newFullPathRoot)
284 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath):
285 os.symlink(relPath, newFullPath)
286 else:
287 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer))
288 path = newPath
289 fullPath = newFullPath
290 if self.useChecksum:
291 checksum = self.computeChecksum(fullPath)
292 else:
293 checksum = None
294 stat = os.stat(fullPath)
295 size = stat.st_size
296 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass,
297 component=ref.datasetType.component(),
298 file_size=size, checksum=checksum)
300 @staticmethod
301 def computeChecksum(filename, algorithm="blake2b", block_size=8192):
302 """Compute the checksum of the supplied file.
304 Parameters
305 ----------
306 filename : `str`
307 Name of file to calculate checksum from.
308 algorithm : `str`, optional
309 Name of algorithm to use. Must be one of the algorithms supported
310 by :py:class`hashlib`.
311 block_size : `int`
312 Number of bytes to read from file at one time.
314 Returns
315 -------
316 hexdigest : `str`
317 Hex digest of the file.
318 """
319 if algorithm not in hashlib.algorithms_guaranteed: 319 ↛ 320line 319 didn't jump to line 320, because the condition on line 319 was never true
320 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
322 hasher = hashlib.new(algorithm)
324 with open(filename, "rb") as f:
325 for chunk in iter(lambda: f.read(block_size), b""):
326 hasher.update(chunk)
328 return hasher.hexdigest()
330 def export(self, refs: Iterable[DatasetRef], *,
331 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
332 # Docstring inherited from Datastore.export.
333 for ref in refs:
334 fileLocations = self._get_dataset_locations_info(ref)
335 if not fileLocations: 335 ↛ 336line 335 didn't jump to line 336, because the condition on line 335 was never true
336 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
337 # For now we can not export disassembled datasets
338 if len(fileLocations) > 1:
339 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}")
340 location, storedFileInfo = fileLocations[0]
341 if transfer is None: 341 ↛ 347line 341 didn't jump to line 347, because the condition on line 341 was never false
342 # TODO: do we also need to return the readStorageClass somehow?
343 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
344 else:
345 # TODO: add support for other transfer modes. If we support
346 # moving, this method should become transactional.
347 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")