Coverage for python/lsst/daf/butler/datastores/posixDatastore.py : 91%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""POSIX datastore."""
24from __future__ import annotations
26__all__ = ("PosixDatastore", )
28import hashlib
29import logging
30import os
31import shutil
32from typing import TYPE_CHECKING, Iterable, Optional, Type
34from .fileLikeDatastore import FileLikeDatastore
35from lsst.daf.butler.core.safeFileIo import safeMakeDir
36from lsst.daf.butler.core.utils import transactional
37from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter
39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 from lsst.daf.butler import DatasetRef
42log = logging.getLogger(__name__)
45class PosixDatastore(FileLikeDatastore):
46 """Basic POSIX filesystem backed Datastore.
48 Parameters
49 ----------
50 config : `DatastoreConfig` or `str`
51 Configuration. A string should refer to the name of the config file.
52 registry : `Registry`
53 Registry to use for storing internal information about the datasets.
54 butlerRoot : `str`, optional
55 New datastore root to use to override the configuration value.
57 Raises
58 ------
59 ValueError
60 If root location does not exist and ``create`` is `False` in the
61 configuration.
63 Notes
64 -----
65 PosixDatastore supports all transfer modes for file-based ingest:
66 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"`
67 and `None` (no transfer).
68 """
70 defaultConfigFile = "datastores/posixDatastore.yaml"
71 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
72 absolute path. Can be None if no defaults specified.
73 """
75 def __init__(self, config, registry, butlerRoot=None):
76 super().__init__(config, registry, butlerRoot)
78 # Check that root is a valid URI for this datastore
79 root = ButlerURI(self.root)
80 if root.scheme and root.scheme != "file": 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 raise ValueError(f"Root location must only be a file URI not {self.root}")
83 self.root = root.path
84 if not os.path.isdir(self.root):
85 if "create" not in self.config or not self.config["create"]: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 raise ValueError(f"No valid root at: {self.root}")
87 safeMakeDir(self.root)
89 def _artifact_exists(self, location):
90 """Check that an artifact exists in this datastore at the specified
91 location.
93 Parameters
94 ----------
95 location : `Location`
96 Expected location of the artifact associated with this datastore.
98 Returns
99 -------
100 exists : `bool`
101 True if the location can be found, false otherwise.
102 """
103 return os.path.exists(location.path)
105 def _delete_artifact(self, location):
106 """Delete the artifact from the datastore.
108 Parameters
109 ----------
110 location : `Location`
111 Location of the artifact associated with this datastore.
112 """
113 os.remove(location.path)
115 def get(self, ref, parameters=None):
116 """Load an InMemoryDataset from the store.
118 Parameters
119 ----------
120 ref : `DatasetRef`
121 Reference to the required Dataset.
122 parameters : `dict`
123 `StorageClass`-specific parameters that specify, for example,
124 a slice of the dataset to be loaded.
126 Returns
127 -------
128 inMemoryDataset : `object`
129 Requested dataset or slice thereof as an InMemoryDataset.
131 Raises
132 ------
133 FileNotFoundError
134 Requested dataset can not be retrieved.
135 TypeError
136 Return value from formatter has unexpected type.
137 ValueError
138 Formatter failed to process the dataset.
139 """
140 getInfo = self._prepare_for_get(ref, parameters)
141 location = getInfo.location
143 # Too expensive to recalculate the checksum on fetch
144 # but we can check size and existence
145 if not os.path.exists(location.path):
146 raise FileNotFoundError("Dataset with Id {} does not seem to exist at"
147 " expected location of {}".format(ref.id, location.path))
148 stat = os.stat(location.path)
149 size = stat.st_size
150 storedFileInfo = getInfo.info
151 if size != storedFileInfo.file_size: 151 ↛ 152line 151 didn't jump to line 152, because the condition on line 151 was never true
152 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not"
153 " match recorded size of {}".format(location.path, size,
154 storedFileInfo.file_size))
156 formatter = getInfo.formatter
157 try:
158 result = formatter.read(component=getInfo.component)
159 except Exception as e:
160 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
161 f" ({ref.datasetType.name} from {location.path}): {e}") from e
163 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams,
164 isComponent=getInfo.component is not None)
166 @transactional
167 def put(self, inMemoryDataset, ref):
168 """Write a InMemoryDataset with a given `DatasetRef` to the store.
170 Parameters
171 ----------
172 inMemoryDataset : `object`
173 The dataset to store.
174 ref : `DatasetRef`
175 Reference to the associated Dataset.
177 Raises
178 ------
179 TypeError
180 Supplied object and storage class are inconsistent.
181 DatasetTypeNotSupportedError
182 The associated `DatasetType` is not handled by this datastore.
184 Notes
185 -----
186 If the datastore is configured to reject certain dataset types it
187 is possible that the put will fail and raise a
188 `DatasetTypeNotSupportedError`. The main use case for this is to
189 allow `ChainedDatastore` to put to multiple datastores without
190 requiring that every datastore accepts the dataset.
191 """
192 location, formatter = self._prepare_for_put(inMemoryDataset, ref)
194 storageDir = os.path.dirname(location.path)
195 if not os.path.isdir(storageDir):
196 # Never try to remove this after creating it since there might
197 # be a butler ingest process running concurrently that will
198 # already think this directory exists.
199 safeMakeDir(storageDir)
201 # Write the file
202 predictedFullPath = os.path.join(self.root, formatter.predictPath())
204 if os.path.exists(predictedFullPath):
205 raise FileExistsError(f"Cannot write file for ref {ref} as "
206 f"output file {predictedFullPath} already exists")
208 def _removeFileExists(path):
209 """Remove a file and do not complain if it is not there.
211 This is important since a formatter might fail before the file
212 is written and we should not confuse people by writing spurious
213 error messages to the log.
214 """
215 try:
216 os.remove(path)
217 except FileNotFoundError:
218 pass
220 formatter_exception = None
221 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath):
222 try:
223 path = formatter.write(inMemoryDataset)
224 log.debug("Wrote file to %s", path)
225 except Exception as e:
226 formatter_exception = e
228 if formatter_exception:
229 raise formatter_exception
231 assert predictedFullPath == os.path.join(self.root, path)
233 info = self._extractIngestInfo(path, ref, formatter=formatter)
234 self._register_datasets([(ref, info)])
236 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str:
237 # Docstring inherited from base class
238 if transfer != "auto":
239 return transfer
241 # See if the paths are within the datastore or not
242 inside = [self._pathInStore(d.path) is not None for d in datasets]
244 if all(inside):
245 transfer = None
246 elif not any(inside): 246 ↛ 249line 246 didn't jump to line 249, because the condition on line 246 was never false
247 transfer = "link"
248 else:
249 raise ValueError("Some datasets are inside the datastore and some are outside."
250 " Please use an explicit transfer mode and not 'auto'.")
252 return transfer
254 def _pathInStore(self, path: str) -> str:
255 """Return path relative to datastore root
257 Parameters
258 ----------
259 path : `str`
260 Path to dataset. Can be absolute path. Returns path in datastore
261 or raises an exception if the path it outside.
263 Returns
264 -------
265 inStore : `str`
266 Path relative to datastore root. Returns `None` if the file is
267 outside the root.
268 """
269 if os.path.isabs(path):
270 absRoot = os.path.abspath(self.root)
271 if os.path.commonpath([absRoot, path]) != absRoot: 271 ↛ 273line 271 didn't jump to line 273, because the condition on line 271 was never false
272 return None
273 return os.path.relpath(path, absRoot)
274 elif path.startswith(os.path.pardir):
275 return None
276 return path
278 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
279 # Docstring inherited from FileLikeDatastore._standardizeIngestPath.
280 fullPath = os.path.normpath(os.path.join(self.root, path))
281 if not os.path.exists(fullPath):
282 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest "
283 f"are assumed to be relative to self.root unless they are absolute.")
284 if transfer is None:
285 path = self._pathInStore(path)
286 if path is None:
287 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.")
288 return path
290 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
291 transfer: Optional[str] = None) -> StoredFileInfo:
292 # Docstring inherited from FileLikeDatastore._extractIngestInfo.
293 fullPath = os.path.normpath(os.path.join(self.root, path))
294 if transfer is not None:
295 template = self.templates.getTemplate(ref)
296 location = self.locationFactory.fromPath(template.format(ref))
297 newPath = formatter.predictPathFromLocation(location)
298 newFullPath = os.path.join(self.root, newPath)
299 if os.path.exists(newFullPath):
300 raise FileExistsError(f"File '{newFullPath}' already exists.")
301 storageDir = os.path.dirname(newFullPath)
302 if not os.path.isdir(storageDir):
303 # Do not attempt to reverse directory creation
304 # because of race conditions with other processes running
305 # ingest in parallel.
306 safeMakeDir(storageDir)
307 if transfer == "move":
308 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath):
309 shutil.move(fullPath, newFullPath)
310 elif transfer == "copy":
311 with self._transaction.undoWith("copy", os.remove, newFullPath):
312 shutil.copy(fullPath, newFullPath)
313 elif transfer == "link":
314 with self._transaction.undoWith("link", os.unlink, newFullPath):
315 realPath = os.path.realpath(fullPath)
316 # Try hard link and if that fails use a symlink
317 try:
318 os.link(realPath, newFullPath)
319 except OSError:
320 # Read through existing symlinks
321 os.symlink(realPath, newFullPath)
322 elif transfer == "hardlink":
323 with self._transaction.undoWith("hardlink", os.unlink, newFullPath):
324 os.link(os.path.realpath(fullPath), newFullPath)
325 elif transfer == "symlink":
326 with self._transaction.undoWith("symlink", os.unlink, newFullPath):
327 # Read through existing symlinks
328 os.symlink(os.path.realpath(fullPath), newFullPath)
329 elif transfer == "relsymlink": 329 ↛ 340line 329 didn't jump to line 340, because the condition on line 329 was never false
330 # This is a standard symlink but using a relative path
331 fullPath = os.path.realpath(fullPath)
333 # Need the directory name to give to relative root
334 # A full file path confuses it into an extra ../
335 newFullPathRoot, _ = os.path.split(newFullPath)
336 relPath = os.path.relpath(fullPath, newFullPathRoot)
337 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath):
338 os.symlink(relPath, newFullPath)
339 else:
340 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer))
341 path = newPath
342 fullPath = newFullPath
343 if self.useChecksum:
344 checksum = self.computeChecksum(fullPath)
345 else:
346 checksum = None
347 stat = os.stat(fullPath)
348 size = stat.st_size
349 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass,
350 file_size=size, checksum=checksum)
352 @staticmethod
353 def computeChecksum(filename, algorithm="blake2b", block_size=8192):
354 """Compute the checksum of the supplied file.
356 Parameters
357 ----------
358 filename : `str`
359 Name of file to calculate checksum from.
360 algorithm : `str`, optional
361 Name of algorithm to use. Must be one of the algorithms supported
362 by :py:class`hashlib`.
363 block_size : `int`
364 Number of bytes to read from file at one time.
366 Returns
367 -------
368 hexdigest : `str`
369 Hex digest of the file.
370 """
371 if algorithm not in hashlib.algorithms_guaranteed: 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true
372 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm))
374 hasher = hashlib.new(algorithm)
376 with open(filename, "rb") as f:
377 for chunk in iter(lambda: f.read(block_size), b""):
378 hasher.update(chunk)
380 return hasher.hexdigest()
382 def export(self, refs: Iterable[DatasetRef], *,
383 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]:
384 # Docstring inherited from Datastore.export.
385 for ref in refs:
386 location, storedFileInfo = self._get_dataset_location_info(ref)
387 if location is None: 387 ↛ 388line 387 didn't jump to line 388, because the condition on line 387 was never true
388 raise FileNotFoundError(f"Could not retrieve dataset {ref}.")
389 if transfer is None: 389 ↛ 395line 389 didn't jump to line 395, because the condition on line 389 was never false
390 # TODO: do we also need to return the readStorageClass somehow?
391 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter)
392 else:
393 # TODO: add support for other transfer modes. If we support
394 # moving, this method should become transactional.
395 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")