Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""POSIX datastore.""" 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31import shutil 

32from typing import ( 

33 TYPE_CHECKING, 

34 Any, 

35 ClassVar, 

36 Iterable, 

37 Optional, 

38 Type, 

39 Union 

40) 

41 

42from .fileLikeDatastore import FileLikeDatastore 

43from lsst.daf.butler.core.utils import safeMakeDir 

44from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter, DatasetRef 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from .fileLikeDatastore import DatastoreFileGetInformation 

48 from lsst.daf.butler import DatastoreConfig, Location 

49 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

50 

51log = logging.getLogger(__name__) 

52 

53 

54class PosixDatastore(FileLikeDatastore): 

55 """Basic POSIX filesystem backed Datastore. 

56 

57 Parameters 

58 ---------- 

59 config : `DatastoreConfig` or `str` 

60 Configuration. A string should refer to the name of the config file. 

61 bridgeManager : `DatastoreRegistryBridgeManager` 

62 Object that manages the interface between `Registry` and datastores. 

63 butlerRoot : `str`, optional 

64 New datastore root to use to override the configuration value. 

65 

66 Raises 

67 ------ 

68 ValueError 

69 If root location does not exist and ``create`` is `False` in the 

70 configuration. 

71 

72 Notes 

73 ----- 

74 PosixDatastore supports all transfer modes for file-based ingest: 

75 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"` 

76 and `None` (no transfer). 

77 """ 

78 

79 defaultConfigFile: ClassVar[Optional[str]] = "datastores/posixDatastore.yaml" 

80 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

81 absolute path. Can be None if no defaults specified. 

82 """ 

83 

84 def __init__(self, config: Union[DatastoreConfig, str], 

85 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

86 super().__init__(config, bridgeManager, butlerRoot) 

87 

88 # Check that root is a valid URI for this datastore 

89 root = ButlerURI(self.root) 

90 if root.scheme and root.scheme != "file": 90 ↛ 91line 90 didn't jump to line 91, because the condition on line 90 was never true

91 raise ValueError(f"Root location must only be a file URI not {self.root}") 

92 

93 self.root = root.path 

94 if not os.path.isdir(self.root): 

95 if "create" not in self.config or not self.config["create"]: 95 ↛ 96line 95 didn't jump to line 96, because the condition on line 95 was never true

96 raise ValueError(f"No valid root at: {self.root}") 

97 safeMakeDir(self.root) 

98 

99 def _artifact_exists(self, location: Location) -> bool: 

100 """Check that an artifact exists in this datastore at the specified 

101 location. 

102 

103 Parameters 

104 ---------- 

105 location : `Location` 

106 Expected location of the artifact associated with this datastore. 

107 

108 Returns 

109 ------- 

110 exists : `bool` 

111 True if the location can be found, false otherwise. 

112 """ 

113 return os.path.exists(location.path) 

114 

115 def _delete_artifact(self, location: Location) -> None: 

116 """Delete the artifact from the datastore. 

117 

118 Parameters 

119 ---------- 

120 location : `Location` 

121 Location of the artifact associated with this datastore. 

122 """ 

123 os.remove(location.path) 

124 

125 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

126 ref: DatasetRef, isComponent: bool = False) -> Any: 

127 location = getInfo.location 

128 

129 # Too expensive to recalculate the checksum on fetch 

130 # but we can check size and existence 

131 if not os.path.exists(location.path): 

132 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

133 " expected location of {}".format(ref.id, location.path)) 

134 stat = os.stat(location.path) 

135 size = stat.st_size 

136 storedFileInfo = getInfo.info 

137 if size != storedFileInfo.file_size: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true

138 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

139 " match recorded size of {}".format(location.path, size, 

140 storedFileInfo.file_size)) 

141 

142 formatter = getInfo.formatter 

143 try: 

144 result = formatter.read(component=getInfo.component if isComponent else None) 

145 except Exception as e: 

146 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

147 f" ({ref.datasetType.name} from {location.path}): {e}") from e 

148 

149 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

150 isComponent=isComponent) 

151 

152 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

153 # Inherit docstring 

154 

155 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

156 

157 storageDir = os.path.dirname(location.path) 

158 if not os.path.isdir(storageDir): 

159 # Never try to remove this after creating it since there might 

160 # be a butler ingest process running concurrently that will 

161 # already think this directory exists. 

162 safeMakeDir(storageDir) 

163 

164 # Write the file 

165 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

166 

167 if os.path.exists(predictedFullPath): 

168 raise FileExistsError(f"Cannot write file for ref {ref} as " 

169 f"output file {predictedFullPath} already exists") 

170 

171 def _removeFileExists(path: str) -> None: 

172 """Remove a file and do not complain if it is not there. 

173 

174 This is important since a formatter might fail before the file 

175 is written and we should not confuse people by writing spurious 

176 error messages to the log. 

177 """ 

178 try: 

179 os.remove(path) 

180 except FileNotFoundError: 

181 pass 

182 

183 if self._transaction is None: 183 ↛ 184line 183 didn't jump to line 184, because the condition on line 183 was never true

184 raise RuntimeError("Attempting to write dataset without transaction enabled") 

185 

186 formatter_exception = None 

187 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

188 try: 

189 path = formatter.write(inMemoryDataset) 

190 log.debug("Wrote file to %s", path) 

191 except Exception as e: 

192 formatter_exception = e 

193 

194 if formatter_exception: 

195 raise formatter_exception 

196 

197 assert predictedFullPath == os.path.join(self.root, path) 

198 

199 return self._extractIngestInfo(path, ref, formatter=formatter) 

200 

201 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

202 # Docstring inherited from base class 

203 if transfer != "auto": 

204 return transfer 

205 

206 # See if the paths are within the datastore or not 

207 inside = [self._pathInStore(d.path) is not None for d in datasets] 

208 

209 if all(inside): 

210 transfer = None 

211 elif not any(inside): 211 ↛ 214line 211 didn't jump to line 214, because the condition on line 211 was never false

212 transfer = "link" 

213 else: 

214 raise ValueError("Some datasets are inside the datastore and some are outside." 

215 " Please use an explicit transfer mode and not 'auto'.") 

216 

217 return transfer 

218 

219 def _pathInStore(self, path: str) -> Optional[str]: 

220 """Return path relative to datastore root 

221 

222 Parameters 

223 ---------- 

224 path : `str` 

225 Path to dataset. Can be absolute path. Returns path in datastore 

226 or raises an exception if the path it outside. 

227 

228 Returns 

229 ------- 

230 inStore : `str` 

231 Path relative to datastore root. Returns `None` if the file is 

232 outside the root. 

233 """ 

234 if os.path.isabs(path): 

235 absRoot = os.path.abspath(self.root) 

236 if os.path.commonpath([absRoot, path]) != absRoot: 236 ↛ 238line 236 didn't jump to line 238, because the condition on line 236 was never false

237 return None 

238 return os.path.relpath(path, absRoot) 

239 elif path.startswith(os.path.pardir): 

240 return None 

241 return path 

242 

243 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

244 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

245 fullPath = os.path.normpath(os.path.join(self.root, path)) 

246 if not os.path.exists(fullPath): 

247 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

248 f"are assumed to be relative to self.root unless they are absolute.") 

249 if transfer is None: 

250 # Can not reuse path var because of typing 

251 pathx = self._pathInStore(path) 

252 if pathx is None: 

253 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

254 path = pathx 

255 return path 

256 

257 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

258 formatter: Union[Formatter, Type[Formatter]], 

259 transfer: Optional[str] = None) -> StoredFileInfo: 

260 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

261 if self._transaction is None: 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true

262 raise RuntimeError("Ingest called without transaction enabled") 

263 

264 fullPath = os.path.normpath(os.path.join(self.root, path)) 

265 if transfer is not None: 

266 template = self.templates.getTemplate(ref) 

267 location = self.locationFactory.fromPath(template.format(ref)) 

268 newPath = formatter.predictPathFromLocation(location) 

269 newFullPath = os.path.join(self.root, newPath) 

270 if os.path.exists(newFullPath): 

271 raise FileExistsError(f"File '{newFullPath}' already exists.") 

272 storageDir = os.path.dirname(newFullPath) 

273 if not os.path.isdir(storageDir): 

274 # Do not attempt to reverse directory creation 

275 # because of race conditions with other processes running 

276 # ingest in parallel. 

277 safeMakeDir(storageDir) 

278 if transfer == "move": 

279 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath): 

280 shutil.move(fullPath, newFullPath) 

281 elif transfer == "copy": 

282 with self._transaction.undoWith("copy", os.remove, newFullPath): 

283 shutil.copy(fullPath, newFullPath) 

284 elif transfer == "link": 

285 with self._transaction.undoWith("link", os.unlink, newFullPath): 

286 realPath = os.path.realpath(fullPath) 

287 # Try hard link and if that fails use a symlink 

288 try: 

289 os.link(realPath, newFullPath) 

290 except OSError: 

291 # Read through existing symlinks 

292 os.symlink(realPath, newFullPath) 

293 elif transfer == "hardlink": 

294 with self._transaction.undoWith("hardlink", os.unlink, newFullPath): 

295 os.link(os.path.realpath(fullPath), newFullPath) 

296 elif transfer == "symlink": 

297 with self._transaction.undoWith("symlink", os.unlink, newFullPath): 

298 # Read through existing symlinks 

299 os.symlink(os.path.realpath(fullPath), newFullPath) 

300 elif transfer == "relsymlink": 300 ↛ 311line 300 didn't jump to line 311, because the condition on line 300 was never false

301 # This is a standard symlink but using a relative path 

302 fullPath = os.path.realpath(fullPath) 

303 

304 # Need the directory name to give to relative root 

305 # A full file path confuses it into an extra ../ 

306 newFullPathRoot, _ = os.path.split(newFullPath) 

307 relPath = os.path.relpath(fullPath, newFullPathRoot) 

308 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath): 

309 os.symlink(relPath, newFullPath) 

310 else: 

311 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer)) 

312 path = newPath 

313 fullPath = newFullPath 

314 checksum = self.computeChecksum(fullPath) if self.useChecksum else None 

315 stat = os.stat(fullPath) 

316 size = stat.st_size 

317 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

318 component=ref.datasetType.component(), 

319 file_size=size, checksum=checksum) 

320 

321 @staticmethod 

322 def computeChecksum(filename: str, algorithm: str = "blake2b", block_size: int = 8192) -> str: 

323 """Compute the checksum of the supplied file. 

324 

325 Parameters 

326 ---------- 

327 filename : `str` 

328 Name of file to calculate checksum from. 

329 algorithm : `str`, optional 

330 Name of algorithm to use. Must be one of the algorithms supported 

331 by :py:class`hashlib`. 

332 block_size : `int` 

333 Number of bytes to read from file at one time. 

334 

335 Returns 

336 ------- 

337 hexdigest : `str` 

338 Hex digest of the file. 

339 """ 

340 if algorithm not in hashlib.algorithms_guaranteed: 340 ↛ 341line 340 didn't jump to line 341, because the condition on line 340 was never true

341 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

342 

343 hasher = hashlib.new(algorithm) 

344 

345 with open(filename, "rb") as f: 

346 for chunk in iter(lambda: f.read(block_size), b""): 

347 hasher.update(chunk) 

348 

349 return hasher.hexdigest() 

350 

351 def export(self, refs: Iterable[DatasetRef], *, 

352 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]: 

353 # Docstring inherited from Datastore.export. 

354 for ref in refs: 

355 fileLocations = self._get_dataset_locations_info(ref) 

356 if not fileLocations: 356 ↛ 357line 356 didn't jump to line 357, because the condition on line 356 was never true

357 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

358 # For now we can not export disassembled datasets 

359 if len(fileLocations) > 1: 

360 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

361 location, storedFileInfo = fileLocations[0] 

362 if transfer is None: 362 ↛ 368line 362 didn't jump to line 368, because the condition on line 362 was never false

363 # TODO: do we also need to return the readStorageClass somehow? 

364 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

365 else: 

366 # TODO: add support for other transfer modes. If we support 

367 # moving, this method should become transactional. 

368 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")