Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""POSIX datastore.""" 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31import shutil 

32from typing import ( 

33 TYPE_CHECKING, 

34 Any, 

35 ClassVar, 

36 Iterable, 

37 Optional, 

38 Type, 

39 Union 

40) 

41 

42from .fileLikeDatastore import FileLikeDatastore 

43from lsst.daf.butler.core.utils import safeMakeDir 

44from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter, DatasetRef 

45 

46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true

47 from .fileLikeDatastore import DatastoreFileGetInformation 

48 from lsst.daf.butler import DatastoreConfig, Location 

49 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

50 

51log = logging.getLogger(__name__) 

52 

53 

54class PosixDatastore(FileLikeDatastore): 

55 """Basic POSIX filesystem backed Datastore. 

56 

57 Parameters 

58 ---------- 

59 config : `DatastoreConfig` or `str` 

60 Configuration. A string should refer to the name of the config file. 

61 bridgeManager : `DatastoreRegistryBridgeManager` 

62 Object that manages the interface between `Registry` and datastores. 

63 butlerRoot : `str`, optional 

64 New datastore root to use to override the configuration value. 

65 

66 Raises 

67 ------ 

68 ValueError 

69 If root location does not exist and ``create`` is `False` in the 

70 configuration. 

71 

72 Notes 

73 ----- 

74 PosixDatastore supports all transfer modes for file-based ingest: 

75 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"` 

76 and `None` (no transfer). 

77 """ 

78 

79 defaultConfigFile: ClassVar[Optional[str]] = "datastores/posixDatastore.yaml" 

80 """Path to configuration defaults. Accessed within the ``config`` resource 

81 or relative to a search path. Can be None if no defaults specified. 

82 """ 

83 

84 def __init__(self, config: Union[DatastoreConfig, str], 

85 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

86 super().__init__(config, bridgeManager, butlerRoot) 

87 

88 # Check that root is a valid URI for this datastore 

89 root = ButlerURI(self.root) 

90 if root.scheme and root.scheme != "file": 90 ↛ 91line 90 didn't jump to line 91, because the condition on line 90 was never true

91 raise ValueError(f"Root location must only be a file URI not {self.root}") 

92 

93 self.root = root.path 

94 if not os.path.isdir(self.root): 

95 if "create" not in self.config or not self.config["create"]: 95 ↛ 96line 95 didn't jump to line 96, because the condition on line 95 was never true

96 raise ValueError(f"No valid root at: {self.root}") 

97 safeMakeDir(self.root) 

98 

99 def _artifact_exists(self, location: Location) -> bool: 

100 """Check that an artifact exists in this datastore at the specified 

101 location. 

102 

103 Parameters 

104 ---------- 

105 location : `Location` 

106 Expected location of the artifact associated with this datastore. 

107 

108 Returns 

109 ------- 

110 exists : `bool` 

111 True if the location can be found, false otherwise. 

112 """ 

113 return os.path.exists(location.path) 

114 

115 def _delete_artifact(self, location: Location) -> None: 

116 """Delete the artifact from the datastore. 

117 

118 Parameters 

119 ---------- 

120 location : `Location` 

121 Location of the artifact associated with this datastore. 

122 """ 

123 os.remove(location.path) 

124 

125 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

126 ref: DatasetRef, isComponent: bool = False) -> Any: 

127 location = getInfo.location 

128 

129 # Too expensive to recalculate the checksum on fetch 

130 # but we can check size and existence 

131 if not os.path.exists(location.path): 

132 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

133 " expected location of {}".format(ref.id, location.path)) 

134 stat = os.stat(location.path) 

135 size = stat.st_size 

136 storedFileInfo = getInfo.info 

137 if size != storedFileInfo.file_size: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true

138 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

139 " match recorded size of {}".format(location.path, size, 

140 storedFileInfo.file_size)) 

141 

142 formatter = getInfo.formatter 

143 try: 

144 result = formatter.read(component=getInfo.component if isComponent else None) 

145 except Exception as e: 

146 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

147 f" ({ref.datasetType.name} from {location.path}): {e}") from e 

148 

149 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

150 isComponent=isComponent) 

151 

152 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

153 # Inherit docstring 

154 

155 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

156 

157 storageDir = os.path.dirname(location.path) 

158 if not os.path.isdir(storageDir): 

159 # Never try to remove this after creating it since there might 

160 # be a butler ingest process running concurrently that will 

161 # already think this directory exists. 

162 safeMakeDir(storageDir) 

163 

164 # Write the file 

165 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

166 

167 if os.path.exists(predictedFullPath): 

168 raise FileExistsError(f"Cannot write file for ref {ref} as " 

169 f"output file {predictedFullPath} already exists") 

170 

171 def _removeFileExists(path: str) -> None: 

172 """Remove a file and do not complain if it is not there. 

173 

174 This is important since a formatter might fail before the file 

175 is written and we should not confuse people by writing spurious 

176 error messages to the log. 

177 """ 

178 try: 

179 os.remove(path) 

180 except FileNotFoundError: 

181 pass 

182 

183 if self._transaction is None: 183 ↛ 184line 183 didn't jump to line 184, because the condition on line 183 was never true

184 raise RuntimeError("Attempting to write dataset without transaction enabled") 

185 

186 formatter_exception = None 

187 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

188 try: 

189 path = formatter.write(inMemoryDataset) 

190 log.debug("Wrote file to %s", path) 

191 except Exception as e: 

192 formatter_exception = e 

193 

194 if formatter_exception: 

195 raise formatter_exception 

196 

197 assert predictedFullPath == os.path.join(self.root, path) 

198 

199 return self._extractIngestInfo(path, ref, formatter=formatter) 

200 

201 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

202 # Docstring inherited from base class 

203 if transfer != "auto": 

204 return transfer 

205 

206 # See if the paths are within the datastore or not 

207 inside = [self._pathInStore(d.path) is not None for d in datasets] 

208 

209 if all(inside): 

210 transfer = None 

211 elif not any(inside): 211 ↛ 214line 211 didn't jump to line 214, because the condition on line 211 was never false

212 transfer = "link" 

213 else: 

214 raise ValueError("Some datasets are inside the datastore and some are outside." 

215 " Please use an explicit transfer mode and not 'auto'.") 

216 

217 return transfer 

218 

219 def _pathInStore(self, path: str) -> Optional[str]: 

220 """Return path relative to datastore root 

221 

222 Parameters 

223 ---------- 

224 path : `str` 

225 Path to dataset. Can be absolute path. Returns path in datastore 

226 or raises an exception if the path it outside. 

227 

228 Returns 

229 ------- 

230 inStore : `str` 

231 Path relative to datastore root. Returns `None` if the file is 

232 outside the root. 

233 """ 

234 if os.path.isabs(path): 

235 absRoot = os.path.abspath(self.root) 

236 if os.path.commonpath([absRoot, path]) != absRoot: 236 ↛ 238line 236 didn't jump to line 238, because the condition on line 236 was never false

237 return None 

238 return os.path.relpath(path, absRoot) 

239 elif path.startswith(os.path.pardir): 

240 return None 

241 return path 

242 

243 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

244 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

245 fullPath = os.path.normpath(os.path.join(self.root, path)) 

246 if not os.path.exists(fullPath): 

247 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

248 f"are assumed to be relative to self.root unless they are absolute.") 

249 if transfer is None: 

250 # Can not reuse path var because of typing 

251 pathx = self._pathInStore(path) 

252 if pathx is None: 

253 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

254 path = pathx 

255 return path 

256 

257 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

258 formatter: Union[Formatter, Type[Formatter]], 

259 transfer: Optional[str] = None) -> StoredFileInfo: 

260 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

261 if self._transaction is None: 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true

262 raise RuntimeError("Ingest called without transaction enabled") 

263 

264 fullPath = os.path.normpath(os.path.join(self.root, path)) 

265 if transfer is not None: 

266 # Work out the name we want this ingested file to have 

267 # inside the datastore 

268 location = self._calculate_ingested_datastore_name(ButlerURI(fullPath), ref, formatter) 

269 

270 newPath = location.pathInStore 

271 newFullPath = location.path 

272 if os.path.exists(newFullPath): 

273 raise FileExistsError(f"File '{newFullPath}' already exists.") 

274 storageDir = os.path.dirname(newFullPath) 

275 if not os.path.isdir(storageDir): 

276 # Do not attempt to reverse directory creation 

277 # because of race conditions with other processes running 

278 # ingest in parallel. 

279 safeMakeDir(storageDir) 

280 if transfer == "move": 

281 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath): 

282 shutil.move(fullPath, newFullPath) 

283 elif transfer == "copy": 

284 with self._transaction.undoWith("copy", os.remove, newFullPath): 

285 shutil.copy(fullPath, newFullPath) 

286 elif transfer == "link": 

287 with self._transaction.undoWith("link", os.unlink, newFullPath): 

288 realPath = os.path.realpath(fullPath) 

289 # Try hard link and if that fails use a symlink 

290 try: 

291 os.link(realPath, newFullPath) 

292 except OSError: 

293 # Read through existing symlinks 

294 os.symlink(realPath, newFullPath) 

295 elif transfer == "hardlink": 

296 with self._transaction.undoWith("hardlink", os.unlink, newFullPath): 

297 os.link(os.path.realpath(fullPath), newFullPath) 

298 elif transfer == "symlink": 

299 with self._transaction.undoWith("symlink", os.unlink, newFullPath): 

300 # Read through existing symlinks 

301 os.symlink(os.path.realpath(fullPath), newFullPath) 

302 elif transfer == "relsymlink": 302 ↛ 313line 302 didn't jump to line 313, because the condition on line 302 was never false

303 # This is a standard symlink but using a relative path 

304 fullPath = os.path.realpath(fullPath) 

305 

306 # Need the directory name to give to relative root 

307 # A full file path confuses it into an extra ../ 

308 newFullPathRoot, _ = os.path.split(newFullPath) 

309 relPath = os.path.relpath(fullPath, newFullPathRoot) 

310 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath): 

311 os.symlink(relPath, newFullPath) 

312 else: 

313 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer)) 

314 path = newPath 

315 fullPath = newFullPath 

316 checksum = self.computeChecksum(fullPath) if self.useChecksum else None 

317 stat = os.stat(fullPath) 

318 size = stat.st_size 

319 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

320 component=ref.datasetType.component(), 

321 file_size=size, checksum=checksum) 

322 

323 @staticmethod 

324 def computeChecksum(filename: str, algorithm: str = "blake2b", block_size: int = 8192) -> str: 

325 """Compute the checksum of the supplied file. 

326 

327 Parameters 

328 ---------- 

329 filename : `str` 

330 Name of file to calculate checksum from. 

331 algorithm : `str`, optional 

332 Name of algorithm to use. Must be one of the algorithms supported 

333 by :py:class`hashlib`. 

334 block_size : `int` 

335 Number of bytes to read from file at one time. 

336 

337 Returns 

338 ------- 

339 hexdigest : `str` 

340 Hex digest of the file. 

341 """ 

342 if algorithm not in hashlib.algorithms_guaranteed: 342 ↛ 343line 342 didn't jump to line 343, because the condition on line 342 was never true

343 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

344 

345 hasher = hashlib.new(algorithm) 

346 

347 with open(filename, "rb") as f: 

348 for chunk in iter(lambda: f.read(block_size), b""): 

349 hasher.update(chunk) 

350 

351 return hasher.hexdigest() 

352 

353 def export(self, refs: Iterable[DatasetRef], *, 

354 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]: 

355 # Docstring inherited from Datastore.export. 

356 for ref in refs: 

357 fileLocations = self._get_dataset_locations_info(ref) 

358 if not fileLocations: 358 ↛ 359line 358 didn't jump to line 359, because the condition on line 358 was never true

359 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

360 # For now we can not export disassembled datasets 

361 if len(fileLocations) > 1: 

362 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

363 location, storedFileInfo = fileLocations[0] 

364 if transfer is None: 364 ↛ 370line 364 didn't jump to line 370, because the condition on line 364 was never false

365 # TODO: do we also need to return the readStorageClass somehow? 

366 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

367 else: 

368 # TODO: add support for other transfer modes. If we support 

369 # moving, this method should become transactional. 

370 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")