Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""POSIX datastore.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31import shutil 

32from typing import Iterable, Optional, Type 

33 

34from .fileLikeDatastore import FileLikeDatastore 

35from lsst.daf.butler.core.safeFileIo import safeMakeDir 

36from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter, DatasetRef 

37 

38log = logging.getLogger(__name__) 

39 

40 

41class PosixDatastore(FileLikeDatastore): 

42 """Basic POSIX filesystem backed Datastore. 

43 

44 Parameters 

45 ---------- 

46 config : `DatastoreConfig` or `str` 

47 Configuration. A string should refer to the name of the config file. 

48 bridgeManager : `DatastoreRegistryBridgeManager` 

49 Object that manages the interface between `Registry` and datastores. 

50 butlerRoot : `str`, optional 

51 New datastore root to use to override the configuration value. 

52 

53 Raises 

54 ------ 

55 ValueError 

56 If root location does not exist and ``create`` is `False` in the 

57 configuration. 

58 

59 Notes 

60 ----- 

61 PosixDatastore supports all transfer modes for file-based ingest: 

62 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"` 

63 and `None` (no transfer). 

64 """ 

65 

66 defaultConfigFile = "datastores/posixDatastore.yaml" 

67 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

68 absolute path. Can be None if no defaults specified. 

69 """ 

70 

71 def __init__(self, config, bridgeManager, butlerRoot=None): 

72 super().__init__(config, bridgeManager, butlerRoot) 

73 

74 # Check that root is a valid URI for this datastore 

75 root = ButlerURI(self.root) 

76 if root.scheme and root.scheme != "file": 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true

77 raise ValueError(f"Root location must only be a file URI not {self.root}") 

78 

79 self.root = root.path 

80 if not os.path.isdir(self.root): 

81 if "create" not in self.config or not self.config["create"]: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true

82 raise ValueError(f"No valid root at: {self.root}") 

83 safeMakeDir(self.root) 

84 

85 def _artifact_exists(self, location): 

86 """Check that an artifact exists in this datastore at the specified 

87 location. 

88 

89 Parameters 

90 ---------- 

91 location : `Location` 

92 Expected location of the artifact associated with this datastore. 

93 

94 Returns 

95 ------- 

96 exists : `bool` 

97 True if the location can be found, false otherwise. 

98 """ 

99 return os.path.exists(location.path) 

100 

101 def _delete_artifact(self, location): 

102 """Delete the artifact from the datastore. 

103 

104 Parameters 

105 ---------- 

106 location : `Location` 

107 Location of the artifact associated with this datastore. 

108 """ 

109 os.remove(location.path) 

110 

111 def _read_artifact_into_memory(self, getInfo, ref, isComponent=False): 

112 location = getInfo.location 

113 

114 # Too expensive to recalculate the checksum on fetch 

115 # but we can check size and existence 

116 if not os.path.exists(location.path): 

117 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

118 " expected location of {}".format(ref.id, location.path)) 

119 stat = os.stat(location.path) 

120 size = stat.st_size 

121 storedFileInfo = getInfo.info 

122 if size != storedFileInfo.file_size: 122 ↛ 123line 122 didn't jump to line 123, because the condition on line 122 was never true

123 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

124 " match recorded size of {}".format(location.path, size, 

125 storedFileInfo.file_size)) 

126 

127 formatter = getInfo.formatter 

128 try: 

129 result = formatter.read(component=getInfo.component if isComponent else None) 

130 except Exception as e: 

131 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

132 f" ({ref.datasetType.name} from {location.path}): {e}") from e 

133 

134 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

135 isComponent=isComponent) 

136 

137 def _write_in_memory_to_artifact(self, inMemoryDataset, ref): 

138 # Inherit docstring 

139 

140 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

141 

142 storageDir = os.path.dirname(location.path) 

143 if not os.path.isdir(storageDir): 

144 # Never try to remove this after creating it since there might 

145 # be a butler ingest process running concurrently that will 

146 # already think this directory exists. 

147 safeMakeDir(storageDir) 

148 

149 # Write the file 

150 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

151 

152 if os.path.exists(predictedFullPath): 

153 raise FileExistsError(f"Cannot write file for ref {ref} as " 

154 f"output file {predictedFullPath} already exists") 

155 

156 def _removeFileExists(path): 

157 """Remove a file and do not complain if it is not there. 

158 

159 This is important since a formatter might fail before the file 

160 is written and we should not confuse people by writing spurious 

161 error messages to the log. 

162 """ 

163 try: 

164 os.remove(path) 

165 except FileNotFoundError: 

166 pass 

167 

168 formatter_exception = None 

169 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

170 try: 

171 path = formatter.write(inMemoryDataset) 

172 log.debug("Wrote file to %s", path) 

173 except Exception as e: 

174 formatter_exception = e 

175 

176 if formatter_exception: 

177 raise formatter_exception 

178 

179 assert predictedFullPath == os.path.join(self.root, path) 

180 

181 return self._extractIngestInfo(path, ref, formatter=formatter) 

182 

183 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str: 

184 # Docstring inherited from base class 

185 if transfer != "auto": 

186 return transfer 

187 

188 # See if the paths are within the datastore or not 

189 inside = [self._pathInStore(d.path) is not None for d in datasets] 

190 

191 if all(inside): 

192 transfer = None 

193 elif not any(inside): 193 ↛ 196line 193 didn't jump to line 196, because the condition on line 193 was never false

194 transfer = "link" 

195 else: 

196 raise ValueError("Some datasets are inside the datastore and some are outside." 

197 " Please use an explicit transfer mode and not 'auto'.") 

198 

199 return transfer 

200 

201 def _pathInStore(self, path: str) -> str: 

202 """Return path relative to datastore root 

203 

204 Parameters 

205 ---------- 

206 path : `str` 

207 Path to dataset. Can be absolute path. Returns path in datastore 

208 or raises an exception if the path it outside. 

209 

210 Returns 

211 ------- 

212 inStore : `str` 

213 Path relative to datastore root. Returns `None` if the file is 

214 outside the root. 

215 """ 

216 if os.path.isabs(path): 

217 absRoot = os.path.abspath(self.root) 

218 if os.path.commonpath([absRoot, path]) != absRoot: 218 ↛ 220line 218 didn't jump to line 220, because the condition on line 218 was never false

219 return None 

220 return os.path.relpath(path, absRoot) 

221 elif path.startswith(os.path.pardir): 

222 return None 

223 return path 

224 

225 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

226 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

227 fullPath = os.path.normpath(os.path.join(self.root, path)) 

228 if not os.path.exists(fullPath): 

229 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

230 f"are assumed to be relative to self.root unless they are absolute.") 

231 if transfer is None: 

232 path = self._pathInStore(path) 

233 if path is None: 

234 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

235 return path 

236 

237 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

238 transfer: Optional[str] = None) -> StoredFileInfo: 

239 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

240 fullPath = os.path.normpath(os.path.join(self.root, path)) 

241 if transfer is not None: 

242 template = self.templates.getTemplate(ref) 

243 location = self.locationFactory.fromPath(template.format(ref)) 

244 newPath = formatter.predictPathFromLocation(location) 

245 newFullPath = os.path.join(self.root, newPath) 

246 if os.path.exists(newFullPath): 

247 raise FileExistsError(f"File '{newFullPath}' already exists.") 

248 storageDir = os.path.dirname(newFullPath) 

249 if not os.path.isdir(storageDir): 

250 # Do not attempt to reverse directory creation 

251 # because of race conditions with other processes running 

252 # ingest in parallel. 

253 safeMakeDir(storageDir) 

254 if transfer == "move": 

255 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath): 

256 shutil.move(fullPath, newFullPath) 

257 elif transfer == "copy": 

258 with self._transaction.undoWith("copy", os.remove, newFullPath): 

259 shutil.copy(fullPath, newFullPath) 

260 elif transfer == "link": 

261 with self._transaction.undoWith("link", os.unlink, newFullPath): 

262 realPath = os.path.realpath(fullPath) 

263 # Try hard link and if that fails use a symlink 

264 try: 

265 os.link(realPath, newFullPath) 

266 except OSError: 

267 # Read through existing symlinks 

268 os.symlink(realPath, newFullPath) 

269 elif transfer == "hardlink": 

270 with self._transaction.undoWith("hardlink", os.unlink, newFullPath): 

271 os.link(os.path.realpath(fullPath), newFullPath) 

272 elif transfer == "symlink": 

273 with self._transaction.undoWith("symlink", os.unlink, newFullPath): 

274 # Read through existing symlinks 

275 os.symlink(os.path.realpath(fullPath), newFullPath) 

276 elif transfer == "relsymlink": 276 ↛ 287line 276 didn't jump to line 287, because the condition on line 276 was never false

277 # This is a standard symlink but using a relative path 

278 fullPath = os.path.realpath(fullPath) 

279 

280 # Need the directory name to give to relative root 

281 # A full file path confuses it into an extra ../ 

282 newFullPathRoot, _ = os.path.split(newFullPath) 

283 relPath = os.path.relpath(fullPath, newFullPathRoot) 

284 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath): 

285 os.symlink(relPath, newFullPath) 

286 else: 

287 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer)) 

288 path = newPath 

289 fullPath = newFullPath 

290 if self.useChecksum: 

291 checksum = self.computeChecksum(fullPath) 

292 else: 

293 checksum = None 

294 stat = os.stat(fullPath) 

295 size = stat.st_size 

296 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

297 component=ref.datasetType.component(), 

298 file_size=size, checksum=checksum) 

299 

300 @staticmethod 

301 def computeChecksum(filename, algorithm="blake2b", block_size=8192): 

302 """Compute the checksum of the supplied file. 

303 

304 Parameters 

305 ---------- 

306 filename : `str` 

307 Name of file to calculate checksum from. 

308 algorithm : `str`, optional 

309 Name of algorithm to use. Must be one of the algorithms supported 

310 by :py:class`hashlib`. 

311 block_size : `int` 

312 Number of bytes to read from file at one time. 

313 

314 Returns 

315 ------- 

316 hexdigest : `str` 

317 Hex digest of the file. 

318 """ 

319 if algorithm not in hashlib.algorithms_guaranteed: 319 ↛ 320line 319 didn't jump to line 320, because the condition on line 319 was never true

320 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

321 

322 hasher = hashlib.new(algorithm) 

323 

324 with open(filename, "rb") as f: 

325 for chunk in iter(lambda: f.read(block_size), b""): 

326 hasher.update(chunk) 

327 

328 return hasher.hexdigest() 

329 

330 def export(self, refs: Iterable[DatasetRef], *, 

331 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]: 

332 # Docstring inherited from Datastore.export. 

333 for ref in refs: 

334 fileLocations = self._get_dataset_locations_info(ref) 

335 if not fileLocations: 335 ↛ 336line 335 didn't jump to line 336, because the condition on line 335 was never true

336 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

337 # For now we can not export disassembled datasets 

338 if len(fileLocations) > 1: 

339 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

340 location, storedFileInfo = fileLocations[0] 

341 if transfer is None: 341 ↛ 347line 341 didn't jump to line 347, because the condition on line 341 was never false

342 # TODO: do we also need to return the readStorageClass somehow? 

343 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

344 else: 

345 # TODO: add support for other transfer modes. If we support 

346 # moving, this method should become transactional. 

347 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")