Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""POSIX datastore.""" 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Optional, 

36 Type, 

37 Union 

38) 

39 

40from .fileLikeDatastore import FileLikeDatastore 

41from lsst.daf.butler.core.utils import safeMakeDir 

42from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter, DatasetRef 

43 

44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true

45 from .fileLikeDatastore import DatastoreFileGetInformation 

46 from lsst.daf.butler import DatastoreConfig, Location 

47 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

48 

49log = logging.getLogger(__name__) 

50 

51 

52class PosixDatastore(FileLikeDatastore): 

53 """Basic POSIX filesystem backed Datastore. 

54 

55 Parameters 

56 ---------- 

57 config : `DatastoreConfig` or `str` 

58 Configuration. A string should refer to the name of the config file. 

59 bridgeManager : `DatastoreRegistryBridgeManager` 

60 Object that manages the interface between `Registry` and datastores. 

61 butlerRoot : `str`, optional 

62 New datastore root to use to override the configuration value. 

63 

64 Raises 

65 ------ 

66 ValueError 

67 If root location does not exist and ``create`` is `False` in the 

68 configuration. 

69 

70 Notes 

71 ----- 

72 PosixDatastore supports all transfer modes for file-based ingest: 

73 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"` 

74 and `None` (no transfer). 

75 """ 

76 

77 defaultConfigFile: ClassVar[Optional[str]] = "datastores/posixDatastore.yaml" 

78 """Path to configuration defaults. Accessed within the ``config`` resource 

79 or relative to a search path. Can be None if no defaults specified. 

80 """ 

81 

82 def __init__(self, config: Union[DatastoreConfig, str], 

83 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

84 super().__init__(config, bridgeManager, butlerRoot) 

85 

86 # Check that root is a valid URI for this datastore 

87 root = ButlerURI(self.root, forceDirectory=True) 

88 if root.scheme and root.scheme != "file": 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true

89 raise ValueError(f"Root location must only be a file URI not {self.root}") 

90 

91 self.root = root.ospath 

92 if not os.path.isdir(self.root): 

93 if "create" not in self.config or not self.config["create"]: 93 ↛ 94line 93 didn't jump to line 94, because the condition on line 93 was never true

94 raise ValueError(f"No valid root at: {self.root}") 

95 safeMakeDir(self.root) 

96 

97 def _artifact_exists(self, location: Location) -> bool: 

98 """Check that an artifact exists in this datastore at the specified 

99 location. 

100 

101 Parameters 

102 ---------- 

103 location : `Location` 

104 Expected location of the artifact associated with this datastore. 

105 

106 Returns 

107 ------- 

108 exists : `bool` 

109 True if the location can be found, false otherwise. 

110 """ 

111 return os.path.exists(location.path) 

112 

113 def _delete_artifact(self, location: Location) -> None: 

114 """Delete the artifact from the datastore. 

115 

116 Parameters 

117 ---------- 

118 location : `Location` 

119 Location of the artifact associated with this datastore. 

120 """ 

121 os.remove(location.path) 

122 

123 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

124 ref: DatasetRef, isComponent: bool = False) -> Any: 

125 location = getInfo.location 

126 

127 # Too expensive to recalculate the checksum on fetch 

128 # but we can check size and existence 

129 if not os.path.exists(location.path): 

130 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

131 " expected location of {}".format(ref.id, location.path)) 

132 stat = os.stat(location.path) 

133 size = stat.st_size 

134 storedFileInfo = getInfo.info 

135 if size != storedFileInfo.file_size: 135 ↛ 136line 135 didn't jump to line 136, because the condition on line 135 was never true

136 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

137 " match recorded size of {}".format(location.path, size, 

138 storedFileInfo.file_size)) 

139 

140 formatter = getInfo.formatter 

141 try: 

142 log.debug("Reading %s from location %s with formatter %s", 

143 f"component {getInfo.component}" if isComponent else "", 

144 location.uri, type(formatter).__name__) 

145 result = formatter.read(component=getInfo.component if isComponent else None) 

146 except Exception as e: 

147 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

148 f" ({ref.datasetType.name} from {location.path}): {e}") from e 

149 

150 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

151 isComponent=isComponent) 

152 

153 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

154 # Inherit docstring 

155 

156 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

157 

158 storageDir = os.path.dirname(location.path) 

159 if not os.path.isdir(storageDir): 

160 # Never try to remove this after creating it since there might 

161 # be a butler ingest process running concurrently that will 

162 # already think this directory exists. 

163 safeMakeDir(storageDir) 

164 

165 # Write the file 

166 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

167 

168 if os.path.exists(predictedFullPath): 

169 raise FileExistsError(f"Cannot write file for ref {ref} as " 

170 f"output file {predictedFullPath} already exists") 

171 

172 def _removeFileExists(path: str) -> None: 

173 """Remove a file and do not complain if it is not there. 

174 

175 This is important since a formatter might fail before the file 

176 is written and we should not confuse people by writing spurious 

177 error messages to the log. 

178 """ 

179 try: 

180 os.remove(path) 

181 except FileNotFoundError: 

182 pass 

183 

184 if self._transaction is None: 184 ↛ 185line 184 didn't jump to line 185, because the condition on line 184 was never true

185 raise RuntimeError("Attempting to write dataset without transaction enabled") 

186 

187 formatter_exception = None 

188 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

189 try: 

190 path = formatter.write(inMemoryDataset) 

191 log.debug("Wrote file to %s", path) 

192 except Exception as e: 

193 formatter_exception = e 

194 

195 if formatter_exception: 

196 raise formatter_exception 

197 

198 assert predictedFullPath == os.path.join(self.root, path) 

199 

200 return self._extractIngestInfo(path, ref, formatter=formatter) 

201 

202 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

203 # Docstring inherited from base class 

204 if transfer != "auto": 

205 return transfer 

206 

207 # See if the paths are within the datastore or not 

208 inside = [self._pathInStore(d.path) is not None for d in datasets] 

209 

210 if all(inside): 

211 transfer = None 

212 elif not any(inside): 212 ↛ 215line 212 didn't jump to line 215, because the condition on line 212 was never false

213 transfer = "link" 

214 else: 

215 raise ValueError("Some datasets are inside the datastore and some are outside." 

216 " Please use an explicit transfer mode and not 'auto'.") 

217 

218 return transfer 

219 

220 def _pathInStore(self, path: str) -> Optional[str]: 

221 """Return path relative to datastore root 

222 

223 Parameters 

224 ---------- 

225 path : `str` 

226 Path to dataset. Can be absolute path. Returns path in datastore 

227 or raises an exception if the path it outside. 

228 

229 Returns 

230 ------- 

231 inStore : `str` 

232 Path relative to datastore root. Returns `None` if the file is 

233 outside the root. 

234 """ 

235 pathUri = ButlerURI(path, forceAbsolute=False) 

236 rootUri = ButlerURI(self.root, forceDirectory=True, forceAbsolute=True) 

237 return pathUri.relative_to(rootUri) 

238 

239 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

240 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

241 fullPath = os.path.normpath(os.path.join(self.root, path)) 

242 if not os.path.exists(fullPath): 

243 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

244 f"are assumed to be relative to self.root unless they are absolute.") 

245 if transfer is None: 

246 # Can not reuse path var because of typing 

247 pathx = self._pathInStore(path) 

248 if pathx is None: 

249 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

250 path = pathx 

251 return path 

252 

253 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

254 formatter: Union[Formatter, Type[Formatter]], 

255 transfer: Optional[str] = None) -> StoredFileInfo: 

256 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

257 if self._transaction is None: 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true

258 raise RuntimeError("Ingest called without transaction enabled") 

259 

260 # Calculate the full path to the source 

261 srcUri = ButlerURI(path, root=self.root, forceAbsolute=True) 

262 if transfer is None: 

263 # File should exist already 

264 rootUri = ButlerURI(self.root, forceDirectory=True) 

265 pathInStore = srcUri.relative_to(rootUri) 

266 if pathInStore is None: 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true

267 raise RuntimeError(f"Unexpectedly learned that {srcUri} is not within datastore {rootUri}") 

268 if not rootUri.exists(): 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true

269 raise RuntimeError(f"Unexpectedly discovered that {srcUri} does not exist inside datastore" 

270 f" {rootUri}") 

271 path = pathInStore 

272 fullPath = srcUri.ospath 

273 elif transfer is not None: 273 ↛ 282line 273 didn't jump to line 282, because the condition on line 273 was never false

274 # Work out the name we want this ingested file to have 

275 # inside the datastore 

276 location = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

277 path = location.pathInStore 

278 fullPath = location.path 

279 targetUri = ButlerURI(location.uri) 

280 targetUri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

281 

282 checksum = self.computeChecksum(fullPath) if self.useChecksum else None 

283 stat = os.stat(fullPath) 

284 size = stat.st_size 

285 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

286 component=ref.datasetType.component(), 

287 file_size=size, checksum=checksum) 

288 

289 @staticmethod 

290 def computeChecksum(filename: str, algorithm: str = "blake2b", block_size: int = 8192) -> str: 

291 """Compute the checksum of the supplied file. 

292 

293 Parameters 

294 ---------- 

295 filename : `str` 

296 Name of file to calculate checksum from. 

297 algorithm : `str`, optional 

298 Name of algorithm to use. Must be one of the algorithms supported 

299 by :py:class`hashlib`. 

300 block_size : `int` 

301 Number of bytes to read from file at one time. 

302 

303 Returns 

304 ------- 

305 hexdigest : `str` 

306 Hex digest of the file. 

307 """ 

308 if algorithm not in hashlib.algorithms_guaranteed: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true

309 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

310 

311 hasher = hashlib.new(algorithm) 

312 

313 with open(filename, "rb") as f: 

314 for chunk in iter(lambda: f.read(block_size), b""): 

315 hasher.update(chunk) 

316 

317 return hasher.hexdigest()