Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""POSIX datastore.""" 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Optional, 

36 Type, 

37 Union 

38) 

39 

40from .fileLikeDatastore import FileLikeDatastore 

41from lsst.daf.butler.core.utils import safeMakeDir 

42from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter, DatasetRef 

43 

44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true

45 from .fileLikeDatastore import DatastoreFileGetInformation 

46 from lsst.daf.butler import DatastoreConfig, Location 

47 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

48 

49log = logging.getLogger(__name__) 

50 

51 

52class PosixDatastore(FileLikeDatastore): 

53 """Basic POSIX filesystem backed Datastore. 

54 

55 Parameters 

56 ---------- 

57 config : `DatastoreConfig` or `str` 

58 Configuration. A string should refer to the name of the config file. 

59 bridgeManager : `DatastoreRegistryBridgeManager` 

60 Object that manages the interface between `Registry` and datastores. 

61 butlerRoot : `str`, optional 

62 New datastore root to use to override the configuration value. 

63 

64 Raises 

65 ------ 

66 ValueError 

67 If root location does not exist and ``create`` is `False` in the 

68 configuration. 

69 

70 Notes 

71 ----- 

72 PosixDatastore supports all transfer modes for file-based ingest: 

73 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"` 

74 and `None` (no transfer). 

75 """ 

76 

77 defaultConfigFile: ClassVar[Optional[str]] = "datastores/posixDatastore.yaml" 

78 """Path to configuration defaults. Accessed within the ``config`` resource 

79 or relative to a search path. Can be None if no defaults specified. 

80 """ 

81 

82 def __init__(self, config: Union[DatastoreConfig, str], 

83 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

84 super().__init__(config, bridgeManager, butlerRoot) 

85 

86 # Check that root is a valid URI for this datastore 

87 root = ButlerURI(self.root, forceDirectory=True) 

88 if root.scheme and root.scheme != "file": 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true

89 raise ValueError(f"Root location must only be a file URI not {self.root}") 

90 

91 self.root = root.ospath 

92 if not os.path.isdir(self.root): 

93 if "create" not in self.config or not self.config["create"]: 93 ↛ 94line 93 didn't jump to line 94, because the condition on line 93 was never true

94 raise ValueError(f"No valid root at: {self.root}") 

95 safeMakeDir(self.root) 

96 

97 def _artifact_exists(self, location: Location) -> bool: 

98 """Check that an artifact exists in this datastore at the specified 

99 location. 

100 

101 Parameters 

102 ---------- 

103 location : `Location` 

104 Expected location of the artifact associated with this datastore. 

105 

106 Returns 

107 ------- 

108 exists : `bool` 

109 True if the location can be found, false otherwise. 

110 """ 

111 return os.path.exists(location.path) 

112 

113 def _delete_artifact(self, location: Location) -> None: 

114 """Delete the artifact from the datastore. 

115 

116 Parameters 

117 ---------- 

118 location : `Location` 

119 Location of the artifact associated with this datastore. 

120 """ 

121 os.remove(location.path) 

122 

123 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

124 ref: DatasetRef, isComponent: bool = False) -> Any: 

125 location = getInfo.location 

126 

127 # Too expensive to recalculate the checksum on fetch 

128 # but we can check size and existence 

129 if not os.path.exists(location.path): 

130 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

131 " expected location of {}".format(ref.id, location.path)) 

132 stat = os.stat(location.path) 

133 size = stat.st_size 

134 storedFileInfo = getInfo.info 

135 if size != storedFileInfo.file_size: 135 ↛ 136line 135 didn't jump to line 136, because the condition on line 135 was never true

136 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

137 " match recorded size of {}".format(location.path, size, 

138 storedFileInfo.file_size)) 

139 

140 formatter = getInfo.formatter 

141 try: 

142 result = formatter.read(component=getInfo.component if isComponent else None) 

143 except Exception as e: 

144 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

145 f" ({ref.datasetType.name} from {location.path}): {e}") from e 

146 

147 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

148 isComponent=isComponent) 

149 

150 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

151 # Inherit docstring 

152 

153 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

154 

155 storageDir = os.path.dirname(location.path) 

156 if not os.path.isdir(storageDir): 

157 # Never try to remove this after creating it since there might 

158 # be a butler ingest process running concurrently that will 

159 # already think this directory exists. 

160 safeMakeDir(storageDir) 

161 

162 # Write the file 

163 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

164 

165 if os.path.exists(predictedFullPath): 

166 raise FileExistsError(f"Cannot write file for ref {ref} as " 

167 f"output file {predictedFullPath} already exists") 

168 

169 def _removeFileExists(path: str) -> None: 

170 """Remove a file and do not complain if it is not there. 

171 

172 This is important since a formatter might fail before the file 

173 is written and we should not confuse people by writing spurious 

174 error messages to the log. 

175 """ 

176 try: 

177 os.remove(path) 

178 except FileNotFoundError: 

179 pass 

180 

181 if self._transaction is None: 181 ↛ 182line 181 didn't jump to line 182, because the condition on line 181 was never true

182 raise RuntimeError("Attempting to write dataset without transaction enabled") 

183 

184 formatter_exception = None 

185 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

186 try: 

187 path = formatter.write(inMemoryDataset) 

188 log.debug("Wrote file to %s", path) 

189 except Exception as e: 

190 formatter_exception = e 

191 

192 if formatter_exception: 

193 raise formatter_exception 

194 

195 assert predictedFullPath == os.path.join(self.root, path) 

196 

197 return self._extractIngestInfo(path, ref, formatter=formatter) 

198 

199 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

200 # Docstring inherited from base class 

201 if transfer != "auto": 

202 return transfer 

203 

204 # See if the paths are within the datastore or not 

205 inside = [self._pathInStore(d.path) is not None for d in datasets] 

206 

207 if all(inside): 

208 transfer = None 

209 elif not any(inside): 209 ↛ 212line 209 didn't jump to line 212, because the condition on line 209 was never false

210 transfer = "link" 

211 else: 

212 raise ValueError("Some datasets are inside the datastore and some are outside." 

213 " Please use an explicit transfer mode and not 'auto'.") 

214 

215 return transfer 

216 

217 def _pathInStore(self, path: str) -> Optional[str]: 

218 """Return path relative to datastore root 

219 

220 Parameters 

221 ---------- 

222 path : `str` 

223 Path to dataset. Can be absolute path. Returns path in datastore 

224 or raises an exception if the path it outside. 

225 

226 Returns 

227 ------- 

228 inStore : `str` 

229 Path relative to datastore root. Returns `None` if the file is 

230 outside the root. 

231 """ 

232 pathUri = ButlerURI(path, forceAbsolute=False) 

233 rootUri = ButlerURI(self.root, forceDirectory=True, forceAbsolute=True) 

234 return pathUri.relative_to(rootUri) 

235 

236 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

237 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

238 fullPath = os.path.normpath(os.path.join(self.root, path)) 

239 if not os.path.exists(fullPath): 

240 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

241 f"are assumed to be relative to self.root unless they are absolute.") 

242 if transfer is None: 

243 # Can not reuse path var because of typing 

244 pathx = self._pathInStore(path) 

245 if pathx is None: 

246 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

247 path = pathx 

248 return path 

249 

250 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

251 formatter: Union[Formatter, Type[Formatter]], 

252 transfer: Optional[str] = None) -> StoredFileInfo: 

253 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

254 if self._transaction is None: 254 ↛ 255line 254 didn't jump to line 255, because the condition on line 254 was never true

255 raise RuntimeError("Ingest called without transaction enabled") 

256 

257 # Calculate the full path to the source 

258 srcUri = ButlerURI(path, root=self.root, forceAbsolute=True) 

259 if transfer is None: 

260 # File should exist already 

261 rootUri = ButlerURI(self.root, forceDirectory=True) 

262 pathInStore = srcUri.relative_to(rootUri) 

263 if pathInStore is None: 263 ↛ 264line 263 didn't jump to line 264, because the condition on line 263 was never true

264 raise RuntimeError(f"Unexpectedly learned that {srcUri} is not within datastore {rootUri}") 

265 if not rootUri.exists(): 265 ↛ 266line 265 didn't jump to line 266, because the condition on line 265 was never true

266 raise RuntimeError(f"Unexpectedly discovered that {srcUri} does not exist inside datastore" 

267 f" {rootUri}") 

268 path = pathInStore 

269 fullPath = srcUri.ospath 

270 elif transfer is not None: 270 ↛ 279line 270 didn't jump to line 279, because the condition on line 270 was never false

271 # Work out the name we want this ingested file to have 

272 # inside the datastore 

273 location = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

274 path = location.pathInStore 

275 fullPath = location.path 

276 targetUri = ButlerURI(location.uri) 

277 targetUri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

278 

279 checksum = self.computeChecksum(fullPath) if self.useChecksum else None 

280 stat = os.stat(fullPath) 

281 size = stat.st_size 

282 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

283 component=ref.datasetType.component(), 

284 file_size=size, checksum=checksum) 

285 

286 @staticmethod 

287 def computeChecksum(filename: str, algorithm: str = "blake2b", block_size: int = 8192) -> str: 

288 """Compute the checksum of the supplied file. 

289 

290 Parameters 

291 ---------- 

292 filename : `str` 

293 Name of file to calculate checksum from. 

294 algorithm : `str`, optional 

295 Name of algorithm to use. Must be one of the algorithms supported 

296 by :py:class`hashlib`. 

297 block_size : `int` 

298 Number of bytes to read from file at one time. 

299 

300 Returns 

301 ------- 

302 hexdigest : `str` 

303 Hex digest of the file. 

304 """ 

305 if algorithm not in hashlib.algorithms_guaranteed: 305 ↛ 306line 305 didn't jump to line 306, because the condition on line 305 was never true

306 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

307 

308 hasher = hashlib.new(algorithm) 

309 

310 with open(filename, "rb") as f: 

311 for chunk in iter(lambda: f.read(block_size), b""): 

312 hasher.update(chunk) 

313 

314 return hasher.hexdigest()