Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""POSIX datastore.""" 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Optional, 

36 Type, 

37 Union 

38) 

39 

40from .fileLikeDatastore import FileLikeDatastore 

41from lsst.daf.butler.core.utils import safeMakeDir 

42from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter, DatasetRef 

43 

44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true

45 from .fileLikeDatastore import DatastoreFileGetInformation 

46 from lsst.daf.butler import DatastoreConfig, Location 

47 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager 

48 

49log = logging.getLogger(__name__) 

50 

51 

52class PosixDatastore(FileLikeDatastore): 

53 """Basic POSIX filesystem backed Datastore. 

54 

55 Parameters 

56 ---------- 

57 config : `DatastoreConfig` or `str` 

58 Configuration. A string should refer to the name of the config file. 

59 bridgeManager : `DatastoreRegistryBridgeManager` 

60 Object that manages the interface between `Registry` and datastores. 

61 butlerRoot : `str`, optional 

62 New datastore root to use to override the configuration value. 

63 

64 Raises 

65 ------ 

66 ValueError 

67 If root location does not exist and ``create`` is `False` in the 

68 configuration. 

69 

70 Notes 

71 ----- 

72 PosixDatastore supports all transfer modes for file-based ingest: 

73 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"` 

74 and `None` (no transfer). 

75 """ 

76 

77 defaultConfigFile: ClassVar[Optional[str]] = "datastores/posixDatastore.yaml" 

78 """Path to configuration defaults. Accessed within the ``config`` resource 

79 or relative to a search path. Can be None if no defaults specified. 

80 """ 

81 

82 def __init__(self, config: Union[DatastoreConfig, str], 

83 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

84 super().__init__(config, bridgeManager, butlerRoot) 

85 

86 # Check that root is a valid URI for this datastore 

87 root = ButlerURI(self.root, forceDirectory=True) 

88 if root.scheme and root.scheme != "file": 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true

89 raise ValueError(f"Root location must only be a file URI not {self.root}") 

90 

91 self.root = root.ospath 

92 if not os.path.isdir(self.root): 

93 if "create" not in self.config or not self.config["create"]: 93 ↛ 94line 93 didn't jump to line 94, because the condition on line 93 was never true

94 raise ValueError(f"No valid root at: {self.root}") 

95 safeMakeDir(self.root) 

96 

97 def _artifact_exists(self, location: Location) -> bool: 

98 """Check that an artifact exists in this datastore at the specified 

99 location. 

100 

101 Parameters 

102 ---------- 

103 location : `Location` 

104 Expected location of the artifact associated with this datastore. 

105 

106 Returns 

107 ------- 

108 exists : `bool` 

109 True if the location can be found, false otherwise. 

110 """ 

111 return os.path.exists(location.path) 

112 

113 def _delete_artifact(self, location: Location) -> None: 

114 """Delete the artifact from the datastore. 

115 

116 Parameters 

117 ---------- 

118 location : `Location` 

119 Location of the artifact associated with this datastore. 

120 """ 

121 os.remove(location.path) 

122 

123 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

124 ref: DatasetRef, isComponent: bool = False) -> Any: 

125 location = getInfo.location 

126 

127 # Too expensive to recalculate the checksum on fetch 

128 # but we can check size and existence 

129 if not os.path.exists(location.path): 

130 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

131 " expected location of {}".format(ref.id, location.path)) 

132 stat = os.stat(location.path) 

133 size = stat.st_size 

134 storedFileInfo = getInfo.info 

135 if size != storedFileInfo.file_size: 135 ↛ 136line 135 didn't jump to line 136, because the condition on line 135 was never true

136 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

137 " match recorded size of {}".format(location.path, size, 

138 storedFileInfo.file_size)) 

139 

140 formatter = getInfo.formatter 

141 try: 

142 log.debug("Reading %s from location %s with formatter %s", 

143 f"component {getInfo.component}" if isComponent else "", 

144 location.uri, type(formatter).__name__) 

145 result = formatter.read(component=getInfo.component if isComponent else None) 

146 except Exception as e: 

147 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

148 f" ({ref.datasetType.name} from {location.path}): {e}") from e 

149 

150 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

151 isComponent=isComponent) 

152 

153 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

154 # Inherit docstring 

155 

156 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

157 

158 storageDir = os.path.dirname(location.path) 

159 if not os.path.isdir(storageDir): 

160 # Never try to remove this after creating it since there might 

161 # be a butler ingest process running concurrently that will 

162 # already think this directory exists. 

163 safeMakeDir(storageDir) 

164 

165 # Write the file 

166 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

167 

168 if os.path.exists(predictedFullPath): 168 ↛ 175line 168 didn't jump to line 175, because the condition on line 168 was never true

169 # Assume that by this point if registry thinks the file should 

170 # not exist then the file should not exist and therefore we can 

171 # overwrite it. This can happen if a put was interrupted by 

172 # an external interrupt. The only time this could be problematic is 

173 # if the file template is incomplete and multiple dataset refs 

174 # result in identical filenames. 

175 log.warning("Object %s exists in datastore for ref %s", location.uri, ref) 

176 

177 def _removeFileExists(path: str) -> None: 

178 """Remove a file and do not complain if it is not there. 

179 

180 This is important since a formatter might fail before the file 

181 is written and we should not confuse people by writing spurious 

182 error messages to the log. 

183 """ 

184 try: 

185 os.remove(path) 

186 except FileNotFoundError: 

187 pass 

188 

189 if self._transaction is None: 189 ↛ 190line 189 didn't jump to line 190, because the condition on line 189 was never true

190 raise RuntimeError("Attempting to write dataset without transaction enabled") 

191 

192 formatter_exception = None 

193 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

194 try: 

195 path = formatter.write(inMemoryDataset) 

196 log.debug("Wrote file to %s", path) 

197 except Exception as e: 

198 formatter_exception = e 

199 

200 if formatter_exception: 

201 raise formatter_exception 

202 

203 assert predictedFullPath == os.path.join(self.root, path) 

204 

205 return self._extractIngestInfo(path, ref, formatter=formatter) 

206 

207 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

208 # Docstring inherited from base class 

209 if transfer != "auto": 

210 return transfer 

211 

212 # See if the paths are within the datastore or not 

213 inside = [self._pathInStore(d.path) is not None for d in datasets] 

214 

215 if all(inside): 

216 transfer = None 

217 elif not any(inside): 217 ↛ 220line 217 didn't jump to line 220, because the condition on line 217 was never false

218 transfer = "link" 

219 else: 

220 raise ValueError("Some datasets are inside the datastore and some are outside." 

221 " Please use an explicit transfer mode and not 'auto'.") 

222 

223 return transfer 

224 

225 def _pathInStore(self, path: str) -> Optional[str]: 

226 """Return path relative to datastore root 

227 

228 Parameters 

229 ---------- 

230 path : `str` 

231 Path to dataset. Can be absolute path. Returns path in datastore 

232 or raises an exception if the path it outside. 

233 

234 Returns 

235 ------- 

236 inStore : `str` 

237 Path relative to datastore root. Returns `None` if the file is 

238 outside the root. 

239 """ 

240 pathUri = ButlerURI(path, forceAbsolute=False) 

241 rootUri = ButlerURI(self.root, forceDirectory=True, forceAbsolute=True) 

242 return pathUri.relative_to(rootUri) 

243 

244 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

245 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

246 fullPath = os.path.normpath(os.path.join(self.root, path)) 

247 if not os.path.exists(fullPath): 

248 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

249 f"are assumed to be relative to self.root unless they are absolute.") 

250 if transfer is None: 

251 # Can not reuse path var because of typing 

252 pathx = self._pathInStore(path) 

253 if pathx is None: 

254 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

255 path = pathx 

256 return path 

257 

258 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

259 formatter: Union[Formatter, Type[Formatter]], 

260 transfer: Optional[str] = None) -> StoredFileInfo: 

261 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

262 if self._transaction is None: 262 ↛ 263line 262 didn't jump to line 263, because the condition on line 262 was never true

263 raise RuntimeError("Ingest called without transaction enabled") 

264 

265 # Calculate the full path to the source 

266 srcUri = ButlerURI(path, root=self.root, forceAbsolute=True) 

267 if transfer is None: 

268 # File should exist already 

269 rootUri = ButlerURI(self.root, forceDirectory=True) 

270 pathInStore = srcUri.relative_to(rootUri) 

271 if pathInStore is None: 271 ↛ 272line 271 didn't jump to line 272, because the condition on line 271 was never true

272 raise RuntimeError(f"Unexpectedly learned that {srcUri} is not within datastore {rootUri}") 

273 if not rootUri.exists(): 273 ↛ 274line 273 didn't jump to line 274, because the condition on line 273 was never true

274 raise RuntimeError(f"Unexpectedly discovered that {srcUri} does not exist inside datastore" 

275 f" {rootUri}") 

276 path = pathInStore 

277 fullPath = srcUri.ospath 

278 elif transfer is not None: 278 ↛ 287line 278 didn't jump to line 287, because the condition on line 278 was never false

279 # Work out the name we want this ingested file to have 

280 # inside the datastore 

281 location = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

282 path = location.pathInStore 

283 fullPath = location.path 

284 targetUri = ButlerURI(location.uri) 

285 targetUri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

286 

287 checksum = self.computeChecksum(fullPath) if self.useChecksum else None 

288 stat = os.stat(fullPath) 

289 size = stat.st_size 

290 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

291 component=ref.datasetType.component(), 

292 file_size=size, checksum=checksum) 

293 

294 @staticmethod 

295 def computeChecksum(filename: str, algorithm: str = "blake2b", block_size: int = 8192) -> str: 

296 """Compute the checksum of the supplied file. 

297 

298 Parameters 

299 ---------- 

300 filename : `str` 

301 Name of file to calculate checksum from. 

302 algorithm : `str`, optional 

303 Name of algorithm to use. Must be one of the algorithms supported 

304 by :py:class`hashlib`. 

305 block_size : `int` 

306 Number of bytes to read from file at one time. 

307 

308 Returns 

309 ------- 

310 hexdigest : `str` 

311 Hex digest of the file. 

312 """ 

313 if algorithm not in hashlib.algorithms_guaranteed: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true

314 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

315 

316 hasher = hashlib.new(algorithm) 

317 

318 with open(filename, "rb") as f: 

319 for chunk in iter(lambda: f.read(block_size), b""): 

320 hasher.update(chunk) 

321 

322 return hasher.hexdigest()