Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""POSIX datastore.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31import shutil 

32from typing import TYPE_CHECKING, Iterable, Optional, Type 

33 

34from .fileLikeDatastore import FileLikeDatastore 

35from lsst.daf.butler.core.safeFileIo import safeMakeDir 

36from lsst.daf.butler.core.utils import transactional 

37from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter 

38 

39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true

40 from lsst.daf.butler import DatasetRef 

41 

42log = logging.getLogger(__name__) 

43 

44 

45class PosixDatastore(FileLikeDatastore): 

46 """Basic POSIX filesystem backed Datastore. 

47 

48 Parameters 

49 ---------- 

50 config : `DatastoreConfig` or `str` 

51 Configuration. A string should refer to the name of the config file. 

52 registry : `Registry` 

53 Registry to use for storing internal information about the datasets. 

54 butlerRoot : `str`, optional 

55 New datastore root to use to override the configuration value. 

56 

57 Raises 

58 ------ 

59 ValueError 

60 If root location does not exist and ``create`` is `False` in the 

61 configuration. 

62 

63 Notes 

64 ----- 

65 PosixDatastore supports all transfer modes for file-based ingest: 

66 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"` 

67 and `None` (no transfer). 

68 """ 

69 

70 defaultConfigFile = "datastores/posixDatastore.yaml" 

71 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

72 absolute path. Can be None if no defaults specified. 

73 """ 

74 

75 def __init__(self, config, registry, butlerRoot=None): 

76 super().__init__(config, registry, butlerRoot) 

77 

78 # Check that root is a valid URI for this datastore 

79 root = ButlerURI(self.root) 

80 if root.scheme and root.scheme != "file": 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true

81 raise ValueError(f"Root location must only be a file URI not {self.root}") 

82 

83 self.root = root.path 

84 if not os.path.isdir(self.root): 

85 if "create" not in self.config or not self.config["create"]: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 raise ValueError(f"No valid root at: {self.root}") 

87 safeMakeDir(self.root) 

88 

89 def _artifact_exists(self, location): 

90 """Check that an artifact exists in this datastore at the specified 

91 location. 

92 

93 Parameters 

94 ---------- 

95 location : `Location` 

96 Expected location of the artifact associated with this datastore. 

97 

98 Returns 

99 ------- 

100 exists : `bool` 

101 True if the location can be found, false otherwise. 

102 """ 

103 return os.path.exists(location.path) 

104 

105 def _delete_artifact(self, location): 

106 """Delete the artifact from the datastore. 

107 

108 Parameters 

109 ---------- 

110 location : `Location` 

111 Location of the artifact associated with this datastore. 

112 """ 

113 os.remove(location.path) 

114 

115 def get(self, ref, parameters=None): 

116 """Load an InMemoryDataset from the store. 

117 

118 Parameters 

119 ---------- 

120 ref : `DatasetRef` 

121 Reference to the required Dataset. 

122 parameters : `dict` 

123 `StorageClass`-specific parameters that specify, for example, 

124 a slice of the dataset to be loaded. 

125 

126 Returns 

127 ------- 

128 inMemoryDataset : `object` 

129 Requested dataset or slice thereof as an InMemoryDataset. 

130 

131 Raises 

132 ------ 

133 FileNotFoundError 

134 Requested dataset can not be retrieved. 

135 TypeError 

136 Return value from formatter has unexpected type. 

137 ValueError 

138 Formatter failed to process the dataset. 

139 """ 

140 getInfo = self._prepare_for_get(ref, parameters) 

141 location = getInfo.location 

142 

143 # Too expensive to recalculate the checksum on fetch 

144 # but we can check size and existence 

145 if not os.path.exists(location.path): 

146 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

147 " expected location of {}".format(ref.id, location.path)) 

148 stat = os.stat(location.path) 

149 size = stat.st_size 

150 storedFileInfo = getInfo.info 

151 if size != storedFileInfo.file_size: 151 ↛ 152line 151 didn't jump to line 152, because the condition on line 151 was never true

152 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

153 " match recorded size of {}".format(location.path, size, 

154 storedFileInfo.file_size)) 

155 

156 formatter = getInfo.formatter 

157 try: 

158 result = formatter.read(component=getInfo.component) 

159 except Exception as e: 

160 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

161 f" ({ref.datasetType.name} from {location.path}): {e}") from e 

162 

163 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

164 isComponent=getInfo.component is not None) 

165 

166 @transactional 

167 def put(self, inMemoryDataset, ref): 

168 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

169 

170 Parameters 

171 ---------- 

172 inMemoryDataset : `object` 

173 The dataset to store. 

174 ref : `DatasetRef` 

175 Reference to the associated Dataset. 

176 

177 Raises 

178 ------ 

179 TypeError 

180 Supplied object and storage class are inconsistent. 

181 DatasetTypeNotSupportedError 

182 The associated `DatasetType` is not handled by this datastore. 

183 

184 Notes 

185 ----- 

186 If the datastore is configured to reject certain dataset types it 

187 is possible that the put will fail and raise a 

188 `DatasetTypeNotSupportedError`. The main use case for this is to 

189 allow `ChainedDatastore` to put to multiple datastores without 

190 requiring that every datastore accepts the dataset. 

191 """ 

192 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

193 

194 storageDir = os.path.dirname(location.path) 

195 if not os.path.isdir(storageDir): 

196 # Never try to remove this after creating it since there might 

197 # be a butler ingest process running concurrently that will 

198 # already think this directory exists. 

199 safeMakeDir(storageDir) 

200 

201 # Write the file 

202 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

203 

204 if os.path.exists(predictedFullPath): 

205 raise FileExistsError(f"Cannot write file for ref {ref} as " 

206 f"output file {predictedFullPath} already exists") 

207 

208 def _removeFileExists(path): 

209 """Remove a file and do not complain if it is not there. 

210 

211 This is important since a formatter might fail before the file 

212 is written and we should not confuse people by writing spurious 

213 error messages to the log. 

214 """ 

215 try: 

216 os.remove(path) 

217 except FileNotFoundError: 

218 pass 

219 

220 formatter_exception = None 

221 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

222 try: 

223 path = formatter.write(inMemoryDataset) 

224 log.debug("Wrote file to %s", path) 

225 except Exception as e: 

226 formatter_exception = e 

227 

228 if formatter_exception: 

229 raise formatter_exception 

230 

231 assert predictedFullPath == os.path.join(self.root, path) 

232 

233 info = self._extractIngestInfo(path, ref, formatter=formatter) 

234 self._register_datasets([(ref, info)]) 

235 

236 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str: 

237 # Docstring inherited from base class 

238 if transfer != "auto": 

239 return transfer 

240 

241 # See if the paths are within the datastore or not 

242 inside = [self._pathInStore(d.path) is not None for d in datasets] 

243 

244 if all(inside): 

245 transfer = None 

246 elif not any(inside): 246 ↛ 249line 246 didn't jump to line 249, because the condition on line 246 was never false

247 transfer = "link" 

248 else: 

249 raise ValueError("Some datasets are inside the datastore and some are outside." 

250 " Please use an explicit transfer mode and not 'auto'.") 

251 

252 return transfer 

253 

254 def _pathInStore(self, path: str) -> str: 

255 """Return path relative to datastore root 

256 

257 Parameters 

258 ---------- 

259 path : `str` 

260 Path to dataset. Can be absolute path. Returns path in datastore 

261 or raises an exception if the path it outside. 

262 

263 Returns 

264 ------- 

265 inStore : `str` 

266 Path relative to datastore root. Returns `None` if the file is 

267 outside the root. 

268 """ 

269 if os.path.isabs(path): 

270 absRoot = os.path.abspath(self.root) 

271 if os.path.commonpath([absRoot, path]) != absRoot: 271 ↛ 273line 271 didn't jump to line 273, because the condition on line 271 was never false

272 return None 

273 return os.path.relpath(path, absRoot) 

274 elif path.startswith(os.path.pardir): 

275 return None 

276 return path 

277 

278 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

279 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

280 fullPath = os.path.normpath(os.path.join(self.root, path)) 

281 if not os.path.exists(fullPath): 

282 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

283 f"are assumed to be relative to self.root unless they are absolute.") 

284 if transfer is None: 

285 path = self._pathInStore(path) 

286 if path is None: 

287 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

288 return path 

289 

290 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

291 transfer: Optional[str] = None) -> StoredFileInfo: 

292 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

293 fullPath = os.path.normpath(os.path.join(self.root, path)) 

294 if transfer is not None: 

295 template = self.templates.getTemplate(ref) 

296 location = self.locationFactory.fromPath(template.format(ref)) 

297 newPath = formatter.predictPathFromLocation(location) 

298 newFullPath = os.path.join(self.root, newPath) 

299 if os.path.exists(newFullPath): 

300 raise FileExistsError(f"File '{newFullPath}' already exists.") 

301 storageDir = os.path.dirname(newFullPath) 

302 if not os.path.isdir(storageDir): 

303 # Do not attempt to reverse directory creation 

304 # because of race conditions with other processes running 

305 # ingest in parallel. 

306 safeMakeDir(storageDir) 

307 if transfer == "move": 

308 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath): 

309 shutil.move(fullPath, newFullPath) 

310 elif transfer == "copy": 

311 with self._transaction.undoWith("copy", os.remove, newFullPath): 

312 shutil.copy(fullPath, newFullPath) 

313 elif transfer == "link": 

314 with self._transaction.undoWith("link", os.unlink, newFullPath): 

315 realPath = os.path.realpath(fullPath) 

316 # Try hard link and if that fails use a symlink 

317 try: 

318 os.link(realPath, newFullPath) 

319 except OSError: 

320 # Read through existing symlinks 

321 os.symlink(realPath, newFullPath) 

322 elif transfer == "hardlink": 

323 with self._transaction.undoWith("hardlink", os.unlink, newFullPath): 

324 os.link(os.path.realpath(fullPath), newFullPath) 

325 elif transfer == "symlink": 

326 with self._transaction.undoWith("symlink", os.unlink, newFullPath): 

327 # Read through existing symlinks 

328 os.symlink(os.path.realpath(fullPath), newFullPath) 

329 elif transfer == "relsymlink": 329 ↛ 340line 329 didn't jump to line 340, because the condition on line 329 was never false

330 # This is a standard symlink but using a relative path 

331 fullPath = os.path.realpath(fullPath) 

332 

333 # Need the directory name to give to relative root 

334 # A full file path confuses it into an extra ../ 

335 newFullPathRoot, _ = os.path.split(newFullPath) 

336 relPath = os.path.relpath(fullPath, newFullPathRoot) 

337 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath): 

338 os.symlink(relPath, newFullPath) 

339 else: 

340 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer)) 

341 path = newPath 

342 fullPath = newFullPath 

343 if self.useChecksum: 

344 checksum = self.computeChecksum(fullPath) 

345 else: 

346 checksum = None 

347 stat = os.stat(fullPath) 

348 size = stat.st_size 

349 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

350 file_size=size, checksum=checksum) 

351 

352 @staticmethod 

353 def computeChecksum(filename, algorithm="blake2b", block_size=8192): 

354 """Compute the checksum of the supplied file. 

355 

356 Parameters 

357 ---------- 

358 filename : `str` 

359 Name of file to calculate checksum from. 

360 algorithm : `str`, optional 

361 Name of algorithm to use. Must be one of the algorithms supported 

362 by :py:class`hashlib`. 

363 block_size : `int` 

364 Number of bytes to read from file at one time. 

365 

366 Returns 

367 ------- 

368 hexdigest : `str` 

369 Hex digest of the file. 

370 """ 

371 if algorithm not in hashlib.algorithms_guaranteed: 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true

372 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

373 

374 hasher = hashlib.new(algorithm) 

375 

376 with open(filename, "rb") as f: 

377 for chunk in iter(lambda: f.read(block_size), b""): 

378 hasher.update(chunk) 

379 

380 return hasher.hexdigest() 

381 

382 def export(self, refs: Iterable[DatasetRef], *, 

383 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]: 

384 # Docstring inherited from Datastore.export. 

385 for ref in refs: 

386 location, storedFileInfo = self._get_dataset_location_info(ref) 

387 if location is None: 387 ↛ 388line 387 didn't jump to line 388, because the condition on line 387 was never true

388 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

389 if transfer is None: 389 ↛ 395line 389 didn't jump to line 395, because the condition on line 389 was never false

390 # TODO: do we also need to return the readStorageClass somehow? 

391 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

392 else: 

393 # TODO: add support for other transfer modes. If we support 

394 # moving, this method should become transactional. 

395 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")