Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""POSIX datastore.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31import shutil 

32from typing import TYPE_CHECKING, Iterable, Optional, Type 

33 

34from .fileLikeDatastore import FileLikeDatastore 

35from lsst.daf.butler.core.safeFileIo import safeMakeDir 

36from lsst.daf.butler.core.utils import transactional 

37from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter 

38 

39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true

40 from lsst.daf.butler import DatasetRef 

41 

42log = logging.getLogger(__name__) 

43 

44 

45class PosixDatastore(FileLikeDatastore): 

46 """Basic POSIX filesystem backed Datastore. 

47 

48 Parameters 

49 ---------- 

50 config : `DatastoreConfig` or `str` 

51 Configuration. A string should refer to the name of the config file. 

52 registry : `Registry` 

53 Registry to use for storing internal information about the datasets. 

54 butlerRoot : `str`, optional 

55 New datastore root to use to override the configuration value. 

56 

57 Raises 

58 ------ 

59 ValueError 

60 If root location does not exist and ``create`` is `False` in the 

61 configuration. 

62 

63 Notes 

64 ----- 

65 PosixDatastore supports all transfer modes for file-based ingest: 

66 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, and `None` (no transfer). 

67 """ 

68 

69 defaultConfigFile = "datastores/posixDatastore.yaml" 

70 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

71 absolute path. Can be None if no defaults specified. 

72 """ 

73 

74 def __init__(self, config, registry, butlerRoot=None): 

75 super().__init__(config, registry, butlerRoot) 

76 

77 # Check that root is a valid URI for this datastore 

78 root = ButlerURI(self.root) 

79 if root.scheme and root.scheme != "file": 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true

80 raise ValueError(f"Root location must only be a file URI not {self.root}") 

81 

82 self.root = root.path 

83 if not os.path.isdir(self.root): 

84 if "create" not in self.config or not self.config["create"]: 84 ↛ 85line 84 didn't jump to line 85, because the condition on line 84 was never true

85 raise ValueError(f"No valid root at: {self.root}") 

86 safeMakeDir(self.root) 

87 

88 def exists(self, ref): 

89 """Check if the dataset exists in the datastore. 

90 

91 Parameters 

92 ---------- 

93 ref : `DatasetRef` 

94 Reference to the required dataset. 

95 

96 Returns 

97 ------- 

98 exists : `bool` 

99 `True` if the entity exists in the `Datastore`. 

100 """ 

101 location, _ = self._get_dataset_location_info(ref) 

102 if location is None: 

103 return False 

104 return os.path.exists(location.path) 

105 

106 def get(self, ref, parameters=None): 

107 """Load an InMemoryDataset from the store. 

108 

109 Parameters 

110 ---------- 

111 ref : `DatasetRef` 

112 Reference to the required Dataset. 

113 parameters : `dict` 

114 `StorageClass`-specific parameters that specify, for example, 

115 a slice of the Dataset to be loaded. 

116 

117 Returns 

118 ------- 

119 inMemoryDataset : `object` 

120 Requested Dataset or slice thereof as an InMemoryDataset. 

121 

122 Raises 

123 ------ 

124 FileNotFoundError 

125 Requested dataset can not be retrieved. 

126 TypeError 

127 Return value from formatter has unexpected type. 

128 ValueError 

129 Formatter failed to process the dataset. 

130 """ 

131 getInfo = self._prepare_for_get(ref, parameters) 

132 location = getInfo.location 

133 

134 # Too expensive to recalculate the checksum on fetch 

135 # but we can check size and existence 

136 if not os.path.exists(location.path): 

137 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

138 " expected location of {}".format(ref.id, location.path)) 

139 stat = os.stat(location.path) 

140 size = stat.st_size 

141 storedFileInfo = getInfo.info 

142 if size != storedFileInfo.file_size: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true

143 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

144 " match recorded size of {}".format(location.path, size, 

145 storedFileInfo.file_size)) 

146 

147 formatter = getInfo.formatter 

148 try: 

149 result = formatter.read(component=getInfo.component) 

150 except Exception as e: 

151 raise ValueError(f"Failure from formatter '{formatter.name()}' for Dataset {ref.id}") from e 

152 

153 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams) 

154 

155 @transactional 

156 def put(self, inMemoryDataset, ref): 

157 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

158 

159 Parameters 

160 ---------- 

161 inMemoryDataset : `object` 

162 The Dataset to store. 

163 ref : `DatasetRef` 

164 Reference to the associated Dataset. 

165 

166 Raises 

167 ------ 

168 TypeError 

169 Supplied object and storage class are inconsistent. 

170 DatasetTypeNotSupportedError 

171 The associated `DatasetType` is not handled by this datastore. 

172 

173 Notes 

174 ----- 

175 If the datastore is configured to reject certain dataset types it 

176 is possible that the put will fail and raise a 

177 `DatasetTypeNotSupportedError`. The main use case for this is to 

178 allow `ChainedDatastore` to put to multiple datastores without 

179 requiring that every datastore accepts the dataset. 

180 """ 

181 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

182 

183 storageDir = os.path.dirname(location.path) 

184 if not os.path.isdir(storageDir): 

185 # Never try to remove this after creating it since there might 

186 # be a butler ingest process running concurrently that will 

187 # already think this directory exists. 

188 safeMakeDir(storageDir) 

189 

190 # Write the file 

191 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

192 

193 if os.path.exists(predictedFullPath): 

194 raise FileExistsError(f"Cannot write file for ref {ref} as " 

195 f"output file {predictedFullPath} already exists") 

196 

197 def _removeFileExists(path): 

198 """Remove a file and do not complain if it is not there. 

199 

200 This is important since a formatter might fail before the file 

201 is written and we should not confuse people by writing spurious 

202 error messages to the log. 

203 """ 

204 try: 

205 os.remove(path) 

206 except FileNotFoundError: 

207 pass 

208 

209 formatter_exception = None 

210 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

211 try: 

212 path = formatter.write(inMemoryDataset) 

213 log.debug("Wrote file to %s", path) 

214 except Exception as e: 

215 formatter_exception = e 

216 

217 if formatter_exception: 

218 raise formatter_exception 

219 

220 assert predictedFullPath == os.path.join(self.root, path) 

221 

222 info = self._extractIngestInfo(path, ref, formatter=formatter) 

223 self._register_datasets([(ref, info)]) 

224 

225 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str: 

226 # Docstring inherited from base class 

227 if transfer != "auto": 

228 return transfer 

229 

230 # See if the paths are within the datastore or not 

231 inside = [self._pathInStore(d.path) is not None for d in datasets] 

232 

233 if all(inside): 

234 transfer = None 

235 elif not any(inside): 235 ↛ 238line 235 didn't jump to line 238, because the condition on line 235 was never false

236 transfer = "link" 

237 else: 

238 raise ValueError("Some datasets are inside the datastore and some are outside." 

239 " Please use an explicit transfer mode and not 'auto'.") 

240 

241 return transfer 

242 

243 def _pathInStore(self, path: str) -> str: 

244 """Return path relative to datastore root 

245 

246 Parameters 

247 ---------- 

248 path : `str` 

249 Path to dataset. Can be absolute path. Returns path in datastore 

250 or raises an exception if the path it outside. 

251 

252 Returns 

253 ------- 

254 inStore : `str` 

255 Path relative to datastore root. Returns `None` if the file is 

256 outside the root. 

257 """ 

258 if os.path.isabs(path): 

259 absRoot = os.path.abspath(self.root) 

260 if os.path.commonpath([absRoot, path]) != absRoot: 260 ↛ 262line 260 didn't jump to line 262, because the condition on line 260 was never false

261 return None 

262 return os.path.relpath(path, absRoot) 

263 elif path.startswith(os.path.pardir): 

264 return None 

265 return path 

266 

267 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

268 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

269 fullPath = os.path.normpath(os.path.join(self.root, path)) 

270 if not os.path.exists(fullPath): 

271 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

272 f"are assumed to be relative to self.root unless they are absolute.") 

273 if transfer is None: 

274 path = self._pathInStore(path) 

275 if path is None: 

276 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

277 return path 

278 

279 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

280 transfer: Optional[str] = None) -> StoredFileInfo: 

281 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

282 fullPath = os.path.normpath(os.path.join(self.root, path)) 

283 if transfer is not None: 

284 template = self.templates.getTemplate(ref) 

285 location = self.locationFactory.fromPath(template.format(ref)) 

286 newPath = formatter.predictPathFromLocation(location) 

287 newFullPath = os.path.join(self.root, newPath) 

288 if os.path.exists(newFullPath): 

289 raise FileExistsError(f"File '{newFullPath}' already exists.") 

290 storageDir = os.path.dirname(newFullPath) 

291 if not os.path.isdir(storageDir): 

292 with self._transaction.undoWith("mkdir", os.rmdir, storageDir): 

293 safeMakeDir(storageDir) 

294 if transfer == "move": 

295 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath): 

296 shutil.move(fullPath, newFullPath) 

297 elif transfer == "copy": 

298 with self._transaction.undoWith("copy", os.remove, newFullPath): 

299 shutil.copy(fullPath, newFullPath) 

300 elif transfer == "link": 

301 with self._transaction.undoWith("link", os.unlink, newFullPath): 

302 # Try hard link and if that fails use a symlink 

303 try: 

304 os.link(fullPath, newFullPath) 

305 except OSError: 

306 # Read through existing symlinks 

307 os.symlink(os.path.realpath(fullPath), newFullPath) 

308 elif transfer == "hardlink": 

309 with self._transaction.undoWith("hardlink", os.unlink, newFullPath): 

310 os.link(fullPath, newFullPath) 

311 elif transfer == "symlink": 311 ↛ 316line 311 didn't jump to line 316, because the condition on line 311 was never false

312 with self._transaction.undoWith("symlink", os.unlink, newFullPath): 

313 # Read through existing symlinks 

314 os.symlink(os.path.realpath(fullPath), newFullPath) 

315 else: 

316 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer)) 

317 path = newPath 

318 fullPath = newFullPath 

319 if self.useChecksum: 

320 checksum = self.computeChecksum(fullPath) 

321 else: 

322 checksum = None 

323 stat = os.stat(fullPath) 

324 size = stat.st_size 

325 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

326 file_size=size, checksum=checksum) 

327 

328 def remove(self, ref): 

329 """Indicate to the Datastore that a Dataset can be removed. 

330 

331 .. warning:: 

332 

333 This method does not support transactions; removals are 

334 immediate, cannot be undone, and are not guaranteed to 

335 be atomic if deleting either the file or the internal 

336 database records fails. 

337 

338 Parameters 

339 ---------- 

340 ref : `DatasetRef` 

341 Reference to the required Dataset. 

342 

343 Raises 

344 ------ 

345 FileNotFoundError 

346 Attempt to remove a dataset that does not exist. 

347 """ 

348 # Get file metadata and internal metadata 

349 location, _ = self._get_dataset_location_info(ref) 

350 if location is None: 

351 raise FileNotFoundError(f"Requested dataset ({ref}) does not exist") 

352 

353 if not os.path.exists(location.path): 353 ↛ 354line 353 didn't jump to line 354, because the condition on line 353 was never true

354 raise FileNotFoundError(f"No such file: {location.uri}") 

355 

356 if self._can_remove_dataset_artifact(ref): 

357 # Only reference to this path so we can remove it 

358 os.remove(location.path) 

359 

360 # Remove rows from registries 

361 self._remove_from_registry(ref) 

362 

363 @staticmethod 

364 def computeChecksum(filename, algorithm="blake2b", block_size=8192): 

365 """Compute the checksum of the supplied file. 

366 

367 Parameters 

368 ---------- 

369 filename : `str` 

370 Name of file to calculate checksum from. 

371 algorithm : `str`, optional 

372 Name of algorithm to use. Must be one of the algorithms supported 

373 by :py:class`hashlib`. 

374 block_size : `int` 

375 Number of bytes to read from file at one time. 

376 

377 Returns 

378 ------- 

379 hexdigest : `str` 

380 Hex digest of the file. 

381 """ 

382 if algorithm not in hashlib.algorithms_guaranteed: 382 ↛ 383line 382 didn't jump to line 383, because the condition on line 382 was never true

383 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

384 

385 hasher = hashlib.new(algorithm) 

386 

387 with open(filename, "rb") as f: 

388 for chunk in iter(lambda: f.read(block_size), b""): 

389 hasher.update(chunk) 

390 

391 return hasher.hexdigest() 

392 

393 def export(self, refs: Iterable[DatasetRef], *, 

394 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]: 

395 # Docstring inherited from Datastore.export. 

396 for ref in refs: 

397 location, storedFileInfo = self._get_dataset_location_info(ref) 

398 if location is None: 398 ↛ 399line 398 didn't jump to line 399, because the condition on line 398 was never true

399 raise FileNotFoundError(f"Could not retrieve Dataset {ref}.") 

400 if transfer is None: 400 ↛ 406line 400 didn't jump to line 406, because the condition on line 400 was never false

401 # TODO: do we also need to return the readStorageClass somehow? 

402 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

403 else: 

404 # TODO: add support for other transfer modes. If we support 

405 # moving, this method should become transactional. 

406 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")