Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""POSIX datastore.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("PosixDatastore", ) 

27 

28import hashlib 

29import logging 

30import os 

31import shutil 

32from typing import TYPE_CHECKING, Iterable, Optional, Type 

33 

34from .fileLikeDatastore import FileLikeDatastore 

35from lsst.daf.butler.core.safeFileIo import safeMakeDir 

36from lsst.daf.butler.core.utils import transactional 

37from lsst.daf.butler import ButlerURI, FileDataset, StoredFileInfo, Formatter 

38 

39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true

40 from lsst.daf.butler import DatasetRef 

41 

42log = logging.getLogger(__name__) 

43 

44 

45class PosixDatastore(FileLikeDatastore): 

46 """Basic POSIX filesystem backed Datastore. 

47 

48 Parameters 

49 ---------- 

50 config : `DatastoreConfig` or `str` 

51 Configuration. A string should refer to the name of the config file. 

52 registry : `Registry` 

53 Registry to use for storing internal information about the datasets. 

54 butlerRoot : `str`, optional 

55 New datastore root to use to override the configuration value. 

56 

57 Raises 

58 ------ 

59 ValueError 

60 If root location does not exist and ``create`` is `False` in the 

61 configuration. 

62 

63 Notes 

64 ----- 

65 PosixDatastore supports all transfer modes for file-based ingest: 

66 `"move"`, `"copy"`, `"symlink"`, `"hardlink"`, `"relsymlink"` 

67 and `None` (no transfer). 

68 """ 

69 

70 defaultConfigFile = "datastores/posixDatastore.yaml" 

71 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

72 absolute path. Can be None if no defaults specified. 

73 """ 

74 

75 def __init__(self, config, registry, butlerRoot=None): 

76 super().__init__(config, registry, butlerRoot) 

77 

78 # Check that root is a valid URI for this datastore 

79 root = ButlerURI(self.root) 

80 if root.scheme and root.scheme != "file": 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true

81 raise ValueError(f"Root location must only be a file URI not {self.root}") 

82 

83 self.root = root.path 

84 if not os.path.isdir(self.root): 

85 if "create" not in self.config or not self.config["create"]: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 raise ValueError(f"No valid root at: {self.root}") 

87 safeMakeDir(self.root) 

88 

89 def exists(self, ref): 

90 """Check if the dataset exists in the datastore. 

91 

92 Parameters 

93 ---------- 

94 ref : `DatasetRef` 

95 Reference to the required dataset. 

96 

97 Returns 

98 ------- 

99 exists : `bool` 

100 `True` if the entity exists in the `Datastore`. 

101 """ 

102 location, _ = self._get_dataset_location_info(ref) 

103 if location is None: 

104 return False 

105 return os.path.exists(location.path) 

106 

107 def get(self, ref, parameters=None): 

108 """Load an InMemoryDataset from the store. 

109 

110 Parameters 

111 ---------- 

112 ref : `DatasetRef` 

113 Reference to the required Dataset. 

114 parameters : `dict` 

115 `StorageClass`-specific parameters that specify, for example, 

116 a slice of the Dataset to be loaded. 

117 

118 Returns 

119 ------- 

120 inMemoryDataset : `object` 

121 Requested Dataset or slice thereof as an InMemoryDataset. 

122 

123 Raises 

124 ------ 

125 FileNotFoundError 

126 Requested dataset can not be retrieved. 

127 TypeError 

128 Return value from formatter has unexpected type. 

129 ValueError 

130 Formatter failed to process the dataset. 

131 """ 

132 getInfo = self._prepare_for_get(ref, parameters) 

133 location = getInfo.location 

134 

135 # Too expensive to recalculate the checksum on fetch 

136 # but we can check size and existence 

137 if not os.path.exists(location.path): 

138 raise FileNotFoundError("Dataset with Id {} does not seem to exist at" 

139 " expected location of {}".format(ref.id, location.path)) 

140 stat = os.stat(location.path) 

141 size = stat.st_size 

142 storedFileInfo = getInfo.info 

143 if size != storedFileInfo.file_size: 143 ↛ 144line 143 didn't jump to line 144, because the condition on line 143 was never true

144 raise RuntimeError("Integrity failure in Datastore. Size of file {} ({}) does not" 

145 " match recorded size of {}".format(location.path, size, 

146 storedFileInfo.file_size)) 

147 

148 formatter = getInfo.formatter 

149 try: 

150 result = formatter.read(component=getInfo.component) 

151 except Exception as e: 

152 raise ValueError(f"Failure from formatter '{formatter.name()}' for Dataset {ref.id}") from e 

153 

154 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams) 

155 

156 @transactional 

157 def put(self, inMemoryDataset, ref): 

158 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

159 

160 Parameters 

161 ---------- 

162 inMemoryDataset : `object` 

163 The Dataset to store. 

164 ref : `DatasetRef` 

165 Reference to the associated Dataset. 

166 

167 Raises 

168 ------ 

169 TypeError 

170 Supplied object and storage class are inconsistent. 

171 DatasetTypeNotSupportedError 

172 The associated `DatasetType` is not handled by this datastore. 

173 

174 Notes 

175 ----- 

176 If the datastore is configured to reject certain dataset types it 

177 is possible that the put will fail and raise a 

178 `DatasetTypeNotSupportedError`. The main use case for this is to 

179 allow `ChainedDatastore` to put to multiple datastores without 

180 requiring that every datastore accepts the dataset. 

181 """ 

182 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

183 

184 storageDir = os.path.dirname(location.path) 

185 if not os.path.isdir(storageDir): 

186 # Never try to remove this after creating it since there might 

187 # be a butler ingest process running concurrently that will 

188 # already think this directory exists. 

189 safeMakeDir(storageDir) 

190 

191 # Write the file 

192 predictedFullPath = os.path.join(self.root, formatter.predictPath()) 

193 

194 if os.path.exists(predictedFullPath): 

195 raise FileExistsError(f"Cannot write file for ref {ref} as " 

196 f"output file {predictedFullPath} already exists") 

197 

198 def _removeFileExists(path): 

199 """Remove a file and do not complain if it is not there. 

200 

201 This is important since a formatter might fail before the file 

202 is written and we should not confuse people by writing spurious 

203 error messages to the log. 

204 """ 

205 try: 

206 os.remove(path) 

207 except FileNotFoundError: 

208 pass 

209 

210 formatter_exception = None 

211 with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): 

212 try: 

213 path = formatter.write(inMemoryDataset) 

214 log.debug("Wrote file to %s", path) 

215 except Exception as e: 

216 formatter_exception = e 

217 

218 if formatter_exception: 

219 raise formatter_exception 

220 

221 assert predictedFullPath == os.path.join(self.root, path) 

222 

223 info = self._extractIngestInfo(path, ref, formatter=formatter) 

224 self._register_datasets([(ref, info)]) 

225 

226 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> str: 

227 # Docstring inherited from base class 

228 if transfer != "auto": 

229 return transfer 

230 

231 # See if the paths are within the datastore or not 

232 inside = [self._pathInStore(d.path) is not None for d in datasets] 

233 

234 if all(inside): 

235 transfer = None 

236 elif not any(inside): 236 ↛ 239line 236 didn't jump to line 239, because the condition on line 236 was never false

237 transfer = "link" 

238 else: 

239 raise ValueError("Some datasets are inside the datastore and some are outside." 

240 " Please use an explicit transfer mode and not 'auto'.") 

241 

242 return transfer 

243 

244 def _pathInStore(self, path: str) -> str: 

245 """Return path relative to datastore root 

246 

247 Parameters 

248 ---------- 

249 path : `str` 

250 Path to dataset. Can be absolute path. Returns path in datastore 

251 or raises an exception if the path it outside. 

252 

253 Returns 

254 ------- 

255 inStore : `str` 

256 Path relative to datastore root. Returns `None` if the file is 

257 outside the root. 

258 """ 

259 if os.path.isabs(path): 

260 absRoot = os.path.abspath(self.root) 

261 if os.path.commonpath([absRoot, path]) != absRoot: 261 ↛ 263line 261 didn't jump to line 263, because the condition on line 261 was never false

262 return None 

263 return os.path.relpath(path, absRoot) 

264 elif path.startswith(os.path.pardir): 

265 return None 

266 return path 

267 

268 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

269 # Docstring inherited from FileLikeDatastore._standardizeIngestPath. 

270 fullPath = os.path.normpath(os.path.join(self.root, path)) 

271 if not os.path.exists(fullPath): 

272 raise FileNotFoundError(f"File at '{fullPath}' does not exist; note that paths to ingest " 

273 f"are assumed to be relative to self.root unless they are absolute.") 

274 if transfer is None: 

275 path = self._pathInStore(path) 

276 if path is None: 

277 raise RuntimeError(f"'{path}' is not inside repository root '{self.root}'.") 

278 return path 

279 

280 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

281 transfer: Optional[str] = None) -> StoredFileInfo: 

282 # Docstring inherited from FileLikeDatastore._extractIngestInfo. 

283 fullPath = os.path.normpath(os.path.join(self.root, path)) 

284 if transfer is not None: 

285 template = self.templates.getTemplate(ref) 

286 location = self.locationFactory.fromPath(template.format(ref)) 

287 newPath = formatter.predictPathFromLocation(location) 

288 newFullPath = os.path.join(self.root, newPath) 

289 if os.path.exists(newFullPath): 

290 raise FileExistsError(f"File '{newFullPath}' already exists.") 

291 storageDir = os.path.dirname(newFullPath) 

292 if not os.path.isdir(storageDir): 

293 # Do not attempt to reverse directory creation 

294 # because of race conditions with other processes running 

295 # ingest in parallel. 

296 safeMakeDir(storageDir) 

297 if transfer == "move": 

298 with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath): 

299 shutil.move(fullPath, newFullPath) 

300 elif transfer == "copy": 

301 with self._transaction.undoWith("copy", os.remove, newFullPath): 

302 shutil.copy(fullPath, newFullPath) 

303 elif transfer == "link": 

304 with self._transaction.undoWith("link", os.unlink, newFullPath): 

305 realPath = os.path.realpath(fullPath) 

306 # Try hard link and if that fails use a symlink 

307 try: 

308 os.link(realPath, newFullPath) 

309 except OSError: 

310 # Read through existing symlinks 

311 os.symlink(realPath, newFullPath) 

312 elif transfer == "hardlink": 

313 with self._transaction.undoWith("hardlink", os.unlink, newFullPath): 

314 os.link(os.path.realpath(fullPath), newFullPath) 

315 elif transfer == "symlink": 

316 with self._transaction.undoWith("symlink", os.unlink, newFullPath): 

317 # Read through existing symlinks 

318 os.symlink(os.path.realpath(fullPath), newFullPath) 

319 elif transfer == "relsymlink": 319 ↛ 330line 319 didn't jump to line 330, because the condition on line 319 was never false

320 # This is a standard symlink but using a relative path 

321 fullPath = os.path.realpath(fullPath) 

322 

323 # Need the directory name to give to relative root 

324 # A full file path confuses it into an extra ../ 

325 newFullPathRoot, _ = os.path.split(newFullPath) 

326 relPath = os.path.relpath(fullPath, newFullPathRoot) 

327 with self._transaction.undoWith("relsymlink", os.unlink, newFullPath): 

328 os.symlink(relPath, newFullPath) 

329 else: 

330 raise NotImplementedError("Transfer type '{}' not supported.".format(transfer)) 

331 path = newPath 

332 fullPath = newFullPath 

333 if self.useChecksum: 

334 checksum = self.computeChecksum(fullPath) 

335 else: 

336 checksum = None 

337 stat = os.stat(fullPath) 

338 size = stat.st_size 

339 return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, 

340 file_size=size, checksum=checksum) 

341 

342 def remove(self, ref): 

343 """Indicate to the Datastore that a Dataset can be removed. 

344 

345 .. warning:: 

346 

347 This method does not support transactions; removals are 

348 immediate, cannot be undone, and are not guaranteed to 

349 be atomic if deleting either the file or the internal 

350 database records fails. 

351 

352 Parameters 

353 ---------- 

354 ref : `DatasetRef` 

355 Reference to the required Dataset. 

356 

357 Raises 

358 ------ 

359 FileNotFoundError 

360 Attempt to remove a dataset that does not exist. 

361 """ 

362 # Get file metadata and internal metadata 

363 location, _ = self._get_dataset_location_info(ref) 

364 if location is None: 

365 raise FileNotFoundError(f"Requested dataset ({ref}) does not exist") 

366 

367 if not os.path.exists(location.path): 367 ↛ 368line 367 didn't jump to line 368, because the condition on line 367 was never true

368 raise FileNotFoundError(f"No such file: {location.uri}") 

369 

370 if self._can_remove_dataset_artifact(ref): 

371 # Only reference to this path so we can remove it 

372 os.remove(location.path) 

373 

374 # Remove rows from registries 

375 self._remove_from_registry(ref) 

376 

377 @staticmethod 

378 def computeChecksum(filename, algorithm="blake2b", block_size=8192): 

379 """Compute the checksum of the supplied file. 

380 

381 Parameters 

382 ---------- 

383 filename : `str` 

384 Name of file to calculate checksum from. 

385 algorithm : `str`, optional 

386 Name of algorithm to use. Must be one of the algorithms supported 

387 by :py:class`hashlib`. 

388 block_size : `int` 

389 Number of bytes to read from file at one time. 

390 

391 Returns 

392 ------- 

393 hexdigest : `str` 

394 Hex digest of the file. 

395 """ 

396 if algorithm not in hashlib.algorithms_guaranteed: 396 ↛ 397line 396 didn't jump to line 397, because the condition on line 396 was never true

397 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

398 

399 hasher = hashlib.new(algorithm) 

400 

401 with open(filename, "rb") as f: 

402 for chunk in iter(lambda: f.read(block_size), b""): 

403 hasher.update(chunk) 

404 

405 return hasher.hexdigest() 

406 

407 def export(self, refs: Iterable[DatasetRef], *, 

408 directory: Optional[str] = None, transfer: Optional[str] = None) -> Iterable[FileDataset]: 

409 # Docstring inherited from Datastore.export. 

410 for ref in refs: 

411 location, storedFileInfo = self._get_dataset_location_info(ref) 

412 if location is None: 412 ↛ 413line 412 didn't jump to line 413, because the condition on line 412 was never true

413 raise FileNotFoundError(f"Could not retrieve Dataset {ref}.") 

414 if transfer is None: 414 ↛ 420line 414 didn't jump to line 420, because the condition on line 414 was never false

415 # TODO: do we also need to return the readStorageClass somehow? 

416 yield FileDataset(refs=[ref], path=location.pathInStore, formatter=storedFileInfo.formatter) 

417 else: 

418 # TODO: add support for other transfer modes. If we support 

419 # moving, this method should become transactional. 

420 raise NotImplementedError(f"Transfer mode '{transfer}' not yet supported.")