Coverage for python/lsst/resources/file.py: 95%

196 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-09 11:30 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("FileResourcePath",) 

15 

16import contextlib 

17import copy 

18import logging 

19import os 

20import os.path 

21import posixpath 

22import re 

23import shutil 

24import urllib.parse 

25from collections.abc import Iterator 

26from typing import IO, TYPE_CHECKING 

27 

28from ._resourceHandles._fileResourceHandle import FileResourceHandle 

29from ._resourcePath import ResourcePath 

30from .utils import NoTransaction, ensure_directory_is_writeable, os2posix, posix2os 

31 

32if TYPE_CHECKING: 

33 from .utils import TransactionProtocol 

34 

35 

36log = logging.getLogger(__name__) 

37 

38 

39class FileResourcePath(ResourcePath): 

40 """Path for explicit ``file`` URI scheme.""" 

41 

42 transferModes = ("copy", "link", "symlink", "hardlink", "relsymlink", "auto", "move") 

43 transferDefault: str = "link" 

44 

45 # By definition refers to a local file 

46 isLocal = True 

47 

48 @property 

49 def ospath(self) -> str: 

50 """Path component of the URI localized to current OS. 

51 

52 Will unquote URI path since a formal URI must include the quoting. 

53 """ 

54 return urllib.parse.unquote(posix2os(self._uri.path)) 

55 

56 def exists(self) -> bool: 

57 """Indicate that the file exists.""" 

58 # Uses os.path.exists so if there is a soft link that points 

59 # to a file that no longer exists this will return False 

60 return os.path.exists(self.ospath) 

61 

62 def size(self) -> int: 

63 """Return the size of the file in bytes.""" 

64 if not os.path.isdir(self.ospath): 

65 stat = os.stat(self.ospath) 

66 sz = stat.st_size 

67 else: 

68 sz = 0 

69 return sz 

70 

71 def remove(self) -> None: 

72 """Remove the resource.""" 

73 os.remove(self.ospath) 

74 

75 def _as_local(self) -> tuple[str, bool]: 

76 """Return the local path of the file. 

77 

78 This is an internal helper for ``as_local()``. 

79 

80 Returns 

81 ------- 

82 path : `str` 

83 The local path to this file. 

84 temporary : `bool` 

85 Always returns the temporary nature of the input file resource. 

86 """ 

87 return self.ospath, self.isTemporary 

88 

89 def read(self, size: int = -1) -> bytes: 

90 with open(self.ospath, "rb") as fh: 

91 return fh.read(size) 

92 

93 def write(self, data: bytes, overwrite: bool = True) -> None: 

94 dir = os.path.dirname(self.ospath) 

95 if not os.path.exists(dir): 

96 _create_directories(dir) 

97 mode = "wb" if overwrite else "xb" 

98 with open(self.ospath, mode) as f: 

99 f.write(data) 

100 

101 def mkdir(self) -> None: 

102 """Make the directory associated with this URI. 

103 

104 An attempt will be made to create the directory even if the URI 

105 looks like a file. 

106 

107 Raises 

108 ------ 

109 NotADirectoryError: 

110 Raised if a non-directory already exists. 

111 """ 

112 try: 

113 _create_directories(self.ospath) 

114 except FileExistsError: 

115 raise NotADirectoryError(f"{self.ospath} exists but is not a directory.") from None 

116 

117 def isdir(self) -> bool: 

118 """Return whether this URI is a directory. 

119 

120 Returns 

121 ------- 

122 isdir : `bool` 

123 `True` if this URI is a directory or looks like a directory, 

124 else `False`. 

125 """ 

126 return self.dirLike or os.path.isdir(self.ospath) 

127 

128 def transfer_from( 

129 self, 

130 src: ResourcePath, 

131 transfer: str, 

132 overwrite: bool = False, 

133 transaction: TransactionProtocol | None = None, 

134 ) -> None: 

135 """Transfer the current resource to a local file. 

136 

137 Parameters 

138 ---------- 

139 src : `ResourcePath` 

140 Source URI. 

141 transfer : `str` 

142 Mode to use for transferring the resource. Supports the following 

143 options: copy, link, symlink, hardlink, relsymlink. 

144 overwrite : `bool`, optional 

145 Allow an existing file to be overwritten. Defaults to `False`. 

146 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

147 If a transaction is provided, undo actions will be registered. 

148 """ 

149 # Fail early to prevent delays if remote resources are requested 

150 if transfer not in self.transferModes: 

151 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

152 

153 # Existence checks can take time so only try if the log message 

154 # will be issued. 

155 if log.isEnabledFor(logging.DEBUG): 155 ↛ 167line 155 didn't jump to line 167, because the condition on line 155 was never false

156 log.debug( 

157 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

158 src, 

159 src.exists(), 

160 self, 

161 self.exists(), 

162 transfer, 

163 ) 

164 

165 # We do not have to special case FileResourcePath here because 

166 # as_local handles that. 

167 with src.as_local() as local_uri: 

168 is_temporary = local_uri.isTemporary 

169 local_src = local_uri.ospath 

170 

171 # Short circuit if the URIs are identical immediately. 

172 if self == local_uri: 

173 log.debug( 

174 "Target and destination URIs are identical: %s, returning immediately." 

175 " No further action required.", 

176 self, 

177 ) 

178 return 

179 

180 # Default transfer mode depends on whether we have a temporary 

181 # file or not. 

182 if transfer == "auto": 

183 transfer = self.transferDefault if not is_temporary else "copy" 

184 

185 if not os.path.exists(local_src): 

186 if is_temporary: 

187 if src == local_uri: 187 ↛ 191line 187 didn't jump to line 191, because the condition on line 187 was never false

188 msg = f"Local temporary file {src} has gone missing." 

189 else: 

190 # This will not happen in normal scenarios. 

191 msg = f"Local file {local_uri} downloaded from {src} has gone missing" 

192 else: 

193 msg = f"Source URI {src} does not exist" 

194 raise FileNotFoundError(msg) 

195 

196 # Follow soft links 

197 local_src = os.path.realpath(os.path.normpath(local_src)) 

198 

199 # Creating a symlink to a local copy of a remote resource 

200 # should never work. Creating a hardlink will work but should 

201 # not be allowed since it is highly unlikely that this is ever 

202 # an intended option and depends on the local target being 

203 # on the same file system as was used for the temporary file 

204 # download. 

205 # If a symlink is being requested for a local temporary file 

206 # that is likely undesirable but should not be refused. 

207 if is_temporary and src != local_uri and "link" in transfer: 

208 raise RuntimeError( 

209 f"Can not use local file system transfer mode {transfer} for remote resource ({src})" 

210 ) 

211 elif is_temporary and src == local_uri and "symlink" in transfer: 

212 log.debug( 

213 "Using a symlink for a temporary resource may lead to unexpected downstream failures." 

214 ) 

215 

216 # For temporary files we can own them if we created it. 

217 requested_transfer = transfer 

218 if src != local_uri and is_temporary and transfer == "copy": 

219 transfer = "move" 

220 

221 # The output location should not exist unless overwrite=True. 

222 # Rather than use `exists()`, use os.stat since we might need 

223 # the full answer later. 

224 dest_stat: os.stat_result | None 

225 try: 

226 # Do not read through links of the file itself. 

227 dest_stat = os.lstat(self.ospath) 

228 except FileNotFoundError: 

229 dest_stat = None 

230 

231 # It is possible that the source URI and target URI refer 

232 # to the same file. This can happen for a number of reasons 

233 # (such as soft links in the path, or they really are the same). 

234 # In that case log a message and return as if the transfer 

235 # completed (it technically did). A temporary file download 

236 # can't be the same so the test can be skipped. 

237 if dest_stat and not is_temporary: 

238 # Be consistent and use lstat here (even though realpath 

239 # has been called). It does not harm. 

240 local_src_stat = os.lstat(local_src) 

241 if dest_stat.st_ino == local_src_stat.st_ino and dest_stat.st_dev == local_src_stat.st_dev: 

242 log.debug( 

243 "Destination URI %s is the same file as source URI %s, returning immediately." 

244 " No further action required.", 

245 self, 

246 local_uri, 

247 ) 

248 return 

249 

250 if not overwrite and dest_stat: 

251 raise FileExistsError( 

252 f"Destination path '{self}' already exists. Transfer from {src} cannot be completed." 

253 ) 

254 

255 # Make the path absolute (but don't follow links since that 

256 # would possibly cause us to end up in the wrong place if the 

257 # file existed already as a soft link) 

258 newFullPath = os.path.abspath(self.ospath) 

259 outputDir = os.path.dirname(newFullPath) 

260 if not os.path.isdir(outputDir): 

261 # Must create the directory -- this can not be rolled back 

262 # since another transfer running concurrently may 

263 # be relying on this existing. 

264 _create_directories(outputDir) 

265 

266 if transaction is None: 266 ↛ 273line 266 didn't jump to line 273, because the condition on line 266 was never false

267 # Use a no-op transaction to reduce code duplication 

268 transaction = NoTransaction() 

269 

270 # For links the OS doesn't let us overwrite so if something does 

271 # exist we have to remove it before we do the actual "transfer" 

272 # below 

273 if "link" in transfer and overwrite and dest_stat: 

274 with contextlib.suppress(Exception): 

275 # If this fails we ignore it since it's a problem 

276 # that will manifest immediately below with a more relevant 

277 # error message 

278 self.remove() 

279 

280 if transfer == "move": 

281 # If a rename works we try that since that is guaranteed to 

282 # be atomic. If that fails we copy and rename. We do this 

283 # in case other processes are trying to move to the same 

284 # file and we want the "winner" to not be corrupted. 

285 try: 

286 with transaction.undoWith(f"move from {local_src}", os.rename, newFullPath, local_src): 

287 os.rename(local_src, newFullPath) 

288 except OSError: 

289 with self.temporary_uri(prefix=self.parent(), suffix=self.getExtension()) as temp_copy: 

290 shutil.copy(local_src, temp_copy.ospath) 

291 with transaction.undoWith( 

292 f"move from {local_src}", 

293 shutil.move, 

294 newFullPath, 

295 local_src, 

296 copy_function=shutil.copy, 

297 ): 

298 os.rename(temp_copy.ospath, newFullPath) 

299 os.remove(local_src) 

300 elif transfer == "copy": 

301 # We want atomic copies so first copy to a temp location in 

302 # the same output directory. This at least guarantees that 

303 # if multiple processes are writing to the same file 

304 # simultaneously the file we end up with will not be corrupt. 

305 with self.temporary_uri(prefix=self.parent(), suffix=self.getExtension()) as temp_copy: 

306 shutil.copy(local_src, temp_copy.ospath) 

307 with transaction.undoWith(f"copy from {local_src}", os.remove, newFullPath): 

308 # os.rename works even if the file exists. 

309 # It's possible that another process has copied a file 

310 # in whilst this one was copying. If overwrite 

311 # protection is needed then another stat() call should 

312 # happen here. 

313 os.rename(temp_copy.ospath, newFullPath) 

314 elif transfer == "link": 

315 # Try hard link and if that fails use a symlink 

316 with transaction.undoWith(f"link to {local_src}", os.remove, newFullPath): 

317 try: 

318 os.link(local_src, newFullPath) 

319 except OSError: 

320 # Read through existing symlinks 

321 os.symlink(local_src, newFullPath) 

322 elif transfer == "hardlink": 

323 with transaction.undoWith(f"hardlink to {local_src}", os.remove, newFullPath): 

324 os.link(local_src, newFullPath) 

325 elif transfer == "symlink": 

326 # Read through existing symlinks 

327 with transaction.undoWith(f"symlink to {local_src}", os.remove, newFullPath): 

328 os.symlink(local_src, newFullPath) 

329 elif transfer == "relsymlink": 

330 # This is a standard symlink but using a relative path 

331 # Need the directory name to give to relative root 

332 # A full file path confuses it into an extra ../ 

333 newFullPathRoot = os.path.dirname(newFullPath) 

334 relPath = os.path.relpath(local_src, newFullPathRoot) 

335 with transaction.undoWith(f"relsymlink to {local_src}", os.remove, newFullPath): 

336 os.symlink(relPath, newFullPath) 

337 else: 

338 raise NotImplementedError(f"Transfer type '{transfer}' not supported.") 

339 

340 # This was an explicit move requested from a remote resource 

341 # try to remove that remote resource. We check is_temporary because 

342 # the local file would have been moved by shutil.move already. 

343 if requested_transfer == "move" and is_temporary and src != local_uri: 

344 # Transactions do not work here 

345 src.remove() 

346 

347 def walk( 

348 self, file_filter: str | re.Pattern | None = None 

349 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

350 """Walk the directory tree returning matching files and directories. 

351 

352 Parameters 

353 ---------- 

354 file_filter : `str` or `re.Pattern`, optional 

355 Regex to filter out files from the list before it is returned. 

356 

357 Yields 

358 ------ 

359 dirpath : `ResourcePath` 

360 Current directory being examined. 

361 dirnames : `list` of `str` 

362 Names of subdirectories within dirpath. 

363 filenames : `list` of `str` 

364 Names of all the files within dirpath. 

365 """ 

366 if not self.isdir(): 

367 raise ValueError("Can not walk a non-directory URI") 

368 

369 if isinstance(file_filter, str): 369 ↛ 370line 369 didn't jump to line 370, because the condition on line 369 was never true

370 file_filter = re.compile(file_filter) 

371 

372 for root, dirs, files in os.walk(self.ospath, followlinks=True): 

373 # Filter by the regex 

374 if file_filter is not None: 

375 files = [f for f in files if file_filter.search(f)] 

376 yield type(self)(root, forceAbsolute=False, forceDirectory=True), dirs, files 

377 

378 @classmethod 

379 def _fixupPathUri( 

380 cls, 

381 parsed: urllib.parse.ParseResult, 

382 root: ResourcePath | None = None, 

383 forceAbsolute: bool = False, 

384 forceDirectory: bool = False, 

385 ) -> tuple[urllib.parse.ParseResult, bool]: 

386 """Fix up relative paths in URI instances. 

387 

388 Parameters 

389 ---------- 

390 parsed : `~urllib.parse.ParseResult` 

391 The result from parsing a URI using `urllib.parse`. 

392 root : `ResourcePath`, optional 

393 Path to use as root when converting relative to absolute. 

394 If `None`, it will be the current working directory. It is only 

395 used if a file-scheme is used incorrectly with a relative path. 

396 forceAbsolute : `bool`, ignored 

397 Has no effect for this subclass. ``file`` URIs are always 

398 absolute. 

399 forceDirectory : `bool`, optional 

400 If `True` forces the URI to end with a separator, otherwise given 

401 URI is interpreted as is. 

402 

403 Returns 

404 ------- 

405 modified : `~urllib.parse.ParseResult` 

406 Update result if a URI is being handled. 

407 dirLike : `bool` 

408 `True` if given parsed URI has a trailing separator or 

409 forceDirectory is True. Otherwise `False`. 

410 

411 Notes 

412 ----- 

413 Relative paths are explicitly not supported by RFC8089 but `urllib` 

414 does accept URIs of the form ``file:relative/path.ext``. They need 

415 to be turned into absolute paths before they can be used. This is 

416 always done regardless of the ``forceAbsolute`` parameter. 

417 """ 

418 # assume we are not dealing with a directory like URI 

419 dirLike = False 

420 

421 # file URI implies POSIX path separators so split as POSIX, 

422 # then join as os, and convert to abspath. Do not handle 

423 # home directories since "file" scheme is explicitly documented 

424 # to not do tilde expansion. 

425 sep = posixpath.sep 

426 

427 # For local file system we can explicitly check to see if this 

428 # really is a directory. The URI might point to a location that 

429 # does not exists yet but all that matters is if it is a directory 

430 # then we make sure use that fact. No need to do the check if 

431 # we are already being told. 

432 if not forceDirectory and posixpath.isdir(parsed.path): 

433 forceDirectory = True 

434 

435 # For an absolute path all we need to do is check if we need 

436 # to force the directory separator 

437 if posixpath.isabs(parsed.path): 

438 if forceDirectory: 

439 if not parsed.path.endswith(sep): 

440 parsed = parsed._replace(path=parsed.path + sep) 

441 dirLike = True 

442 return copy.copy(parsed), dirLike 

443 

444 # Relative path so must fix it to be compliant with the standard 

445 

446 # Replacement values for the URI 

447 replacements = {} 

448 

449 if root is None: 

450 root_str = os.path.abspath(os.path.curdir) 

451 else: 

452 if root.scheme and root.scheme != "file": 452 ↛ 453line 452 didn't jump to line 453, because the condition on line 452 was never true

453 raise RuntimeError(f"The override root must be a file URI not {root.scheme}") 

454 root_str = os.path.abspath(root.ospath) 

455 

456 replacements["path"] = posixpath.normpath(posixpath.join(os2posix(root_str), parsed.path)) 

457 

458 # normpath strips trailing "/" so put it back if necessary 

459 # Acknowledge that trailing separator exists. 

460 if forceDirectory or (parsed.path.endswith(sep) and not replacements["path"].endswith(sep)): 

461 replacements["path"] += sep 

462 dirLike = True 

463 

464 # ParseResult is a NamedTuple so _replace is standard API 

465 parsed = parsed._replace(**replacements) 

466 

467 if parsed.params or parsed.query: 467 ↛ 468line 467 didn't jump to line 468, because the condition on line 467 was never true

468 log.warning("Additional items unexpectedly encountered in file URI: %s", parsed.geturl()) 

469 

470 return parsed, dirLike 

471 

472 @contextlib.contextmanager 

473 def _openImpl( 

474 self, 

475 mode: str = "r", 

476 *, 

477 encoding: str | None = None, 

478 ) -> Iterator[IO]: 

479 with FileResourceHandle(mode=mode, log=log, filename=self.ospath, encoding=encoding) as buffer: 

480 yield buffer # type: ignore 

481 

482 

483def _create_directories(name: str | bytes) -> None: 

484 """Create a directory and all of its parent directories that don't yet 

485 exist. 

486 

487 Parameters 

488 ---------- 

489 name : `str` or `bytes` 

490 Path to the directory to be created 

491 

492 Notes 

493 ----- 

494 The code in this function is duplicated from the Python standard library 

495 function os.makedirs with one change: if the user has set a process umask 

496 that prevents us from creating/accessing files in the newly created 

497 directories, the permissions of the directories are altered to allow 

498 owner-write and owner-traverse so that they can be used. 

499 """ 

500 # These are optional parameters in the original function, but they can be 

501 # constant here. 

502 mode = 0o777 

503 exist_ok = True 

504 

505 head, tail = os.path.split(name) 

506 if not tail: 

507 head, tail = os.path.split(head) 

508 if head and tail and not os.path.exists(head): 

509 try: 

510 _create_directories(head) 

511 except FileExistsError: 

512 # Defeats race condition when another thread created the path 

513 pass 

514 cdir: str | bytes = os.curdir 

515 if isinstance(tail, bytes): 515 ↛ 516line 515 didn't jump to line 516, because the condition on line 515 was never true

516 cdir = bytes(os.curdir, "ASCII") 

517 if tail == cdir: # xxx/newdir/. exists if xxx/newdir exists 517 ↛ 518line 517 didn't jump to line 518, because the condition on line 517 was never true

518 return 

519 try: 

520 os.mkdir(name, mode) 

521 # This is the portion that is modified relative to the standard library 

522 # version of the function. 

523 ensure_directory_is_writeable(name) 

524 # end modified portion 

525 except OSError: 

526 # Cannot rely on checking for EEXIST, since the operating system 

527 # could give priority to other errors like EACCES or EROFS 

528 if not exist_ok or not os.path.isdir(name): 

529 raise