Coverage for python/lsst/resources/file.py: 91%

198 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-16 02:50 -0700

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("FileResourcePath",) 

15 

16import contextlib 

17import copy 

18import logging 

19import os 

20import os.path 

21import posixpath 

22import re 

23import shutil 

24import urllib.parse 

25from collections.abc import Iterator 

26from typing import IO, TYPE_CHECKING 

27 

28from ._resourceHandles._fileResourceHandle import FileResourceHandle 

29from ._resourcePath import ResourcePath 

30from .utils import NoTransaction, ensure_directory_is_writeable, os2posix, posix2os 

31 

32if TYPE_CHECKING: 

33 from .utils import TransactionProtocol 

34 

35 

36log = logging.getLogger(__name__) 

37 

38 

39class FileResourcePath(ResourcePath): 

40 """Path for explicit ``file`` URI scheme.""" 

41 

42 transferModes = ("copy", "link", "symlink", "hardlink", "relsymlink", "auto", "move") 

43 transferDefault: str = "link" 

44 

45 # By definition refers to a local file 

46 isLocal = True 

47 

48 @property 

49 def ospath(self) -> str: 

50 """Path component of the URI localized to current OS. 

51 

52 Will unquote URI path since a formal URI must include the quoting. 

53 """ 

54 return urllib.parse.unquote(posix2os(self._uri.path)) 

55 

56 def exists(self) -> bool: 

57 """Indicate that the file exists.""" 

58 # Uses os.path.exists so if there is a soft link that points 

59 # to a file that no longer exists this will return False 

60 return os.path.exists(self.ospath) 

61 

62 def size(self) -> int: 

63 """Return the size of the file in bytes.""" 

64 if not os.path.isdir(self.ospath): 

65 stat = os.stat(self.ospath) 

66 sz = stat.st_size 

67 else: 

68 sz = 0 

69 return sz 

70 

71 def remove(self) -> None: 

72 """Remove the resource.""" 

73 os.remove(self.ospath) 

74 

75 def _as_local(self) -> tuple[str, bool]: 

76 """Return the local path of the file. 

77 

78 This is an internal helper for ``as_local()``. 

79 

80 Returns 

81 ------- 

82 path : `str` 

83 The local path to this file. 

84 temporary : `bool` 

85 Always returns the temporary nature of the input file resource. 

86 """ 

87 return self.ospath, self.isTemporary 

88 

89 def read(self, size: int = -1) -> bytes: 

90 with open(self.ospath, "rb") as fh: 

91 return fh.read(size) 

92 

93 def write(self, data: bytes, overwrite: bool = True) -> None: 

94 dir = os.path.dirname(self.ospath) 

95 if not os.path.exists(dir): 

96 _create_directories(dir) 

97 mode = "wb" if overwrite else "xb" 

98 with open(self.ospath, mode) as f: 

99 f.write(data) 

100 

101 def mkdir(self) -> None: 

102 """Make the directory associated with this URI. 

103 

104 An attempt will be made to create the directory even if the URI 

105 looks like a file. 

106 

107 Raises 

108 ------ 

109 NotADirectoryError: 

110 Raised if a non-directory already exists. 

111 """ 

112 try: 

113 _create_directories(self.ospath) 

114 except FileExistsError: 

115 raise NotADirectoryError(f"{self.ospath} exists but is not a directory.") from None 

116 

117 def isdir(self) -> bool: 

118 """Return whether this URI is a directory. 

119 

120 Returns 

121 ------- 

122 isdir : `bool` 

123 `True` if this URI is a directory or looks like a directory, 

124 else `False`. 

125 """ 

126 if self.dirLike is None: 

127 # Cache state for next time. 

128 self.dirLike = os.path.isdir(self.ospath) 

129 return self.dirLike 

130 

131 def transfer_from( 

132 self, 

133 src: ResourcePath, 

134 transfer: str, 

135 overwrite: bool = False, 

136 transaction: TransactionProtocol | None = None, 

137 ) -> None: 

138 """Transfer the current resource to a local file. 

139 

140 Parameters 

141 ---------- 

142 src : `ResourcePath` 

143 Source URI. 

144 transfer : `str` 

145 Mode to use for transferring the resource. Supports the following 

146 options: copy, link, symlink, hardlink, relsymlink. 

147 overwrite : `bool`, optional 

148 Allow an existing file to be overwritten. Defaults to `False`. 

149 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

150 If a transaction is provided, undo actions will be registered. 

151 """ 

152 # Fail early to prevent delays if remote resources are requested 

153 if transfer not in self.transferModes: 

154 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

155 

156 # Existence checks can take time so only try if the log message 

157 # will be issued. 

158 if log.isEnabledFor(logging.DEBUG): 158 ↛ 170line 158 didn't jump to line 170, because the condition on line 158 was always true

159 log.debug( 

160 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

161 src, 

162 src.exists(), 

163 self, 

164 self.exists(), 

165 transfer, 

166 ) 

167 

168 # We do not have to special case FileResourcePath here because 

169 # as_local handles that. 

170 with src.as_local() as local_uri: 

171 is_temporary = local_uri.isTemporary 

172 local_src = local_uri.ospath 

173 

174 # Short circuit if the URIs are identical immediately. 

175 if self == local_uri: 

176 log.debug( 

177 "Target and destination URIs are identical: %s, returning immediately." 

178 " No further action required.", 

179 self, 

180 ) 

181 return 

182 

183 # Default transfer mode depends on whether we have a temporary 

184 # file or not. 

185 if transfer == "auto": 

186 transfer = self.transferDefault if not is_temporary else "copy" 

187 

188 if not os.path.exists(local_src): 

189 if is_temporary: 

190 if src == local_uri: 190 ↛ 194line 190 didn't jump to line 194, because the condition on line 190 was always true

191 msg = f"Local temporary file {src} has gone missing." 

192 else: 

193 # This will not happen in normal scenarios. 

194 msg = f"Local file {local_uri} downloaded from {src} has gone missing" 

195 else: 

196 msg = f"Source URI {src} does not exist" 

197 raise FileNotFoundError(msg) 

198 

199 # Follow soft links 

200 local_src = os.path.realpath(os.path.normpath(local_src)) 

201 

202 # Creating a symlink to a local copy of a remote resource 

203 # should never work. Creating a hardlink will work but should 

204 # not be allowed since it is highly unlikely that this is ever 

205 # an intended option and depends on the local target being 

206 # on the same file system as was used for the temporary file 

207 # download. 

208 # If a symlink is being requested for a local temporary file 

209 # that is likely undesirable but should not be refused. 

210 if is_temporary and src != local_uri and "link" in transfer: 

211 raise RuntimeError( 

212 f"Can not use local file system transfer mode {transfer} for remote resource ({src})" 

213 ) 

214 elif is_temporary and src == local_uri and "symlink" in transfer: 

215 log.debug( 

216 "Using a symlink for a temporary resource may lead to unexpected downstream failures." 

217 ) 

218 

219 # For temporary files we can own them if we created it. 

220 requested_transfer = transfer 

221 if src != local_uri and is_temporary and transfer == "copy": 

222 transfer = "move" 

223 

224 # The output location should not exist unless overwrite=True. 

225 # Rather than use `exists()`, use os.stat since we might need 

226 # the full answer later. 

227 dest_stat: os.stat_result | None 

228 try: 

229 # Do not read through links of the file itself. 

230 dest_stat = os.lstat(self.ospath) 

231 except FileNotFoundError: 

232 dest_stat = None 

233 

234 # It is possible that the source URI and target URI refer 

235 # to the same file. This can happen for a number of reasons 

236 # (such as soft links in the path, or they really are the same). 

237 # In that case log a message and return as if the transfer 

238 # completed (it technically did). A temporary file download 

239 # can't be the same so the test can be skipped. 

240 if dest_stat and not is_temporary: 

241 # Be consistent and use lstat here (even though realpath 

242 # has been called). It does not harm. 

243 local_src_stat = os.lstat(local_src) 

244 if dest_stat.st_ino == local_src_stat.st_ino and dest_stat.st_dev == local_src_stat.st_dev: 

245 log.debug( 

246 "Destination URI %s is the same file as source URI %s, returning immediately." 

247 " No further action required.", 

248 self, 

249 local_uri, 

250 ) 

251 return 

252 

253 if not overwrite and dest_stat: 

254 raise FileExistsError( 

255 f"Destination path '{self}' already exists. Transfer from {src} cannot be completed." 

256 ) 

257 

258 # Make the path absolute (but don't follow links since that 

259 # would possibly cause us to end up in the wrong place if the 

260 # file existed already as a soft link) 

261 newFullPath = os.path.abspath(self.ospath) 

262 outputDir = os.path.dirname(newFullPath) 

263 if not os.path.isdir(outputDir): 

264 # Must create the directory -- this can not be rolled back 

265 # since another transfer running concurrently may 

266 # be relying on this existing. 

267 _create_directories(outputDir) 

268 

269 if transaction is None: 269 ↛ 276line 269 didn't jump to line 276, because the condition on line 269 was always true

270 # Use a no-op transaction to reduce code duplication 

271 transaction = NoTransaction() 

272 

273 # For links the OS doesn't let us overwrite so if something does 

274 # exist we have to remove it before we do the actual "transfer" 

275 # below 

276 if "link" in transfer and overwrite and dest_stat: 

277 with contextlib.suppress(Exception): 

278 # If this fails we ignore it since it's a problem 

279 # that will manifest immediately below with a more relevant 

280 # error message 

281 self.remove() 

282 

283 if transfer == "move": 

284 # If a rename works we try that since that is guaranteed to 

285 # be atomic. If that fails we copy and rename. We do this 

286 # in case other processes are trying to move to the same 

287 # file and we want the "winner" to not be corrupted. 

288 try: 

289 with transaction.undoWith(f"move from {local_src}", os.rename, newFullPath, local_src): 

290 os.rename(local_src, newFullPath) 

291 except OSError: 

292 with self.temporary_uri(prefix=self.parent(), suffix=self.getExtension()) as temp_copy: 

293 shutil.copy(local_src, temp_copy.ospath) 

294 with transaction.undoWith( 

295 f"move from {local_src}", 

296 shutil.move, 

297 newFullPath, 

298 local_src, 

299 copy_function=shutil.copy, 

300 ): 

301 os.rename(temp_copy.ospath, newFullPath) 

302 os.remove(local_src) 

303 elif transfer == "copy": 

304 # We want atomic copies so first copy to a temp location in 

305 # the same output directory. This at least guarantees that 

306 # if multiple processes are writing to the same file 

307 # simultaneously the file we end up with will not be corrupt. 

308 with self.temporary_uri(prefix=self.parent(), suffix=self.getExtension()) as temp_copy: 

309 shutil.copy(local_src, temp_copy.ospath) 

310 with transaction.undoWith(f"copy from {local_src}", os.remove, newFullPath): 

311 # os.rename works even if the file exists. 

312 # It's possible that another process has copied a file 

313 # in whilst this one was copying. If overwrite 

314 # protection is needed then another stat() call should 

315 # happen here. 

316 os.rename(temp_copy.ospath, newFullPath) 

317 elif transfer == "link": 

318 # Try hard link and if that fails use a symlink 

319 with transaction.undoWith(f"link to {local_src}", os.remove, newFullPath): 

320 try: 

321 os.link(local_src, newFullPath) 

322 except OSError: 

323 # Read through existing symlinks 

324 os.symlink(local_src, newFullPath) 

325 elif transfer == "hardlink": 

326 with transaction.undoWith(f"hardlink to {local_src}", os.remove, newFullPath): 

327 os.link(local_src, newFullPath) 

328 elif transfer == "symlink": 

329 # Read through existing symlinks 

330 with transaction.undoWith(f"symlink to {local_src}", os.remove, newFullPath): 

331 os.symlink(local_src, newFullPath) 

332 elif transfer == "relsymlink": 

333 # This is a standard symlink but using a relative path 

334 # Need the directory name to give to relative root 

335 # A full file path confuses it into an extra ../ 

336 newFullPathRoot = os.path.dirname(newFullPath) 

337 relPath = os.path.relpath(local_src, newFullPathRoot) 

338 with transaction.undoWith(f"relsymlink to {local_src}", os.remove, newFullPath): 

339 os.symlink(relPath, newFullPath) 

340 else: 

341 raise NotImplementedError(f"Transfer type '{transfer}' not supported.") 

342 

343 # This was an explicit move requested from a remote resource 

344 # try to remove that remote resource. We check is_temporary because 

345 # the local file would have been moved by shutil.move already. 

346 if requested_transfer == "move" and is_temporary and src != local_uri: 

347 # Transactions do not work here 

348 src.remove() 

349 

350 def walk( 

351 self, file_filter: str | re.Pattern | None = None 

352 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

353 """Walk the directory tree returning matching files and directories. 

354 

355 Parameters 

356 ---------- 

357 file_filter : `str` or `re.Pattern`, optional 

358 Regex to filter out files from the list before it is returned. 

359 

360 Yields 

361 ------ 

362 dirpath : `ResourcePath` 

363 Current directory being examined. 

364 dirnames : `list` of `str` 

365 Names of subdirectories within dirpath. 

366 filenames : `list` of `str` 

367 Names of all the files within dirpath. 

368 """ 

369 if not self.isdir(): 

370 raise ValueError("Can not walk a non-directory URI") 

371 

372 if isinstance(file_filter, str): 372 ↛ 373line 372 didn't jump to line 373, because the condition on line 372 was never true

373 file_filter = re.compile(file_filter) 

374 

375 for root, dirs, files in os.walk(self.ospath, followlinks=True): 

376 # Filter by the regex 

377 if file_filter is not None: 

378 files = [f for f in files if file_filter.search(f)] 

379 yield type(self)(root, forceAbsolute=False, forceDirectory=True), dirs, files 

380 

381 @classmethod 

382 def _fixupPathUri( 

383 cls, 

384 parsed: urllib.parse.ParseResult, 

385 root: ResourcePath | None = None, 

386 forceAbsolute: bool = False, 

387 forceDirectory: bool | None = None, 

388 ) -> tuple[urllib.parse.ParseResult, bool | None]: 

389 """Fix up relative paths in URI instances. 

390 

391 Parameters 

392 ---------- 

393 parsed : `~urllib.parse.ParseResult` 

394 The result from parsing a URI using `urllib.parse`. 

395 root : `ResourcePath`, optional 

396 Path to use as root when converting relative to absolute. 

397 If `None`, it will be the current working directory. It is only 

398 used if a file-scheme is used incorrectly with a relative path. 

399 forceAbsolute : `bool`, ignored 

400 Has no effect for this subclass. ``file`` URIs are always 

401 absolute. 

402 forceDirectory : `bool`, optional 

403 If `True` forces the URI to end with a separator, otherwise given 

404 URI is interpreted as is. 

405 

406 Returns 

407 ------- 

408 modified : `~urllib.parse.ParseResult` 

409 Update result if a URI is being handled. 

410 dirLike : `bool` or `None` 

411 `True` if given parsed URI has a trailing separator or 

412 ``forceDirectory`` is `True`. Otherwise can return the given 

413 value of ``forceDirectory``. 

414 

415 Notes 

416 ----- 

417 Relative paths are explicitly not supported by RFC8089 but `urllib` 

418 does accept URIs of the form ``file:relative/path.ext``. They need 

419 to be turned into absolute paths before they can be used. This is 

420 always done regardless of the ``forceAbsolute`` parameter. 

421 """ 

422 # assume we are not dealing with a directory like URI 

423 dirLike = forceDirectory 

424 

425 # file URI implies POSIX path separators so split as POSIX, 

426 # then join as os, and convert to abspath. Do not handle 

427 # home directories since "file" scheme is explicitly documented 

428 # to not do tilde expansion. 

429 sep = posixpath.sep 

430 

431 # Consistency check. 

432 if forceDirectory is False and parsed.path.endswith(sep): 

433 raise ValueError( 

434 f"URI {parsed.geturl()} ends with {sep} but " 

435 "forceDirectory parameter declares it to be a file." 

436 ) 

437 

438 # For an absolute path all we need to do is check if we need 

439 # to force the directory separator 

440 if posixpath.isabs(parsed.path): 

441 if forceDirectory: 

442 if not parsed.path.endswith(sep): 

443 parsed = parsed._replace(path=parsed.path + sep) 

444 dirLike = True 

445 return copy.copy(parsed), dirLike 

446 

447 # Relative path so must fix it to be compliant with the standard 

448 

449 # Replacement values for the URI 

450 replacements = {} 

451 

452 if root is None: 

453 root_str = os.path.abspath(os.path.curdir) 

454 else: 

455 if root.scheme and root.scheme != "file": 455 ↛ 456line 455 didn't jump to line 456, because the condition on line 455 was never true

456 raise RuntimeError(f"The override root must be a file URI not {root.scheme}") 

457 root_str = os.path.abspath(root.ospath) 

458 

459 replacements["path"] = posixpath.normpath(posixpath.join(os2posix(root_str), parsed.path)) 

460 

461 # normpath strips trailing "/" so put it back if necessary 

462 # Acknowledge that trailing separator exists. 

463 if forceDirectory or (parsed.path.endswith(sep) and not replacements["path"].endswith(sep)): 

464 replacements["path"] += sep 

465 dirLike = True 

466 

467 # ParseResult is a NamedTuple so _replace is standard API 

468 parsed = parsed._replace(**replacements) 

469 

470 if parsed.params or parsed.query: 470 ↛ 471line 470 didn't jump to line 471, because the condition on line 470 was never true

471 log.warning("Additional items unexpectedly encountered in file URI: %s", parsed.geturl()) 

472 

473 return parsed, dirLike 

474 

475 @contextlib.contextmanager 

476 def _openImpl( 

477 self, 

478 mode: str = "r", 

479 *, 

480 encoding: str | None = None, 

481 ) -> Iterator[IO]: 

482 with FileResourceHandle(mode=mode, log=log, filename=self.ospath, encoding=encoding) as buffer: 

483 yield buffer # type: ignore 

484 

485 

486def _create_directories(name: str | bytes) -> None: 

487 """Create a directory and all of its parent directories that don't yet 

488 exist. 

489 

490 Parameters 

491 ---------- 

492 name : `str` or `bytes` 

493 Path to the directory to be created 

494 

495 Notes 

496 ----- 

497 The code in this function is duplicated from the Python standard library 

498 function os.makedirs with one change: if the user has set a process umask 

499 that prevents us from creating/accessing files in the newly created 

500 directories, the permissions of the directories are altered to allow 

501 owner-write and owner-traverse so that they can be used. 

502 """ 

503 # These are optional parameters in the original function, but they can be 

504 # constant here. 

505 mode = 0o777 

506 exist_ok = True 

507 

508 head, tail = os.path.split(name) 

509 if not tail: 

510 head, tail = os.path.split(head) 

511 if head and tail and not os.path.exists(head): 

512 try: 

513 _create_directories(head) 

514 except FileExistsError: 

515 # Defeats race condition when another thread created the path 

516 pass 

517 cdir: str | bytes = os.curdir 

518 if isinstance(tail, bytes): 518 ↛ 519line 518 didn't jump to line 519, because the condition on line 518 was never true

519 cdir = bytes(os.curdir, "ASCII") 

520 if tail == cdir: # xxx/newdir/. exists if xxx/newdir exists 520 ↛ 521line 520 didn't jump to line 521, because the condition on line 520 was never true

521 return 

522 try: 

523 os.mkdir(name, mode) 

524 # This is the portion that is modified relative to the standard library 

525 # version of the function. 

526 ensure_directory_is_writeable(name) 

527 # end modified portion 

528 except OSError: 

529 # Cannot rely on checking for EEXIST, since the operating system 

530 # could give priority to other errors like EACCES or EROFS 

531 if not exist_ok or not os.path.isdir(name): 

532 raise