Coverage for python/lsst/resources/file.py: 95%

196 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-30 11:34 +0000

1# This file is part of lsst-resources. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# Use of this source code is governed by a 3-clause BSD-style 

10# license that can be found in the LICENSE file. 

11 

12from __future__ import annotations 

13 

14__all__ = ("FileResourcePath",) 

15 

16import contextlib 

17import copy 

18import logging 

19import os 

20import os.path 

21import posixpath 

22import re 

23import shutil 

24import urllib.parse 

25from collections.abc import Iterator 

26from typing import IO, TYPE_CHECKING 

27 

28from ._resourceHandles._fileResourceHandle import FileResourceHandle 

29from ._resourcePath import ResourcePath 

30from .utils import NoTransaction, ensure_directory_is_writeable, os2posix, posix2os 

31 

32if TYPE_CHECKING: 

33 from .utils import TransactionProtocol 

34 

35 

36log = logging.getLogger(__name__) 

37 

38 

39class FileResourcePath(ResourcePath): 

40 """Path for explicit ``file`` URI scheme.""" 

41 

42 transferModes = ("copy", "link", "symlink", "hardlink", "relsymlink", "auto", "move") 

43 transferDefault: str = "link" 

44 

45 # By definition refers to a local file 

46 isLocal = True 

47 

48 @property 

49 def ospath(self) -> str: 

50 """Path component of the URI localized to current OS. 

51 

52 Will unquote URI path since a formal URI must include the quoting. 

53 """ 

54 return urllib.parse.unquote(posix2os(self._uri.path)) 

55 

56 def exists(self) -> bool: 

57 """Indicate that the file exists.""" 

58 # Uses os.path.exists so if there is a soft link that points 

59 # to a file that no longer exists this will return False 

60 return os.path.exists(self.ospath) 

61 

62 def size(self) -> int: 

63 """Return the size of the file in bytes.""" 

64 if not os.path.isdir(self.ospath): 

65 stat = os.stat(self.ospath) 

66 sz = stat.st_size 

67 else: 

68 sz = 0 

69 return sz 

70 

71 def remove(self) -> None: 

72 """Remove the resource.""" 

73 os.remove(self.ospath) 

74 

75 def _as_local(self) -> tuple[str, bool]: 

76 """Return the local path of the file. 

77 

78 This is an internal helper for ``as_local()``. 

79 

80 Returns 

81 ------- 

82 path : `str` 

83 The local path to this file. 

84 temporary : `bool` 

85 Always returns the temporary nature of the input file resource. 

86 """ 

87 return self.ospath, self.isTemporary 

88 

89 def read(self, size: int = -1) -> bytes: 

90 """Return the entire content of the file as bytes.""" 

91 with open(self.ospath, "rb") as fh: 

92 return fh.read(size) 

93 

94 def write(self, data: bytes, overwrite: bool = True) -> None: 

95 """Write the supplied data to the file.""" 

96 dir = os.path.dirname(self.ospath) 

97 if not os.path.exists(dir): 

98 _create_directories(dir) 

99 mode = "wb" if overwrite else "xb" 

100 with open(self.ospath, mode) as f: 

101 f.write(data) 

102 

103 def mkdir(self) -> None: 

104 """Make the directory associated with this URI. 

105 

106 An attempt will be made to create the directory even if the URI 

107 looks like a file. 

108 

109 Raises 

110 ------ 

111 NotADirectoryError: 

112 Raised if a non-directory already exists. 

113 """ 

114 try: 

115 _create_directories(self.ospath) 

116 except FileExistsError: 

117 raise NotADirectoryError(f"{self.ospath} exists but is not a directory.") from None 

118 

119 def isdir(self) -> bool: 

120 """Return whether this URI is a directory. 

121 

122 Returns 

123 ------- 

124 isdir : `bool` 

125 `True` if this URI is a directory or looks like a directory, 

126 else `False`. 

127 """ 

128 return self.dirLike or os.path.isdir(self.ospath) 

129 

130 def transfer_from( 

131 self, 

132 src: ResourcePath, 

133 transfer: str, 

134 overwrite: bool = False, 

135 transaction: TransactionProtocol | None = None, 

136 ) -> None: 

137 """Transfer the current resource to a local file. 

138 

139 Parameters 

140 ---------- 

141 src : `ResourcePath` 

142 Source URI. 

143 transfer : `str` 

144 Mode to use for transferring the resource. Supports the following 

145 options: copy, link, symlink, hardlink, relsymlink. 

146 overwrite : `bool`, optional 

147 Allow an existing file to be overwritten. Defaults to `False`. 

148 transaction : `~lsst.resources.utils.TransactionProtocol`, optional 

149 If a transaction is provided, undo actions will be registered. 

150 """ 

151 # Fail early to prevent delays if remote resources are requested 

152 if transfer not in self.transferModes: 

153 raise ValueError(f"Transfer mode '{transfer}' not supported by URI scheme {self.scheme}") 

154 

155 # Existence checks can take time so only try if the log message 

156 # will be issued. 

157 if log.isEnabledFor(logging.DEBUG): 157 ↛ 169line 157 didn't jump to line 169, because the condition on line 157 was never false

158 log.debug( 

159 "Transferring %s [exists: %s] -> %s [exists: %s] (transfer=%s)", 

160 src, 

161 src.exists(), 

162 self, 

163 self.exists(), 

164 transfer, 

165 ) 

166 

167 # We do not have to special case FileResourcePath here because 

168 # as_local handles that. 

169 with src.as_local() as local_uri: 

170 is_temporary = local_uri.isTemporary 

171 local_src = local_uri.ospath 

172 

173 # Short circuit if the URIs are identical immediately. 

174 if self == local_uri: 

175 log.debug( 

176 "Target and destination URIs are identical: %s, returning immediately." 

177 " No further action required.", 

178 self, 

179 ) 

180 return 

181 

182 # Default transfer mode depends on whether we have a temporary 

183 # file or not. 

184 if transfer == "auto": 

185 transfer = self.transferDefault if not is_temporary else "copy" 

186 

187 if not os.path.exists(local_src): 

188 if is_temporary: 

189 if src == local_uri: 189 ↛ 193line 189 didn't jump to line 193, because the condition on line 189 was never false

190 msg = f"Local temporary file {src} has gone missing." 

191 else: 

192 # This will not happen in normal scenarios. 

193 msg = f"Local file {local_uri} downloaded from {src} has gone missing" 

194 else: 

195 msg = f"Source URI {src} does not exist" 

196 raise FileNotFoundError(msg) 

197 

198 # Follow soft links 

199 local_src = os.path.realpath(os.path.normpath(local_src)) 

200 

201 # Creating a symlink to a local copy of a remote resource 

202 # should never work. Creating a hardlink will work but should 

203 # not be allowed since it is highly unlikely that this is ever 

204 # an intended option and depends on the local target being 

205 # on the same file system as was used for the temporary file 

206 # download. 

207 # If a symlink is being requested for a local temporary file 

208 # that is likely undesirable but should not be refused. 

209 if is_temporary and src != local_uri and "link" in transfer: 

210 raise RuntimeError( 

211 f"Can not use local file system transfer mode {transfer} for remote resource ({src})" 

212 ) 

213 elif is_temporary and src == local_uri and "symlink" in transfer: 

214 log.debug( 

215 "Using a symlink for a temporary resource may lead to unexpected downstream failures." 

216 ) 

217 

218 # For temporary files we can own them if we created it. 

219 requested_transfer = transfer 

220 if src != local_uri and is_temporary and transfer == "copy": 

221 transfer = "move" 

222 

223 # The output location should not exist unless overwrite=True. 

224 # Rather than use `exists()`, use os.stat since we might need 

225 # the full answer later. 

226 dest_stat: os.stat_result | None 

227 try: 

228 # Do not read through links of the file itself. 

229 dest_stat = os.lstat(self.ospath) 

230 except FileNotFoundError: 

231 dest_stat = None 

232 

233 # It is possible that the source URI and target URI refer 

234 # to the same file. This can happen for a number of reasons 

235 # (such as soft links in the path, or they really are the same). 

236 # In that case log a message and return as if the transfer 

237 # completed (it technically did). A temporary file download 

238 # can't be the same so the test can be skipped. 

239 if dest_stat and not is_temporary: 

240 # Be consistent and use lstat here (even though realpath 

241 # has been called). It does not harm. 

242 local_src_stat = os.lstat(local_src) 

243 if dest_stat.st_ino == local_src_stat.st_ino and dest_stat.st_dev == local_src_stat.st_dev: 

244 log.debug( 

245 "Destination URI %s is the same file as source URI %s, returning immediately." 

246 " No further action required.", 

247 self, 

248 local_uri, 

249 ) 

250 return 

251 

252 if not overwrite and dest_stat: 

253 raise FileExistsError( 

254 f"Destination path '{self}' already exists. Transfer from {src} cannot be completed." 

255 ) 

256 

257 # Make the path absolute (but don't follow links since that 

258 # would possibly cause us to end up in the wrong place if the 

259 # file existed already as a soft link) 

260 newFullPath = os.path.abspath(self.ospath) 

261 outputDir = os.path.dirname(newFullPath) 

262 if not os.path.isdir(outputDir): 

263 # Must create the directory -- this can not be rolled back 

264 # since another transfer running concurrently may 

265 # be relying on this existing. 

266 _create_directories(outputDir) 

267 

268 if transaction is None: 268 ↛ 275line 268 didn't jump to line 275, because the condition on line 268 was never false

269 # Use a no-op transaction to reduce code duplication 

270 transaction = NoTransaction() 

271 

272 # For links the OS doesn't let us overwrite so if something does 

273 # exist we have to remove it before we do the actual "transfer" 

274 # below 

275 if "link" in transfer and overwrite and dest_stat: 

276 with contextlib.suppress(Exception): 

277 # If this fails we ignore it since it's a problem 

278 # that will manifest immediately below with a more relevant 

279 # error message 

280 self.remove() 

281 

282 if transfer == "move": 

283 # If a rename works we try that since that is guaranteed to 

284 # be atomic. If that fails we copy and rename. We do this 

285 # in case other processes are trying to move to the same 

286 # file and we want the "winner" to not be corrupted. 

287 try: 

288 with transaction.undoWith(f"move from {local_src}", os.rename, newFullPath, local_src): 

289 os.rename(local_src, newFullPath) 

290 except OSError: 

291 with self.temporary_uri(prefix=self.parent(), suffix=self.getExtension()) as temp_copy: 

292 shutil.copy(local_src, temp_copy.ospath) 

293 with transaction.undoWith( 

294 f"move from {local_src}", 

295 shutil.move, 

296 newFullPath, 

297 local_src, 

298 copy_function=shutil.copy, 

299 ): 

300 os.rename(temp_copy.ospath, newFullPath) 

301 os.remove(local_src) 

302 elif transfer == "copy": 

303 # We want atomic copies so first copy to a temp location in 

304 # the same output directory. This at least guarantees that 

305 # if multiple processes are writing to the same file 

306 # simultaneously the file we end up with will not be corrupt. 

307 with self.temporary_uri(prefix=self.parent(), suffix=self.getExtension()) as temp_copy: 

308 shutil.copy(local_src, temp_copy.ospath) 

309 with transaction.undoWith(f"copy from {local_src}", os.remove, newFullPath): 

310 # os.rename works even if the file exists. 

311 # It's possible that another process has copied a file 

312 # in whilst this one was copying. If overwrite 

313 # protection is needed then another stat() call should 

314 # happen here. 

315 os.rename(temp_copy.ospath, newFullPath) 

316 elif transfer == "link": 

317 # Try hard link and if that fails use a symlink 

318 with transaction.undoWith(f"link to {local_src}", os.remove, newFullPath): 

319 try: 

320 os.link(local_src, newFullPath) 

321 except OSError: 

322 # Read through existing symlinks 

323 os.symlink(local_src, newFullPath) 

324 elif transfer == "hardlink": 

325 with transaction.undoWith(f"hardlink to {local_src}", os.remove, newFullPath): 

326 os.link(local_src, newFullPath) 

327 elif transfer == "symlink": 

328 # Read through existing symlinks 

329 with transaction.undoWith(f"symlink to {local_src}", os.remove, newFullPath): 

330 os.symlink(local_src, newFullPath) 

331 elif transfer == "relsymlink": 

332 # This is a standard symlink but using a relative path 

333 # Need the directory name to give to relative root 

334 # A full file path confuses it into an extra ../ 

335 newFullPathRoot = os.path.dirname(newFullPath) 

336 relPath = os.path.relpath(local_src, newFullPathRoot) 

337 with transaction.undoWith(f"relsymlink to {local_src}", os.remove, newFullPath): 

338 os.symlink(relPath, newFullPath) 

339 else: 

340 raise NotImplementedError(f"Transfer type '{transfer}' not supported.") 

341 

342 # This was an explicit move requested from a remote resource 

343 # try to remove that remote resource. We check is_temporary because 

344 # the local file would have been moved by shutil.move already. 

345 if requested_transfer == "move" and is_temporary and src != local_uri: 

346 # Transactions do not work here 

347 src.remove() 

348 

349 def walk( 

350 self, file_filter: str | re.Pattern | None = None 

351 ) -> Iterator[list | tuple[ResourcePath, list[str], list[str]]]: 

352 """Walk the directory tree returning matching files and directories. 

353 

354 Parameters 

355 ---------- 

356 file_filter : `str` or `re.Pattern`, optional 

357 Regex to filter out files from the list before it is returned. 

358 

359 Yields 

360 ------ 

361 dirpath : `ResourcePath` 

362 Current directory being examined. 

363 dirnames : `list` of `str` 

364 Names of subdirectories within dirpath. 

365 filenames : `list` of `str` 

366 Names of all the files within dirpath. 

367 """ 

368 if not self.isdir(): 

369 raise ValueError("Can not walk a non-directory URI") 

370 

371 if isinstance(file_filter, str): 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true

372 file_filter = re.compile(file_filter) 

373 

374 for root, dirs, files in os.walk(self.ospath, followlinks=True): 

375 # Filter by the regex 

376 if file_filter is not None: 

377 files = [f for f in files if file_filter.search(f)] 

378 yield type(self)(root, forceAbsolute=False, forceDirectory=True), dirs, files 

379 

380 @classmethod 

381 def _fixupPathUri( 

382 cls, 

383 parsed: urllib.parse.ParseResult, 

384 root: ResourcePath | None = None, 

385 forceAbsolute: bool = False, 

386 forceDirectory: bool = False, 

387 ) -> tuple[urllib.parse.ParseResult, bool]: 

388 """Fix up relative paths in URI instances. 

389 

390 Parameters 

391 ---------- 

392 parsed : `~urllib.parse.ParseResult` 

393 The result from parsing a URI using `urllib.parse`. 

394 root : `ResourcePath`, optional 

395 Path to use as root when converting relative to absolute. 

396 If `None`, it will be the current working directory. It is only 

397 used if a file-scheme is used incorrectly with a relative path. 

398 forceAbsolute : `bool`, ignored 

399 Has no effect for this subclass. ``file`` URIs are always 

400 absolute. 

401 forceDirectory : `bool`, optional 

402 If `True` forces the URI to end with a separator, otherwise given 

403 URI is interpreted as is. 

404 

405 Returns 

406 ------- 

407 modified : `~urllib.parse.ParseResult` 

408 Update result if a URI is being handled. 

409 dirLike : `bool` 

410 `True` if given parsed URI has a trailing separator or 

411 forceDirectory is True. Otherwise `False`. 

412 

413 Notes 

414 ----- 

415 Relative paths are explicitly not supported by RFC8089 but `urllib` 

416 does accept URIs of the form ``file:relative/path.ext``. They need 

417 to be turned into absolute paths before they can be used. This is 

418 always done regardless of the ``forceAbsolute`` parameter. 

419 """ 

420 # assume we are not dealing with a directory like URI 

421 dirLike = False 

422 

423 # file URI implies POSIX path separators so split as POSIX, 

424 # then join as os, and convert to abspath. Do not handle 

425 # home directories since "file" scheme is explicitly documented 

426 # to not do tilde expansion. 

427 sep = posixpath.sep 

428 

429 # For local file system we can explicitly check to see if this 

430 # really is a directory. The URI might point to a location that 

431 # does not exists yet but all that matters is if it is a directory 

432 # then we make sure use that fact. No need to do the check if 

433 # we are already being told. 

434 if not forceDirectory and posixpath.isdir(parsed.path): 

435 forceDirectory = True 

436 

437 # For an absolute path all we need to do is check if we need 

438 # to force the directory separator 

439 if posixpath.isabs(parsed.path): 

440 if forceDirectory: 

441 if not parsed.path.endswith(sep): 

442 parsed = parsed._replace(path=parsed.path + sep) 

443 dirLike = True 

444 return copy.copy(parsed), dirLike 

445 

446 # Relative path so must fix it to be compliant with the standard 

447 

448 # Replacement values for the URI 

449 replacements = {} 

450 

451 if root is None: 

452 root_str = os.path.abspath(os.path.curdir) 

453 else: 

454 if root.scheme and root.scheme != "file": 454 ↛ 455line 454 didn't jump to line 455, because the condition on line 454 was never true

455 raise RuntimeError(f"The override root must be a file URI not {root.scheme}") 

456 root_str = os.path.abspath(root.ospath) 

457 

458 replacements["path"] = posixpath.normpath(posixpath.join(os2posix(root_str), parsed.path)) 

459 

460 # normpath strips trailing "/" so put it back if necessary 

461 # Acknowledge that trailing separator exists. 

462 if forceDirectory or (parsed.path.endswith(sep) and not replacements["path"].endswith(sep)): 

463 replacements["path"] += sep 

464 dirLike = True 

465 

466 # ParseResult is a NamedTuple so _replace is standard API 

467 parsed = parsed._replace(**replacements) 

468 

469 if parsed.params or parsed.query: 469 ↛ 470line 469 didn't jump to line 470, because the condition on line 469 was never true

470 log.warning("Additional items unexpectedly encountered in file URI: %s", parsed.geturl()) 

471 

472 return parsed, dirLike 

473 

474 @contextlib.contextmanager 

475 def _openImpl( 

476 self, 

477 mode: str = "r", 

478 *, 

479 encoding: str | None = None, 

480 ) -> Iterator[IO]: 

481 with FileResourceHandle(mode=mode, log=log, filename=self.ospath, encoding=encoding) as buffer: 

482 yield buffer # type: ignore 

483 

484 

485def _create_directories(name: str | bytes) -> None: 

486 """Create a directory and all of its parent directories that don't yet 

487 exist. 

488 

489 Parameters 

490 ---------- 

491 name: `str` or `bytes` 

492 Path to the directory to be created 

493 

494 Notes 

495 ----- 

496 The code in this function is duplicated from the Python standard library 

497 function os.makedirs with one change: if the user has set a process umask 

498 that prevents us from creating/accessing files in the newly created 

499 directories, the permissions of the directories are altered to allow 

500 owner-write and owner-traverse so that they can be used. 

501 """ 

502 # These are optional parameters in the original function, but they can be 

503 # constant here. 

504 mode = 0o777 

505 exist_ok = True 

506 

507 head, tail = os.path.split(name) 

508 if not tail: 

509 head, tail = os.path.split(head) 

510 if head and tail and not os.path.exists(head): 

511 try: 

512 _create_directories(head) 

513 except FileExistsError: 

514 # Defeats race condition when another thread created the path 

515 pass 

516 cdir: str | bytes = os.curdir 

517 if isinstance(tail, bytes): 517 ↛ 518line 517 didn't jump to line 518, because the condition on line 517 was never true

518 cdir = bytes(os.curdir, "ASCII") 

519 if tail == cdir: # xxx/newdir/. exists if xxx/newdir exists 519 ↛ 520line 519 didn't jump to line 520, because the condition on line 519 was never true

520 return 

521 try: 

522 os.mkdir(name, mode) 

523 # This is the portion that is modified relative to the standard library 

524 # version of the function. 

525 ensure_directory_is_writeable(name) 

526 # end modified portion 

527 except OSError: 

528 # Cannot rely on checking for EEXIST, since the operating system 

529 # could give priority to other errors like EACCES or EROFS 

530 if not exist_ok or not os.path.isdir(name): 

531 raise