Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 23%

395 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-15 09:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Cache management for a datastore.""" 

25 

26__all__ = ( 

27 "AbstractDatastoreCacheManager", 

28 "DatastoreDisabledCacheManager", 

29 "DatastoreCacheManager", 

30 "DatastoreCacheManagerConfig", 

31) 

32 

33import atexit 

34import contextlib 

35import datetime 

36import itertools 

37import logging 

38import os 

39import shutil 

40import tempfile 

41import uuid 

42from abc import ABC, abstractmethod 

43from collections import defaultdict 

44from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView 

45from random import Random 

46from typing import TYPE_CHECKING 

47 

48from lsst.resources import ResourcePath 

49from pydantic import BaseModel, PrivateAttr 

50 

51from .config import ConfigSubset 

52from .configSupport import processLookupConfigs 

53from .datasets import DatasetId, DatasetRef 

54 

55if TYPE_CHECKING: 

56 from .configSupport import LookupKey 

57 from .datasets import DatasetType 

58 from .dimensions import DimensionUniverse 

59 from .storageClass import StorageClass 

60 

61log = logging.getLogger(__name__) 

62 

63 

64def remove_cache_directory(directory: str) -> None: 

65 """Remove the specified directory and all its contents.""" 

66 log.debug("Removing temporary cache directory %s", directory) 

67 shutil.rmtree(directory, ignore_errors=True) 

68 

69 

70def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

71 """Construct the full path to use for this dataset in the cache. 

72 

73 Parameters 

74 ---------- 

75 ref : `DatasetRef` 

76 The dataset to look up in or write to the cache. 

77 extension : `str` 

78 File extension to use for this file. Should include the 

79 leading "``.``". 

80 

81 Returns 

82 ------- 

83 uri : `lsst.resources.ResourcePath` 

84 URI to use for this dataset in the cache. 

85 """ 

86 # Dataset type component is needed in the name if composite 

87 # disassembly is happening since the ID is shared for all components. 

88 component = ref.datasetType.component() 

89 component = f"_{component}" if component else "" 

90 return root.join(f"{ref.id}{component}{extension}") 

91 

92 

93def _parse_cache_name(cached_location: str) -> tuple[uuid.UUID, str | None, str | None]: 

94 """For a given cache name, return its component parts. 

95 

96 Changes to ``_construct_cache_path()`` should be reflected here. 

97 

98 Parameters 

99 ---------- 

100 cached_location : `str` 

101 The name of the file within the cache. 

102 

103 Returns 

104 ------- 

105 id : `uuid.UUID` 

106 The dataset ID. 

107 component : `str` or `None` 

108 The name of the component, if present. 

109 extension: `str` or `None` 

110 The file extension, if present. 

111 """ 

112 # Assume first dot is the extension and so allow .fits.gz 

113 root_ext = cached_location.split(".", maxsplit=1) 

114 root = root_ext.pop(0) 

115 ext = "." + root_ext.pop(0) if root_ext else None 

116 

117 parts = root.split("_") 

118 id_ = uuid.UUID(parts.pop(0)) 

119 component = parts.pop(0) if parts else None 

120 return id_, component, ext 

121 

122 

123class CacheEntry(BaseModel): 

124 """Represent an entry in the cache.""" 

125 

126 name: str 

127 """Name of the file.""" 

128 

129 size: int 

130 """Size of the file in bytes.""" 

131 

132 ctime: datetime.datetime 

133 """Creation time of the file.""" 

134 

135 ref: DatasetId 

136 """ID of this dataset.""" 

137 

138 component: str | None 

139 """Component for this disassembled composite (optional).""" 

140 

141 @classmethod 

142 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

143 """Construct an object from a file name. 

144 

145 Parameters 

146 ---------- 

147 file : `lsst.resources.ResourcePath` 

148 Path to the file. 

149 root : `lsst.resources.ResourcePath` 

150 Cache root directory. 

151 """ 

152 file_in_cache = file.relative_to(root) 

153 if file_in_cache is None: 

154 raise ValueError(f"Supplied file {file} is not inside root {root}") 

155 id_, component, _ = _parse_cache_name(file_in_cache) 

156 

157 stat = os.stat(file.ospath) 

158 return cls( 

159 name=file_in_cache, 

160 size=stat.st_size, 

161 ref=id_, 

162 component=component, 

163 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime), 

164 ) 

165 

166 

167class _MarkerEntry(CacheEntry): 

168 pass 

169 

170 

171class CacheRegistry(BaseModel): 

172 """Collection of cache entries.""" 

173 

174 _size: int = PrivateAttr(0) 

175 """Size of the cache.""" 

176 

177 _entries: dict[str, CacheEntry] = PrivateAttr({}) 

178 """Internal collection of cache entries.""" 

179 

180 _ref_map: dict[DatasetId, list[str]] = PrivateAttr({}) 

181 """Mapping of DatasetID to corresponding keys in cache registry.""" 

182 

183 @property 

184 def cache_size(self) -> int: 

185 return self._size 

186 

187 def __getitem__(self, key: str) -> CacheEntry: 

188 return self._entries[key] 

189 

190 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

191 self._size += entry.size 

192 self._entries[key] = entry 

193 

194 # Update the mapping from ref to path. 

195 if entry.ref not in self._ref_map: 

196 self._ref_map[entry.ref] = [] 

197 self._ref_map[entry.ref].append(key) 

198 

199 def __delitem__(self, key: str) -> None: 

200 entry = self._entries.pop(key) 

201 self._decrement(entry) 

202 self._ref_map[entry.ref].remove(key) 

203 

204 def _decrement(self, entry: CacheEntry | None) -> None: 

205 if entry: 

206 self._size -= entry.size 

207 if self._size < 0: 

208 log.warning("Cache size has gone negative. Inconsistent cache records...") 

209 self._size = 0 

210 

211 def __contains__(self, key: str) -> bool: 

212 return key in self._entries 

213 

214 def __len__(self) -> int: 

215 return len(self._entries) 

216 

217 def __iter__(self) -> Iterator[str]: # type: ignore 

218 return iter(self._entries) 

219 

220 def keys(self) -> KeysView[str]: 

221 return self._entries.keys() 

222 

223 def values(self) -> ValuesView[CacheEntry]: 

224 return self._entries.values() 

225 

226 def items(self) -> ItemsView[str, CacheEntry]: 

227 return self._entries.items() 

228 

229 # An private marker to indicate that pop() should raise if no default 

230 # is given. 

231 __marker = _MarkerEntry( 

232 name="marker", 

233 size=0, 

234 ref=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), 

235 ctime=datetime.datetime.utcfromtimestamp(0), 

236 ) 

237 

238 def pop(self, key: str, default: CacheEntry | None = __marker) -> CacheEntry | None: 

239 # The marker for dict.pop is not the same as our marker. 

240 if default is self.__marker: 

241 entry = self._entries.pop(key) 

242 else: 

243 entry = self._entries.pop(key, self.__marker) 

244 # Should not attempt to correct for this entry being removed 

245 # if we got the default value. 

246 if entry is self.__marker: 

247 return default 

248 

249 self._decrement(entry) 

250 # The default entry given to this method may not even be in the cache. 

251 if entry and entry.ref in self._ref_map: 

252 keys = self._ref_map[entry.ref] 

253 if key in keys: 

254 keys.remove(key) 

255 return entry 

256 

257 def get_dataset_keys(self, dataset_id: DatasetId | None) -> list[str] | None: 

258 """Retrieve all keys associated with the given dataset ID. 

259 

260 Parameters 

261 ---------- 

262 dataset_id : `DatasetId` or `None` 

263 The dataset ID to look up. Returns `None` if the ID is `None`. 

264 

265 Returns 

266 ------- 

267 keys : `list` [`str`] 

268 Keys associated with this dataset. These keys can be used to lookup 

269 the cache entry information in the `CacheRegistry`. Returns 

270 `None` if the dataset is not known to the cache. 

271 """ 

272 if dataset_id not in self._ref_map: 

273 return None 

274 keys = self._ref_map[dataset_id] 

275 if not keys: 

276 return None 

277 return keys 

278 

279 

280class DatastoreCacheManagerConfig(ConfigSubset): 

281 """Configuration information for `DatastoreCacheManager`.""" 

282 

283 component = "cached" 

284 requiredKeys = ("cacheable",) 

285 

286 

287class AbstractDatastoreCacheManager(ABC): 

288 """An abstract base class for managing caching in a Datastore. 

289 

290 Parameters 

291 ---------- 

292 config : `str` or `DatastoreCacheManagerConfig` 

293 Configuration to control caching. 

294 universe : `DimensionUniverse` 

295 Set of all known dimensions, used to expand and validate any used 

296 in lookup keys. 

297 """ 

298 

299 @property 

300 def cache_size(self) -> int: 

301 """Size of the cache in bytes.""" 

302 return 0 

303 

304 @property 

305 def file_count(self) -> int: 

306 """Return number of cached files tracked by registry.""" 

307 return 0 

308 

309 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

310 if not isinstance(config, DatastoreCacheManagerConfig): 

311 config = DatastoreCacheManagerConfig(config) 

312 assert isinstance(config, DatastoreCacheManagerConfig) 

313 self.config = config 

314 

315 @abstractmethod 

316 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

317 """Indicate whether the entity should be added to the cache. 

318 

319 This is relevant when reading or writing. 

320 

321 Parameters 

322 ---------- 

323 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

324 Thing to test against the configuration. The ``name`` property 

325 is used to determine a match. A `DatasetType` will first check 

326 its name, before checking its `StorageClass`. If there are no 

327 matches the default will be returned. 

328 

329 Returns 

330 ------- 

331 should_cache : `bool` 

332 Returns `True` if the dataset should be cached; `False` otherwise. 

333 """ 

334 raise NotImplementedError() 

335 

336 @abstractmethod 

337 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

338 """Report if the dataset is known to the cache. 

339 

340 Parameters 

341 ---------- 

342 ref : `DatasetRef` 

343 Dataset to check for in the cache. 

344 extension : `str`, optional 

345 File extension expected. Should include the leading "``.``". 

346 If `None` the extension is ignored and the dataset ID alone is 

347 used to check in the cache. The extension must be defined if 

348 a specific component is being checked. 

349 

350 Returns 

351 ------- 

352 known : `bool` 

353 Returns `True` if the dataset is currently known to the cache 

354 and `False` otherwise. 

355 

356 Notes 

357 ----- 

358 This method can only report if the dataset is known to the cache 

359 in this specific instant and does not indicate whether the file 

360 can be read from the cache later. `find_in_cache()` should be called 

361 if the cached file is to be used. 

362 """ 

363 raise NotImplementedError() 

364 

365 @abstractmethod 

366 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

367 """Move a file to the cache. 

368 

369 Move the given file into the cache, using the supplied DatasetRef 

370 for naming. A call is made to `should_be_cached()` and if the 

371 DatasetRef should not be accepted `None` will be returned. 

372 

373 Cache expiry can occur during this. 

374 

375 Parameters 

376 ---------- 

377 uri : `lsst.resources.ResourcePath` 

378 Location of the file to be relocated to the cache. Will be moved. 

379 ref : `DatasetRef` 

380 Ref associated with this file. Will be used to determine the name 

381 of the file within the cache. 

382 

383 Returns 

384 ------- 

385 new : `lsst.resources.ResourcePath` or `None` 

386 URI to the file within the cache, or `None` if the dataset 

387 was not accepted by the cache. 

388 """ 

389 raise NotImplementedError() 

390 

391 @abstractmethod 

392 @contextlib.contextmanager 

393 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

394 """Look for a dataset in the cache and return its location. 

395 

396 Parameters 

397 ---------- 

398 ref : `DatasetRef` 

399 Dataset to locate in the cache. 

400 extension : `str` 

401 File extension expected. Should include the leading "``.``". 

402 

403 Yields 

404 ------ 

405 uri : `lsst.resources.ResourcePath` or `None` 

406 The URI to the cached file, or `None` if the file has not been 

407 cached. 

408 

409 Notes 

410 ----- 

411 Should be used as a context manager in order to prevent this 

412 file from being removed from the cache for that context. 

413 """ 

414 raise NotImplementedError() 

415 

416 @abstractmethod 

417 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None: 

418 """Remove the specified datasets from the cache. 

419 

420 It is not an error for these datasets to be missing from the cache. 

421 

422 Parameters 

423 ---------- 

424 ref : `DatasetRef` or iterable of `DatasetRef` 

425 The datasets to remove from the cache. 

426 """ 

427 raise NotImplementedError() 

428 

429 @abstractmethod 

430 def __str__(self) -> str: 

431 raise NotImplementedError() 

432 

433 

434class DatastoreCacheManager(AbstractDatastoreCacheManager): 

435 """A class for managing caching in a Datastore using local files. 

436 

437 Parameters 

438 ---------- 

439 config : `str` or `DatastoreCacheManagerConfig` 

440 Configuration to control caching. 

441 universe : `DimensionUniverse` 

442 Set of all known dimensions, used to expand and validate any used 

443 in lookup keys. 

444 

445 Notes 

446 ----- 

447 Two environment variables can be used to override the cache directory 

448 and expiration configuration: 

449 

450 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

451 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

452 

453 The expiration mode should take the form ``mode=threshold`` so for 

454 example to configure expiration to limit the cache directory to 5 datasets 

455 the value would be ``datasets=5``. 

456 

457 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment 

458 variable can be used to indicate that this directory should be used 

459 if no explicit directory has been specified from configuration or from 

460 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable. 

461 """ 

462 

463 _temp_exemption_prefix = "exempt/" 

464 _tmpdir_prefix = "butler-cache-dir-" 

465 

466 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

467 super().__init__(config, universe) 

468 

469 # Set cache directory if it pre-exists, else defer creation until 

470 # requested. Allow external override from environment. 

471 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

472 

473 # Allow the execution environment to override the default values 

474 # so long as no default value has been set from the line above. 

475 if root is None: 

476 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET") 

477 

478 self._cache_directory = ( 

479 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

480 ) 

481 

482 if self._cache_directory: 

483 if not self._cache_directory.isLocal: 

484 raise ValueError( 

485 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

486 ) 

487 # Ensure that the cache directory is created. We assume that 

488 # someone specifying a permanent cache directory will be expecting 

489 # it to always be there. This will also trigger an error 

490 # early rather than waiting until the cache is needed. 

491 self._cache_directory.mkdir() 

492 

493 # Calculate the caching lookup table. 

494 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

495 

496 # Default decision to for whether a dataset should be cached. 

497 self._caching_default = self.config.get("default", False) 

498 

499 # Expiration mode. Read from config but allow override from 

500 # the environment. 

501 expiration_mode = self.config.get(("expiry", "mode")) 

502 threshold = self.config.get(("expiry", "threshold")) 

503 

504 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

505 if external_mode and "=" in external_mode: 

506 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

507 threshold = int(expiration_threshold) 

508 if expiration_mode is None: 

509 # Force to None to avoid confusion. 

510 threshold = None 

511 

512 self._expiration_mode: str | None = expiration_mode 

513 self._expiration_threshold: int | None = threshold 

514 if self._expiration_threshold is None and self._expiration_mode is not None: 

515 raise ValueError( 

516 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

517 ) 

518 

519 log.debug( 

520 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

521 self._cache_directory if self._cache_directory else "tmpdir", 

522 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

523 ) 

524 

525 # Files in cache, indexed by path within the cache directory. 

526 self._cache_entries = CacheRegistry() 

527 

528 @property 

529 def cache_directory(self) -> ResourcePath: 

530 if self._cache_directory is None: 

531 # Create on demand. Allow the override environment variable 

532 # to be used in case it got set after this object was created 

533 # but before a cache was used. 

534 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

535 # Someone else will clean this up. 

536 isTemporary = False 

537 msg = "deferred fallback" 

538 else: 

539 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix) 

540 isTemporary = True 

541 msg = "temporary" 

542 

543 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary) 

544 log.debug("Using %s cache directory at %s", msg, self._cache_directory) 

545 

546 # Remove when we no longer need it. 

547 if isTemporary: 

548 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

549 return self._cache_directory 

550 

551 @property 

552 def _temp_exempt_directory(self) -> ResourcePath: 

553 """Return the directory in which to store temporary cache files that 

554 should not be expired. 

555 """ 

556 return self.cache_directory.join(self._temp_exemption_prefix) 

557 

558 @property 

559 def cache_size(self) -> int: 

560 return self._cache_entries.cache_size 

561 

562 @property 

563 def file_count(self) -> int: 

564 return len(self._cache_entries) 

565 

566 @classmethod 

567 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]: 

568 """Define a fallback cache directory if a fallback not set already. 

569 

570 Returns 

571 ------- 

572 defined : `bool` 

573 `True` if the fallback directory was newly-defined in this method. 

574 `False` if it had already been set. 

575 cache_dir : `str` 

576 Returns the path to the cache directory that will be used if it's 

577 needed. This can allow the caller to run a directory cleanup 

578 when it's no longer needed (something that the cache manager 

579 can not do because forks should not clean up directories defined 

580 by the parent process). 

581 

582 Notes 

583 ----- 

584 The fallback directory will not be defined if one has already been 

585 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` 

586 environment variable only if a value has not previously been stored 

587 in that environment variable. Setting the environment variable allows 

588 this value to survive into spawned subprocesses. Calling this method 

589 will lead to all subsequently created cache managers sharing the same 

590 cache. 

591 """ 

592 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

593 # A value has already been set. 

594 return (False, cache_dir) 

595 

596 # As a class method, we do not know at this point whether a cache 

597 # directory will be needed so it would be impolite to create a 

598 # directory that will never be used. 

599 

600 # Construct our own temp name -- 16 characters should have a fairly 

601 # low chance of clashing when combined with the process ID. 

602 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

603 rng = Random() 

604 tempchars = "".join(rng.choice(characters) for _ in range(16)) 

605 

606 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}" 

607 

608 cache_dir = os.path.join(tempfile.gettempdir(), tempname) 

609 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir 

610 return (True, cache_dir) 

611 

612 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

613 # Docstring inherited 

614 matchName: LookupKey | str = f"{entity} (via default)" 

615 should_cache = self._caching_default 

616 

617 for key in entity._lookupNames(): 

618 if key in self._lut: 

619 should_cache = bool(self._lut[key]) 

620 matchName = key 

621 break 

622 

623 if not isinstance(should_cache, bool): 

624 raise TypeError( 

625 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

626 ) 

627 

628 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

629 return should_cache 

630 

631 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

632 """Construct the name to use for this dataset in the cache. 

633 

634 Parameters 

635 ---------- 

636 ref : `DatasetRef` 

637 The dataset to look up in or write to the cache. 

638 extension : `str` 

639 File extension to use for this file. Should include the 

640 leading "``.``". 

641 

642 Returns 

643 ------- 

644 uri : `lsst.resources.ResourcePath` 

645 URI to use for this dataset in the cache. 

646 """ 

647 return _construct_cache_path(self.cache_directory, ref, extension) 

648 

649 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

650 # Docstring inherited 

651 if not self.should_be_cached(ref): 

652 return None 

653 

654 # Write the file using the id of the dataset ref and the file 

655 # extension. 

656 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

657 

658 # Run cache expiry to ensure that we have room for this 

659 # item. 

660 self._expire_cache() 

661 

662 # The above reset the in-memory cache status. It's entirely possible 

663 # that another process has just cached this file (if multiple 

664 # processes are caching on read), so check our in-memory cache 

665 # before attempting to cache the dataset. 

666 path_in_cache = cached_location.relative_to(self.cache_directory) 

667 if path_in_cache and path_in_cache in self._cache_entries: 

668 return cached_location 

669 

670 # Move into the cache. Given that multiple processes might be 

671 # sharing a single cache directory, and the file we need might have 

672 # been copied in whilst we were checking, allow overwrite without 

673 # complaint. Even for a private cache directory it is possible that 

674 # a second butler in a subprocess could be writing to it. 

675 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

676 log.debug("Cached dataset %s to %s", ref, cached_location) 

677 

678 self._register_cache_entry(cached_location) 

679 

680 return cached_location 

681 

682 @contextlib.contextmanager 

683 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

684 # Docstring inherited 

685 # Short circuit this if the cache directory has not been created yet. 

686 if self._cache_directory is None: 

687 yield None 

688 return 

689 

690 cached_location = self._construct_cache_name(ref, extension) 

691 if cached_location.exists(): 

692 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

693 

694 # The cached file could be removed by another process doing 

695 # cache expiration so we need to protect against that by making 

696 # a copy in a different tree. Use hardlinks to ensure that 

697 # we either have the cached file or we don't. This is robust 

698 # against race conditions that can be caused by using soft links 

699 # and the other end of the link being deleted just after it 

700 # is created. 

701 path_in_cache = cached_location.relative_to(self.cache_directory) 

702 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

703 

704 # Need to use a unique file name for the temporary location to 

705 # ensure that two different processes can read the file 

706 # simultaneously without one of them deleting it when it's in 

707 # use elsewhere. Retain the original filename for easier debugging. 

708 random = str(uuid.uuid4())[:8] 

709 basename = cached_location.basename() 

710 filename = f"{random}-{basename}" 

711 

712 temp_location: ResourcePath | None = self._temp_exempt_directory.join(filename) 

713 try: 

714 if temp_location is not None: 

715 temp_location.transfer_from(cached_location, transfer="hardlink") 

716 except Exception as e: 

717 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

718 # Any failure will be treated as if the file was not 

719 # in the cache. Yielding the original cache location 

720 # is too dangerous. 

721 temp_location = None 

722 

723 try: 

724 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

725 yield temp_location 

726 finally: 

727 try: 

728 if temp_location: 

729 temp_location.remove() 

730 except FileNotFoundError: 

731 pass 

732 return 

733 

734 log.debug("Dataset %s not found in cache.", ref) 

735 yield None 

736 return 

737 

738 def remove_from_cache(self, refs: DatasetRef | Iterable[DatasetRef]) -> None: 

739 # Docstring inherited. 

740 

741 # Stop early if there are no cache entries anyhow. 

742 if len(self._cache_entries) == 0: 

743 return 

744 

745 if isinstance(refs, DatasetRef): 

746 refs = [refs] 

747 

748 # Create a set of all the IDs 

749 all_ids = {ref.id for ref in refs} 

750 

751 keys_to_remove = [] 

752 for key, entry in self._cache_entries.items(): 

753 if entry.ref in all_ids: 

754 keys_to_remove.append(key) 

755 self._remove_from_cache(keys_to_remove) 

756 

757 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str | None: 

758 """Record the file in the cache registry. 

759 

760 Parameters 

761 ---------- 

762 cached_location : `lsst.resources.ResourcePath` 

763 Location of the file to be registered. 

764 can_exist : `bool`, optional 

765 If `True` the item being registered can already be listed. 

766 This can allow a cache refresh to run without checking the 

767 file again. If `False` it is an error for the registry to 

768 already know about this file. 

769 

770 Returns 

771 ------- 

772 cache_key : `str` or `None` 

773 The key used in the registry for this file. `None` if the file 

774 no longer exists (it could have been expired by another process). 

775 """ 

776 path_in_cache = cached_location.relative_to(self.cache_directory) 

777 if path_in_cache is None: 

778 raise ValueError( 

779 f"Can not register cached file {cached_location} that is not within" 

780 f" the cache directory at {self.cache_directory}." 

781 ) 

782 if path_in_cache in self._cache_entries: 

783 if can_exist: 

784 return path_in_cache 

785 else: 

786 raise ValueError( 

787 f"Cached file {cached_location} is already known to the registry" 

788 " but this was expected to be a new file." 

789 ) 

790 try: 

791 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

792 except FileNotFoundError: 

793 return None 

794 self._cache_entries[path_in_cache] = details 

795 return path_in_cache 

796 

797 def scan_cache(self) -> None: 

798 """Scan the cache directory and record information about files.""" 

799 found = set() 

800 for file in ResourcePath.findFileResources([self.cache_directory]): 

801 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

802 

803 # Skip any that are found in an exempt part of the hierarchy 

804 # since they should not be part of the registry. 

805 if file.relative_to(self._temp_exempt_directory) is not None: 

806 continue 

807 

808 path_in_cache = self._register_cache_entry(file, can_exist=True) 

809 if path_in_cache: 

810 found.add(path_in_cache) 

811 

812 # Find any files that were recorded in the cache but are no longer 

813 # on disk. (something else cleared them out?) 

814 known_to_cache = set(self._cache_entries) 

815 missing = known_to_cache - found 

816 

817 if missing: 

818 log.debug( 

819 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

820 ) 

821 for path_in_cache in missing: 

822 self._cache_entries.pop(path_in_cache, None) 

823 

824 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

825 """Report if the dataset is known to the cache. 

826 

827 Parameters 

828 ---------- 

829 ref : `DatasetRef` 

830 Dataset to check for in the cache. 

831 extension : `str`, optional 

832 File extension expected. Should include the leading "``.``". 

833 If `None` the extension is ignored and the dataset ID alone is 

834 used to check in the cache. The extension must be defined if 

835 a specific component is being checked. 

836 

837 Returns 

838 ------- 

839 known : `bool` 

840 Returns `True` if the dataset is currently known to the cache 

841 and `False` otherwise. If the dataset refers to a component and 

842 an extension is given then only that component is checked. 

843 

844 Notes 

845 ----- 

846 This method can only report if the dataset is known to the cache 

847 in this specific instant and does not indicate whether the file 

848 can be read from the cache later. `find_in_cache()` should be called 

849 if the cached file is to be used. 

850 

851 This method does not force the cache to be re-scanned and so can miss 

852 cached datasets that have recently been written by other processes. 

853 """ 

854 if self._cache_directory is None: 

855 return False 

856 if self.file_count == 0: 

857 return False 

858 

859 if extension is None: 

860 # Look solely for matching dataset ref ID and not specific 

861 # components. 

862 cached_paths = self._cache_entries.get_dataset_keys(ref.id) 

863 return True if cached_paths else False 

864 

865 else: 

866 # Extension is known so we can do an explicit look up for the 

867 # cache entry. 

868 cached_location = self._construct_cache_name(ref, extension) 

869 path_in_cache = cached_location.relative_to(self.cache_directory) 

870 assert path_in_cache is not None # For mypy 

871 return path_in_cache in self._cache_entries 

872 

873 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

874 """Remove the specified cache entries from cache. 

875 

876 Parameters 

877 ---------- 

878 cache_entries : iterable of `str` 

879 The entries to remove from the cache. The values are the path 

880 within the cache. 

881 """ 

882 for entry in cache_entries: 

883 path = self.cache_directory.join(entry) 

884 

885 self._cache_entries.pop(entry, None) 

886 log.debug("Removing file from cache: %s", path) 

887 try: 

888 path.remove() 

889 except FileNotFoundError: 

890 pass 

891 

892 def _expire_cache(self) -> None: 

893 """Expire the files in the cache. 

894 

895 Notes 

896 ----- 

897 The expiration modes are defined by the config or can be overridden. 

898 Available options: 

899 

900 * ``files``: Number of files. 

901 * ``datasets``: Number of datasets 

902 * ``size``: Total size of files. 

903 * ``age``: Age of files. 

904 

905 The first three would remove in reverse time order. 

906 Number of files is complicated by the possibility of disassembled 

907 composites where 10 small files can be created for each dataset. 

908 

909 Additionally there is a use case for an external user to explicitly 

910 state the dataset refs that should be cached and then when to 

911 remove them. Overriding any global configuration. 

912 """ 

913 if self._expiration_mode is None: 

914 # Expiration has been disabled. 

915 return 

916 

917 # mypy can't be sure we have set a threshold properly 

918 if self._expiration_threshold is None: 

919 log.warning( 

920 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

921 ) 

922 return 

923 

924 # Sync up cache. There is no file locking involved so for a shared 

925 # cache multiple processes may be racing to delete files. Deleting 

926 # a file that no longer exists is not an error. 

927 self.scan_cache() 

928 

929 if self._expiration_mode == "files": 

930 n_files = len(self._cache_entries) 

931 n_over = n_files - self._expiration_threshold 

932 if n_over > 0: 

933 sorted_keys = self._sort_cache() 

934 keys_to_remove = sorted_keys[:n_over] 

935 self._remove_from_cache(keys_to_remove) 

936 return 

937 

938 if self._expiration_mode == "datasets": 

939 # Count the datasets, in ascending timestamp order, 

940 # so that oldest turn up first. 

941 datasets = defaultdict(list) 

942 for key in self._sort_cache(): 

943 entry = self._cache_entries[key] 

944 datasets[entry.ref].append(key) 

945 

946 n_datasets = len(datasets) 

947 n_over = n_datasets - self._expiration_threshold 

948 if n_over > 0: 

949 # Keys will be read out in insert order which 

950 # will be date order so oldest ones are removed. 

951 ref_ids = list(datasets.keys())[:n_over] 

952 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

953 self._remove_from_cache(keys_to_remove) 

954 return 

955 

956 if self._expiration_mode == "size": 

957 if self.cache_size > self._expiration_threshold: 

958 for key in self._sort_cache(): 

959 self._remove_from_cache([key]) 

960 if self.cache_size <= self._expiration_threshold: 

961 break 

962 return 

963 

964 if self._expiration_mode == "age": 

965 now = datetime.datetime.utcnow() 

966 for key in self._sort_cache(): 

967 delta = now - self._cache_entries[key].ctime 

968 if delta.seconds > self._expiration_threshold: 

969 self._remove_from_cache([key]) 

970 else: 

971 # We're already in date order. 

972 break 

973 return 

974 

975 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

976 

977 def _sort_cache(self) -> list[str]: 

978 """Sort the cache entries by time and return the sorted keys. 

979 

980 Returns 

981 ------- 

982 sorted : `list` of `str` 

983 Keys into the cache, sorted by time with oldest first. 

984 """ 

985 

986 def sort_by_time(key: str) -> datetime.datetime: 

987 """Sorter key function using cache entry details.""" 

988 return self._cache_entries[key].ctime 

989 

990 return sorted(self._cache_entries, key=sort_by_time) 

991 

992 def __str__(self) -> str: 

993 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

994 return ( 

995 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

996 f"default={self._caching_default}) " 

997 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

998 ) 

999 

1000 

1001class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

1002 """A variant of the datastore cache where no cache is enabled. 

1003 

1004 Parameters 

1005 ---------- 

1006 config : `str` or `DatastoreCacheManagerConfig` 

1007 Configuration to control caching. 

1008 universe : `DimensionUniverse` 

1009 Set of all known dimensions, used to expand and validate any used 

1010 in lookup keys. 

1011 """ 

1012 

1013 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

1014 return 

1015 

1016 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

1017 """Indicate whether the entity should be added to the cache. 

1018 

1019 Always returns `False`. 

1020 """ 

1021 return False 

1022 

1023 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

1024 """Move dataset to cache but always refuse and returns `None`.""" 

1025 return None 

1026 

1027 @contextlib.contextmanager 

1028 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

1029 """Look for a dataset in the cache and return its location. 

1030 

1031 Never finds a file. 

1032 """ 

1033 yield None 

1034 

1035 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None: 

1036 """Remove datasets from cache. 

1037 

1038 Always does nothing. 

1039 """ 

1040 return 

1041 

1042 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

1043 """Report if a dataset is known to the cache. 

1044 

1045 Always returns `False`. 

1046 """ 

1047 return False 

1048 

1049 def __str__(self) -> str: 

1050 return f"{type(self).__name__}()"