Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 26%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

325 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Cache management for a datastore.""" 

25 

26__all__ = ( 

27 "AbstractDatastoreCacheManager", 

28 "DatastoreDisabledCacheManager", 

29 "DatastoreCacheManager", 

30 "DatastoreCacheManagerConfig", 

31) 

32 

33import atexit 

34import contextlib 

35import datetime 

36import itertools 

37import logging 

38import os 

39import shutil 

40import tempfile 

41from abc import ABC, abstractmethod 

42from collections import defaultdict 

43from typing import ( 

44 TYPE_CHECKING, 

45 Dict, 

46 ItemsView, 

47 Iterable, 

48 Iterator, 

49 KeysView, 

50 List, 

51 Optional, 

52 Union, 

53 ValuesView, 

54) 

55 

56from lsst.resources import ResourcePath 

57from pydantic import BaseModel, PrivateAttr 

58 

59from .config import ConfigSubset 

60from .configSupport import processLookupConfigs 

61from .datasets import DatasetId, DatasetRef 

62 

63if TYPE_CHECKING: 63 ↛ 64line 63 didn't jump to line 64, because the condition on line 63 was never true

64 from .configSupport import LookupKey 

65 from .datasets import DatasetType 

66 from .dimensions import DimensionUniverse 

67 from .storageClass import StorageClass 

68 

69log = logging.getLogger(__name__) 

70 

71 

72def remove_cache_directory(directory: str) -> None: 

73 """Remove the specified directory and all its contents.""" 

74 log.debug("Removing temporary cache directory %s", directory) 

75 shutil.rmtree(directory, ignore_errors=True) 

76 

77 

78def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

79 """Construct the full path to use for this dataset in the cache. 

80 

81 Parameters 

82 ---------- 

83 ref : `DatasetRef` 

84 The dataset to look up in or write to the cache. 

85 extension : `str` 

86 File extension to use for this file. Should include the 

87 leading "``.``". 

88 

89 Returns 

90 ------- 

91 uri : `lsst.resources.ResourcePath` 

92 URI to use for this dataset in the cache. 

93 """ 

94 # Dataset type component is needed in the name if composite 

95 # disassembly is happening since the ID is shared for all components. 

96 component = ref.datasetType.component() 

97 component = f"_{component}" if component else "" 

98 return root.join(f"{ref.id}{component}{extension}") 

99 

100 

101def _parse_cache_name(cached_location: str) -> Dict[str, Optional[str]]: 

102 """For a given cache name, return its component parts. 

103 

104 Changes to ``_construct_cache_path()`` should be reflected here. 

105 

106 Parameters 

107 ---------- 

108 cached_location : `str` 

109 The name of the file within the cache. 

110 

111 Returns 

112 ------- 

113 parsed : `dict` of `str`, `str` 

114 Parsed components of the file. These include: 

115 - "id": The dataset ID, 

116 - "component": The name of the component (can be `None`), 

117 - "extension": File extension (can be `None`). 

118 """ 

119 # Assume first dot is the extension and so allow .fits.gz 

120 root_ext = cached_location.split(".", maxsplit=1) 

121 root = root_ext.pop(0) 

122 ext = "." + root_ext.pop(0) if root_ext else None 

123 

124 parts = root.split("_") 

125 id_ = parts.pop(0) 

126 component = parts.pop(0) if parts else None 

127 return {"id": id_, "component": component, "extension": ext} 

128 

129 

130class CacheEntry(BaseModel): 

131 """Represent an entry in the cache.""" 

132 

133 name: str 

134 """Name of the file.""" 

135 

136 size: int 

137 """Size of the file in bytes.""" 

138 

139 ctime: datetime.datetime 

140 """Creation time of the file.""" 

141 

142 ref: DatasetId 

143 """ID of this dataset.""" 

144 

145 component: Optional[str] 

146 """Component for this disassembled composite (optional).""" 

147 

148 @classmethod 

149 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

150 """Construct an object from a file name. 

151 

152 Parameters 

153 ---------- 

154 file : `lsst.resources.ResourcePath` 

155 Path to the file. 

156 root : `lsst.resources.ResourcePath` 

157 Cache root directory. 

158 """ 

159 file_in_cache = file.relative_to(root) 

160 if file_in_cache is None: 

161 raise ValueError(f"Supplied file {file} is not inside root {root}") 

162 parts = _parse_cache_name(file_in_cache) 

163 

164 stat = os.stat(file.ospath) 

165 return cls( 

166 name=file_in_cache, 

167 size=stat.st_size, 

168 ref=parts["id"], 

169 component=parts["component"], 

170 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime), 

171 ) 

172 

173 

174class CacheRegistry(BaseModel): 

175 """Collection of cache entries.""" 

176 

177 _size: int = PrivateAttr(0) 

178 """Size of the cache.""" 

179 

180 _entries: Dict[str, CacheEntry] = PrivateAttr({}) 

181 """Internal collection of cache entries.""" 

182 

183 @property 

184 def cache_size(self) -> int: 

185 return self._size 

186 

187 def __getitem__(self, key: str) -> CacheEntry: 

188 return self._entries[key] 

189 

190 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

191 self._size += entry.size 

192 self._entries[key] = entry 

193 

194 def __delitem__(self, key: str) -> None: 

195 entry = self._entries.pop(key) 

196 self._decrement(entry) 

197 

198 def _decrement(self, entry: Optional[CacheEntry]) -> None: 

199 if entry: 

200 self._size -= entry.size 

201 if self._size < 0: 

202 log.warning("Cache size has gone negative. Inconsistent cache records...") 

203 self._size = 0 

204 

205 def __contains__(self, key: str) -> bool: 

206 return key in self._entries 

207 

208 def __len__(self) -> int: 

209 return len(self._entries) 

210 

211 def __iter__(self) -> Iterator[str]: # type: ignore 

212 return iter(self._entries) 

213 

214 def keys(self) -> KeysView[str]: 

215 return self._entries.keys() 

216 

217 def values(self) -> ValuesView[CacheEntry]: 

218 return self._entries.values() 

219 

220 def items(self) -> ItemsView[str, CacheEntry]: 

221 return self._entries.items() 

222 

223 def pop(self, key: str, default: Optional[CacheEntry] = None) -> Optional[CacheEntry]: 

224 entry = self._entries.pop(key, default) 

225 self._decrement(entry) 

226 return entry 

227 

228 

229class DatastoreCacheManagerConfig(ConfigSubset): 

230 """Configuration information for `DatastoreCacheManager`.""" 

231 

232 component = "cached" 

233 requiredKeys = ("cacheable",) 

234 

235 

236class AbstractDatastoreCacheManager(ABC): 

237 """An abstract base class for managing caching in a Datastore. 

238 

239 Parameters 

240 ---------- 

241 config : `str` or `DatastoreCacheManagerConfig` 

242 Configuration to control caching. 

243 universe : `DimensionUniverse` 

244 Set of all known dimensions, used to expand and validate any used 

245 in lookup keys. 

246 """ 

247 

248 @property 

249 def cache_size(self) -> int: 

250 """Size of the cache in bytes.""" 

251 return 0 

252 

253 @property 

254 def file_count(self) -> int: 

255 """Return number of cached files tracked by registry.""" 

256 return 0 

257 

258 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

259 if not isinstance(config, DatastoreCacheManagerConfig): 

260 config = DatastoreCacheManagerConfig(config) 

261 assert isinstance(config, DatastoreCacheManagerConfig) 

262 self.config = config 

263 

264 @abstractmethod 

265 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

266 """Indicate whether the entity should be added to the cache. 

267 

268 This is relevant when reading or writing. 

269 

270 Parameters 

271 ---------- 

272 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

273 Thing to test against the configuration. The ``name`` property 

274 is used to determine a match. A `DatasetType` will first check 

275 its name, before checking its `StorageClass`. If there are no 

276 matches the default will be returned. 

277 

278 Returns 

279 ------- 

280 should_cache : `bool` 

281 Returns `True` if the dataset should be cached; `False` otherwise. 

282 """ 

283 raise NotImplementedError() 

284 

285 @abstractmethod 

286 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

287 """Move a file to the cache. 

288 

289 Move the given file into the cache, using the supplied DatasetRef 

290 for naming. A call is made to `should_be_cached()` and if the 

291 DatasetRef should not be accepted `None` will be returned. 

292 

293 Cache expiry can occur during this. 

294 

295 Parameters 

296 ---------- 

297 uri : `lsst.resources.ResourcePath` 

298 Location of the file to be relocated to the cache. Will be moved. 

299 ref : `DatasetRef` 

300 Ref associated with this file. Will be used to determine the name 

301 of the file within the cache. 

302 

303 Returns 

304 ------- 

305 new : `lsst.resources.ResourcePath` or `None` 

306 URI to the file within the cache, or `None` if the dataset 

307 was not accepted by the cache. 

308 """ 

309 raise NotImplementedError() 

310 

311 @abstractmethod 

312 @contextlib.contextmanager 

313 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

314 """Look for a dataset in the cache and return its location. 

315 

316 Parameters 

317 ---------- 

318 ref : `DatasetRef` 

319 Dataset to locate in the cache. 

320 extension : `str` 

321 File extension expected. Should include the leading "``.``". 

322 

323 Yields 

324 ------ 

325 uri : `lsst.resources.ResourcePath` or `None` 

326 The URI to the cached file, or `None` if the file has not been 

327 cached. 

328 

329 Notes 

330 ----- 

331 Should be used as a context manager in order to prevent this 

332 file from being removed from the cache for that context. 

333 """ 

334 raise NotImplementedError() 

335 

336 @abstractmethod 

337 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

338 """Remove the specified datasets from the cache. 

339 

340 It is not an error for these datasets to be missing from the cache. 

341 

342 Parameters 

343 ---------- 

344 ref : `DatasetRef` or iterable of `DatasetRef` 

345 The datasets to remove from the cache. 

346 """ 

347 raise NotImplementedError() 

348 

349 @abstractmethod 

350 def __str__(self) -> str: 

351 raise NotImplementedError() 

352 

353 

354class DatastoreCacheManager(AbstractDatastoreCacheManager): 

355 """A class for managing caching in a Datastore using local files. 

356 

357 Parameters 

358 ---------- 

359 config : `str` or `DatastoreCacheManagerConfig` 

360 Configuration to control caching. 

361 universe : `DimensionUniverse` 

362 Set of all known dimensions, used to expand and validate any used 

363 in lookup keys. 

364 

365 Notes 

366 ----- 

367 Two environment variables can be used to override the cache directory 

368 and expiration configuration: 

369 

370 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

371 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

372 

373 The expiration mode should take the form ``mode=threshold`` so for 

374 example to configure expiration to limit the cache directory to 5 datasets 

375 the value would be ``datasets=5``. 

376 """ 

377 

378 _temp_exemption_prefix = "exempt/" 

379 

380 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

381 super().__init__(config, universe) 

382 

383 # Set cache directory if it pre-exists, else defer creation until 

384 # requested. Allow external override from environment. 

385 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

386 self._cache_directory = ( 

387 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

388 ) 

389 

390 if self._cache_directory: 

391 if not self._cache_directory.isLocal: 

392 raise ValueError( 

393 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

394 ) 

395 # Ensure that the cache directory is created. We assume that 

396 # someone specifying a permanent cache directory will be expecting 

397 # it to always be there. This will also trigger an error 

398 # early rather than waiting until the cache is needed. 

399 self._cache_directory.mkdir() 

400 

401 # Calculate the caching lookup table. 

402 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

403 

404 # Default decision to for whether a dataset should be cached. 

405 self._caching_default = self.config.get("default", False) 

406 

407 # Expiration mode. Read from config but allow override from 

408 # the environment. 

409 expiration_mode = self.config.get(("expiry", "mode")) 

410 threshold = self.config.get(("expiry", "threshold")) 

411 

412 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

413 if external_mode and "=" in external_mode: 

414 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

415 threshold = int(expiration_threshold) 

416 if expiration_mode is None: 

417 # Force to None to avoid confusion. 

418 threshold = None 

419 

420 self._expiration_mode: Optional[str] = expiration_mode 

421 self._expiration_threshold: Optional[int] = threshold 

422 if self._expiration_threshold is None and self._expiration_mode is not None: 

423 raise ValueError( 

424 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

425 ) 

426 

427 log.debug( 

428 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

429 self._cache_directory if self._cache_directory else "tmpdir", 

430 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

431 ) 

432 

433 # Files in cache, indexed by path within the cache directory. 

434 self._cache_entries = CacheRegistry() 

435 

436 @property 

437 def cache_directory(self) -> ResourcePath: 

438 if self._cache_directory is None: 

439 # Create on demand. 

440 self._cache_directory = ResourcePath( 

441 tempfile.mkdtemp(prefix="butler-"), forceDirectory=True, isTemporary=True 

442 ) 

443 log.debug("Creating temporary cache directory at %s", self._cache_directory) 

444 # Remove when we no longer need it. 

445 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

446 return self._cache_directory 

447 

448 @property 

449 def _temp_exempt_directory(self) -> ResourcePath: 

450 """Return the directory in which to store temporary cache files that 

451 should not be expired. 

452 """ 

453 return self.cache_directory.join(self._temp_exemption_prefix) 

454 

455 @property 

456 def cache_size(self) -> int: 

457 return self._cache_entries.cache_size 

458 

459 @property 

460 def file_count(self) -> int: 

461 return len(self._cache_entries) 

462 

463 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

464 # Docstring inherited 

465 matchName: Union[LookupKey, str] = "{} (via default)".format(entity) 

466 should_cache = self._caching_default 

467 

468 for key in entity._lookupNames(): 

469 if key in self._lut: 

470 should_cache = bool(self._lut[key]) 

471 matchName = key 

472 break 

473 

474 if not isinstance(should_cache, bool): 

475 raise TypeError( 

476 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

477 ) 

478 

479 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

480 return should_cache 

481 

482 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

483 """Construct the name to use for this dataset in the cache. 

484 

485 Parameters 

486 ---------- 

487 ref : `DatasetRef` 

488 The dataset to look up in or write to the cache. 

489 extension : `str` 

490 File extension to use for this file. Should include the 

491 leading "``.``". 

492 

493 Returns 

494 ------- 

495 uri : `lsst.resources.ResourcePath` 

496 URI to use for this dataset in the cache. 

497 """ 

498 return _construct_cache_path(self.cache_directory, ref, extension) 

499 

500 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

501 # Docstring inherited 

502 if ref.id is None: 

503 raise ValueError(f"Can not cache a file associated with an unresolved reference ({ref})") 

504 

505 if not self.should_be_cached(ref): 

506 return None 

507 

508 # Write the file using the id of the dataset ref and the file 

509 # extension. 

510 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

511 

512 # Run cache expiry to ensure that we have room for this 

513 # item. 

514 self._expire_cache() 

515 

516 # Move into the cache. Given that multiple processes might be 

517 # sharing a single cache directory, and the file we need might have 

518 # been copied in whilst we were checking, allow overwrite without 

519 # complaint. Even for a private cache directory it is possible that 

520 # a second butler in a subprocess could be writing to it. 

521 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

522 log.debug("Cached dataset %s to %s", ref, cached_location) 

523 

524 self._register_cache_entry(cached_location) 

525 

526 return cached_location 

527 

528 @contextlib.contextmanager 

529 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

530 # Docstring inherited 

531 # Short circuit this if the cache directory has not been created yet. 

532 if self._cache_directory is None: 

533 yield None 

534 return 

535 

536 cached_location = self._construct_cache_name(ref, extension) 

537 if cached_location.exists(): 

538 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

539 

540 # The cached file could be removed by another process doing 

541 # cache expiration so we need to protect against that by making 

542 # a copy in a different tree. Use hardlinks to ensure that 

543 # we either have the cached file or we don't. This is robust 

544 # against race conditions that can be caused by using soft links 

545 # and the other end of the link being deleted just after it 

546 # is created. 

547 path_in_cache = cached_location.relative_to(self.cache_directory) 

548 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

549 temp_location: Optional[ResourcePath] = self._temp_exempt_directory.join(path_in_cache) 

550 try: 

551 if temp_location is not None: 

552 temp_location.transfer_from(cached_location, transfer="hardlink") 

553 except Exception as e: 

554 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

555 # Any failure will be treated as if the file was not 

556 # in the cache. Yielding the original cache location 

557 # is too dangerous. 

558 temp_location = None 

559 

560 try: 

561 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

562 yield temp_location 

563 finally: 

564 try: 

565 if temp_location: 

566 temp_location.remove() 

567 except FileNotFoundError: 

568 pass 

569 return 

570 

571 log.debug("Dataset %s not found in cache.", ref) 

572 yield None 

573 return 

574 

575 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

576 # Docstring inherited. 

577 

578 # Stop early if there are no cache entries anyhow. 

579 if len(self._cache_entries) == 0: 

580 return 

581 

582 if isinstance(refs, DatasetRef): 

583 refs = [refs] 

584 

585 # Create a set of all the IDs 

586 all_ids = {ref.getCheckedId() for ref in refs} 

587 

588 keys_to_remove = [] 

589 for key, entry in self._cache_entries.items(): 

590 if entry.ref in all_ids: 

591 keys_to_remove.append(key) 

592 self._remove_from_cache(keys_to_remove) 

593 

594 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str: 

595 """Record the file in the cache registry. 

596 

597 Parameters 

598 ---------- 

599 cached_location : `lsst.resources.ResourcePath` 

600 Location of the file to be registered. 

601 can_exist : `bool`, optional 

602 If `True` the item being registered can already be listed. 

603 This can allow a cache refresh to run without checking the 

604 file again. If `False` it is an error for the registry to 

605 already know about this file. 

606 

607 Returns 

608 ------- 

609 cache_key : `str` 

610 The key used in the registry for this file. 

611 """ 

612 path_in_cache = cached_location.relative_to(self.cache_directory) 

613 if path_in_cache is None: 

614 raise ValueError( 

615 f"Can not register cached file {cached_location} that is not within" 

616 f" the cache directory at {self.cache_directory}." 

617 ) 

618 if path_in_cache in self._cache_entries: 

619 if can_exist: 

620 return path_in_cache 

621 else: 

622 raise ValueError( 

623 f"Cached file {cached_location} is already known to the registry" 

624 " but this was expected to be a new file." 

625 ) 

626 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

627 self._cache_entries[path_in_cache] = details 

628 return path_in_cache 

629 

630 def scan_cache(self) -> None: 

631 """Scan the cache directory and record information about files.""" 

632 found = set() 

633 for file in ResourcePath.findFileResources([self.cache_directory]): 

634 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

635 

636 # Skip any that are found in an exempt part of the hierarchy 

637 # since they should not be part of the registry. 

638 if file.relative_to(self._temp_exempt_directory) is not None: 

639 continue 

640 

641 path_in_cache = self._register_cache_entry(file, can_exist=True) 

642 found.add(path_in_cache) 

643 

644 # Find any files that were recorded in the cache but are no longer 

645 # on disk. (something else cleared them out?) 

646 known_to_cache = set(self._cache_entries) 

647 missing = known_to_cache - found 

648 

649 if missing: 

650 log.debug( 

651 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

652 ) 

653 for path_in_cache in missing: 

654 self._cache_entries.pop(path_in_cache) 

655 

656 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

657 """Remove the specified cache entries from cache. 

658 

659 Parameters 

660 ---------- 

661 cache_entries : iterable of `str` 

662 The entries to remove from the cache. The values are the path 

663 within the cache. 

664 """ 

665 for entry in cache_entries: 

666 path = self.cache_directory.join(entry) 

667 

668 self._cache_entries.pop(entry) 

669 log.debug("Removing file from cache: %s", path) 

670 try: 

671 path.remove() 

672 except FileNotFoundError: 

673 pass 

674 

675 def _expire_cache(self) -> None: 

676 """Expire the files in the cache. 

677 

678 Notes 

679 ----- 

680 The expiration modes are defined by the config or can be overridden. 

681 Available options: 

682 

683 * ``files``: Number of files. 

684 * ``datasets``: Number of datasets 

685 * ``size``: Total size of files. 

686 * ``age``: Age of files. 

687 

688 The first three would remove in reverse time order. 

689 Number of files is complicated by the possibility of disassembled 

690 composites where 10 small files can be created for each dataset. 

691 

692 Additionally there is a use case for an external user to explicitly 

693 state the dataset refs that should be cached and then when to 

694 remove them. Overriding any global configuration. 

695 """ 

696 if self._expiration_mode is None: 

697 # Expiration has been disabled. 

698 return 

699 

700 # mypy can't be sure we have set a threshold properly 

701 if self._expiration_threshold is None: 

702 log.warning( 

703 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

704 ) 

705 return 

706 

707 # Sync up cache. There is no file locking involved so for a shared 

708 # cache multiple processes may be racing to delete files. Deleting 

709 # a file that no longer exists is not an error. 

710 self.scan_cache() 

711 

712 if self._expiration_mode == "files": 

713 n_files = len(self._cache_entries) 

714 n_over = n_files - self._expiration_threshold 

715 if n_over > 0: 

716 sorted_keys = self._sort_cache() 

717 keys_to_remove = sorted_keys[:n_over] 

718 self._remove_from_cache(keys_to_remove) 

719 return 

720 

721 if self._expiration_mode == "datasets": 

722 # Count the datasets, in ascending timestamp order, 

723 # so that oldest turn up first. 

724 datasets = defaultdict(list) 

725 for key in self._sort_cache(): 

726 entry = self._cache_entries[key] 

727 datasets[entry.ref].append(key) 

728 

729 n_datasets = len(datasets) 

730 n_over = n_datasets - self._expiration_threshold 

731 if n_over > 0: 

732 # Keys will be read out in insert order which 

733 # will be date order so oldest ones are removed. 

734 ref_ids = list(datasets.keys())[:n_over] 

735 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

736 self._remove_from_cache(keys_to_remove) 

737 return 

738 

739 if self._expiration_mode == "size": 

740 if self.cache_size > self._expiration_threshold: 

741 for key in self._sort_cache(): 

742 self._remove_from_cache([key]) 

743 if self.cache_size <= self._expiration_threshold: 

744 break 

745 return 

746 

747 if self._expiration_mode == "age": 

748 now = datetime.datetime.utcnow() 

749 for key in self._sort_cache(): 

750 delta = now - self._cache_entries[key].ctime 

751 if delta.seconds > self._expiration_threshold: 

752 self._remove_from_cache([key]) 

753 else: 

754 # We're already in date order. 

755 break 

756 return 

757 

758 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

759 

760 def _sort_cache(self) -> List[str]: 

761 """Sort the cache entries by time and return the sorted keys. 

762 

763 Returns 

764 ------- 

765 sorted : `list` of `str` 

766 Keys into the cache, sorted by time with oldest first. 

767 """ 

768 

769 def sort_by_time(key: str) -> datetime.datetime: 

770 """Sorter key function using cache entry details.""" 

771 return self._cache_entries[key].ctime 

772 

773 return sorted(self._cache_entries, key=sort_by_time) 

774 

775 def __str__(self) -> str: 

776 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

777 return ( 

778 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

779 f"default={self._caching_default}) " 

780 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

781 ) 

782 

783 

784class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

785 """A variant of the datastore cache where no cache is enabled. 

786 

787 Parameters 

788 ---------- 

789 config : `str` or `DatastoreCacheManagerConfig` 

790 Configuration to control caching. 

791 universe : `DimensionUniverse` 

792 Set of all known dimensions, used to expand and validate any used 

793 in lookup keys. 

794 """ 

795 

796 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

797 return 

798 

799 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

800 """Indicate whether the entity should be added to the cache. 

801 

802 Always returns `False`. 

803 """ 

804 return False 

805 

806 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

807 """Move dataset to cache but always refuse and returns `None`.""" 

808 return None 

809 

810 @contextlib.contextmanager 

811 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

812 """Look for a dataset in the cache and return its location. 

813 

814 Never finds a file. 

815 """ 

816 yield None 

817 

818 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

819 """Remove datasets from cache. 

820 

821 Always does nothing. 

822 """ 

823 return 

824 

825 def __str__(self) -> str: 

826 return f"{type(self).__name__}()"