Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 26%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

324 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Cache management for a datastore.""" 

25 

26__all__ = ( 

27 "AbstractDatastoreCacheManager", 

28 "DatastoreDisabledCacheManager", 

29 "DatastoreCacheManager", 

30 "DatastoreCacheManagerConfig", 

31) 

32 

33import atexit 

34import contextlib 

35import datetime 

36import itertools 

37import logging 

38import os 

39import shutil 

40import tempfile 

41from abc import ABC, abstractmethod 

42from collections import defaultdict 

43from typing import ( 

44 TYPE_CHECKING, 

45 Dict, 

46 ItemsView, 

47 Iterable, 

48 Iterator, 

49 KeysView, 

50 List, 

51 Optional, 

52 Union, 

53 ValuesView, 

54) 

55 

56from pydantic import BaseModel, PrivateAttr 

57 

58from ._butlerUri import ButlerURI 

59from .config import ConfigSubset 

60from .configSupport import processLookupConfigs 

61from .datasets import DatasetId, DatasetRef 

62 

63if TYPE_CHECKING: 63 ↛ 64line 63 didn't jump to line 64, because the condition on line 63 was never true

64 from .configSupport import LookupKey 

65 from .datasets import DatasetType 

66 from .dimensions import DimensionUniverse 

67 from .storageClass import StorageClass 

68 

69log = logging.getLogger(__name__) 

70 

71 

72def remove_cache_directory(directory: str) -> None: 

73 """Remove the specified directory and all its contents.""" 

74 log.debug("Removing temporary cache directory %s", directory) 

75 shutil.rmtree(directory, ignore_errors=True) 

76 

77 

78def _construct_cache_path(root: ButlerURI, ref: DatasetRef, extension: str) -> ButlerURI: 

79 """Construct the full path to use for this dataset in the cache. 

80 

81 Parameters 

82 ---------- 

83 ref : `DatasetRef` 

84 The dataset to look up in or write to the cache. 

85 extension : `str` 

86 File extension to use for this file. Should include the 

87 leading "``.``". 

88 

89 Returns 

90 ------- 

91 uri : `ButlerURI` 

92 URI to use for this dataset in the cache. 

93 """ 

94 # Dataset type component is needed in the name if composite 

95 # disassembly is happening since the ID is shared for all components. 

96 component = ref.datasetType.component() 

97 component = f"_{component}" if component else "" 

98 return root.join(f"{ref.id}{component}{extension}") 

99 

100 

101def _parse_cache_name(cached_location: str) -> Dict[str, Optional[str]]: 

102 """For a given cache name, return its component parts. 

103 

104 Changes to ``_construct_cache_path()`` should be reflected here. 

105 

106 Parameters 

107 ---------- 

108 cached_location : `str` 

109 The name of the file within the cache. 

110 

111 Returns 

112 ------- 

113 parsed : `dict` of `str`, `str` 

114 Parsed components of the file. These include: 

115 - "id": The dataset ID, 

116 - "component": The name of the component (can be `None`), 

117 - "extension": File extension (can be `None`). 

118 """ 

119 # Assume first dot is the extension and so allow .fits.gz 

120 root_ext = cached_location.split(".", maxsplit=1) 

121 root = root_ext.pop(0) 

122 ext = "." + root_ext.pop(0) if root_ext else None 

123 

124 parts = root.split("_") 

125 id_ = parts.pop(0) 

126 component = parts.pop(0) if parts else None 

127 return {"id": id_, "component": component, "extension": ext} 

128 

129 

130class CacheEntry(BaseModel): 

131 """Represent an entry in the cache.""" 

132 

133 name: str 

134 """Name of the file.""" 

135 

136 size: int 

137 """Size of the file in bytes.""" 

138 

139 ctime: datetime.datetime 

140 """Creation time of the file.""" 

141 

142 ref: DatasetId 

143 """ID of this dataset.""" 

144 

145 component: Optional[str] 

146 """Component for this disassembled composite (optional).""" 

147 

148 @classmethod 

149 def from_file(cls, file: ButlerURI, root: ButlerURI) -> CacheEntry: 

150 """Construct an object from a file name. 

151 

152 Parameters 

153 ---------- 

154 file : `ButlerURI` 

155 Path to the file. 

156 root : `ButlerURI` 

157 Cache root directory. 

158 """ 

159 file_in_cache = file.relative_to(root) 

160 if file_in_cache is None: 

161 raise ValueError(f"Supplied file {file} is not inside root {root}") 

162 parts = _parse_cache_name(file_in_cache) 

163 

164 stat = os.stat(file.ospath) 

165 return cls( 

166 name=file_in_cache, 

167 size=stat.st_size, 

168 ref=parts["id"], 

169 component=parts["component"], 

170 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime), 

171 ) 

172 

173 

174class CacheRegistry(BaseModel): 

175 """Collection of cache entries.""" 

176 

177 _size: int = PrivateAttr(0) 

178 """Size of the cache.""" 

179 

180 _entries: Dict[str, CacheEntry] = PrivateAttr({}) 

181 """Internal collection of cache entries.""" 

182 

183 @property 

184 def cache_size(self) -> int: 

185 return self._size 

186 

187 def __getitem__(self, key: str) -> CacheEntry: 

188 return self._entries[key] 

189 

190 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

191 self._size += entry.size 

192 self._entries[key] = entry 

193 

194 def __delitem__(self, key: str) -> None: 

195 entry = self._entries.pop(key) 

196 self._decrement(entry) 

197 

198 def _decrement(self, entry: Optional[CacheEntry]) -> None: 

199 if entry: 

200 self._size -= entry.size 

201 if self._size < 0: 

202 log.warning("Cache size has gone negative. Inconsistent cache records...") 

203 self._size = 0 

204 

205 def __contains__(self, key: str) -> bool: 

206 return key in self._entries 

207 

208 def __len__(self) -> int: 

209 return len(self._entries) 

210 

211 def __iter__(self) -> Iterator[str]: # type: ignore 

212 return iter(self._entries) 

213 

214 def keys(self) -> KeysView[str]: 

215 return self._entries.keys() 

216 

217 def values(self) -> ValuesView[CacheEntry]: 

218 return self._entries.values() 

219 

220 def items(self) -> ItemsView[str, CacheEntry]: 

221 return self._entries.items() 

222 

223 def pop(self, key: str, default: Optional[CacheEntry] = None) -> Optional[CacheEntry]: 

224 entry = self._entries.pop(key, default) 

225 self._decrement(entry) 

226 return entry 

227 

228 

229class DatastoreCacheManagerConfig(ConfigSubset): 

230 """Configuration information for `DatastoreCacheManager`.""" 

231 

232 component = "cached" 

233 requiredKeys = ("cacheable",) 

234 

235 

236class AbstractDatastoreCacheManager(ABC): 

237 """An abstract base class for managing caching in a Datastore. 

238 

239 Parameters 

240 ---------- 

241 config : `str` or `DatastoreCacheManagerConfig` 

242 Configuration to control caching. 

243 universe : `DimensionUniverse` 

244 Set of all known dimensions, used to expand and validate any used 

245 in lookup keys. 

246 """ 

247 

248 @property 

249 def cache_size(self) -> int: 

250 """Size of the cache in bytes.""" 

251 return 0 

252 

253 @property 

254 def file_count(self) -> int: 

255 """Return number of cached files tracked by registry.""" 

256 return 0 

257 

258 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

259 if not isinstance(config, DatastoreCacheManagerConfig): 

260 config = DatastoreCacheManagerConfig(config) 

261 assert isinstance(config, DatastoreCacheManagerConfig) 

262 self.config = config 

263 

264 @abstractmethod 

265 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

266 """Indicate whether the entity should be added to the cache. 

267 

268 This is relevant when reading or writing. 

269 

270 Parameters 

271 ---------- 

272 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

273 Thing to test against the configuration. The ``name`` property 

274 is used to determine a match. A `DatasetType` will first check 

275 its name, before checking its `StorageClass`. If there are no 

276 matches the default will be returned. 

277 

278 Returns 

279 ------- 

280 should_cache : `bool` 

281 Returns `True` if the dataset should be cached; `False` otherwise. 

282 """ 

283 raise NotImplementedError() 

284 

285 @abstractmethod 

286 def move_to_cache(self, uri: ButlerURI, ref: DatasetRef) -> Optional[ButlerURI]: 

287 """Move a file to the cache. 

288 

289 Move the given file into the cache, using the supplied DatasetRef 

290 for naming. A call is made to `should_be_cached()` and if the 

291 DatasetRef should not be accepted `None` will be returned. 

292 

293 Cache expiry can occur during this. 

294 

295 Parameters 

296 ---------- 

297 uri : `ButlerURI` 

298 Location of the file to be relocated to the cache. Will be moved. 

299 ref : `DatasetRef` 

300 Ref associated with this file. Will be used to determine the name 

301 of the file within the cache. 

302 

303 Returns 

304 ------- 

305 new : `ButlerURI` or `None` 

306 URI to the file within the cache, or `None` if the dataset 

307 was not accepted by the cache. 

308 """ 

309 raise NotImplementedError() 

310 

311 @abstractmethod 

312 @contextlib.contextmanager 

313 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ButlerURI]]: 

314 """Look for a dataset in the cache and return its location. 

315 

316 Parameters 

317 ---------- 

318 ref : `DatasetRef` 

319 Dataset to locate in the cache. 

320 extension : `str` 

321 File extension expected. Should include the leading "``.``". 

322 

323 Yields 

324 ------ 

325 uri : `ButlerURI` or `None` 

326 The URI to the cached file, or `None` if the file has not been 

327 cached. 

328 

329 Notes 

330 ----- 

331 Should be used as a context manager in order to prevent this 

332 file from being removed from the cache for that context. 

333 """ 

334 raise NotImplementedError() 

335 

336 @abstractmethod 

337 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

338 """Remove the specified datasets from the cache. 

339 

340 It is not an error for these datasets to be missing from the cache. 

341 

342 Parameters 

343 ---------- 

344 ref : `DatasetRef` or iterable of `DatasetRef` 

345 The datasets to remove from the cache. 

346 """ 

347 raise NotImplementedError() 

348 

349 @abstractmethod 

350 def __str__(self) -> str: 

351 raise NotImplementedError() 

352 

353 

354class DatastoreCacheManager(AbstractDatastoreCacheManager): 

355 """A class for managing caching in a Datastore using local files. 

356 

357 Parameters 

358 ---------- 

359 config : `str` or `DatastoreCacheManagerConfig` 

360 Configuration to control caching. 

361 universe : `DimensionUniverse` 

362 Set of all known dimensions, used to expand and validate any used 

363 in lookup keys. 

364 

365 Notes 

366 ----- 

367 Two environment variables can be used to override the cache directory 

368 and expiration configuration: 

369 

370 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

371 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

372 

373 The expiration mode should take the form ``mode=threshold`` so for 

374 example to configure expiration to limit the cache directory to 5 datasets 

375 the value would be ``datasets=5``. 

376 """ 

377 

378 _temp_exemption_prefix = "exempt/" 

379 

380 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

381 super().__init__(config, universe) 

382 

383 # Set cache directory if it pre-exists, else defer creation until 

384 # requested. Allow external override from environment. 

385 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

386 self._cache_directory = ( 

387 ButlerURI(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

388 ) 

389 

390 if self._cache_directory: 

391 if not self._cache_directory.isLocal: 

392 raise ValueError( 

393 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

394 ) 

395 # Ensure that the cache directory is created. We assume that 

396 # someone specifying a permanent cache directory will be expecting 

397 # it to always be there. This will also trigger an error 

398 # early rather than waiting until the cache is needed. 

399 self._cache_directory.mkdir() 

400 

401 # Calculate the caching lookup table. 

402 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

403 

404 # Default decision to for whether a dataset should be cached. 

405 self._caching_default = self.config.get("default", False) 

406 

407 # Expiration mode. Read from config but allow override from 

408 # the environment. 

409 expiration_mode = self.config.get(("expiry", "mode")) 

410 threshold = self.config.get(("expiry", "threshold")) 

411 

412 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

413 if external_mode and "=" in external_mode: 

414 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

415 threshold = int(expiration_threshold) 

416 if expiration_mode is None: 

417 # Force to None to avoid confusion. 

418 threshold = None 

419 

420 self._expiration_mode: Optional[str] = expiration_mode 

421 self._expiration_threshold: Optional[int] = threshold 

422 if self._expiration_threshold is None and self._expiration_mode is not None: 

423 raise ValueError( 

424 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

425 ) 

426 

427 log.debug( 

428 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

429 self._cache_directory if self._cache_directory else "tmpdir", 

430 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

431 ) 

432 

433 # Files in cache, indexed by path within the cache directory. 

434 self._cache_entries = CacheRegistry() 

435 

436 @property 

437 def cache_directory(self) -> ButlerURI: 

438 if self._cache_directory is None: 

439 # Create on demand. 

440 self._cache_directory = ButlerURI( 

441 tempfile.mkdtemp(prefix="butler-"), forceDirectory=True, isTemporary=True 

442 ) 

443 log.debug("Creating temporary cache directory at %s", self._cache_directory) 

444 # Remove when we no longer need it. 

445 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

446 return self._cache_directory 

447 

448 @property 

449 def _temp_exempt_directory(self) -> ButlerURI: 

450 """Return the directory in which to store temporary cache files that 

451 should not be expired. 

452 """ 

453 return self.cache_directory.join(self._temp_exemption_prefix) 

454 

455 @property 

456 def cache_size(self) -> int: 

457 return self._cache_entries.cache_size 

458 

459 @property 

460 def file_count(self) -> int: 

461 return len(self._cache_entries) 

462 

463 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

464 # Docstring inherited 

465 matchName: Union[LookupKey, str] = "{} (via default)".format(entity) 

466 should_cache = self._caching_default 

467 

468 for key in entity._lookupNames(): 

469 if key in self._lut: 

470 should_cache = bool(self._lut[key]) 

471 matchName = key 

472 break 

473 

474 if not isinstance(should_cache, bool): 

475 raise TypeError( 

476 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

477 ) 

478 

479 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

480 return should_cache 

481 

482 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ButlerURI: 

483 """Construct the name to use for this dataset in the cache. 

484 

485 Parameters 

486 ---------- 

487 ref : `DatasetRef` 

488 The dataset to look up in or write to the cache. 

489 extension : `str` 

490 File extension to use for this file. Should include the 

491 leading "``.``". 

492 

493 Returns 

494 ------- 

495 uri : `ButlerURI` 

496 URI to use for this dataset in the cache. 

497 """ 

498 return _construct_cache_path(self.cache_directory, ref, extension) 

499 

500 def move_to_cache(self, uri: ButlerURI, ref: DatasetRef) -> Optional[ButlerURI]: 

501 # Docstring inherited 

502 if ref.id is None: 

503 raise ValueError(f"Can not cache a file associated with an unresolved reference ({ref})") 

504 

505 if not self.should_be_cached(ref): 

506 return None 

507 

508 # Write the file using the id of the dataset ref and the file 

509 # extension. 

510 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

511 

512 # Run cache expiry to ensure that we have room for this 

513 # item. 

514 self._expire_cache() 

515 

516 # Move into the cache. Given that multiple processes might be 

517 # sharing a single cache directory, and the file we need might have 

518 # been copied in whilst we were checking, allow overwrite without 

519 # complaint. Even for a private cache directory it is possible that 

520 # a second butler in a subprocess could be writing to it. 

521 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

522 log.debug("Cached dataset %s to %s", ref, cached_location) 

523 

524 self._register_cache_entry(cached_location) 

525 

526 return cached_location 

527 

528 @contextlib.contextmanager 

529 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ButlerURI]]: 

530 # Docstring inherited 

531 # Short circuit this if the cache directory has not been created yet. 

532 if self._cache_directory is None: 

533 yield None 

534 return 

535 

536 cached_location = self._construct_cache_name(ref, extension) 

537 if cached_location.exists(): 

538 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

539 

540 # The cached file could be removed by another process doing 

541 # cache expiration so we need to protect against that by making 

542 # a copy in a different tree. Use hardlinks to ensure that 

543 # we either have the cached file or we don't. This is robust 

544 # against race conditions that can be caused by using soft links 

545 # and the other end of the link being deleted just after it 

546 # is created. 

547 path_in_cache = cached_location.relative_to(self.cache_directory) 

548 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

549 temp_location: Optional[ButlerURI] = self._temp_exempt_directory.join(path_in_cache) 

550 try: 

551 if temp_location is not None: 

552 temp_location.transfer_from(cached_location, transfer="hardlink") 

553 except Exception: 

554 # Any failure will be treated as if the file was not 

555 # in the cache. Yielding the original cache location 

556 # is too dangerous. 

557 temp_location = None 

558 

559 try: 

560 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

561 yield temp_location 

562 finally: 

563 try: 

564 if temp_location: 

565 temp_location.remove() 

566 except FileNotFoundError: 

567 pass 

568 return 

569 

570 log.debug("Dataset %s not found in cache.", ref) 

571 yield None 

572 return 

573 

574 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

575 # Docstring inherited. 

576 

577 # Stop early if there are no cache entries anyhow. 

578 if len(self._cache_entries) == 0: 

579 return 

580 

581 if isinstance(refs, DatasetRef): 

582 refs = [refs] 

583 

584 # Create a set of all the IDs 

585 all_ids = {ref.getCheckedId() for ref in refs} 

586 

587 keys_to_remove = [] 

588 for key, entry in self._cache_entries.items(): 

589 if entry.ref in all_ids: 

590 keys_to_remove.append(key) 

591 self._remove_from_cache(keys_to_remove) 

592 

593 def _register_cache_entry(self, cached_location: ButlerURI, can_exist: bool = False) -> str: 

594 """Record the file in the cache registry. 

595 

596 Parameters 

597 ---------- 

598 cached_location : `ButlerURI` 

599 Location of the file to be registered. 

600 can_exist : `bool`, optional 

601 If `True` the item being registered can already be listed. 

602 This can allow a cache refresh to run without checking the 

603 file again. If `False` it is an error for the registry to 

604 already know about this file. 

605 

606 Returns 

607 ------- 

608 cache_key : `str` 

609 The key used in the registry for this file. 

610 """ 

611 path_in_cache = cached_location.relative_to(self.cache_directory) 

612 if path_in_cache is None: 

613 raise ValueError( 

614 f"Can not register cached file {cached_location} that is not within" 

615 f" the cache directory at {self.cache_directory}." 

616 ) 

617 if path_in_cache in self._cache_entries: 

618 if can_exist: 

619 return path_in_cache 

620 else: 

621 raise ValueError( 

622 f"Cached file {cached_location} is already known to the registry" 

623 " but this was expected to be a new file." 

624 ) 

625 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

626 self._cache_entries[path_in_cache] = details 

627 return path_in_cache 

628 

629 def scan_cache(self) -> None: 

630 """Scan the cache directory and record information about files.""" 

631 found = set() 

632 for file in ButlerURI.findFileResources([self.cache_directory]): 

633 assert isinstance(file, ButlerURI), "Unexpectedly did not get ButlerURI from iterator" 

634 

635 # Skip any that are found in an exempt part of the hierarchy 

636 # since they should not be part of the registry. 

637 if file.relative_to(self._temp_exempt_directory) is not None: 

638 continue 

639 

640 path_in_cache = self._register_cache_entry(file, can_exist=True) 

641 found.add(path_in_cache) 

642 

643 # Find any files that were recorded in the cache but are no longer 

644 # on disk. (something else cleared them out?) 

645 known_to_cache = set(self._cache_entries) 

646 missing = known_to_cache - found 

647 

648 if missing: 

649 log.debug( 

650 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

651 ) 

652 for path_in_cache in missing: 

653 self._cache_entries.pop(path_in_cache) 

654 

655 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

656 """Remove the specified cache entries from cache. 

657 

658 Parameters 

659 ---------- 

660 cache_entries : iterable of `str` 

661 The entries to remove from the cache. The values are the path 

662 within the cache. 

663 """ 

664 for entry in cache_entries: 

665 path = self.cache_directory.join(entry) 

666 

667 self._cache_entries.pop(entry) 

668 log.debug("Removing file from cache: %s", path) 

669 try: 

670 path.remove() 

671 except FileNotFoundError: 

672 pass 

673 

674 def _expire_cache(self) -> None: 

675 """Expire the files in the cache. 

676 

677 Notes 

678 ----- 

679 The expiration modes are defined by the config or can be overridden. 

680 Available options: 

681 

682 * ``files``: Number of files. 

683 * ``datasets``: Number of datasets 

684 * ``size``: Total size of files. 

685 * ``age``: Age of files. 

686 

687 The first three would remove in reverse time order. 

688 Number of files is complicated by the possibility of disassembled 

689 composites where 10 small files can be created for each dataset. 

690 

691 Additionally there is a use case for an external user to explicitly 

692 state the dataset refs that should be cached and then when to 

693 remove them. Overriding any global configuration. 

694 """ 

695 if self._expiration_mode is None: 

696 # Expiration has been disabled. 

697 return 

698 

699 # mypy can't be sure we have set a threshold properly 

700 if self._expiration_threshold is None: 

701 log.warning( 

702 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

703 ) 

704 return 

705 

706 # Sync up cache. There is no file locking involved so for a shared 

707 # cache multiple processes may be racing to delete files. Deleting 

708 # a file that no longer exists is not an error. 

709 self.scan_cache() 

710 

711 if self._expiration_mode == "files": 

712 n_files = len(self._cache_entries) 

713 n_over = n_files - self._expiration_threshold 

714 if n_over > 0: 

715 sorted_keys = self._sort_cache() 

716 keys_to_remove = sorted_keys[:n_over] 

717 self._remove_from_cache(keys_to_remove) 

718 return 

719 

720 if self._expiration_mode == "datasets": 

721 # Count the datasets, in ascending timestamp order, 

722 # so that oldest turn up first. 

723 datasets = defaultdict(list) 

724 for key in self._sort_cache(): 

725 entry = self._cache_entries[key] 

726 datasets[entry.ref].append(key) 

727 

728 n_datasets = len(datasets) 

729 n_over = n_datasets - self._expiration_threshold 

730 if n_over > 0: 

731 # Keys will be read out in insert order which 

732 # will be date order so oldest ones are removed. 

733 ref_ids = list(datasets.keys())[:n_over] 

734 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

735 self._remove_from_cache(keys_to_remove) 

736 return 

737 

738 if self._expiration_mode == "size": 

739 if self.cache_size > self._expiration_threshold: 

740 for key in self._sort_cache(): 

741 self._remove_from_cache([key]) 

742 if self.cache_size <= self._expiration_threshold: 

743 break 

744 return 

745 

746 if self._expiration_mode == "age": 

747 now = datetime.datetime.utcnow() 

748 for key in self._sort_cache(): 

749 delta = now - self._cache_entries[key].ctime 

750 if delta.seconds > self._expiration_threshold: 

751 self._remove_from_cache([key]) 

752 else: 

753 # We're already in date order. 

754 break 

755 return 

756 

757 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

758 

759 def _sort_cache(self) -> List[str]: 

760 """Sort the cache entries by time and return the sorted keys. 

761 

762 Returns 

763 ------- 

764 sorted : `list` of `str` 

765 Keys into the cache, sorted by time with oldest first. 

766 """ 

767 

768 def sort_by_time(key: str) -> datetime.datetime: 

769 """Sorter key function using cache entry details.""" 

770 return self._cache_entries[key].ctime 

771 

772 return sorted(self._cache_entries, key=sort_by_time) 

773 

774 def __str__(self) -> str: 

775 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

776 return ( 

777 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

778 f"default={self._caching_default}) " 

779 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

780 ) 

781 

782 

783class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

784 """A variant of the datastore cache where no cache is enabled. 

785 

786 Parameters 

787 ---------- 

788 config : `str` or `DatastoreCacheManagerConfig` 

789 Configuration to control caching. 

790 universe : `DimensionUniverse` 

791 Set of all known dimensions, used to expand and validate any used 

792 in lookup keys. 

793 """ 

794 

795 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

796 return 

797 

798 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

799 """Indicate whether the entity should be added to the cache. 

800 

801 Always returns `False`. 

802 """ 

803 return False 

804 

805 def move_to_cache(self, uri: ButlerURI, ref: DatasetRef) -> Optional[ButlerURI]: 

806 """Move dataset to cache but always refuse and returns `None`.""" 

807 return None 

808 

809 @contextlib.contextmanager 

810 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ButlerURI]]: 

811 """Look for a dataset in the cache and return its location. 

812 

813 Never finds a file. 

814 """ 

815 yield None 

816 

817 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

818 """Remove datasets from cache. 

819 

820 Always does nothing. 

821 """ 

822 return 

823 

824 def __str__(self) -> str: 

825 return f"{type(self).__name__}()"