Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Cache management for a datastore.""" 

25 

26__all__ = ("AbstractDatastoreCacheManager", 

27 "DatastoreDisabledCacheManager", 

28 "DatastoreCacheManager", 

29 "DatastoreCacheManagerConfig", 

30 ) 

31 

32from typing import ( 

33 TYPE_CHECKING, 

34 Dict, 

35 Iterable, 

36 Iterator, 

37 ItemsView, 

38 KeysView, 

39 List, 

40 Optional, 

41 Union, 

42 ValuesView, 

43) 

44 

45from abc import ABC, abstractmethod 

46from collections import defaultdict 

47import atexit 

48import contextlib 

49import datetime 

50import itertools 

51import logging 

52import os 

53import shutil 

54import tempfile 

55 

56from pydantic import BaseModel, PrivateAttr 

57 

58from .configSupport import processLookupConfigs 

59from .config import ConfigSubset 

60from ._butlerUri import ButlerURI 

61from .datasets import DatasetId, DatasetRef 

62 

63if TYPE_CHECKING: 63 ↛ 64line 63 didn't jump to line 64, because the condition on line 63 was never true

64 from .dimensions import DimensionUniverse 

65 from .datasets import DatasetType 

66 from .storageClass import StorageClass 

67 from .configSupport import LookupKey 

68 

69log = logging.getLogger(__name__) 

70 

71 

72def remove_cache_directory(directory: str) -> None: 

73 """Remove the specified directory and all its contents. 

74 """ 

75 log.debug("Removing temporary cache directory %s", directory) 

76 shutil.rmtree(directory, ignore_errors=True) 

77 

78 

79def _construct_cache_path(root: ButlerURI, ref: DatasetRef, extension: str) -> ButlerURI: 

80 """Construct the full path to use for this dataset in the cache. 

81 

82 Parameters 

83 ---------- 

84 ref : `DatasetRef` 

85 The dataset to look up in or write to the cache. 

86 extension : `str` 

87 File extension to use for this file. Should include the 

88 leading "``.``". 

89 

90 Returns 

91 ------- 

92 uri : `ButlerURI` 

93 URI to use for this dataset in the cache. 

94 """ 

95 # Dataset type component is needed in the name if composite 

96 # disassembly is happening since the ID is shared for all components. 

97 component = ref.datasetType.component() 

98 component = f"_{component}" if component else "" 

99 return root.join(f"{ref.id}{component}{extension}") 

100 

101 

102def _parse_cache_name(cached_location: str) -> Dict[str, Optional[str]]: 

103 """For a given cache name, return its component parts. 

104 

105 Changes to ``_construct_cache_path()`` should be reflected here. 

106 

107 Parameters 

108 ---------- 

109 cached_location : `str` 

110 The name of the file within the cache. 

111 

112 Returns 

113 ------- 

114 parsed : `dict` of `str`, `str` 

115 Parsed components of the file. These include: 

116 - "id": The dataset ID, 

117 - "component": The name of the component (can be `None`), 

118 - "extension": File extension (can be `None`). 

119 """ 

120 # Assume first dot is the extension and so allow .fits.gz 

121 root_ext = cached_location.split(".", maxsplit=1) 

122 root = root_ext.pop(0) 

123 ext = "." + root_ext.pop(0) if root_ext else None 

124 

125 parts = root.split("_") 

126 id_ = parts.pop(0) 

127 component = parts.pop(0) if parts else None 

128 return {"id": id_, "component": component, "extension": ext} 

129 

130 

131class CacheEntry(BaseModel): 

132 """Represent an entry in the cache.""" 

133 

134 name: str 

135 """Name of the file.""" 

136 

137 size: int 

138 """Size of the file in bytes.""" 

139 

140 ctime: datetime.datetime 

141 """Creation time of the file.""" 

142 

143 ref: DatasetId 

144 """ID of this dataset.""" 

145 

146 component: Optional[str] 

147 """Component for this disassembled composite (optional).""" 

148 

149 @classmethod 

150 def from_file(cls, file: ButlerURI, root: ButlerURI) -> CacheEntry: 

151 """Construct an object from a file name. 

152 

153 Parameters 

154 ---------- 

155 file : `ButlerURI` 

156 Path to the file. 

157 root : `ButlerURI` 

158 Cache root directory. 

159 """ 

160 file_in_cache = file.relative_to(root) 

161 if file_in_cache is None: 

162 raise ValueError(f"Supplied file {file} is not inside root {root}") 

163 parts = _parse_cache_name(file_in_cache) 

164 

165 stat = os.stat(file.ospath) 

166 return cls(name=file_in_cache, size=stat.st_size, ref=parts["id"], component=parts["component"], 

167 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime)) 

168 

169 

170class CacheRegistry(BaseModel): 

171 """Collection of cache entries.""" 

172 

173 _size: int = PrivateAttr(0) 

174 """Size of the cache.""" 

175 

176 _entries: Dict[str, CacheEntry] = PrivateAttr({}) 

177 """Internal collection of cache entries.""" 

178 

179 @property 

180 def cache_size(self) -> int: 

181 return self._size 

182 

183 def __getitem__(self, key: str) -> CacheEntry: 

184 return self._entries[key] 

185 

186 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

187 self._size += entry.size 

188 self._entries[key] = entry 

189 

190 def __delitem__(self, key: str) -> None: 

191 entry = self._entries.pop(key) 

192 self._decrement(entry) 

193 

194 def _decrement(self, entry: Optional[CacheEntry]) -> None: 

195 if entry: 

196 self._size -= entry.size 

197 if self._size < 0: 

198 log.warning("Cache size has gone negative. Inconsistent cache records...") 

199 self._size = 0 

200 

201 def __contains__(self, key: str) -> bool: 

202 return key in self._entries 

203 

204 def __len__(self) -> int: 

205 return len(self._entries) 

206 

207 def __iter__(self) -> Iterator[str]: # type: ignore 

208 return iter(self._entries) 

209 

210 def keys(self) -> KeysView[str]: 

211 return self._entries.keys() 

212 

213 def values(self) -> ValuesView[CacheEntry]: 

214 return self._entries.values() 

215 

216 def items(self) -> ItemsView[str, CacheEntry]: 

217 return self._entries.items() 

218 

219 def pop(self, key: str, default: Optional[CacheEntry] = None) -> Optional[CacheEntry]: 

220 entry = self._entries.pop(key, default) 

221 self._decrement(entry) 

222 return entry 

223 

224 

225class DatastoreCacheManagerConfig(ConfigSubset): 

226 """Configuration information for `DatastoreCacheManager`.""" 

227 

228 component = "cached" 

229 requiredKeys = ("cacheable",) 

230 

231 

232class AbstractDatastoreCacheManager(ABC): 

233 """An abstract base class for managing caching in a Datastore. 

234 

235 Parameters 

236 ---------- 

237 config : `str` or `DatastoreCacheManagerConfig` 

238 Configuration to control caching. 

239 universe : `DimensionUniverse` 

240 Set of all known dimensions, used to expand and validate any used 

241 in lookup keys. 

242 """ 

243 

244 @property 

245 def cache_size(self) -> int: 

246 """Size of the cache in bytes.""" 

247 return 0 

248 

249 @property 

250 def file_count(self) -> int: 

251 """Return number of cached files tracked by registry.""" 

252 return 0 

253 

254 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], 

255 universe: DimensionUniverse): 

256 if not isinstance(config, DatastoreCacheManagerConfig): 

257 config = DatastoreCacheManagerConfig(config) 

258 assert isinstance(config, DatastoreCacheManagerConfig) 

259 self.config = config 

260 

261 @abstractmethod 

262 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

263 """Indicate whether the entity should be added to the cache. 

264 

265 This is relevant when reading or writing. 

266 

267 Parameters 

268 ---------- 

269 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

270 Thing to test against the configuration. The ``name`` property 

271 is used to determine a match. A `DatasetType` will first check 

272 its name, before checking its `StorageClass`. If there are no 

273 matches the default will be returned. 

274 

275 Returns 

276 ------- 

277 should_cache : `bool` 

278 Returns `True` if the dataset should be cached; `False` otherwise. 

279 """ 

280 raise NotImplementedError() 

281 

282 @abstractmethod 

283 def move_to_cache(self, uri: ButlerURI, ref: DatasetRef) -> Optional[ButlerURI]: 

284 """Move a file to the cache. 

285 

286 Move the given file into the cache, using the supplied DatasetRef 

287 for naming. A call is made to `should_be_cached()` and if the 

288 DatasetRef should not be accepted `None` will be returned. 

289 

290 Cache expiry can occur during this. 

291 

292 Parameters 

293 ---------- 

294 uri : `ButlerURI` 

295 Location of the file to be relocated to the cache. Will be moved. 

296 ref : `DatasetRef` 

297 Ref associated with this file. Will be used to determine the name 

298 of the file within the cache. 

299 

300 Returns 

301 ------- 

302 new : `ButlerURI` or `None` 

303 URI to the file within the cache, or `None` if the dataset 

304 was not accepted by the cache. 

305 """ 

306 raise NotImplementedError() 

307 

308 @abstractmethod 

309 @contextlib.contextmanager 

310 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ButlerURI]]: 

311 """Look for a dataset in the cache and return its location. 

312 

313 Parameters 

314 ---------- 

315 ref : `DatasetRef` 

316 Dataset to locate in the cache. 

317 extension : `str` 

318 File extension expected. Should include the leading "``.``". 

319 

320 Yields 

321 ------ 

322 uri : `ButlerURI` or `None` 

323 The URI to the cached file, or `None` if the file has not been 

324 cached. 

325 

326 Notes 

327 ----- 

328 Should be used as a context manager in order to prevent this 

329 file from being removed from the cache for that context. 

330 """ 

331 raise NotImplementedError() 

332 

333 @abstractmethod 

334 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

335 """Remove the specified datasets from the cache. 

336 

337 It is not an error for these datasets to be missing from the cache. 

338 

339 Parameters 

340 ---------- 

341 ref : `DatasetRef` or iterable of `DatasetRef` 

342 The datasets to remove from the cache. 

343 """ 

344 raise NotImplementedError() 

345 

346 @abstractmethod 

347 def __str__(self) -> str: 

348 raise NotImplementedError() 

349 

350 

351class DatastoreCacheManager(AbstractDatastoreCacheManager): 

352 """A class for managing caching in a Datastore using local files. 

353 

354 Parameters 

355 ---------- 

356 config : `str` or `DatastoreCacheManagerConfig` 

357 Configuration to control caching. 

358 universe : `DimensionUniverse` 

359 Set of all known dimensions, used to expand and validate any used 

360 in lookup keys. 

361 

362 Notes 

363 ----- 

364 Two environment variables can be used to override the cache directory 

365 and expiration configuration: 

366 

367 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

368 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

369 

370 The expiration mode should take the form ``mode=threshold`` so for 

371 example to configure expiration to limit the cache directory to 5 datasets 

372 the value would be ``datasets=5``. 

373 """ 

374 

375 _temp_exemption_prefix = "exempt/" 

376 

377 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], 

378 universe: DimensionUniverse): 

379 super().__init__(config, universe) 

380 

381 # Set cache directory if it pre-exists, else defer creation until 

382 # requested. Allow external override from environment. 

383 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

384 self._cache_directory = ButlerURI(root, forceAbsolute=True, 

385 forceDirectory=True) if root is not None else None 

386 

387 if self._cache_directory: 

388 if not self._cache_directory.isLocal: 

389 raise ValueError("Cache directory must be on a local file system. " 

390 f"Got: {self._cache_directory}") 

391 # Ensure that the cache directory is created. We assume that 

392 # someone specifying a permanent cache directory will be expecting 

393 # it to always be there. This will also trigger an error 

394 # early rather than waiting until the cache is needed. 

395 self._cache_directory.mkdir() 

396 

397 # Calculate the caching lookup table. 

398 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

399 

400 # Default decision to for whether a dataset should be cached. 

401 self._caching_default = self.config.get("default", False) 

402 

403 # Expiration mode. Read from config but allow override from 

404 # the environment. 

405 expiration_mode = self.config.get(("expiry", "mode")) 

406 threshold = self.config.get(("expiry", "threshold")) 

407 

408 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

409 if external_mode and "=" in external_mode: 

410 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

411 threshold = int(expiration_threshold) 

412 if expiration_mode is None: 

413 # Force to None to avoid confusion. 

414 threshold = None 

415 

416 self._expiration_mode: Optional[str] = expiration_mode 

417 self._expiration_threshold: Optional[int] = threshold 

418 if self._expiration_threshold is None and self._expiration_mode is not None: 

419 raise ValueError("Cache expiration threshold must be set for expiration mode " 

420 f"{self._expiration_mode}") 

421 

422 log.debug("Cache configuration:\n- root: %s\n- expiration mode: %s", 

423 self._cache_directory if self._cache_directory else "tmpdir", 

424 f"{self._expiration_mode}={self._expiration_threshold}" 

425 if self._expiration_mode else "disabled") 

426 

427 # Files in cache, indexed by path within the cache directory. 

428 self._cache_entries = CacheRegistry() 

429 

430 @property 

431 def cache_directory(self) -> ButlerURI: 

432 if self._cache_directory is None: 

433 # Create on demand. 

434 self._cache_directory = ButlerURI(tempfile.mkdtemp(prefix="butler-"), forceDirectory=True, 

435 isTemporary=True) 

436 log.debug("Creating temporary cache directory at %s", self._cache_directory) 

437 # Remove when we no longer need it. 

438 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

439 return self._cache_directory 

440 

441 @property 

442 def _temp_exempt_directory(self) -> ButlerURI: 

443 """Return the directory in which to store temporary cache files that 

444 should not be expired. 

445 """ 

446 return self.cache_directory.join(self._temp_exemption_prefix) 

447 

448 @property 

449 def cache_size(self) -> int: 

450 return self._cache_entries.cache_size 

451 

452 @property 

453 def file_count(self) -> int: 

454 return len(self._cache_entries) 

455 

456 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

457 # Docstring inherited 

458 matchName: Union[LookupKey, str] = "{} (via default)".format(entity) 

459 should_cache = self._caching_default 

460 

461 for key in entity._lookupNames(): 

462 if key in self._lut: 

463 should_cache = bool(self._lut[key]) 

464 matchName = key 

465 break 

466 

467 if not isinstance(should_cache, bool): 

468 raise TypeError( 

469 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

470 ) 

471 

472 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

473 return should_cache 

474 

475 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ButlerURI: 

476 """Construct the name to use for this dataset in the cache. 

477 

478 Parameters 

479 ---------- 

480 ref : `DatasetRef` 

481 The dataset to look up in or write to the cache. 

482 extension : `str` 

483 File extension to use for this file. Should include the 

484 leading "``.``". 

485 

486 Returns 

487 ------- 

488 uri : `ButlerURI` 

489 URI to use for this dataset in the cache. 

490 """ 

491 return _construct_cache_path(self.cache_directory, ref, extension) 

492 

493 def move_to_cache(self, uri: ButlerURI, ref: DatasetRef) -> Optional[ButlerURI]: 

494 # Docstring inherited 

495 if ref.id is None: 

496 raise ValueError(f"Can not cache a file associated with an unresolved reference ({ref})") 

497 

498 if not self.should_be_cached(ref): 

499 return None 

500 

501 # Write the file using the id of the dataset ref and the file 

502 # extension. 

503 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

504 

505 # Run cache expiry to ensure that we have room for this 

506 # item. 

507 self._expire_cache() 

508 

509 # Move into the cache. Given that multiple processes might be 

510 # sharing a single cache directory, and the file we need might have 

511 # been copied in whilst we were checking, allow overwrite without 

512 # complaint. Even for a private cache directory it is possible that 

513 # a second butler in a subprocess could be writing to it. 

514 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

515 log.debug("Cached dataset %s to %s", ref, cached_location) 

516 

517 self._register_cache_entry(cached_location) 

518 

519 return cached_location 

520 

521 @contextlib.contextmanager 

522 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ButlerURI]]: 

523 # Docstring inherited 

524 # Short circuit this if the cache directory has not been created yet. 

525 if self._cache_directory is None: 

526 yield None 

527 return 

528 

529 cached_location = self._construct_cache_name(ref, extension) 

530 if cached_location.exists(): 

531 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

532 

533 # The cached file could be removed by another process doing 

534 # cache expiration so we need to protect against that by making 

535 # a copy in a different tree. Use hardlinks to ensure that 

536 # we either have the cached file or we don't. This is robust 

537 # against race conditions that can be caused by using soft links 

538 # and the other end of the link being deleted just after it 

539 # is created. 

540 path_in_cache = cached_location.relative_to(self.cache_directory) 

541 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

542 temp_location: Optional[ButlerURI] = self._temp_exempt_directory.join(path_in_cache) 

543 try: 

544 if temp_location is not None: 

545 temp_location.transfer_from(cached_location, transfer="hardlink") 

546 except Exception: 

547 # Any failure will be treated as if the file was not 

548 # in the cache. Yielding the original cache location 

549 # is too dangerous. 

550 temp_location = None 

551 

552 try: 

553 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

554 yield temp_location 

555 finally: 

556 try: 

557 if temp_location: 

558 temp_location.remove() 

559 except FileNotFoundError: 

560 pass 

561 return 

562 

563 log.debug("Dataset %s not found in cache.", ref) 

564 yield None 

565 return 

566 

567 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

568 # Docstring inherited. 

569 

570 # Stop early if there are no cache entries anyhow. 

571 if len(self._cache_entries) == 0: 

572 return 

573 

574 if isinstance(refs, DatasetRef): 

575 refs = [refs] 

576 

577 # Create a set of all the IDs 

578 all_ids = {ref.getCheckedId() for ref in refs} 

579 

580 keys_to_remove = [] 

581 for key, entry in self._cache_entries.items(): 

582 if entry.ref in all_ids: 

583 keys_to_remove.append(key) 

584 self._remove_from_cache(keys_to_remove) 

585 

586 def _register_cache_entry(self, cached_location: ButlerURI, can_exist: bool = False) -> str: 

587 """Record the file in the cache registry. 

588 

589 Parameters 

590 ---------- 

591 cached_location : `ButlerURI` 

592 Location of the file to be registered. 

593 can_exist : `bool`, optional 

594 If `True` the item being registered can already be listed. 

595 This can allow a cache refresh to run without checking the 

596 file again. If `False` it is an error for the registry to 

597 already know about this file. 

598 

599 Returns 

600 ------- 

601 cache_key : `str` 

602 The key used in the registry for this file. 

603 """ 

604 path_in_cache = cached_location.relative_to(self.cache_directory) 

605 if path_in_cache is None: 

606 raise ValueError(f"Can not register cached file {cached_location} that is not within" 

607 f" the cache directory at {self.cache_directory}.") 

608 if path_in_cache in self._cache_entries: 

609 if can_exist: 

610 return path_in_cache 

611 else: 

612 raise ValueError(f"Cached file {cached_location} is already known to the registry" 

613 " but this was expected to be a new file.") 

614 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

615 self._cache_entries[path_in_cache] = details 

616 return path_in_cache 

617 

618 def scan_cache(self) -> None: 

619 """Scan the cache directory and record information about files. 

620 """ 

621 found = set() 

622 for file in ButlerURI.findFileResources([self.cache_directory]): 

623 assert isinstance(file, ButlerURI), "Unexpectedly did not get ButlerURI from iterator" 

624 

625 # Skip any that are found in an exempt part of the hierarchy 

626 # since they should not be part of the registry. 

627 if file.relative_to(self._temp_exempt_directory) is not None: 

628 continue 

629 

630 path_in_cache = self._register_cache_entry(file, can_exist=True) 

631 found.add(path_in_cache) 

632 

633 # Find any files that were recorded in the cache but are no longer 

634 # on disk. (something else cleared them out?) 

635 known_to_cache = set(self._cache_entries) 

636 missing = known_to_cache - found 

637 

638 if missing: 

639 log.debug("Entries no longer on disk but thought to be in cache and so removed: %s", 

640 ",".join(missing)) 

641 for path_in_cache in missing: 

642 self._cache_entries.pop(path_in_cache) 

643 

644 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

645 """Remove the specified cache entries from cache. 

646 

647 Parameters 

648 ---------- 

649 cache_entries : iterable of `str` 

650 The entries to remove from the cache. The values are the path 

651 within the cache. 

652 """ 

653 for entry in cache_entries: 

654 path = self.cache_directory.join(entry) 

655 

656 self._cache_entries.pop(entry) 

657 log.debug("Removing file from cache: %s", path) 

658 try: 

659 path.remove() 

660 except FileNotFoundError: 

661 pass 

662 

663 def _expire_cache(self) -> None: 

664 """Expire the files in the cache. 

665 

666 Notes 

667 ----- 

668 The expiration modes are defined by the config or can be overridden. 

669 Available options: 

670 

671 * ``files``: Number of files. 

672 * ``datasets``: Number of datasets 

673 * ``size``: Total size of files. 

674 * ``age``: Age of files. 

675 

676 The first three would remove in reverse time order. 

677 Number of files is complicated by the possibility of disassembled 

678 composites where 10 small files can be created for each dataset. 

679 

680 Additionally there is a use case for an external user to explicitly 

681 state the dataset refs that should be cached and then when to 

682 remove them. Overriding any global configuration. 

683 """ 

684 if self._expiration_mode is None: 

685 # Expiration has been disabled. 

686 return 

687 

688 # mypy can't be sure we have set a threshold properly 

689 if self._expiration_threshold is None: 

690 log.warning("Requesting cache expiry of mode %s but no threshold set in config.", 

691 self._expiration_mode) 

692 return 

693 

694 # Sync up cache. There is no file locking involved so for a shared 

695 # cache multiple processes may be racing to delete files. Deleting 

696 # a file that no longer exists is not an error. 

697 self.scan_cache() 

698 

699 if self._expiration_mode == "files": 

700 n_files = len(self._cache_entries) 

701 n_over = n_files - self._expiration_threshold 

702 if n_over > 0: 

703 sorted_keys = self._sort_cache() 

704 keys_to_remove = sorted_keys[:n_over] 

705 self._remove_from_cache(keys_to_remove) 

706 return 

707 

708 if self._expiration_mode == "datasets": 

709 # Count the datasets, in ascending timestamp order, 

710 # so that oldest turn up first. 

711 datasets = defaultdict(list) 

712 for key in self._sort_cache(): 

713 entry = self._cache_entries[key] 

714 datasets[entry.ref].append(key) 

715 

716 n_datasets = len(datasets) 

717 n_over = n_datasets - self._expiration_threshold 

718 if n_over > 0: 

719 # Keys will be read out in insert order which 

720 # will be date order so oldest ones are removed. 

721 ref_ids = list(datasets.keys())[:n_over] 

722 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

723 self._remove_from_cache(keys_to_remove) 

724 return 

725 

726 if self._expiration_mode == "size": 

727 if self.cache_size > self._expiration_threshold: 

728 for key in self._sort_cache(): 

729 self._remove_from_cache([key]) 

730 if self.cache_size <= self._expiration_threshold: 

731 break 

732 return 

733 

734 if self._expiration_mode == "age": 

735 now = datetime.datetime.utcnow() 

736 for key in self._sort_cache(): 

737 delta = now - self._cache_entries[key].ctime 

738 if delta.seconds > self._expiration_threshold: 

739 self._remove_from_cache([key]) 

740 else: 

741 # We're already in date order. 

742 break 

743 return 

744 

745 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

746 

747 def _sort_cache(self) -> List[str]: 

748 """Sort the cache entries by time and return the sorted keys. 

749 

750 Returns 

751 ------- 

752 sorted : `list` of `str` 

753 Keys into the cache, sorted by time with oldest first. 

754 """ 

755 

756 def sort_by_time(key: str) -> datetime.datetime: 

757 """Sorter key function using cache entry details.""" 

758 return self._cache_entries[key].ctime 

759 

760 return sorted(self._cache_entries, key=sort_by_time) 

761 

762 def __str__(self) -> str: 

763 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

764 return f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," \ 

765 f"default={self._caching_default}) " \ 

766 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

767 

768 

769class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

770 """A variant of the datastore cache where no cache is enabled. 

771 

772 Parameters 

773 ---------- 

774 config : `str` or `DatastoreCacheManagerConfig` 

775 Configuration to control caching. 

776 universe : `DimensionUniverse` 

777 Set of all known dimensions, used to expand and validate any used 

778 in lookup keys. 

779 """ 

780 

781 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], 

782 universe: DimensionUniverse): 

783 return 

784 

785 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

786 """Indicate whether the entity should be added to the cache. 

787 

788 Always returns `False`. 

789 """ 

790 return False 

791 

792 def move_to_cache(self, uri: ButlerURI, ref: DatasetRef) -> Optional[ButlerURI]: 

793 """Move dataset to cache but always refuse and returns `None`.""" 

794 return None 

795 

796 @contextlib.contextmanager 

797 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ButlerURI]]: 

798 """Look for a dataset in the cache and return its location. 

799 

800 Never finds a file. 

801 """ 

802 yield None 

803 

804 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

805 """Remove datasets from cache. 

806 

807 Always does nothing. 

808 """ 

809 return 

810 

811 def __str__(self) -> str: 

812 return f"{type(self).__name__}()"