Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 25%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

365 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Cache management for a datastore.""" 

25 

26__all__ = ( 

27 "AbstractDatastoreCacheManager", 

28 "DatastoreDisabledCacheManager", 

29 "DatastoreCacheManager", 

30 "DatastoreCacheManagerConfig", 

31) 

32 

33import atexit 

34import contextlib 

35import datetime 

36import itertools 

37import logging 

38import os 

39import shutil 

40import tempfile 

41from abc import ABC, abstractmethod 

42from collections import defaultdict 

43from typing import ( 

44 TYPE_CHECKING, 

45 Dict, 

46 ItemsView, 

47 Iterable, 

48 Iterator, 

49 KeysView, 

50 List, 

51 Optional, 

52 Union, 

53 ValuesView, 

54) 

55 

56from lsst.resources import ResourcePath 

57from pydantic import BaseModel, PrivateAttr 

58 

59from .config import ConfigSubset 

60from .configSupport import processLookupConfigs 

61from .datasets import DatasetId, DatasetRef 

62 

63if TYPE_CHECKING: 63 ↛ 64line 63 didn't jump to line 64, because the condition on line 63 was never true

64 from .configSupport import LookupKey 

65 from .datasets import DatasetType 

66 from .dimensions import DimensionUniverse 

67 from .storageClass import StorageClass 

68 

69log = logging.getLogger(__name__) 

70 

71 

72def remove_cache_directory(directory: str) -> None: 

73 """Remove the specified directory and all its contents.""" 

74 log.debug("Removing temporary cache directory %s", directory) 

75 shutil.rmtree(directory, ignore_errors=True) 

76 

77 

78def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

79 """Construct the full path to use for this dataset in the cache. 

80 

81 Parameters 

82 ---------- 

83 ref : `DatasetRef` 

84 The dataset to look up in or write to the cache. 

85 extension : `str` 

86 File extension to use for this file. Should include the 

87 leading "``.``". 

88 

89 Returns 

90 ------- 

91 uri : `lsst.resources.ResourcePath` 

92 URI to use for this dataset in the cache. 

93 """ 

94 # Dataset type component is needed in the name if composite 

95 # disassembly is happening since the ID is shared for all components. 

96 component = ref.datasetType.component() 

97 component = f"_{component}" if component else "" 

98 return root.join(f"{ref.id}{component}{extension}") 

99 

100 

101def _parse_cache_name(cached_location: str) -> Dict[str, Optional[str]]: 

102 """For a given cache name, return its component parts. 

103 

104 Changes to ``_construct_cache_path()`` should be reflected here. 

105 

106 Parameters 

107 ---------- 

108 cached_location : `str` 

109 The name of the file within the cache. 

110 

111 Returns 

112 ------- 

113 parsed : `dict` of `str`, `str` 

114 Parsed components of the file. These include: 

115 - "id": The dataset ID, 

116 - "component": The name of the component (can be `None`), 

117 - "extension": File extension (can be `None`). 

118 """ 

119 # Assume first dot is the extension and so allow .fits.gz 

120 root_ext = cached_location.split(".", maxsplit=1) 

121 root = root_ext.pop(0) 

122 ext = "." + root_ext.pop(0) if root_ext else None 

123 

124 parts = root.split("_") 

125 id_ = parts.pop(0) 

126 component = parts.pop(0) if parts else None 

127 return {"id": id_, "component": component, "extension": ext} 

128 

129 

130class CacheEntry(BaseModel): 

131 """Represent an entry in the cache.""" 

132 

133 name: str 

134 """Name of the file.""" 

135 

136 size: int 

137 """Size of the file in bytes.""" 

138 

139 ctime: datetime.datetime 

140 """Creation time of the file.""" 

141 

142 ref: DatasetId 

143 """ID of this dataset.""" 

144 

145 component: Optional[str] 

146 """Component for this disassembled composite (optional).""" 

147 

148 @classmethod 

149 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

150 """Construct an object from a file name. 

151 

152 Parameters 

153 ---------- 

154 file : `lsst.resources.ResourcePath` 

155 Path to the file. 

156 root : `lsst.resources.ResourcePath` 

157 Cache root directory. 

158 """ 

159 file_in_cache = file.relative_to(root) 

160 if file_in_cache is None: 

161 raise ValueError(f"Supplied file {file} is not inside root {root}") 

162 parts = _parse_cache_name(file_in_cache) 

163 

164 stat = os.stat(file.ospath) 

165 return cls( 

166 name=file_in_cache, 

167 size=stat.st_size, 

168 ref=parts["id"], 

169 component=parts["component"], 

170 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime), 

171 ) 

172 

173 

174class _MarkerEntry(CacheEntry): 

175 pass 

176 

177 

178class CacheRegistry(BaseModel): 

179 """Collection of cache entries.""" 

180 

181 _size: int = PrivateAttr(0) 

182 """Size of the cache.""" 

183 

184 _entries: Dict[str, CacheEntry] = PrivateAttr({}) 

185 """Internal collection of cache entries.""" 

186 

187 _ref_map: Dict[DatasetId, List[str]] = PrivateAttr({}) 

188 """Mapping of DatasetID to corresponding keys in cache registry.""" 

189 

190 @property 

191 def cache_size(self) -> int: 

192 return self._size 

193 

194 def __getitem__(self, key: str) -> CacheEntry: 

195 return self._entries[key] 

196 

197 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

198 self._size += entry.size 

199 self._entries[key] = entry 

200 

201 # Update the mapping from ref to path. 

202 if entry.ref not in self._ref_map: 

203 self._ref_map[entry.ref] = [] 

204 self._ref_map[entry.ref].append(key) 

205 

206 def __delitem__(self, key: str) -> None: 

207 entry = self._entries.pop(key) 

208 self._decrement(entry) 

209 self._ref_map[entry.ref].remove(key) 

210 

211 def _decrement(self, entry: Optional[CacheEntry]) -> None: 

212 if entry: 

213 self._size -= entry.size 

214 if self._size < 0: 

215 log.warning("Cache size has gone negative. Inconsistent cache records...") 

216 self._size = 0 

217 

218 def __contains__(self, key: str) -> bool: 

219 return key in self._entries 

220 

221 def __len__(self) -> int: 

222 return len(self._entries) 

223 

224 def __iter__(self) -> Iterator[str]: # type: ignore 

225 return iter(self._entries) 

226 

227 def keys(self) -> KeysView[str]: 

228 return self._entries.keys() 

229 

230 def values(self) -> ValuesView[CacheEntry]: 

231 return self._entries.values() 

232 

233 def items(self) -> ItemsView[str, CacheEntry]: 

234 return self._entries.items() 

235 

236 # An private marker to indicate that pop() should raise if no default 

237 # is given. 

238 __marker = _MarkerEntry(name="marker", size=0, ref=0, ctime=datetime.datetime.utcfromtimestamp(0)) 

239 

240 def pop(self, key: str, default: Optional[CacheEntry] = __marker) -> Optional[CacheEntry]: 

241 # The marker for dict.pop is not the same as our marker. 

242 if default is self.__marker: 

243 entry = self._entries.pop(key) 

244 else: 

245 entry = self._entries.pop(key, self.__marker) 

246 # Should not attempt to correct for this entry being removed 

247 # if we got the default value. 

248 if entry is self.__marker: 

249 return default 

250 

251 self._decrement(entry) 

252 # The default entry given to this method may not even be in the cache. 

253 if entry and entry.ref in self._ref_map: 

254 keys = self._ref_map[entry.ref] 

255 if key in keys: 

256 keys.remove(key) 

257 return entry 

258 

259 def get_dataset_keys(self, dataset_id: Optional[DatasetId]) -> Optional[List[str]]: 

260 """Retrieve all keys associated with the given dataset ID. 

261 

262 Parameters 

263 ---------- 

264 dataset_id : `DatasetId` or `None` 

265 The dataset ID to look up. Returns `None` if the ID is `None`. 

266 

267 Returns 

268 ------- 

269 keys : `list` [`str`] 

270 Keys associated with this dataset. These keys can be used to lookup 

271 the cache entry information in the `CacheRegistry`. Returns 

272 `None` if the dataset is not known to the cache. 

273 """ 

274 if dataset_id not in self._ref_map: 

275 return None 

276 keys = self._ref_map[dataset_id] 

277 if not keys: 

278 return None 

279 return keys 

280 

281 

282class DatastoreCacheManagerConfig(ConfigSubset): 

283 """Configuration information for `DatastoreCacheManager`.""" 

284 

285 component = "cached" 

286 requiredKeys = ("cacheable",) 

287 

288 

289class AbstractDatastoreCacheManager(ABC): 

290 """An abstract base class for managing caching in a Datastore. 

291 

292 Parameters 

293 ---------- 

294 config : `str` or `DatastoreCacheManagerConfig` 

295 Configuration to control caching. 

296 universe : `DimensionUniverse` 

297 Set of all known dimensions, used to expand and validate any used 

298 in lookup keys. 

299 """ 

300 

301 @property 

302 def cache_size(self) -> int: 

303 """Size of the cache in bytes.""" 

304 return 0 

305 

306 @property 

307 def file_count(self) -> int: 

308 """Return number of cached files tracked by registry.""" 

309 return 0 

310 

311 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

312 if not isinstance(config, DatastoreCacheManagerConfig): 

313 config = DatastoreCacheManagerConfig(config) 

314 assert isinstance(config, DatastoreCacheManagerConfig) 

315 self.config = config 

316 

317 @abstractmethod 

318 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

319 """Indicate whether the entity should be added to the cache. 

320 

321 This is relevant when reading or writing. 

322 

323 Parameters 

324 ---------- 

325 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

326 Thing to test against the configuration. The ``name`` property 

327 is used to determine a match. A `DatasetType` will first check 

328 its name, before checking its `StorageClass`. If there are no 

329 matches the default will be returned. 

330 

331 Returns 

332 ------- 

333 should_cache : `bool` 

334 Returns `True` if the dataset should be cached; `False` otherwise. 

335 """ 

336 raise NotImplementedError() 

337 

338 @abstractmethod 

339 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

340 """Report if the dataset is known to the cache. 

341 

342 Parameters 

343 ---------- 

344 ref : `DatasetRef` 

345 Dataset to check for in the cache. 

346 extension : `str`, optional 

347 File extension expected. Should include the leading "``.``". 

348 If `None` the extension is ignored and the dataset ID alone is 

349 used to check in the cache. The extension must be defined if 

350 a specific component is being checked. 

351 

352 Returns 

353 ------- 

354 known : `bool` 

355 Returns `True` if the dataset is currently known to the cache 

356 and `False` otherwise. 

357 

358 Notes 

359 ----- 

360 This method can only report if the dataset is known to the cache 

361 in this specific instant and does not indicate whether the file 

362 can be read from the cache later. `find_in_cache()` should be called 

363 if the cached file is to be used. 

364 """ 

365 raise NotImplementedError() 

366 

367 @abstractmethod 

368 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

369 """Move a file to the cache. 

370 

371 Move the given file into the cache, using the supplied DatasetRef 

372 for naming. A call is made to `should_be_cached()` and if the 

373 DatasetRef should not be accepted `None` will be returned. 

374 

375 Cache expiry can occur during this. 

376 

377 Parameters 

378 ---------- 

379 uri : `lsst.resources.ResourcePath` 

380 Location of the file to be relocated to the cache. Will be moved. 

381 ref : `DatasetRef` 

382 Ref associated with this file. Will be used to determine the name 

383 of the file within the cache. 

384 

385 Returns 

386 ------- 

387 new : `lsst.resources.ResourcePath` or `None` 

388 URI to the file within the cache, or `None` if the dataset 

389 was not accepted by the cache. 

390 """ 

391 raise NotImplementedError() 

392 

393 @abstractmethod 

394 @contextlib.contextmanager 

395 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

396 """Look for a dataset in the cache and return its location. 

397 

398 Parameters 

399 ---------- 

400 ref : `DatasetRef` 

401 Dataset to locate in the cache. 

402 extension : `str` 

403 File extension expected. Should include the leading "``.``". 

404 

405 Yields 

406 ------ 

407 uri : `lsst.resources.ResourcePath` or `None` 

408 The URI to the cached file, or `None` if the file has not been 

409 cached. 

410 

411 Notes 

412 ----- 

413 Should be used as a context manager in order to prevent this 

414 file from being removed from the cache for that context. 

415 """ 

416 raise NotImplementedError() 

417 

418 @abstractmethod 

419 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

420 """Remove the specified datasets from the cache. 

421 

422 It is not an error for these datasets to be missing from the cache. 

423 

424 Parameters 

425 ---------- 

426 ref : `DatasetRef` or iterable of `DatasetRef` 

427 The datasets to remove from the cache. 

428 """ 

429 raise NotImplementedError() 

430 

431 @abstractmethod 

432 def __str__(self) -> str: 

433 raise NotImplementedError() 

434 

435 

436class DatastoreCacheManager(AbstractDatastoreCacheManager): 

437 """A class for managing caching in a Datastore using local files. 

438 

439 Parameters 

440 ---------- 

441 config : `str` or `DatastoreCacheManagerConfig` 

442 Configuration to control caching. 

443 universe : `DimensionUniverse` 

444 Set of all known dimensions, used to expand and validate any used 

445 in lookup keys. 

446 

447 Notes 

448 ----- 

449 Two environment variables can be used to override the cache directory 

450 and expiration configuration: 

451 

452 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

453 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

454 

455 The expiration mode should take the form ``mode=threshold`` so for 

456 example to configure expiration to limit the cache directory to 5 datasets 

457 the value would be ``datasets=5``. 

458 """ 

459 

460 _temp_exemption_prefix = "exempt/" 

461 

462 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

463 super().__init__(config, universe) 

464 

465 # Set cache directory if it pre-exists, else defer creation until 

466 # requested. Allow external override from environment. 

467 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

468 self._cache_directory = ( 

469 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

470 ) 

471 

472 if self._cache_directory: 

473 if not self._cache_directory.isLocal: 

474 raise ValueError( 

475 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

476 ) 

477 # Ensure that the cache directory is created. We assume that 

478 # someone specifying a permanent cache directory will be expecting 

479 # it to always be there. This will also trigger an error 

480 # early rather than waiting until the cache is needed. 

481 self._cache_directory.mkdir() 

482 

483 # Calculate the caching lookup table. 

484 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

485 

486 # Default decision to for whether a dataset should be cached. 

487 self._caching_default = self.config.get("default", False) 

488 

489 # Expiration mode. Read from config but allow override from 

490 # the environment. 

491 expiration_mode = self.config.get(("expiry", "mode")) 

492 threshold = self.config.get(("expiry", "threshold")) 

493 

494 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

495 if external_mode and "=" in external_mode: 

496 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

497 threshold = int(expiration_threshold) 

498 if expiration_mode is None: 

499 # Force to None to avoid confusion. 

500 threshold = None 

501 

502 self._expiration_mode: Optional[str] = expiration_mode 

503 self._expiration_threshold: Optional[int] = threshold 

504 if self._expiration_threshold is None and self._expiration_mode is not None: 

505 raise ValueError( 

506 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

507 ) 

508 

509 log.debug( 

510 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

511 self._cache_directory if self._cache_directory else "tmpdir", 

512 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

513 ) 

514 

515 # Files in cache, indexed by path within the cache directory. 

516 self._cache_entries = CacheRegistry() 

517 

518 @property 

519 def cache_directory(self) -> ResourcePath: 

520 if self._cache_directory is None: 

521 # Create on demand. 

522 self._cache_directory = ResourcePath( 

523 tempfile.mkdtemp(prefix="butler-"), forceDirectory=True, isTemporary=True 

524 ) 

525 log.debug("Creating temporary cache directory at %s", self._cache_directory) 

526 # Remove when we no longer need it. 

527 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

528 return self._cache_directory 

529 

530 @property 

531 def _temp_exempt_directory(self) -> ResourcePath: 

532 """Return the directory in which to store temporary cache files that 

533 should not be expired. 

534 """ 

535 return self.cache_directory.join(self._temp_exemption_prefix) 

536 

537 @property 

538 def cache_size(self) -> int: 

539 return self._cache_entries.cache_size 

540 

541 @property 

542 def file_count(self) -> int: 

543 return len(self._cache_entries) 

544 

545 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

546 # Docstring inherited 

547 matchName: Union[LookupKey, str] = "{} (via default)".format(entity) 

548 should_cache = self._caching_default 

549 

550 for key in entity._lookupNames(): 

551 if key in self._lut: 

552 should_cache = bool(self._lut[key]) 

553 matchName = key 

554 break 

555 

556 if not isinstance(should_cache, bool): 

557 raise TypeError( 

558 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

559 ) 

560 

561 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

562 return should_cache 

563 

564 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

565 """Construct the name to use for this dataset in the cache. 

566 

567 Parameters 

568 ---------- 

569 ref : `DatasetRef` 

570 The dataset to look up in or write to the cache. 

571 extension : `str` 

572 File extension to use for this file. Should include the 

573 leading "``.``". 

574 

575 Returns 

576 ------- 

577 uri : `lsst.resources.ResourcePath` 

578 URI to use for this dataset in the cache. 

579 """ 

580 return _construct_cache_path(self.cache_directory, ref, extension) 

581 

582 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

583 # Docstring inherited 

584 if ref.id is None: 

585 raise ValueError(f"Can not cache a file associated with an unresolved reference ({ref})") 

586 

587 if not self.should_be_cached(ref): 

588 return None 

589 

590 # Write the file using the id of the dataset ref and the file 

591 # extension. 

592 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

593 

594 # Run cache expiry to ensure that we have room for this 

595 # item. 

596 self._expire_cache() 

597 

598 # Move into the cache. Given that multiple processes might be 

599 # sharing a single cache directory, and the file we need might have 

600 # been copied in whilst we were checking, allow overwrite without 

601 # complaint. Even for a private cache directory it is possible that 

602 # a second butler in a subprocess could be writing to it. 

603 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

604 log.debug("Cached dataset %s to %s", ref, cached_location) 

605 

606 self._register_cache_entry(cached_location) 

607 

608 return cached_location 

609 

610 @contextlib.contextmanager 

611 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

612 # Docstring inherited 

613 # Short circuit this if the cache directory has not been created yet. 

614 if self._cache_directory is None: 

615 yield None 

616 return 

617 

618 cached_location = self._construct_cache_name(ref, extension) 

619 if cached_location.exists(): 

620 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

621 

622 # The cached file could be removed by another process doing 

623 # cache expiration so we need to protect against that by making 

624 # a copy in a different tree. Use hardlinks to ensure that 

625 # we either have the cached file or we don't. This is robust 

626 # against race conditions that can be caused by using soft links 

627 # and the other end of the link being deleted just after it 

628 # is created. 

629 path_in_cache = cached_location.relative_to(self.cache_directory) 

630 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

631 temp_location: Optional[ResourcePath] = self._temp_exempt_directory.join(path_in_cache) 

632 try: 

633 if temp_location is not None: 

634 temp_location.transfer_from(cached_location, transfer="hardlink") 

635 except Exception as e: 

636 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

637 # Any failure will be treated as if the file was not 

638 # in the cache. Yielding the original cache location 

639 # is too dangerous. 

640 temp_location = None 

641 

642 try: 

643 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

644 yield temp_location 

645 finally: 

646 try: 

647 if temp_location: 

648 temp_location.remove() 

649 except FileNotFoundError: 

650 pass 

651 return 

652 

653 log.debug("Dataset %s not found in cache.", ref) 

654 yield None 

655 return 

656 

657 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

658 # Docstring inherited. 

659 

660 # Stop early if there are no cache entries anyhow. 

661 if len(self._cache_entries) == 0: 

662 return 

663 

664 if isinstance(refs, DatasetRef): 

665 refs = [refs] 

666 

667 # Create a set of all the IDs 

668 all_ids = {ref.getCheckedId() for ref in refs} 

669 

670 keys_to_remove = [] 

671 for key, entry in self._cache_entries.items(): 

672 if entry.ref in all_ids: 

673 keys_to_remove.append(key) 

674 self._remove_from_cache(keys_to_remove) 

675 

676 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str: 

677 """Record the file in the cache registry. 

678 

679 Parameters 

680 ---------- 

681 cached_location : `lsst.resources.ResourcePath` 

682 Location of the file to be registered. 

683 can_exist : `bool`, optional 

684 If `True` the item being registered can already be listed. 

685 This can allow a cache refresh to run without checking the 

686 file again. If `False` it is an error for the registry to 

687 already know about this file. 

688 

689 Returns 

690 ------- 

691 cache_key : `str` 

692 The key used in the registry for this file. 

693 """ 

694 path_in_cache = cached_location.relative_to(self.cache_directory) 

695 if path_in_cache is None: 

696 raise ValueError( 

697 f"Can not register cached file {cached_location} that is not within" 

698 f" the cache directory at {self.cache_directory}." 

699 ) 

700 if path_in_cache in self._cache_entries: 

701 if can_exist: 

702 return path_in_cache 

703 else: 

704 raise ValueError( 

705 f"Cached file {cached_location} is already known to the registry" 

706 " but this was expected to be a new file." 

707 ) 

708 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

709 self._cache_entries[path_in_cache] = details 

710 return path_in_cache 

711 

712 def scan_cache(self) -> None: 

713 """Scan the cache directory and record information about files.""" 

714 found = set() 

715 for file in ResourcePath.findFileResources([self.cache_directory]): 

716 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

717 

718 # Skip any that are found in an exempt part of the hierarchy 

719 # since they should not be part of the registry. 

720 if file.relative_to(self._temp_exempt_directory) is not None: 

721 continue 

722 

723 path_in_cache = self._register_cache_entry(file, can_exist=True) 

724 found.add(path_in_cache) 

725 

726 # Find any files that were recorded in the cache but are no longer 

727 # on disk. (something else cleared them out?) 

728 known_to_cache = set(self._cache_entries) 

729 missing = known_to_cache - found 

730 

731 if missing: 

732 log.debug( 

733 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

734 ) 

735 for path_in_cache in missing: 

736 self._cache_entries.pop(path_in_cache, None) 

737 

738 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

739 """Report if the dataset is known to the cache. 

740 

741 Parameters 

742 ---------- 

743 ref : `DatasetRef` 

744 Dataset to check for in the cache. 

745 extension : `str`, optional 

746 File extension expected. Should include the leading "``.``". 

747 If `None` the extension is ignored and the dataset ID alone is 

748 used to check in the cache. The extension must be defined if 

749 a specific component is being checked. 

750 

751 Returns 

752 ------- 

753 known : `bool` 

754 Returns `True` if the dataset is currently known to the cache 

755 and `False` otherwise. If the dataset refers to a component and 

756 an extension is given then only that component is checked. 

757 

758 Notes 

759 ----- 

760 This method can only report if the dataset is known to the cache 

761 in this specific instant and does not indicate whether the file 

762 can be read from the cache later. `find_in_cache()` should be called 

763 if the cached file is to be used. 

764 

765 This method does not force the cache to be re-scanned and so can miss 

766 cached datasets that have recently been written by other processes. 

767 """ 

768 if self._cache_directory is None: 

769 return False 

770 if self.file_count == 0: 

771 return False 

772 

773 if extension is None: 

774 # Look solely for matching dataset ref ID and not specific 

775 # components. 

776 cached_paths = self._cache_entries.get_dataset_keys(ref.id) 

777 return True if cached_paths else False 

778 

779 else: 

780 # Extension is known so we can do an explicit look up for the 

781 # cache entry. 

782 cached_location = self._construct_cache_name(ref, extension) 

783 path_in_cache = cached_location.relative_to(self.cache_directory) 

784 assert path_in_cache is not None # For mypy 

785 return path_in_cache in self._cache_entries 

786 

787 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

788 """Remove the specified cache entries from cache. 

789 

790 Parameters 

791 ---------- 

792 cache_entries : iterable of `str` 

793 The entries to remove from the cache. The values are the path 

794 within the cache. 

795 """ 

796 for entry in cache_entries: 

797 path = self.cache_directory.join(entry) 

798 

799 self._cache_entries.pop(entry, None) 

800 log.debug("Removing file from cache: %s", path) 

801 try: 

802 path.remove() 

803 except FileNotFoundError: 

804 pass 

805 

806 def _expire_cache(self) -> None: 

807 """Expire the files in the cache. 

808 

809 Notes 

810 ----- 

811 The expiration modes are defined by the config or can be overridden. 

812 Available options: 

813 

814 * ``files``: Number of files. 

815 * ``datasets``: Number of datasets 

816 * ``size``: Total size of files. 

817 * ``age``: Age of files. 

818 

819 The first three would remove in reverse time order. 

820 Number of files is complicated by the possibility of disassembled 

821 composites where 10 small files can be created for each dataset. 

822 

823 Additionally there is a use case for an external user to explicitly 

824 state the dataset refs that should be cached and then when to 

825 remove them. Overriding any global configuration. 

826 """ 

827 if self._expiration_mode is None: 

828 # Expiration has been disabled. 

829 return 

830 

831 # mypy can't be sure we have set a threshold properly 

832 if self._expiration_threshold is None: 

833 log.warning( 

834 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

835 ) 

836 return 

837 

838 # Sync up cache. There is no file locking involved so for a shared 

839 # cache multiple processes may be racing to delete files. Deleting 

840 # a file that no longer exists is not an error. 

841 self.scan_cache() 

842 

843 if self._expiration_mode == "files": 

844 n_files = len(self._cache_entries) 

845 n_over = n_files - self._expiration_threshold 

846 if n_over > 0: 

847 sorted_keys = self._sort_cache() 

848 keys_to_remove = sorted_keys[:n_over] 

849 self._remove_from_cache(keys_to_remove) 

850 return 

851 

852 if self._expiration_mode == "datasets": 

853 # Count the datasets, in ascending timestamp order, 

854 # so that oldest turn up first. 

855 datasets = defaultdict(list) 

856 for key in self._sort_cache(): 

857 entry = self._cache_entries[key] 

858 datasets[entry.ref].append(key) 

859 

860 n_datasets = len(datasets) 

861 n_over = n_datasets - self._expiration_threshold 

862 if n_over > 0: 

863 # Keys will be read out in insert order which 

864 # will be date order so oldest ones are removed. 

865 ref_ids = list(datasets.keys())[:n_over] 

866 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

867 self._remove_from_cache(keys_to_remove) 

868 return 

869 

870 if self._expiration_mode == "size": 

871 if self.cache_size > self._expiration_threshold: 

872 for key in self._sort_cache(): 

873 self._remove_from_cache([key]) 

874 if self.cache_size <= self._expiration_threshold: 

875 break 

876 return 

877 

878 if self._expiration_mode == "age": 

879 now = datetime.datetime.utcnow() 

880 for key in self._sort_cache(): 

881 delta = now - self._cache_entries[key].ctime 

882 if delta.seconds > self._expiration_threshold: 

883 self._remove_from_cache([key]) 

884 else: 

885 # We're already in date order. 

886 break 

887 return 

888 

889 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

890 

891 def _sort_cache(self) -> List[str]: 

892 """Sort the cache entries by time and return the sorted keys. 

893 

894 Returns 

895 ------- 

896 sorted : `list` of `str` 

897 Keys into the cache, sorted by time with oldest first. 

898 """ 

899 

900 def sort_by_time(key: str) -> datetime.datetime: 

901 """Sorter key function using cache entry details.""" 

902 return self._cache_entries[key].ctime 

903 

904 return sorted(self._cache_entries, key=sort_by_time) 

905 

906 def __str__(self) -> str: 

907 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

908 return ( 

909 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

910 f"default={self._caching_default}) " 

911 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

912 ) 

913 

914 

915class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

916 """A variant of the datastore cache where no cache is enabled. 

917 

918 Parameters 

919 ---------- 

920 config : `str` or `DatastoreCacheManagerConfig` 

921 Configuration to control caching. 

922 universe : `DimensionUniverse` 

923 Set of all known dimensions, used to expand and validate any used 

924 in lookup keys. 

925 """ 

926 

927 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

928 return 

929 

930 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

931 """Indicate whether the entity should be added to the cache. 

932 

933 Always returns `False`. 

934 """ 

935 return False 

936 

937 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

938 """Move dataset to cache but always refuse and returns `None`.""" 

939 return None 

940 

941 @contextlib.contextmanager 

942 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

943 """Look for a dataset in the cache and return its location. 

944 

945 Never finds a file. 

946 """ 

947 yield None 

948 

949 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

950 """Remove datasets from cache. 

951 

952 Always does nothing. 

953 """ 

954 return 

955 

956 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

957 """Report if a dataset is known to the cache. 

958 

959 Always returns `False`. 

960 """ 

961 return False 

962 

963 def __str__(self) -> str: 

964 return f"{type(self).__name__}()"