Coverage for python/lsst/daf/butler/datastore/cache_manager.py: 28%

393 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-01 11:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Cache management for a datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "AbstractDatastoreCacheManager", 

34 "DatastoreDisabledCacheManager", 

35 "DatastoreCacheManager", 

36 "DatastoreCacheManagerConfig", 

37) 

38 

39import atexit 

40import contextlib 

41import datetime 

42import itertools 

43import logging 

44import os 

45import shutil 

46import tempfile 

47import uuid 

48from abc import ABC, abstractmethod 

49from collections import defaultdict 

50from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView 

51from random import Random 

52from typing import TYPE_CHECKING 

53 

54from lsst.resources import ResourcePath 

55from pydantic import BaseModel, PrivateAttr 

56 

57from .._config import ConfigSubset 

58from .._config_support import processLookupConfigs 

59from .._dataset_ref import DatasetId, DatasetRef 

60 

61if TYPE_CHECKING: 

62 from .._config_support import LookupKey 

63 from .._dataset_type import DatasetType 

64 from .._storage_class import StorageClass 

65 from ..dimensions import DimensionUniverse 

66 

67log = logging.getLogger(__name__) 

68 

69 

70def remove_cache_directory(directory: str) -> None: 

71 """Remove the specified directory and all its contents. 

72 

73 Parameters 

74 ---------- 

75 directory : `str` 

76 Directory to remove. 

77 """ 

78 log.debug("Removing temporary cache directory %s", directory) 

79 shutil.rmtree(directory, ignore_errors=True) 

80 

81 

82def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

83 """Construct the full path to use for this dataset in the cache. 

84 

85 Parameters 

86 ---------- 

87 root : `lsst.resources.ResourcePath` 

88 The root of the cache. 

89 ref : `DatasetRef` 

90 The dataset to look up in or write to the cache. 

91 extension : `str` 

92 File extension to use for this file. Should include the 

93 leading "``.``". 

94 

95 Returns 

96 ------- 

97 uri : `lsst.resources.ResourcePath` 

98 URI to use for this dataset in the cache. 

99 """ 

100 # Dataset type component is needed in the name if composite 

101 # disassembly is happening since the ID is shared for all components. 

102 component = ref.datasetType.component() 

103 component = f"_{component}" if component else "" 

104 return root.join(f"{ref.id}{component}{extension}") 

105 

106 

107def _parse_cache_name(cached_location: str) -> tuple[uuid.UUID, str | None, str | None]: 

108 """For a given cache name, return its component parts. 

109 

110 Changes to ``_construct_cache_path()`` should be reflected here. 

111 

112 Parameters 

113 ---------- 

114 cached_location : `str` 

115 The name of the file within the cache. 

116 

117 Returns 

118 ------- 

119 id : `uuid.UUID` 

120 The dataset ID. 

121 component : `str` or `None` 

122 The name of the component, if present. 

123 extension: `str` or `None` 

124 The file extension, if present. 

125 """ 

126 # Assume first dot is the extension and so allow .fits.gz 

127 root_ext = cached_location.split(".", maxsplit=1) 

128 root = root_ext.pop(0) 

129 ext = "." + root_ext.pop(0) if root_ext else None 

130 

131 parts = root.split("_") 

132 id_ = uuid.UUID(parts.pop(0)) 

133 component = parts.pop(0) if parts else None 

134 return id_, component, ext 

135 

136 

137class CacheEntry(BaseModel): 

138 """Represent an entry in the cache.""" 

139 

140 name: str 

141 """Name of the file.""" 

142 

143 size: int 

144 """Size of the file in bytes.""" 

145 

146 ctime: datetime.datetime 

147 """Creation time of the file.""" 

148 

149 ref: DatasetId 

150 """ID of this dataset.""" 

151 

152 component: str | None = None 

153 """Component for this disassembled composite (optional).""" 

154 

155 @classmethod 

156 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

157 """Construct an object from a file name. 

158 

159 Parameters 

160 ---------- 

161 file : `lsst.resources.ResourcePath` 

162 Path to the file. 

163 root : `lsst.resources.ResourcePath` 

164 Cache root directory. 

165 """ 

166 file_in_cache = file.relative_to(root) 

167 if file_in_cache is None: 

168 raise ValueError(f"Supplied file {file} is not inside root {root}") 

169 id_, component, _ = _parse_cache_name(file_in_cache) 

170 

171 stat = os.stat(file.ospath) 

172 return cls.model_construct( 

173 name=file_in_cache, 

174 size=stat.st_size, 

175 ref=id_, 

176 component=component, 

177 ctime=datetime.datetime.fromtimestamp(stat.st_ctime, datetime.UTC), 

178 ) 

179 

180 

181class _MarkerEntry(CacheEntry): 

182 pass 

183 

184 

185class CacheRegistry(BaseModel): 

186 """Collection of cache entries.""" 

187 

188 _size: int = PrivateAttr(0) 

189 """Size of the cache.""" 

190 

191 _entries: dict[str, CacheEntry] = PrivateAttr({}) 

192 """Internal collection of cache entries.""" 

193 

194 _ref_map: dict[DatasetId, list[str]] = PrivateAttr({}) 

195 """Mapping of DatasetID to corresponding keys in cache registry.""" 

196 

197 @property 

198 def cache_size(self) -> int: 

199 return self._size 

200 

201 def __getitem__(self, key: str) -> CacheEntry: 

202 return self._entries[key] 

203 

204 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

205 self._size += entry.size 

206 self._entries[key] = entry 

207 

208 # Update the mapping from ref to path. 

209 if entry.ref not in self._ref_map: 

210 self._ref_map[entry.ref] = [] 

211 self._ref_map[entry.ref].append(key) 

212 

213 def __delitem__(self, key: str) -> None: 

214 entry = self._entries.pop(key) 

215 self._decrement(entry) 

216 self._ref_map[entry.ref].remove(key) 

217 

218 def _decrement(self, entry: CacheEntry | None) -> None: 

219 if entry: 

220 self._size -= entry.size 

221 if self._size < 0: 

222 log.warning("Cache size has gone negative. Inconsistent cache records...") 

223 self._size = 0 

224 

225 def __contains__(self, key: str) -> bool: 

226 return key in self._entries 

227 

228 def __len__(self) -> int: 

229 return len(self._entries) 

230 

231 def __iter__(self) -> Iterator[str]: # type: ignore 

232 return iter(self._entries) 

233 

234 def keys(self) -> KeysView[str]: 

235 return self._entries.keys() 

236 

237 def values(self) -> ValuesView[CacheEntry]: 

238 return self._entries.values() 

239 

240 def items(self) -> ItemsView[str, CacheEntry]: 

241 return self._entries.items() 

242 

243 # An private marker to indicate that pop() should raise if no default 

244 # is given. 

245 __marker = _MarkerEntry( 

246 name="marker", 

247 size=0, 

248 ref=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), 

249 ctime=datetime.datetime.fromtimestamp(0, datetime.UTC), 

250 ) 

251 

252 def pop(self, key: str, default: CacheEntry | None = __marker) -> CacheEntry | None: 

253 # The marker for dict.pop is not the same as our marker. 

254 if default is self.__marker: 

255 entry = self._entries.pop(key) 

256 else: 

257 entry = self._entries.pop(key, self.__marker) 

258 # Should not attempt to correct for this entry being removed 

259 # if we got the default value. 

260 if entry is self.__marker: 

261 return default 

262 

263 self._decrement(entry) 

264 # The default entry given to this method may not even be in the cache. 

265 if entry and entry.ref in self._ref_map: 

266 keys = self._ref_map[entry.ref] 

267 if key in keys: 

268 keys.remove(key) 

269 return entry 

270 

271 def get_dataset_keys(self, dataset_id: DatasetId | None) -> list[str] | None: 

272 """Retrieve all keys associated with the given dataset ID. 

273 

274 Parameters 

275 ---------- 

276 dataset_id : `DatasetId` or `None` 

277 The dataset ID to look up. Returns `None` if the ID is `None`. 

278 

279 Returns 

280 ------- 

281 keys : `list` [`str`] 

282 Keys associated with this dataset. These keys can be used to lookup 

283 the cache entry information in the `CacheRegistry`. Returns 

284 `None` if the dataset is not known to the cache. 

285 """ 

286 if dataset_id not in self._ref_map: 

287 return None 

288 keys = self._ref_map[dataset_id] 

289 if not keys: 

290 return None 

291 return keys 

292 

293 

294class DatastoreCacheManagerConfig(ConfigSubset): 

295 """Configuration information for `DatastoreCacheManager`.""" 

296 

297 component = "cached" 

298 requiredKeys = ("cacheable",) 

299 

300 

301class AbstractDatastoreCacheManager(ABC): 

302 """An abstract base class for managing caching in a Datastore. 

303 

304 Parameters 

305 ---------- 

306 config : `str` or `DatastoreCacheManagerConfig` 

307 Configuration to control caching. 

308 universe : `DimensionUniverse` 

309 Set of all known dimensions, used to expand and validate any used 

310 in lookup keys. 

311 """ 

312 

313 @property 

314 def cache_size(self) -> int: 

315 """Size of the cache in bytes.""" 

316 return 0 

317 

318 @property 

319 def file_count(self) -> int: 

320 """Return number of cached files tracked by registry.""" 

321 return 0 

322 

323 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

324 if not isinstance(config, DatastoreCacheManagerConfig): 

325 config = DatastoreCacheManagerConfig(config) 

326 assert isinstance(config, DatastoreCacheManagerConfig) 

327 self.config = config 

328 

329 @abstractmethod 

330 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

331 """Indicate whether the entity should be added to the cache. 

332 

333 This is relevant when reading or writing. 

334 

335 Parameters 

336 ---------- 

337 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

338 Thing to test against the configuration. The ``name`` property 

339 is used to determine a match. A `DatasetType` will first check 

340 its name, before checking its `StorageClass`. If there are no 

341 matches the default will be returned. 

342 

343 Returns 

344 ------- 

345 should_cache : `bool` 

346 Returns `True` if the dataset should be cached; `False` otherwise. 

347 """ 

348 raise NotImplementedError() 

349 

350 @abstractmethod 

351 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

352 """Report if the dataset is known to the cache. 

353 

354 Parameters 

355 ---------- 

356 ref : `DatasetRef` 

357 Dataset to check for in the cache. 

358 extension : `str`, optional 

359 File extension expected. Should include the leading "``.``". 

360 If `None` the extension is ignored and the dataset ID alone is 

361 used to check in the cache. The extension must be defined if 

362 a specific component is being checked. 

363 

364 Returns 

365 ------- 

366 known : `bool` 

367 Returns `True` if the dataset is currently known to the cache 

368 and `False` otherwise. 

369 

370 Notes 

371 ----- 

372 This method can only report if the dataset is known to the cache 

373 in this specific instant and does not indicate whether the file 

374 can be read from the cache later. `find_in_cache()` should be called 

375 if the cached file is to be used. 

376 """ 

377 raise NotImplementedError() 

378 

379 @abstractmethod 

380 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

381 """Move a file to the cache. 

382 

383 Move the given file into the cache, using the supplied DatasetRef 

384 for naming. A call is made to `should_be_cached()` and if the 

385 DatasetRef should not be accepted `None` will be returned. 

386 

387 Cache expiry can occur during this. 

388 

389 Parameters 

390 ---------- 

391 uri : `lsst.resources.ResourcePath` 

392 Location of the file to be relocated to the cache. Will be moved. 

393 ref : `DatasetRef` 

394 Ref associated with this file. Will be used to determine the name 

395 of the file within the cache. 

396 

397 Returns 

398 ------- 

399 new : `lsst.resources.ResourcePath` or `None` 

400 URI to the file within the cache, or `None` if the dataset 

401 was not accepted by the cache. 

402 """ 

403 raise NotImplementedError() 

404 

405 @abstractmethod 

406 @contextlib.contextmanager 

407 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

408 """Look for a dataset in the cache and return its location. 

409 

410 Parameters 

411 ---------- 

412 ref : `DatasetRef` 

413 Dataset to locate in the cache. 

414 extension : `str` 

415 File extension expected. Should include the leading "``.``". 

416 

417 Yields 

418 ------ 

419 uri : `lsst.resources.ResourcePath` or `None` 

420 The URI to the cached file, or `None` if the file has not been 

421 cached. 

422 

423 Notes 

424 ----- 

425 Should be used as a context manager in order to prevent this 

426 file from being removed from the cache for that context. 

427 """ 

428 raise NotImplementedError() 

429 

430 @abstractmethod 

431 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None: 

432 """Remove the specified datasets from the cache. 

433 

434 It is not an error for these datasets to be missing from the cache. 

435 

436 Parameters 

437 ---------- 

438 ref : `DatasetRef` or iterable of `DatasetRef` 

439 The datasets to remove from the cache. 

440 """ 

441 raise NotImplementedError() 

442 

443 @abstractmethod 

444 def __str__(self) -> str: 

445 raise NotImplementedError() 

446 

447 

448class DatastoreCacheManager(AbstractDatastoreCacheManager): 

449 """A class for managing caching in a Datastore using local files. 

450 

451 Parameters 

452 ---------- 

453 config : `str` or `DatastoreCacheManagerConfig` 

454 Configuration to control caching. 

455 universe : `DimensionUniverse` 

456 Set of all known dimensions, used to expand and validate any used 

457 in lookup keys. 

458 

459 Notes 

460 ----- 

461 Two environment variables can be used to override the cache directory 

462 and expiration configuration: 

463 

464 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

465 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

466 

467 The expiration mode should take the form ``mode=threshold`` so for 

468 example to configure expiration to limit the cache directory to 5 datasets 

469 the value would be ``datasets=5``. 

470 

471 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment 

472 variable can be used to indicate that this directory should be used 

473 if no explicit directory has been specified from configuration or from 

474 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable. 

475 """ 

476 

477 _temp_exemption_prefix = "exempt/" 

478 _tmpdir_prefix = "butler-cache-dir-" 

479 

480 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

481 super().__init__(config, universe) 

482 

483 # Set cache directory if it pre-exists, else defer creation until 

484 # requested. Allow external override from environment. 

485 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

486 

487 # Allow the execution environment to override the default values 

488 # so long as no default value has been set from the line above. 

489 if root is None: 

490 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET") 

491 

492 self._cache_directory = ( 

493 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

494 ) 

495 

496 if self._cache_directory: 

497 if not self._cache_directory.isLocal: 

498 raise ValueError( 

499 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

500 ) 

501 # Ensure that the cache directory is created. We assume that 

502 # someone specifying a permanent cache directory will be expecting 

503 # it to always be there. This will also trigger an error 

504 # early rather than waiting until the cache is needed. 

505 self._cache_directory.mkdir() 

506 

507 # Calculate the caching lookup table. 

508 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

509 

510 # Default decision to for whether a dataset should be cached. 

511 self._caching_default = self.config.get("default", False) 

512 

513 # Expiration mode. Read from config but allow override from 

514 # the environment. 

515 expiration_mode = self.config.get(("expiry", "mode")) 

516 threshold = self.config.get(("expiry", "threshold")) 

517 

518 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

519 if external_mode and "=" in external_mode: 

520 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

521 threshold = int(expiration_threshold) 

522 if expiration_mode is None: 

523 # Force to None to avoid confusion. 

524 threshold = None 

525 

526 self._expiration_mode: str | None = expiration_mode 

527 self._expiration_threshold: int | None = threshold 

528 if self._expiration_threshold is None and self._expiration_mode is not None: 

529 raise ValueError( 

530 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

531 ) 

532 

533 log.debug( 

534 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

535 self._cache_directory if self._cache_directory else "tmpdir", 

536 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

537 ) 

538 

539 # Files in cache, indexed by path within the cache directory. 

540 self._cache_entries = CacheRegistry() 

541 

542 @property 

543 def cache_directory(self) -> ResourcePath: 

544 if self._cache_directory is None: 

545 # Create on demand. Allow the override environment variable 

546 # to be used in case it got set after this object was created 

547 # but before a cache was used. 

548 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

549 # Someone else will clean this up. 

550 isTemporary = False 

551 msg = "deferred fallback" 

552 else: 

553 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix) 

554 isTemporary = True 

555 msg = "temporary" 

556 

557 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary) 

558 log.debug("Using %s cache directory at %s", msg, self._cache_directory) 

559 

560 # Remove when we no longer need it. 

561 if isTemporary: 

562 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

563 return self._cache_directory 

564 

565 @property 

566 def _temp_exempt_directory(self) -> ResourcePath: 

567 """Return the directory in which to store temporary cache files that 

568 should not be expired. 

569 """ 

570 return self.cache_directory.join(self._temp_exemption_prefix) 

571 

572 @property 

573 def cache_size(self) -> int: 

574 return self._cache_entries.cache_size 

575 

576 @property 

577 def file_count(self) -> int: 

578 return len(self._cache_entries) 

579 

580 @classmethod 

581 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]: 

582 """Define a fallback cache directory if a fallback not set already. 

583 

584 Returns 

585 ------- 

586 defined : `bool` 

587 `True` if the fallback directory was newly-defined in this method. 

588 `False` if it had already been set. 

589 cache_dir : `str` 

590 Returns the path to the cache directory that will be used if it's 

591 needed. This can allow the caller to run a directory cleanup 

592 when it's no longer needed (something that the cache manager 

593 can not do because forks should not clean up directories defined 

594 by the parent process). 

595 

596 Notes 

597 ----- 

598 The fallback directory will not be defined if one has already been 

599 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` 

600 environment variable only if a value has not previously been stored 

601 in that environment variable. Setting the environment variable allows 

602 this value to survive into spawned subprocesses. Calling this method 

603 will lead to all subsequently created cache managers sharing the same 

604 cache. 

605 """ 

606 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

607 # A value has already been set. 

608 return (False, cache_dir) 

609 

610 # As a class method, we do not know at this point whether a cache 

611 # directory will be needed so it would be impolite to create a 

612 # directory that will never be used. 

613 

614 # Construct our own temp name -- 16 characters should have a fairly 

615 # low chance of clashing when combined with the process ID. 

616 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

617 rng = Random() 

618 tempchars = "".join(rng.choice(characters) for _ in range(16)) 

619 

620 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}" 

621 

622 cache_dir = os.path.join(tempfile.gettempdir(), tempname) 

623 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir 

624 return (True, cache_dir) 

625 

626 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

627 # Docstring inherited 

628 matchName: LookupKey | str = f"{entity} (via default)" 

629 should_cache = self._caching_default 

630 

631 for key in entity._lookupNames(): 

632 if key in self._lut: 

633 should_cache = bool(self._lut[key]) 

634 matchName = key 

635 break 

636 

637 if not isinstance(should_cache, bool): 

638 raise TypeError( 

639 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

640 ) 

641 

642 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

643 return should_cache 

644 

645 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

646 """Construct the name to use for this dataset in the cache. 

647 

648 Parameters 

649 ---------- 

650 ref : `DatasetRef` 

651 The dataset to look up in or write to the cache. 

652 extension : `str` 

653 File extension to use for this file. Should include the 

654 leading "``.``". 

655 

656 Returns 

657 ------- 

658 uri : `lsst.resources.ResourcePath` 

659 URI to use for this dataset in the cache. 

660 """ 

661 return _construct_cache_path(self.cache_directory, ref, extension) 

662 

663 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

664 # Docstring inherited 

665 if not self.should_be_cached(ref): 

666 return None 

667 

668 # Write the file using the id of the dataset ref and the file 

669 # extension. 

670 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

671 

672 # Run cache expiry to ensure that we have room for this 

673 # item. 

674 self._expire_cache() 

675 

676 # The above reset the in-memory cache status. It's entirely possible 

677 # that another process has just cached this file (if multiple 

678 # processes are caching on read), so check our in-memory cache 

679 # before attempting to cache the dataset. 

680 path_in_cache = cached_location.relative_to(self.cache_directory) 

681 if path_in_cache and path_in_cache in self._cache_entries: 

682 return cached_location 

683 

684 # Move into the cache. Given that multiple processes might be 

685 # sharing a single cache directory, and the file we need might have 

686 # been copied in whilst we were checking, allow overwrite without 

687 # complaint. Even for a private cache directory it is possible that 

688 # a second butler in a subprocess could be writing to it. 

689 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

690 log.debug("Cached dataset %s to %s", ref, cached_location) 

691 

692 self._register_cache_entry(cached_location) 

693 

694 return cached_location 

695 

696 @contextlib.contextmanager 

697 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

698 # Docstring inherited 

699 # Short circuit this if the cache directory has not been created yet. 

700 if self._cache_directory is None: 

701 yield None 

702 return 

703 

704 cached_location = self._construct_cache_name(ref, extension) 

705 if cached_location.exists(): 

706 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

707 

708 # The cached file could be removed by another process doing 

709 # cache expiration so we need to protect against that by making 

710 # a copy in a different tree. Use hardlinks to ensure that 

711 # we either have the cached file or we don't. This is robust 

712 # against race conditions that can be caused by using soft links 

713 # and the other end of the link being deleted just after it 

714 # is created. 

715 path_in_cache = cached_location.relative_to(self.cache_directory) 

716 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

717 

718 # Need to use a unique file name for the temporary location to 

719 # ensure that two different processes can read the file 

720 # simultaneously without one of them deleting it when it's in 

721 # use elsewhere. Retain the original filename for easier debugging. 

722 random = str(uuid.uuid4())[:8] 

723 basename = cached_location.basename() 

724 filename = f"{random}-{basename}" 

725 

726 temp_location: ResourcePath | None = self._temp_exempt_directory.join(filename) 

727 try: 

728 if temp_location is not None: 

729 temp_location.transfer_from(cached_location, transfer="hardlink") 

730 except Exception as e: 

731 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

732 # Any failure will be treated as if the file was not 

733 # in the cache. Yielding the original cache location 

734 # is too dangerous. 

735 temp_location = None 

736 

737 try: 

738 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

739 yield temp_location 

740 finally: 

741 try: 

742 if temp_location: 

743 temp_location.remove() 

744 except FileNotFoundError: 

745 pass 

746 return 

747 

748 log.debug("Dataset %s not found in cache.", ref) 

749 yield None 

750 return 

751 

752 def remove_from_cache(self, refs: DatasetRef | Iterable[DatasetRef]) -> None: 

753 # Docstring inherited. 

754 

755 # Stop early if there are no cache entries anyhow. 

756 if len(self._cache_entries) == 0: 

757 return 

758 

759 if isinstance(refs, DatasetRef): 

760 refs = [refs] 

761 

762 # Create a set of all the IDs 

763 all_ids = {ref.id for ref in refs} 

764 

765 keys_to_remove = [] 

766 for key, entry in self._cache_entries.items(): 

767 if entry.ref in all_ids: 

768 keys_to_remove.append(key) 

769 self._remove_from_cache(keys_to_remove) 

770 

771 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str | None: 

772 """Record the file in the cache registry. 

773 

774 Parameters 

775 ---------- 

776 cached_location : `lsst.resources.ResourcePath` 

777 Location of the file to be registered. 

778 can_exist : `bool`, optional 

779 If `True` the item being registered can already be listed. 

780 This can allow a cache refresh to run without checking the 

781 file again. If `False` it is an error for the registry to 

782 already know about this file. 

783 

784 Returns 

785 ------- 

786 cache_key : `str` or `None` 

787 The key used in the registry for this file. `None` if the file 

788 no longer exists (it could have been expired by another process). 

789 """ 

790 path_in_cache = cached_location.relative_to(self.cache_directory) 

791 if path_in_cache is None: 

792 raise ValueError( 

793 f"Can not register cached file {cached_location} that is not within" 

794 f" the cache directory at {self.cache_directory}." 

795 ) 

796 if path_in_cache in self._cache_entries: 

797 if can_exist: 

798 return path_in_cache 

799 else: 

800 raise ValueError( 

801 f"Cached file {cached_location} is already known to the registry" 

802 " but this was expected to be a new file." 

803 ) 

804 try: 

805 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

806 except FileNotFoundError: 

807 return None 

808 self._cache_entries[path_in_cache] = details 

809 return path_in_cache 

810 

811 def scan_cache(self) -> None: 

812 """Scan the cache directory and record information about files.""" 

813 found = set() 

814 for file in ResourcePath.findFileResources([self.cache_directory]): 

815 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

816 

817 # Skip any that are found in an exempt part of the hierarchy 

818 # since they should not be part of the registry. 

819 if file.relative_to(self._temp_exempt_directory) is not None: 

820 continue 

821 

822 path_in_cache = self._register_cache_entry(file, can_exist=True) 

823 if path_in_cache: 

824 found.add(path_in_cache) 

825 

826 # Find any files that were recorded in the cache but are no longer 

827 # on disk. (something else cleared them out?) 

828 known_to_cache = set(self._cache_entries) 

829 missing = known_to_cache - found 

830 

831 if missing: 

832 log.debug( 

833 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

834 ) 

835 for path_in_cache in missing: 

836 self._cache_entries.pop(path_in_cache, None) 

837 

838 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

839 """Report if the dataset is known to the cache. 

840 

841 Parameters 

842 ---------- 

843 ref : `DatasetRef` 

844 Dataset to check for in the cache. 

845 extension : `str`, optional 

846 File extension expected. Should include the leading "``.``". 

847 If `None` the extension is ignored and the dataset ID alone is 

848 used to check in the cache. The extension must be defined if 

849 a specific component is being checked. 

850 

851 Returns 

852 ------- 

853 known : `bool` 

854 Returns `True` if the dataset is currently known to the cache 

855 and `False` otherwise. If the dataset refers to a component and 

856 an extension is given then only that component is checked. 

857 

858 Notes 

859 ----- 

860 This method can only report if the dataset is known to the cache 

861 in this specific instant and does not indicate whether the file 

862 can be read from the cache later. `find_in_cache()` should be called 

863 if the cached file is to be used. 

864 

865 This method does not force the cache to be re-scanned and so can miss 

866 cached datasets that have recently been written by other processes. 

867 """ 

868 if self._cache_directory is None: 

869 return False 

870 if self.file_count == 0: 

871 return False 

872 

873 if extension is None: 

874 # Look solely for matching dataset ref ID and not specific 

875 # components. 

876 cached_paths = self._cache_entries.get_dataset_keys(ref.id) 

877 return bool(cached_paths) 

878 

879 else: 

880 # Extension is known so we can do an explicit look up for the 

881 # cache entry. 

882 cached_location = self._construct_cache_name(ref, extension) 

883 path_in_cache = cached_location.relative_to(self.cache_directory) 

884 assert path_in_cache is not None # For mypy 

885 return path_in_cache in self._cache_entries 

886 

887 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

888 """Remove the specified cache entries from cache. 

889 

890 Parameters 

891 ---------- 

892 cache_entries : iterable of `str` 

893 The entries to remove from the cache. The values are the path 

894 within the cache. 

895 """ 

896 for entry in cache_entries: 

897 path = self.cache_directory.join(entry) 

898 

899 self._cache_entries.pop(entry, None) 

900 log.debug("Removing file from cache: %s", path) 

901 with contextlib.suppress(FileNotFoundError): 

902 path.remove() 

903 

904 def _expire_cache(self) -> None: 

905 """Expire the files in the cache. 

906 

907 Notes 

908 ----- 

909 The expiration modes are defined by the config or can be overridden. 

910 Available options: 

911 

912 * ``files``: Number of files. 

913 * ``datasets``: Number of datasets 

914 * ``size``: Total size of files. 

915 * ``age``: Age of files. 

916 

917 The first three would remove in reverse time order. 

918 Number of files is complicated by the possibility of disassembled 

919 composites where 10 small files can be created for each dataset. 

920 

921 Additionally there is a use case for an external user to explicitly 

922 state the dataset refs that should be cached and then when to 

923 remove them. Overriding any global configuration. 

924 """ 

925 if self._expiration_mode is None: 

926 # Expiration has been disabled. 

927 return 

928 

929 # mypy can't be sure we have set a threshold properly 

930 if self._expiration_threshold is None: 

931 log.warning( 

932 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

933 ) 

934 return 

935 

936 # Sync up cache. There is no file locking involved so for a shared 

937 # cache multiple processes may be racing to delete files. Deleting 

938 # a file that no longer exists is not an error. 

939 self.scan_cache() 

940 

941 if self._expiration_mode == "files": 

942 n_files = len(self._cache_entries) 

943 n_over = n_files - self._expiration_threshold 

944 if n_over > 0: 

945 sorted_keys = self._sort_cache() 

946 keys_to_remove = sorted_keys[:n_over] 

947 self._remove_from_cache(keys_to_remove) 

948 return 

949 

950 if self._expiration_mode == "datasets": 

951 # Count the datasets, in ascending timestamp order, 

952 # so that oldest turn up first. 

953 datasets = defaultdict(list) 

954 for key in self._sort_cache(): 

955 entry = self._cache_entries[key] 

956 datasets[entry.ref].append(key) 

957 

958 n_datasets = len(datasets) 

959 n_over = n_datasets - self._expiration_threshold 

960 if n_over > 0: 

961 # Keys will be read out in insert order which 

962 # will be date order so oldest ones are removed. 

963 ref_ids = list(datasets.keys())[:n_over] 

964 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

965 self._remove_from_cache(keys_to_remove) 

966 return 

967 

968 if self._expiration_mode == "size": 

969 if self.cache_size > self._expiration_threshold: 

970 for key in self._sort_cache(): 

971 self._remove_from_cache([key]) 

972 if self.cache_size <= self._expiration_threshold: 

973 break 

974 return 

975 

976 if self._expiration_mode == "age": 

977 now = datetime.datetime.now(datetime.UTC) 

978 for key in self._sort_cache(): 

979 delta = now - self._cache_entries[key].ctime 

980 if delta.seconds > self._expiration_threshold: 

981 self._remove_from_cache([key]) 

982 else: 

983 # We're already in date order. 

984 break 

985 return 

986 

987 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

988 

989 def _sort_cache(self) -> list[str]: 

990 """Sort the cache entries by time and return the sorted keys. 

991 

992 Returns 

993 ------- 

994 sorted : `list` of `str` 

995 Keys into the cache, sorted by time with oldest first. 

996 """ 

997 

998 def _sort_by_time(key: str) -> datetime.datetime: 

999 """Sorter key function using cache entry details.""" 

1000 return self._cache_entries[key].ctime 

1001 

1002 return sorted(self._cache_entries, key=_sort_by_time) 

1003 

1004 def __str__(self) -> str: 

1005 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

1006 return ( 

1007 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

1008 f"default={self._caching_default}) " 

1009 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

1010 ) 

1011 

1012 

1013class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

1014 """A variant of the datastore cache where no cache is enabled. 

1015 

1016 Parameters 

1017 ---------- 

1018 config : `str` or `DatastoreCacheManagerConfig` 

1019 Configuration to control caching. 

1020 universe : `DimensionUniverse` 

1021 Set of all known dimensions, used to expand and validate any used 

1022 in lookup keys. 

1023 """ 

1024 

1025 def __init__( 

1026 self, 

1027 config: str | DatastoreCacheManagerConfig | None = None, 

1028 universe: DimensionUniverse | None = None, 

1029 ): 

1030 return 

1031 

1032 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

1033 """Indicate whether the entity should be added to the cache. 

1034 

1035 Parameters 

1036 ---------- 

1037 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

1038 Thing to test against the configuration. The ``name`` property 

1039 is used to determine a match. A `DatasetType` will first check 

1040 its name, before checking its `StorageClass`. If there are no 

1041 matches the default will be returned. 

1042 

1043 Returns 

1044 ------- 

1045 should_cache : `bool` 

1046 Always returns `False`. 

1047 """ 

1048 return False 

1049 

1050 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

1051 """Move dataset to cache. 

1052 

1053 Parameters 

1054 ---------- 

1055 uri : `lsst.resources.ResourcePath` 

1056 Location of the file to be relocated to the cache. Will be moved. 

1057 ref : `DatasetRef` 

1058 Ref associated with this file. Will be used to determine the name 

1059 of the file within the cache. 

1060 

1061 Returns 

1062 ------- 

1063 new : `lsst.resources.ResourcePath` or `None` 

1064 Always refuses and returns `None`. 

1065 """ 

1066 return None 

1067 

1068 @contextlib.contextmanager 

1069 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

1070 """Look for a dataset in the cache and return its location. 

1071 

1072 Parameters 

1073 ---------- 

1074 ref : `DatasetRef` 

1075 Dataset to locate in the cache. 

1076 extension : `str` 

1077 File extension expected. Should include the leading "``.``". 

1078 

1079 Yields 

1080 ------ 

1081 uri : `lsst.resources.ResourcePath` or `None` 

1082 Never finds a file. Always returns `None`. 

1083 """ 

1084 yield None 

1085 

1086 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None: 

1087 """Remove datasets from cache. 

1088 

1089 Parameters 

1090 ---------- 

1091 ref : `DatasetRef` or iterable of `DatasetRef` 

1092 The datasets to remove from the cache. Always does nothing. 

1093 """ 

1094 return 

1095 

1096 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

1097 """Report if a dataset is known to the cache. 

1098 

1099 Parameters 

1100 ---------- 

1101 ref : `DatasetRef` 

1102 Dataset to check for in the cache. 

1103 extension : `str`, optional 

1104 File extension expected. Should include the leading "``.``". 

1105 If `None` the extension is ignored and the dataset ID alone is 

1106 used to check in the cache. The extension must be defined if 

1107 a specific component is being checked. 

1108 

1109 Returns 

1110 ------- 

1111 known : `bool` 

1112 Always returns `False`. 

1113 """ 

1114 return False 

1115 

1116 def __str__(self) -> str: 

1117 return f"{type(self).__name__}()"