Coverage for python/lsst/daf/butler/datastore/cache_manager.py: 29%

394 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-05 11:07 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Cache management for a datastore.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "AbstractDatastoreCacheManager", 

34 "DatastoreDisabledCacheManager", 

35 "DatastoreCacheManager", 

36 "DatastoreCacheManagerConfig", 

37) 

38 

39import atexit 

40import contextlib 

41import datetime 

42import itertools 

43import logging 

44import os 

45import shutil 

46import tempfile 

47import uuid 

48from abc import ABC, abstractmethod 

49from collections import defaultdict 

50from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView 

51from random import Random 

52from typing import TYPE_CHECKING 

53 

54from lsst.daf.butler._compat import _BaseModelCompat 

55from lsst.resources import ResourcePath 

56from pydantic import PrivateAttr 

57 

58from .._config import ConfigSubset 

59from .._config_support import processLookupConfigs 

60from .._dataset_ref import DatasetId, DatasetRef 

61 

62if TYPE_CHECKING: 

63 from .._config_support import LookupKey 

64 from .._dataset_type import DatasetType 

65 from .._storage_class import StorageClass 

66 from ..dimensions import DimensionUniverse 

67 

68log = logging.getLogger(__name__) 

69 

70 

71def remove_cache_directory(directory: str) -> None: 

72 """Remove the specified directory and all its contents.""" 

73 log.debug("Removing temporary cache directory %s", directory) 

74 shutil.rmtree(directory, ignore_errors=True) 

75 

76 

77def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

78 """Construct the full path to use for this dataset in the cache. 

79 

80 Parameters 

81 ---------- 

82 ref : `DatasetRef` 

83 The dataset to look up in or write to the cache. 

84 extension : `str` 

85 File extension to use for this file. Should include the 

86 leading "``.``". 

87 

88 Returns 

89 ------- 

90 uri : `lsst.resources.ResourcePath` 

91 URI to use for this dataset in the cache. 

92 """ 

93 # Dataset type component is needed in the name if composite 

94 # disassembly is happening since the ID is shared for all components. 

95 component = ref.datasetType.component() 

96 component = f"_{component}" if component else "" 

97 return root.join(f"{ref.id}{component}{extension}") 

98 

99 

100def _parse_cache_name(cached_location: str) -> tuple[uuid.UUID, str | None, str | None]: 

101 """For a given cache name, return its component parts. 

102 

103 Changes to ``_construct_cache_path()`` should be reflected here. 

104 

105 Parameters 

106 ---------- 

107 cached_location : `str` 

108 The name of the file within the cache. 

109 

110 Returns 

111 ------- 

112 id : `uuid.UUID` 

113 The dataset ID. 

114 component : `str` or `None` 

115 The name of the component, if present. 

116 extension: `str` or `None` 

117 The file extension, if present. 

118 """ 

119 # Assume first dot is the extension and so allow .fits.gz 

120 root_ext = cached_location.split(".", maxsplit=1) 

121 root = root_ext.pop(0) 

122 ext = "." + root_ext.pop(0) if root_ext else None 

123 

124 parts = root.split("_") 

125 id_ = uuid.UUID(parts.pop(0)) 

126 component = parts.pop(0) if parts else None 

127 return id_, component, ext 

128 

129 

130class CacheEntry(_BaseModelCompat): 

131 """Represent an entry in the cache.""" 

132 

133 name: str 

134 """Name of the file.""" 

135 

136 size: int 

137 """Size of the file in bytes.""" 

138 

139 ctime: datetime.datetime 

140 """Creation time of the file.""" 

141 

142 ref: DatasetId 

143 """ID of this dataset.""" 

144 

145 component: str | None = None 

146 """Component for this disassembled composite (optional).""" 

147 

148 @classmethod 

149 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

150 """Construct an object from a file name. 

151 

152 Parameters 

153 ---------- 

154 file : `lsst.resources.ResourcePath` 

155 Path to the file. 

156 root : `lsst.resources.ResourcePath` 

157 Cache root directory. 

158 """ 

159 file_in_cache = file.relative_to(root) 

160 if file_in_cache is None: 

161 raise ValueError(f"Supplied file {file} is not inside root {root}") 

162 id_, component, _ = _parse_cache_name(file_in_cache) 

163 

164 stat = os.stat(file.ospath) 

165 return cls( 

166 name=file_in_cache, 

167 size=stat.st_size, 

168 ref=id_, 

169 component=component, 

170 ctime=datetime.datetime.fromtimestamp(stat.st_ctime, datetime.UTC), 

171 ) 

172 

173 

174class _MarkerEntry(CacheEntry): 

175 pass 

176 

177 

178class CacheRegistry(_BaseModelCompat): 

179 """Collection of cache entries.""" 

180 

181 _size: int = PrivateAttr(0) 

182 """Size of the cache.""" 

183 

184 _entries: dict[str, CacheEntry] = PrivateAttr({}) 

185 """Internal collection of cache entries.""" 

186 

187 _ref_map: dict[DatasetId, list[str]] = PrivateAttr({}) 

188 """Mapping of DatasetID to corresponding keys in cache registry.""" 

189 

190 @property 

191 def cache_size(self) -> int: 

192 return self._size 

193 

194 def __getitem__(self, key: str) -> CacheEntry: 

195 return self._entries[key] 

196 

197 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

198 self._size += entry.size 

199 self._entries[key] = entry 

200 

201 # Update the mapping from ref to path. 

202 if entry.ref not in self._ref_map: 

203 self._ref_map[entry.ref] = [] 

204 self._ref_map[entry.ref].append(key) 

205 

206 def __delitem__(self, key: str) -> None: 

207 entry = self._entries.pop(key) 

208 self._decrement(entry) 

209 self._ref_map[entry.ref].remove(key) 

210 

211 def _decrement(self, entry: CacheEntry | None) -> None: 

212 if entry: 

213 self._size -= entry.size 

214 if self._size < 0: 

215 log.warning("Cache size has gone negative. Inconsistent cache records...") 

216 self._size = 0 

217 

218 def __contains__(self, key: str) -> bool: 

219 return key in self._entries 

220 

221 def __len__(self) -> int: 

222 return len(self._entries) 

223 

224 def __iter__(self) -> Iterator[str]: # type: ignore 

225 return iter(self._entries) 

226 

227 def keys(self) -> KeysView[str]: 

228 return self._entries.keys() 

229 

230 def values(self) -> ValuesView[CacheEntry]: 

231 return self._entries.values() 

232 

233 def items(self) -> ItemsView[str, CacheEntry]: 

234 return self._entries.items() 

235 

236 # An private marker to indicate that pop() should raise if no default 

237 # is given. 

238 __marker = _MarkerEntry( 

239 name="marker", 

240 size=0, 

241 ref=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), 

242 ctime=datetime.datetime.fromtimestamp(0, datetime.UTC), 

243 ) 

244 

245 def pop(self, key: str, default: CacheEntry | None = __marker) -> CacheEntry | None: 

246 # The marker for dict.pop is not the same as our marker. 

247 if default is self.__marker: 

248 entry = self._entries.pop(key) 

249 else: 

250 entry = self._entries.pop(key, self.__marker) 

251 # Should not attempt to correct for this entry being removed 

252 # if we got the default value. 

253 if entry is self.__marker: 

254 return default 

255 

256 self._decrement(entry) 

257 # The default entry given to this method may not even be in the cache. 

258 if entry and entry.ref in self._ref_map: 

259 keys = self._ref_map[entry.ref] 

260 if key in keys: 

261 keys.remove(key) 

262 return entry 

263 

264 def get_dataset_keys(self, dataset_id: DatasetId | None) -> list[str] | None: 

265 """Retrieve all keys associated with the given dataset ID. 

266 

267 Parameters 

268 ---------- 

269 dataset_id : `DatasetId` or `None` 

270 The dataset ID to look up. Returns `None` if the ID is `None`. 

271 

272 Returns 

273 ------- 

274 keys : `list` [`str`] 

275 Keys associated with this dataset. These keys can be used to lookup 

276 the cache entry information in the `CacheRegistry`. Returns 

277 `None` if the dataset is not known to the cache. 

278 """ 

279 if dataset_id not in self._ref_map: 

280 return None 

281 keys = self._ref_map[dataset_id] 

282 if not keys: 

283 return None 

284 return keys 

285 

286 

287class DatastoreCacheManagerConfig(ConfigSubset): 

288 """Configuration information for `DatastoreCacheManager`.""" 

289 

290 component = "cached" 

291 requiredKeys = ("cacheable",) 

292 

293 

294class AbstractDatastoreCacheManager(ABC): 

295 """An abstract base class for managing caching in a Datastore. 

296 

297 Parameters 

298 ---------- 

299 config : `str` or `DatastoreCacheManagerConfig` 

300 Configuration to control caching. 

301 universe : `DimensionUniverse` 

302 Set of all known dimensions, used to expand and validate any used 

303 in lookup keys. 

304 """ 

305 

306 @property 

307 def cache_size(self) -> int: 

308 """Size of the cache in bytes.""" 

309 return 0 

310 

311 @property 

312 def file_count(self) -> int: 

313 """Return number of cached files tracked by registry.""" 

314 return 0 

315 

316 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

317 if not isinstance(config, DatastoreCacheManagerConfig): 

318 config = DatastoreCacheManagerConfig(config) 

319 assert isinstance(config, DatastoreCacheManagerConfig) 

320 self.config = config 

321 

322 @abstractmethod 

323 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

324 """Indicate whether the entity should be added to the cache. 

325 

326 This is relevant when reading or writing. 

327 

328 Parameters 

329 ---------- 

330 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

331 Thing to test against the configuration. The ``name`` property 

332 is used to determine a match. A `DatasetType` will first check 

333 its name, before checking its `StorageClass`. If there are no 

334 matches the default will be returned. 

335 

336 Returns 

337 ------- 

338 should_cache : `bool` 

339 Returns `True` if the dataset should be cached; `False` otherwise. 

340 """ 

341 raise NotImplementedError() 

342 

343 @abstractmethod 

344 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

345 """Report if the dataset is known to the cache. 

346 

347 Parameters 

348 ---------- 

349 ref : `DatasetRef` 

350 Dataset to check for in the cache. 

351 extension : `str`, optional 

352 File extension expected. Should include the leading "``.``". 

353 If `None` the extension is ignored and the dataset ID alone is 

354 used to check in the cache. The extension must be defined if 

355 a specific component is being checked. 

356 

357 Returns 

358 ------- 

359 known : `bool` 

360 Returns `True` if the dataset is currently known to the cache 

361 and `False` otherwise. 

362 

363 Notes 

364 ----- 

365 This method can only report if the dataset is known to the cache 

366 in this specific instant and does not indicate whether the file 

367 can be read from the cache later. `find_in_cache()` should be called 

368 if the cached file is to be used. 

369 """ 

370 raise NotImplementedError() 

371 

372 @abstractmethod 

373 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

374 """Move a file to the cache. 

375 

376 Move the given file into the cache, using the supplied DatasetRef 

377 for naming. A call is made to `should_be_cached()` and if the 

378 DatasetRef should not be accepted `None` will be returned. 

379 

380 Cache expiry can occur during this. 

381 

382 Parameters 

383 ---------- 

384 uri : `lsst.resources.ResourcePath` 

385 Location of the file to be relocated to the cache. Will be moved. 

386 ref : `DatasetRef` 

387 Ref associated with this file. Will be used to determine the name 

388 of the file within the cache. 

389 

390 Returns 

391 ------- 

392 new : `lsst.resources.ResourcePath` or `None` 

393 URI to the file within the cache, or `None` if the dataset 

394 was not accepted by the cache. 

395 """ 

396 raise NotImplementedError() 

397 

398 @abstractmethod 

399 @contextlib.contextmanager 

400 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

401 """Look for a dataset in the cache and return its location. 

402 

403 Parameters 

404 ---------- 

405 ref : `DatasetRef` 

406 Dataset to locate in the cache. 

407 extension : `str` 

408 File extension expected. Should include the leading "``.``". 

409 

410 Yields 

411 ------ 

412 uri : `lsst.resources.ResourcePath` or `None` 

413 The URI to the cached file, or `None` if the file has not been 

414 cached. 

415 

416 Notes 

417 ----- 

418 Should be used as a context manager in order to prevent this 

419 file from being removed from the cache for that context. 

420 """ 

421 raise NotImplementedError() 

422 

423 @abstractmethod 

424 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None: 

425 """Remove the specified datasets from the cache. 

426 

427 It is not an error for these datasets to be missing from the cache. 

428 

429 Parameters 

430 ---------- 

431 ref : `DatasetRef` or iterable of `DatasetRef` 

432 The datasets to remove from the cache. 

433 """ 

434 raise NotImplementedError() 

435 

436 @abstractmethod 

437 def __str__(self) -> str: 

438 raise NotImplementedError() 

439 

440 

441class DatastoreCacheManager(AbstractDatastoreCacheManager): 

442 """A class for managing caching in a Datastore using local files. 

443 

444 Parameters 

445 ---------- 

446 config : `str` or `DatastoreCacheManagerConfig` 

447 Configuration to control caching. 

448 universe : `DimensionUniverse` 

449 Set of all known dimensions, used to expand and validate any used 

450 in lookup keys. 

451 

452 Notes 

453 ----- 

454 Two environment variables can be used to override the cache directory 

455 and expiration configuration: 

456 

457 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

458 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

459 

460 The expiration mode should take the form ``mode=threshold`` so for 

461 example to configure expiration to limit the cache directory to 5 datasets 

462 the value would be ``datasets=5``. 

463 

464 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment 

465 variable can be used to indicate that this directory should be used 

466 if no explicit directory has been specified from configuration or from 

467 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable. 

468 """ 

469 

470 _temp_exemption_prefix = "exempt/" 

471 _tmpdir_prefix = "butler-cache-dir-" 

472 

473 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

474 super().__init__(config, universe) 

475 

476 # Set cache directory if it pre-exists, else defer creation until 

477 # requested. Allow external override from environment. 

478 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

479 

480 # Allow the execution environment to override the default values 

481 # so long as no default value has been set from the line above. 

482 if root is None: 

483 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET") 

484 

485 self._cache_directory = ( 

486 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

487 ) 

488 

489 if self._cache_directory: 

490 if not self._cache_directory.isLocal: 

491 raise ValueError( 

492 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

493 ) 

494 # Ensure that the cache directory is created. We assume that 

495 # someone specifying a permanent cache directory will be expecting 

496 # it to always be there. This will also trigger an error 

497 # early rather than waiting until the cache is needed. 

498 self._cache_directory.mkdir() 

499 

500 # Calculate the caching lookup table. 

501 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

502 

503 # Default decision to for whether a dataset should be cached. 

504 self._caching_default = self.config.get("default", False) 

505 

506 # Expiration mode. Read from config but allow override from 

507 # the environment. 

508 expiration_mode = self.config.get(("expiry", "mode")) 

509 threshold = self.config.get(("expiry", "threshold")) 

510 

511 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

512 if external_mode and "=" in external_mode: 

513 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

514 threshold = int(expiration_threshold) 

515 if expiration_mode is None: 

516 # Force to None to avoid confusion. 

517 threshold = None 

518 

519 self._expiration_mode: str | None = expiration_mode 

520 self._expiration_threshold: int | None = threshold 

521 if self._expiration_threshold is None and self._expiration_mode is not None: 

522 raise ValueError( 

523 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

524 ) 

525 

526 log.debug( 

527 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

528 self._cache_directory if self._cache_directory else "tmpdir", 

529 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

530 ) 

531 

532 # Files in cache, indexed by path within the cache directory. 

533 self._cache_entries = CacheRegistry() 

534 

535 @property 

536 def cache_directory(self) -> ResourcePath: 

537 if self._cache_directory is None: 

538 # Create on demand. Allow the override environment variable 

539 # to be used in case it got set after this object was created 

540 # but before a cache was used. 

541 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

542 # Someone else will clean this up. 

543 isTemporary = False 

544 msg = "deferred fallback" 

545 else: 

546 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix) 

547 isTemporary = True 

548 msg = "temporary" 

549 

550 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary) 

551 log.debug("Using %s cache directory at %s", msg, self._cache_directory) 

552 

553 # Remove when we no longer need it. 

554 if isTemporary: 

555 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

556 return self._cache_directory 

557 

558 @property 

559 def _temp_exempt_directory(self) -> ResourcePath: 

560 """Return the directory in which to store temporary cache files that 

561 should not be expired. 

562 """ 

563 return self.cache_directory.join(self._temp_exemption_prefix) 

564 

565 @property 

566 def cache_size(self) -> int: 

567 return self._cache_entries.cache_size 

568 

569 @property 

570 def file_count(self) -> int: 

571 return len(self._cache_entries) 

572 

573 @classmethod 

574 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]: 

575 """Define a fallback cache directory if a fallback not set already. 

576 

577 Returns 

578 ------- 

579 defined : `bool` 

580 `True` if the fallback directory was newly-defined in this method. 

581 `False` if it had already been set. 

582 cache_dir : `str` 

583 Returns the path to the cache directory that will be used if it's 

584 needed. This can allow the caller to run a directory cleanup 

585 when it's no longer needed (something that the cache manager 

586 can not do because forks should not clean up directories defined 

587 by the parent process). 

588 

589 Notes 

590 ----- 

591 The fallback directory will not be defined if one has already been 

592 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` 

593 environment variable only if a value has not previously been stored 

594 in that environment variable. Setting the environment variable allows 

595 this value to survive into spawned subprocesses. Calling this method 

596 will lead to all subsequently created cache managers sharing the same 

597 cache. 

598 """ 

599 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

600 # A value has already been set. 

601 return (False, cache_dir) 

602 

603 # As a class method, we do not know at this point whether a cache 

604 # directory will be needed so it would be impolite to create a 

605 # directory that will never be used. 

606 

607 # Construct our own temp name -- 16 characters should have a fairly 

608 # low chance of clashing when combined with the process ID. 

609 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

610 rng = Random() 

611 tempchars = "".join(rng.choice(characters) for _ in range(16)) 

612 

613 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}" 

614 

615 cache_dir = os.path.join(tempfile.gettempdir(), tempname) 

616 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir 

617 return (True, cache_dir) 

618 

619 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

620 # Docstring inherited 

621 matchName: LookupKey | str = f"{entity} (via default)" 

622 should_cache = self._caching_default 

623 

624 for key in entity._lookupNames(): 

625 if key in self._lut: 

626 should_cache = bool(self._lut[key]) 

627 matchName = key 

628 break 

629 

630 if not isinstance(should_cache, bool): 

631 raise TypeError( 

632 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

633 ) 

634 

635 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

636 return should_cache 

637 

638 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

639 """Construct the name to use for this dataset in the cache. 

640 

641 Parameters 

642 ---------- 

643 ref : `DatasetRef` 

644 The dataset to look up in or write to the cache. 

645 extension : `str` 

646 File extension to use for this file. Should include the 

647 leading "``.``". 

648 

649 Returns 

650 ------- 

651 uri : `lsst.resources.ResourcePath` 

652 URI to use for this dataset in the cache. 

653 """ 

654 return _construct_cache_path(self.cache_directory, ref, extension) 

655 

656 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

657 # Docstring inherited 

658 if not self.should_be_cached(ref): 

659 return None 

660 

661 # Write the file using the id of the dataset ref and the file 

662 # extension. 

663 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

664 

665 # Run cache expiry to ensure that we have room for this 

666 # item. 

667 self._expire_cache() 

668 

669 # The above reset the in-memory cache status. It's entirely possible 

670 # that another process has just cached this file (if multiple 

671 # processes are caching on read), so check our in-memory cache 

672 # before attempting to cache the dataset. 

673 path_in_cache = cached_location.relative_to(self.cache_directory) 

674 if path_in_cache and path_in_cache in self._cache_entries: 

675 return cached_location 

676 

677 # Move into the cache. Given that multiple processes might be 

678 # sharing a single cache directory, and the file we need might have 

679 # been copied in whilst we were checking, allow overwrite without 

680 # complaint. Even for a private cache directory it is possible that 

681 # a second butler in a subprocess could be writing to it. 

682 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

683 log.debug("Cached dataset %s to %s", ref, cached_location) 

684 

685 self._register_cache_entry(cached_location) 

686 

687 return cached_location 

688 

689 @contextlib.contextmanager 

690 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

691 # Docstring inherited 

692 # Short circuit this if the cache directory has not been created yet. 

693 if self._cache_directory is None: 

694 yield None 

695 return 

696 

697 cached_location = self._construct_cache_name(ref, extension) 

698 if cached_location.exists(): 

699 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

700 

701 # The cached file could be removed by another process doing 

702 # cache expiration so we need to protect against that by making 

703 # a copy in a different tree. Use hardlinks to ensure that 

704 # we either have the cached file or we don't. This is robust 

705 # against race conditions that can be caused by using soft links 

706 # and the other end of the link being deleted just after it 

707 # is created. 

708 path_in_cache = cached_location.relative_to(self.cache_directory) 

709 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

710 

711 # Need to use a unique file name for the temporary location to 

712 # ensure that two different processes can read the file 

713 # simultaneously without one of them deleting it when it's in 

714 # use elsewhere. Retain the original filename for easier debugging. 

715 random = str(uuid.uuid4())[:8] 

716 basename = cached_location.basename() 

717 filename = f"{random}-{basename}" 

718 

719 temp_location: ResourcePath | None = self._temp_exempt_directory.join(filename) 

720 try: 

721 if temp_location is not None: 

722 temp_location.transfer_from(cached_location, transfer="hardlink") 

723 except Exception as e: 

724 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

725 # Any failure will be treated as if the file was not 

726 # in the cache. Yielding the original cache location 

727 # is too dangerous. 

728 temp_location = None 

729 

730 try: 

731 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

732 yield temp_location 

733 finally: 

734 try: 

735 if temp_location: 

736 temp_location.remove() 

737 except FileNotFoundError: 

738 pass 

739 return 

740 

741 log.debug("Dataset %s not found in cache.", ref) 

742 yield None 

743 return 

744 

745 def remove_from_cache(self, refs: DatasetRef | Iterable[DatasetRef]) -> None: 

746 # Docstring inherited. 

747 

748 # Stop early if there are no cache entries anyhow. 

749 if len(self._cache_entries) == 0: 

750 return 

751 

752 if isinstance(refs, DatasetRef): 

753 refs = [refs] 

754 

755 # Create a set of all the IDs 

756 all_ids = {ref.id for ref in refs} 

757 

758 keys_to_remove = [] 

759 for key, entry in self._cache_entries.items(): 

760 if entry.ref in all_ids: 

761 keys_to_remove.append(key) 

762 self._remove_from_cache(keys_to_remove) 

763 

764 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str | None: 

765 """Record the file in the cache registry. 

766 

767 Parameters 

768 ---------- 

769 cached_location : `lsst.resources.ResourcePath` 

770 Location of the file to be registered. 

771 can_exist : `bool`, optional 

772 If `True` the item being registered can already be listed. 

773 This can allow a cache refresh to run without checking the 

774 file again. If `False` it is an error for the registry to 

775 already know about this file. 

776 

777 Returns 

778 ------- 

779 cache_key : `str` or `None` 

780 The key used in the registry for this file. `None` if the file 

781 no longer exists (it could have been expired by another process). 

782 """ 

783 path_in_cache = cached_location.relative_to(self.cache_directory) 

784 if path_in_cache is None: 

785 raise ValueError( 

786 f"Can not register cached file {cached_location} that is not within" 

787 f" the cache directory at {self.cache_directory}." 

788 ) 

789 if path_in_cache in self._cache_entries: 

790 if can_exist: 

791 return path_in_cache 

792 else: 

793 raise ValueError( 

794 f"Cached file {cached_location} is already known to the registry" 

795 " but this was expected to be a new file." 

796 ) 

797 try: 

798 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

799 except FileNotFoundError: 

800 return None 

801 self._cache_entries[path_in_cache] = details 

802 return path_in_cache 

803 

804 def scan_cache(self) -> None: 

805 """Scan the cache directory and record information about files.""" 

806 found = set() 

807 for file in ResourcePath.findFileResources([self.cache_directory]): 

808 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

809 

810 # Skip any that are found in an exempt part of the hierarchy 

811 # since they should not be part of the registry. 

812 if file.relative_to(self._temp_exempt_directory) is not None: 

813 continue 

814 

815 path_in_cache = self._register_cache_entry(file, can_exist=True) 

816 if path_in_cache: 

817 found.add(path_in_cache) 

818 

819 # Find any files that were recorded in the cache but are no longer 

820 # on disk. (something else cleared them out?) 

821 known_to_cache = set(self._cache_entries) 

822 missing = known_to_cache - found 

823 

824 if missing: 

825 log.debug( 

826 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

827 ) 

828 for path_in_cache in missing: 

829 self._cache_entries.pop(path_in_cache, None) 

830 

831 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

832 """Report if the dataset is known to the cache. 

833 

834 Parameters 

835 ---------- 

836 ref : `DatasetRef` 

837 Dataset to check for in the cache. 

838 extension : `str`, optional 

839 File extension expected. Should include the leading "``.``". 

840 If `None` the extension is ignored and the dataset ID alone is 

841 used to check in the cache. The extension must be defined if 

842 a specific component is being checked. 

843 

844 Returns 

845 ------- 

846 known : `bool` 

847 Returns `True` if the dataset is currently known to the cache 

848 and `False` otherwise. If the dataset refers to a component and 

849 an extension is given then only that component is checked. 

850 

851 Notes 

852 ----- 

853 This method can only report if the dataset is known to the cache 

854 in this specific instant and does not indicate whether the file 

855 can be read from the cache later. `find_in_cache()` should be called 

856 if the cached file is to be used. 

857 

858 This method does not force the cache to be re-scanned and so can miss 

859 cached datasets that have recently been written by other processes. 

860 """ 

861 if self._cache_directory is None: 

862 return False 

863 if self.file_count == 0: 

864 return False 

865 

866 if extension is None: 

867 # Look solely for matching dataset ref ID and not specific 

868 # components. 

869 cached_paths = self._cache_entries.get_dataset_keys(ref.id) 

870 return bool(cached_paths) 

871 

872 else: 

873 # Extension is known so we can do an explicit look up for the 

874 # cache entry. 

875 cached_location = self._construct_cache_name(ref, extension) 

876 path_in_cache = cached_location.relative_to(self.cache_directory) 

877 assert path_in_cache is not None # For mypy 

878 return path_in_cache in self._cache_entries 

879 

880 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

881 """Remove the specified cache entries from cache. 

882 

883 Parameters 

884 ---------- 

885 cache_entries : iterable of `str` 

886 The entries to remove from the cache. The values are the path 

887 within the cache. 

888 """ 

889 for entry in cache_entries: 

890 path = self.cache_directory.join(entry) 

891 

892 self._cache_entries.pop(entry, None) 

893 log.debug("Removing file from cache: %s", path) 

894 with contextlib.suppress(FileNotFoundError): 

895 path.remove() 

896 

897 def _expire_cache(self) -> None: 

898 """Expire the files in the cache. 

899 

900 Notes 

901 ----- 

902 The expiration modes are defined by the config or can be overridden. 

903 Available options: 

904 

905 * ``files``: Number of files. 

906 * ``datasets``: Number of datasets 

907 * ``size``: Total size of files. 

908 * ``age``: Age of files. 

909 

910 The first three would remove in reverse time order. 

911 Number of files is complicated by the possibility of disassembled 

912 composites where 10 small files can be created for each dataset. 

913 

914 Additionally there is a use case for an external user to explicitly 

915 state the dataset refs that should be cached and then when to 

916 remove them. Overriding any global configuration. 

917 """ 

918 if self._expiration_mode is None: 

919 # Expiration has been disabled. 

920 return 

921 

922 # mypy can't be sure we have set a threshold properly 

923 if self._expiration_threshold is None: 

924 log.warning( 

925 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

926 ) 

927 return 

928 

929 # Sync up cache. There is no file locking involved so for a shared 

930 # cache multiple processes may be racing to delete files. Deleting 

931 # a file that no longer exists is not an error. 

932 self.scan_cache() 

933 

934 if self._expiration_mode == "files": 

935 n_files = len(self._cache_entries) 

936 n_over = n_files - self._expiration_threshold 

937 if n_over > 0: 

938 sorted_keys = self._sort_cache() 

939 keys_to_remove = sorted_keys[:n_over] 

940 self._remove_from_cache(keys_to_remove) 

941 return 

942 

943 if self._expiration_mode == "datasets": 

944 # Count the datasets, in ascending timestamp order, 

945 # so that oldest turn up first. 

946 datasets = defaultdict(list) 

947 for key in self._sort_cache(): 

948 entry = self._cache_entries[key] 

949 datasets[entry.ref].append(key) 

950 

951 n_datasets = len(datasets) 

952 n_over = n_datasets - self._expiration_threshold 

953 if n_over > 0: 

954 # Keys will be read out in insert order which 

955 # will be date order so oldest ones are removed. 

956 ref_ids = list(datasets.keys())[:n_over] 

957 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

958 self._remove_from_cache(keys_to_remove) 

959 return 

960 

961 if self._expiration_mode == "size": 

962 if self.cache_size > self._expiration_threshold: 

963 for key in self._sort_cache(): 

964 self._remove_from_cache([key]) 

965 if self.cache_size <= self._expiration_threshold: 

966 break 

967 return 

968 

969 if self._expiration_mode == "age": 

970 now = datetime.datetime.now(datetime.UTC) 

971 for key in self._sort_cache(): 

972 delta = now - self._cache_entries[key].ctime 

973 if delta.seconds > self._expiration_threshold: 

974 self._remove_from_cache([key]) 

975 else: 

976 # We're already in date order. 

977 break 

978 return 

979 

980 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

981 

982 def _sort_cache(self) -> list[str]: 

983 """Sort the cache entries by time and return the sorted keys. 

984 

985 Returns 

986 ------- 

987 sorted : `list` of `str` 

988 Keys into the cache, sorted by time with oldest first. 

989 """ 

990 

991 def sort_by_time(key: str) -> datetime.datetime: 

992 """Sorter key function using cache entry details.""" 

993 return self._cache_entries[key].ctime 

994 

995 return sorted(self._cache_entries, key=sort_by_time) 

996 

997 def __str__(self) -> str: 

998 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

999 return ( 

1000 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

1001 f"default={self._caching_default}) " 

1002 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

1003 ) 

1004 

1005 

1006class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

1007 """A variant of the datastore cache where no cache is enabled. 

1008 

1009 Parameters 

1010 ---------- 

1011 config : `str` or `DatastoreCacheManagerConfig` 

1012 Configuration to control caching. 

1013 universe : `DimensionUniverse` 

1014 Set of all known dimensions, used to expand and validate any used 

1015 in lookup keys. 

1016 """ 

1017 

1018 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

1019 return 

1020 

1021 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

1022 """Indicate whether the entity should be added to the cache. 

1023 

1024 Always returns `False`. 

1025 """ 

1026 return False 

1027 

1028 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

1029 """Move dataset to cache but always refuse and returns `None`.""" 

1030 return None 

1031 

1032 @contextlib.contextmanager 

1033 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

1034 """Look for a dataset in the cache and return its location. 

1035 

1036 Never finds a file. 

1037 """ 

1038 yield None 

1039 

1040 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None: 

1041 """Remove datasets from cache. 

1042 

1043 Always does nothing. 

1044 """ 

1045 return 

1046 

1047 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

1048 """Report if a dataset is known to the cache. 

1049 

1050 Always returns `False`. 

1051 """ 

1052 return False 

1053 

1054 def __str__(self) -> str: 

1055 return f"{type(self).__name__}()"