Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 29%

396 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Cache management for a datastore.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ( 

27 "AbstractDatastoreCacheManager", 

28 "DatastoreDisabledCacheManager", 

29 "DatastoreCacheManager", 

30 "DatastoreCacheManagerConfig", 

31) 

32 

33import atexit 

34import contextlib 

35import datetime 

36import itertools 

37import logging 

38import os 

39import shutil 

40import tempfile 

41import uuid 

42from abc import ABC, abstractmethod 

43from collections import defaultdict 

44from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView 

45from random import Random 

46from typing import TYPE_CHECKING 

47 

48from lsst.daf.butler._compat import _BaseModelCompat 

49from lsst.resources import ResourcePath 

50from pydantic import PrivateAttr 

51 

52from .config import ConfigSubset 

53from .configSupport import processLookupConfigs 

54from .datasets import DatasetId, DatasetRef 

55 

56if TYPE_CHECKING: 

57 from .configSupport import LookupKey 

58 from .datasets import DatasetType 

59 from .dimensions import DimensionUniverse 

60 from .storageClass import StorageClass 

61 

62log = logging.getLogger(__name__) 

63 

64 

65def remove_cache_directory(directory: str) -> None: 

66 """Remove the specified directory and all its contents.""" 

67 log.debug("Removing temporary cache directory %s", directory) 

68 shutil.rmtree(directory, ignore_errors=True) 

69 

70 

71def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

72 """Construct the full path to use for this dataset in the cache. 

73 

74 Parameters 

75 ---------- 

76 ref : `DatasetRef` 

77 The dataset to look up in or write to the cache. 

78 extension : `str` 

79 File extension to use for this file. Should include the 

80 leading "``.``". 

81 

82 Returns 

83 ------- 

84 uri : `lsst.resources.ResourcePath` 

85 URI to use for this dataset in the cache. 

86 """ 

87 # Dataset type component is needed in the name if composite 

88 # disassembly is happening since the ID is shared for all components. 

89 component = ref.datasetType.component() 

90 component = f"_{component}" if component else "" 

91 return root.join(f"{ref.id}{component}{extension}") 

92 

93 

94def _parse_cache_name(cached_location: str) -> tuple[uuid.UUID, str | None, str | None]: 

95 """For a given cache name, return its component parts. 

96 

97 Changes to ``_construct_cache_path()`` should be reflected here. 

98 

99 Parameters 

100 ---------- 

101 cached_location : `str` 

102 The name of the file within the cache. 

103 

104 Returns 

105 ------- 

106 id : `uuid.UUID` 

107 The dataset ID. 

108 component : `str` or `None` 

109 The name of the component, if present. 

110 extension: `str` or `None` 

111 The file extension, if present. 

112 """ 

113 # Assume first dot is the extension and so allow .fits.gz 

114 root_ext = cached_location.split(".", maxsplit=1) 

115 root = root_ext.pop(0) 

116 ext = "." + root_ext.pop(0) if root_ext else None 

117 

118 parts = root.split("_") 

119 id_ = uuid.UUID(parts.pop(0)) 

120 component = parts.pop(0) if parts else None 

121 return id_, component, ext 

122 

123 

124class CacheEntry(_BaseModelCompat): 

125 """Represent an entry in the cache.""" 

126 

127 name: str 

128 """Name of the file.""" 

129 

130 size: int 

131 """Size of the file in bytes.""" 

132 

133 ctime: datetime.datetime 

134 """Creation time of the file.""" 

135 

136 ref: DatasetId 

137 """ID of this dataset.""" 

138 

139 component: str | None = None 

140 """Component for this disassembled composite (optional).""" 

141 

142 @classmethod 

143 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

144 """Construct an object from a file name. 

145 

146 Parameters 

147 ---------- 

148 file : `lsst.resources.ResourcePath` 

149 Path to the file. 

150 root : `lsst.resources.ResourcePath` 

151 Cache root directory. 

152 """ 

153 file_in_cache = file.relative_to(root) 

154 if file_in_cache is None: 

155 raise ValueError(f"Supplied file {file} is not inside root {root}") 

156 id_, component, _ = _parse_cache_name(file_in_cache) 

157 

158 stat = os.stat(file.ospath) 

159 return cls( 

160 name=file_in_cache, 

161 size=stat.st_size, 

162 ref=id_, 

163 component=component, 

164 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime), 

165 ) 

166 

167 

168class _MarkerEntry(CacheEntry): 

169 pass 

170 

171 

172class CacheRegistry(_BaseModelCompat): 

173 """Collection of cache entries.""" 

174 

175 _size: int = PrivateAttr(0) 

176 """Size of the cache.""" 

177 

178 _entries: dict[str, CacheEntry] = PrivateAttr({}) 

179 """Internal collection of cache entries.""" 

180 

181 _ref_map: dict[DatasetId, list[str]] = PrivateAttr({}) 

182 """Mapping of DatasetID to corresponding keys in cache registry.""" 

183 

184 @property 

185 def cache_size(self) -> int: 

186 return self._size 

187 

188 def __getitem__(self, key: str) -> CacheEntry: 

189 return self._entries[key] 

190 

191 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

192 self._size += entry.size 

193 self._entries[key] = entry 

194 

195 # Update the mapping from ref to path. 

196 if entry.ref not in self._ref_map: 

197 self._ref_map[entry.ref] = [] 

198 self._ref_map[entry.ref].append(key) 

199 

200 def __delitem__(self, key: str) -> None: 

201 entry = self._entries.pop(key) 

202 self._decrement(entry) 

203 self._ref_map[entry.ref].remove(key) 

204 

205 def _decrement(self, entry: CacheEntry | None) -> None: 

206 if entry: 

207 self._size -= entry.size 

208 if self._size < 0: 

209 log.warning("Cache size has gone negative. Inconsistent cache records...") 

210 self._size = 0 

211 

212 def __contains__(self, key: str) -> bool: 

213 return key in self._entries 

214 

215 def __len__(self) -> int: 

216 return len(self._entries) 

217 

218 def __iter__(self) -> Iterator[str]: # type: ignore 

219 return iter(self._entries) 

220 

221 def keys(self) -> KeysView[str]: 

222 return self._entries.keys() 

223 

224 def values(self) -> ValuesView[CacheEntry]: 

225 return self._entries.values() 

226 

227 def items(self) -> ItemsView[str, CacheEntry]: 

228 return self._entries.items() 

229 

230 # An private marker to indicate that pop() should raise if no default 

231 # is given. 

232 __marker = _MarkerEntry( 

233 name="marker", 

234 size=0, 

235 ref=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), 

236 ctime=datetime.datetime.utcfromtimestamp(0), 

237 ) 

238 

239 def pop(self, key: str, default: CacheEntry | None = __marker) -> CacheEntry | None: 

240 # The marker for dict.pop is not the same as our marker. 

241 if default is self.__marker: 

242 entry = self._entries.pop(key) 

243 else: 

244 entry = self._entries.pop(key, self.__marker) 

245 # Should not attempt to correct for this entry being removed 

246 # if we got the default value. 

247 if entry is self.__marker: 

248 return default 

249 

250 self._decrement(entry) 

251 # The default entry given to this method may not even be in the cache. 

252 if entry and entry.ref in self._ref_map: 

253 keys = self._ref_map[entry.ref] 

254 if key in keys: 

255 keys.remove(key) 

256 return entry 

257 

258 def get_dataset_keys(self, dataset_id: DatasetId | None) -> list[str] | None: 

259 """Retrieve all keys associated with the given dataset ID. 

260 

261 Parameters 

262 ---------- 

263 dataset_id : `DatasetId` or `None` 

264 The dataset ID to look up. Returns `None` if the ID is `None`. 

265 

266 Returns 

267 ------- 

268 keys : `list` [`str`] 

269 Keys associated with this dataset. These keys can be used to lookup 

270 the cache entry information in the `CacheRegistry`. Returns 

271 `None` if the dataset is not known to the cache. 

272 """ 

273 if dataset_id not in self._ref_map: 

274 return None 

275 keys = self._ref_map[dataset_id] 

276 if not keys: 

277 return None 

278 return keys 

279 

280 

281class DatastoreCacheManagerConfig(ConfigSubset): 

282 """Configuration information for `DatastoreCacheManager`.""" 

283 

284 component = "cached" 

285 requiredKeys = ("cacheable",) 

286 

287 

288class AbstractDatastoreCacheManager(ABC): 

289 """An abstract base class for managing caching in a Datastore. 

290 

291 Parameters 

292 ---------- 

293 config : `str` or `DatastoreCacheManagerConfig` 

294 Configuration to control caching. 

295 universe : `DimensionUniverse` 

296 Set of all known dimensions, used to expand and validate any used 

297 in lookup keys. 

298 """ 

299 

300 @property 

301 def cache_size(self) -> int: 

302 """Size of the cache in bytes.""" 

303 return 0 

304 

305 @property 

306 def file_count(self) -> int: 

307 """Return number of cached files tracked by registry.""" 

308 return 0 

309 

310 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

311 if not isinstance(config, DatastoreCacheManagerConfig): 

312 config = DatastoreCacheManagerConfig(config) 

313 assert isinstance(config, DatastoreCacheManagerConfig) 

314 self.config = config 

315 

316 @abstractmethod 

317 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

318 """Indicate whether the entity should be added to the cache. 

319 

320 This is relevant when reading or writing. 

321 

322 Parameters 

323 ---------- 

324 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

325 Thing to test against the configuration. The ``name`` property 

326 is used to determine a match. A `DatasetType` will first check 

327 its name, before checking its `StorageClass`. If there are no 

328 matches the default will be returned. 

329 

330 Returns 

331 ------- 

332 should_cache : `bool` 

333 Returns `True` if the dataset should be cached; `False` otherwise. 

334 """ 

335 raise NotImplementedError() 

336 

337 @abstractmethod 

338 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

339 """Report if the dataset is known to the cache. 

340 

341 Parameters 

342 ---------- 

343 ref : `DatasetRef` 

344 Dataset to check for in the cache. 

345 extension : `str`, optional 

346 File extension expected. Should include the leading "``.``". 

347 If `None` the extension is ignored and the dataset ID alone is 

348 used to check in the cache. The extension must be defined if 

349 a specific component is being checked. 

350 

351 Returns 

352 ------- 

353 known : `bool` 

354 Returns `True` if the dataset is currently known to the cache 

355 and `False` otherwise. 

356 

357 Notes 

358 ----- 

359 This method can only report if the dataset is known to the cache 

360 in this specific instant and does not indicate whether the file 

361 can be read from the cache later. `find_in_cache()` should be called 

362 if the cached file is to be used. 

363 """ 

364 raise NotImplementedError() 

365 

366 @abstractmethod 

367 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

368 """Move a file to the cache. 

369 

370 Move the given file into the cache, using the supplied DatasetRef 

371 for naming. A call is made to `should_be_cached()` and if the 

372 DatasetRef should not be accepted `None` will be returned. 

373 

374 Cache expiry can occur during this. 

375 

376 Parameters 

377 ---------- 

378 uri : `lsst.resources.ResourcePath` 

379 Location of the file to be relocated to the cache. Will be moved. 

380 ref : `DatasetRef` 

381 Ref associated with this file. Will be used to determine the name 

382 of the file within the cache. 

383 

384 Returns 

385 ------- 

386 new : `lsst.resources.ResourcePath` or `None` 

387 URI to the file within the cache, or `None` if the dataset 

388 was not accepted by the cache. 

389 """ 

390 raise NotImplementedError() 

391 

392 @abstractmethod 

393 @contextlib.contextmanager 

394 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

395 """Look for a dataset in the cache and return its location. 

396 

397 Parameters 

398 ---------- 

399 ref : `DatasetRef` 

400 Dataset to locate in the cache. 

401 extension : `str` 

402 File extension expected. Should include the leading "``.``". 

403 

404 Yields 

405 ------ 

406 uri : `lsst.resources.ResourcePath` or `None` 

407 The URI to the cached file, or `None` if the file has not been 

408 cached. 

409 

410 Notes 

411 ----- 

412 Should be used as a context manager in order to prevent this 

413 file from being removed from the cache for that context. 

414 """ 

415 raise NotImplementedError() 

416 

417 @abstractmethod 

418 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None: 

419 """Remove the specified datasets from the cache. 

420 

421 It is not an error for these datasets to be missing from the cache. 

422 

423 Parameters 

424 ---------- 

425 ref : `DatasetRef` or iterable of `DatasetRef` 

426 The datasets to remove from the cache. 

427 """ 

428 raise NotImplementedError() 

429 

430 @abstractmethod 

431 def __str__(self) -> str: 

432 raise NotImplementedError() 

433 

434 

435class DatastoreCacheManager(AbstractDatastoreCacheManager): 

436 """A class for managing caching in a Datastore using local files. 

437 

438 Parameters 

439 ---------- 

440 config : `str` or `DatastoreCacheManagerConfig` 

441 Configuration to control caching. 

442 universe : `DimensionUniverse` 

443 Set of all known dimensions, used to expand and validate any used 

444 in lookup keys. 

445 

446 Notes 

447 ----- 

448 Two environment variables can be used to override the cache directory 

449 and expiration configuration: 

450 

451 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

452 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

453 

454 The expiration mode should take the form ``mode=threshold`` so for 

455 example to configure expiration to limit the cache directory to 5 datasets 

456 the value would be ``datasets=5``. 

457 

458 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment 

459 variable can be used to indicate that this directory should be used 

460 if no explicit directory has been specified from configuration or from 

461 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable. 

462 """ 

463 

464 _temp_exemption_prefix = "exempt/" 

465 _tmpdir_prefix = "butler-cache-dir-" 

466 

467 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

468 super().__init__(config, universe) 

469 

470 # Set cache directory if it pre-exists, else defer creation until 

471 # requested. Allow external override from environment. 

472 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

473 

474 # Allow the execution environment to override the default values 

475 # so long as no default value has been set from the line above. 

476 if root is None: 

477 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET") 

478 

479 self._cache_directory = ( 

480 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

481 ) 

482 

483 if self._cache_directory: 

484 if not self._cache_directory.isLocal: 

485 raise ValueError( 

486 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

487 ) 

488 # Ensure that the cache directory is created. We assume that 

489 # someone specifying a permanent cache directory will be expecting 

490 # it to always be there. This will also trigger an error 

491 # early rather than waiting until the cache is needed. 

492 self._cache_directory.mkdir() 

493 

494 # Calculate the caching lookup table. 

495 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

496 

497 # Default decision to for whether a dataset should be cached. 

498 self._caching_default = self.config.get("default", False) 

499 

500 # Expiration mode. Read from config but allow override from 

501 # the environment. 

502 expiration_mode = self.config.get(("expiry", "mode")) 

503 threshold = self.config.get(("expiry", "threshold")) 

504 

505 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

506 if external_mode and "=" in external_mode: 

507 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

508 threshold = int(expiration_threshold) 

509 if expiration_mode is None: 

510 # Force to None to avoid confusion. 

511 threshold = None 

512 

513 self._expiration_mode: str | None = expiration_mode 

514 self._expiration_threshold: int | None = threshold 

515 if self._expiration_threshold is None and self._expiration_mode is not None: 

516 raise ValueError( 

517 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

518 ) 

519 

520 log.debug( 

521 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

522 self._cache_directory if self._cache_directory else "tmpdir", 

523 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

524 ) 

525 

526 # Files in cache, indexed by path within the cache directory. 

527 self._cache_entries = CacheRegistry() 

528 

529 @property 

530 def cache_directory(self) -> ResourcePath: 

531 if self._cache_directory is None: 

532 # Create on demand. Allow the override environment variable 

533 # to be used in case it got set after this object was created 

534 # but before a cache was used. 

535 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

536 # Someone else will clean this up. 

537 isTemporary = False 

538 msg = "deferred fallback" 

539 else: 

540 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix) 

541 isTemporary = True 

542 msg = "temporary" 

543 

544 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary) 

545 log.debug("Using %s cache directory at %s", msg, self._cache_directory) 

546 

547 # Remove when we no longer need it. 

548 if isTemporary: 

549 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

550 return self._cache_directory 

551 

552 @property 

553 def _temp_exempt_directory(self) -> ResourcePath: 

554 """Return the directory in which to store temporary cache files that 

555 should not be expired. 

556 """ 

557 return self.cache_directory.join(self._temp_exemption_prefix) 

558 

559 @property 

560 def cache_size(self) -> int: 

561 return self._cache_entries.cache_size 

562 

563 @property 

564 def file_count(self) -> int: 

565 return len(self._cache_entries) 

566 

567 @classmethod 

568 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]: 

569 """Define a fallback cache directory if a fallback not set already. 

570 

571 Returns 

572 ------- 

573 defined : `bool` 

574 `True` if the fallback directory was newly-defined in this method. 

575 `False` if it had already been set. 

576 cache_dir : `str` 

577 Returns the path to the cache directory that will be used if it's 

578 needed. This can allow the caller to run a directory cleanup 

579 when it's no longer needed (something that the cache manager 

580 can not do because forks should not clean up directories defined 

581 by the parent process). 

582 

583 Notes 

584 ----- 

585 The fallback directory will not be defined if one has already been 

586 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` 

587 environment variable only if a value has not previously been stored 

588 in that environment variable. Setting the environment variable allows 

589 this value to survive into spawned subprocesses. Calling this method 

590 will lead to all subsequently created cache managers sharing the same 

591 cache. 

592 """ 

593 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

594 # A value has already been set. 

595 return (False, cache_dir) 

596 

597 # As a class method, we do not know at this point whether a cache 

598 # directory will be needed so it would be impolite to create a 

599 # directory that will never be used. 

600 

601 # Construct our own temp name -- 16 characters should have a fairly 

602 # low chance of clashing when combined with the process ID. 

603 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

604 rng = Random() 

605 tempchars = "".join(rng.choice(characters) for _ in range(16)) 

606 

607 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}" 

608 

609 cache_dir = os.path.join(tempfile.gettempdir(), tempname) 

610 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir 

611 return (True, cache_dir) 

612 

613 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

614 # Docstring inherited 

615 matchName: LookupKey | str = f"{entity} (via default)" 

616 should_cache = self._caching_default 

617 

618 for key in entity._lookupNames(): 

619 if key in self._lut: 

620 should_cache = bool(self._lut[key]) 

621 matchName = key 

622 break 

623 

624 if not isinstance(should_cache, bool): 

625 raise TypeError( 

626 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

627 ) 

628 

629 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

630 return should_cache 

631 

632 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

633 """Construct the name to use for this dataset in the cache. 

634 

635 Parameters 

636 ---------- 

637 ref : `DatasetRef` 

638 The dataset to look up in or write to the cache. 

639 extension : `str` 

640 File extension to use for this file. Should include the 

641 leading "``.``". 

642 

643 Returns 

644 ------- 

645 uri : `lsst.resources.ResourcePath` 

646 URI to use for this dataset in the cache. 

647 """ 

648 return _construct_cache_path(self.cache_directory, ref, extension) 

649 

650 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

651 # Docstring inherited 

652 if not self.should_be_cached(ref): 

653 return None 

654 

655 # Write the file using the id of the dataset ref and the file 

656 # extension. 

657 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

658 

659 # Run cache expiry to ensure that we have room for this 

660 # item. 

661 self._expire_cache() 

662 

663 # The above reset the in-memory cache status. It's entirely possible 

664 # that another process has just cached this file (if multiple 

665 # processes are caching on read), so check our in-memory cache 

666 # before attempting to cache the dataset. 

667 path_in_cache = cached_location.relative_to(self.cache_directory) 

668 if path_in_cache and path_in_cache in self._cache_entries: 

669 return cached_location 

670 

671 # Move into the cache. Given that multiple processes might be 

672 # sharing a single cache directory, and the file we need might have 

673 # been copied in whilst we were checking, allow overwrite without 

674 # complaint. Even for a private cache directory it is possible that 

675 # a second butler in a subprocess could be writing to it. 

676 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

677 log.debug("Cached dataset %s to %s", ref, cached_location) 

678 

679 self._register_cache_entry(cached_location) 

680 

681 return cached_location 

682 

683 @contextlib.contextmanager 

684 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

685 # Docstring inherited 

686 # Short circuit this if the cache directory has not been created yet. 

687 if self._cache_directory is None: 

688 yield None 

689 return 

690 

691 cached_location = self._construct_cache_name(ref, extension) 

692 if cached_location.exists(): 

693 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

694 

695 # The cached file could be removed by another process doing 

696 # cache expiration so we need to protect against that by making 

697 # a copy in a different tree. Use hardlinks to ensure that 

698 # we either have the cached file or we don't. This is robust 

699 # against race conditions that can be caused by using soft links 

700 # and the other end of the link being deleted just after it 

701 # is created. 

702 path_in_cache = cached_location.relative_to(self.cache_directory) 

703 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

704 

705 # Need to use a unique file name for the temporary location to 

706 # ensure that two different processes can read the file 

707 # simultaneously without one of them deleting it when it's in 

708 # use elsewhere. Retain the original filename for easier debugging. 

709 random = str(uuid.uuid4())[:8] 

710 basename = cached_location.basename() 

711 filename = f"{random}-{basename}" 

712 

713 temp_location: ResourcePath | None = self._temp_exempt_directory.join(filename) 

714 try: 

715 if temp_location is not None: 

716 temp_location.transfer_from(cached_location, transfer="hardlink") 

717 except Exception as e: 

718 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

719 # Any failure will be treated as if the file was not 

720 # in the cache. Yielding the original cache location 

721 # is too dangerous. 

722 temp_location = None 

723 

724 try: 

725 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

726 yield temp_location 

727 finally: 

728 try: 

729 if temp_location: 

730 temp_location.remove() 

731 except FileNotFoundError: 

732 pass 

733 return 

734 

735 log.debug("Dataset %s not found in cache.", ref) 

736 yield None 

737 return 

738 

739 def remove_from_cache(self, refs: DatasetRef | Iterable[DatasetRef]) -> None: 

740 # Docstring inherited. 

741 

742 # Stop early if there are no cache entries anyhow. 

743 if len(self._cache_entries) == 0: 

744 return 

745 

746 if isinstance(refs, DatasetRef): 

747 refs = [refs] 

748 

749 # Create a set of all the IDs 

750 all_ids = {ref.id for ref in refs} 

751 

752 keys_to_remove = [] 

753 for key, entry in self._cache_entries.items(): 

754 if entry.ref in all_ids: 

755 keys_to_remove.append(key) 

756 self._remove_from_cache(keys_to_remove) 

757 

758 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str | None: 

759 """Record the file in the cache registry. 

760 

761 Parameters 

762 ---------- 

763 cached_location : `lsst.resources.ResourcePath` 

764 Location of the file to be registered. 

765 can_exist : `bool`, optional 

766 If `True` the item being registered can already be listed. 

767 This can allow a cache refresh to run without checking the 

768 file again. If `False` it is an error for the registry to 

769 already know about this file. 

770 

771 Returns 

772 ------- 

773 cache_key : `str` or `None` 

774 The key used in the registry for this file. `None` if the file 

775 no longer exists (it could have been expired by another process). 

776 """ 

777 path_in_cache = cached_location.relative_to(self.cache_directory) 

778 if path_in_cache is None: 

779 raise ValueError( 

780 f"Can not register cached file {cached_location} that is not within" 

781 f" the cache directory at {self.cache_directory}." 

782 ) 

783 if path_in_cache in self._cache_entries: 

784 if can_exist: 

785 return path_in_cache 

786 else: 

787 raise ValueError( 

788 f"Cached file {cached_location} is already known to the registry" 

789 " but this was expected to be a new file." 

790 ) 

791 try: 

792 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

793 except FileNotFoundError: 

794 return None 

795 self._cache_entries[path_in_cache] = details 

796 return path_in_cache 

797 

798 def scan_cache(self) -> None: 

799 """Scan the cache directory and record information about files.""" 

800 found = set() 

801 for file in ResourcePath.findFileResources([self.cache_directory]): 

802 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

803 

804 # Skip any that are found in an exempt part of the hierarchy 

805 # since they should not be part of the registry. 

806 if file.relative_to(self._temp_exempt_directory) is not None: 

807 continue 

808 

809 path_in_cache = self._register_cache_entry(file, can_exist=True) 

810 if path_in_cache: 

811 found.add(path_in_cache) 

812 

813 # Find any files that were recorded in the cache but are no longer 

814 # on disk. (something else cleared them out?) 

815 known_to_cache = set(self._cache_entries) 

816 missing = known_to_cache - found 

817 

818 if missing: 

819 log.debug( 

820 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

821 ) 

822 for path_in_cache in missing: 

823 self._cache_entries.pop(path_in_cache, None) 

824 

825 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

826 """Report if the dataset is known to the cache. 

827 

828 Parameters 

829 ---------- 

830 ref : `DatasetRef` 

831 Dataset to check for in the cache. 

832 extension : `str`, optional 

833 File extension expected. Should include the leading "``.``". 

834 If `None` the extension is ignored and the dataset ID alone is 

835 used to check in the cache. The extension must be defined if 

836 a specific component is being checked. 

837 

838 Returns 

839 ------- 

840 known : `bool` 

841 Returns `True` if the dataset is currently known to the cache 

842 and `False` otherwise. If the dataset refers to a component and 

843 an extension is given then only that component is checked. 

844 

845 Notes 

846 ----- 

847 This method can only report if the dataset is known to the cache 

848 in this specific instant and does not indicate whether the file 

849 can be read from the cache later. `find_in_cache()` should be called 

850 if the cached file is to be used. 

851 

852 This method does not force the cache to be re-scanned and so can miss 

853 cached datasets that have recently been written by other processes. 

854 """ 

855 if self._cache_directory is None: 

856 return False 

857 if self.file_count == 0: 

858 return False 

859 

860 if extension is None: 

861 # Look solely for matching dataset ref ID and not specific 

862 # components. 

863 cached_paths = self._cache_entries.get_dataset_keys(ref.id) 

864 return True if cached_paths else False 

865 

866 else: 

867 # Extension is known so we can do an explicit look up for the 

868 # cache entry. 

869 cached_location = self._construct_cache_name(ref, extension) 

870 path_in_cache = cached_location.relative_to(self.cache_directory) 

871 assert path_in_cache is not None # For mypy 

872 return path_in_cache in self._cache_entries 

873 

874 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

875 """Remove the specified cache entries from cache. 

876 

877 Parameters 

878 ---------- 

879 cache_entries : iterable of `str` 

880 The entries to remove from the cache. The values are the path 

881 within the cache. 

882 """ 

883 for entry in cache_entries: 

884 path = self.cache_directory.join(entry) 

885 

886 self._cache_entries.pop(entry, None) 

887 log.debug("Removing file from cache: %s", path) 

888 try: 

889 path.remove() 

890 except FileNotFoundError: 

891 pass 

892 

893 def _expire_cache(self) -> None: 

894 """Expire the files in the cache. 

895 

896 Notes 

897 ----- 

898 The expiration modes are defined by the config or can be overridden. 

899 Available options: 

900 

901 * ``files``: Number of files. 

902 * ``datasets``: Number of datasets 

903 * ``size``: Total size of files. 

904 * ``age``: Age of files. 

905 

906 The first three would remove in reverse time order. 

907 Number of files is complicated by the possibility of disassembled 

908 composites where 10 small files can be created for each dataset. 

909 

910 Additionally there is a use case for an external user to explicitly 

911 state the dataset refs that should be cached and then when to 

912 remove them. Overriding any global configuration. 

913 """ 

914 if self._expiration_mode is None: 

915 # Expiration has been disabled. 

916 return 

917 

918 # mypy can't be sure we have set a threshold properly 

919 if self._expiration_threshold is None: 

920 log.warning( 

921 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

922 ) 

923 return 

924 

925 # Sync up cache. There is no file locking involved so for a shared 

926 # cache multiple processes may be racing to delete files. Deleting 

927 # a file that no longer exists is not an error. 

928 self.scan_cache() 

929 

930 if self._expiration_mode == "files": 

931 n_files = len(self._cache_entries) 

932 n_over = n_files - self._expiration_threshold 

933 if n_over > 0: 

934 sorted_keys = self._sort_cache() 

935 keys_to_remove = sorted_keys[:n_over] 

936 self._remove_from_cache(keys_to_remove) 

937 return 

938 

939 if self._expiration_mode == "datasets": 

940 # Count the datasets, in ascending timestamp order, 

941 # so that oldest turn up first. 

942 datasets = defaultdict(list) 

943 for key in self._sort_cache(): 

944 entry = self._cache_entries[key] 

945 datasets[entry.ref].append(key) 

946 

947 n_datasets = len(datasets) 

948 n_over = n_datasets - self._expiration_threshold 

949 if n_over > 0: 

950 # Keys will be read out in insert order which 

951 # will be date order so oldest ones are removed. 

952 ref_ids = list(datasets.keys())[:n_over] 

953 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

954 self._remove_from_cache(keys_to_remove) 

955 return 

956 

957 if self._expiration_mode == "size": 

958 if self.cache_size > self._expiration_threshold: 

959 for key in self._sort_cache(): 

960 self._remove_from_cache([key]) 

961 if self.cache_size <= self._expiration_threshold: 

962 break 

963 return 

964 

965 if self._expiration_mode == "age": 

966 now = datetime.datetime.utcnow() 

967 for key in self._sort_cache(): 

968 delta = now - self._cache_entries[key].ctime 

969 if delta.seconds > self._expiration_threshold: 

970 self._remove_from_cache([key]) 

971 else: 

972 # We're already in date order. 

973 break 

974 return 

975 

976 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

977 

978 def _sort_cache(self) -> list[str]: 

979 """Sort the cache entries by time and return the sorted keys. 

980 

981 Returns 

982 ------- 

983 sorted : `list` of `str` 

984 Keys into the cache, sorted by time with oldest first. 

985 """ 

986 

987 def sort_by_time(key: str) -> datetime.datetime: 

988 """Sorter key function using cache entry details.""" 

989 return self._cache_entries[key].ctime 

990 

991 return sorted(self._cache_entries, key=sort_by_time) 

992 

993 def __str__(self) -> str: 

994 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

995 return ( 

996 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

997 f"default={self._caching_default}) " 

998 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

999 ) 

1000 

1001 

1002class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

1003 """A variant of the datastore cache where no cache is enabled. 

1004 

1005 Parameters 

1006 ---------- 

1007 config : `str` or `DatastoreCacheManagerConfig` 

1008 Configuration to control caching. 

1009 universe : `DimensionUniverse` 

1010 Set of all known dimensions, used to expand and validate any used 

1011 in lookup keys. 

1012 """ 

1013 

1014 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): 

1015 return 

1016 

1017 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: 

1018 """Indicate whether the entity should be added to the cache. 

1019 

1020 Always returns `False`. 

1021 """ 

1022 return False 

1023 

1024 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None: 

1025 """Move dataset to cache but always refuse and returns `None`.""" 

1026 return None 

1027 

1028 @contextlib.contextmanager 

1029 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]: 

1030 """Look for a dataset in the cache and return its location. 

1031 

1032 Never finds a file. 

1033 """ 

1034 yield None 

1035 

1036 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None: 

1037 """Remove datasets from cache. 

1038 

1039 Always does nothing. 

1040 """ 

1041 return 

1042 

1043 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool: 

1044 """Report if a dataset is known to the cache. 

1045 

1046 Always returns `False`. 

1047 """ 

1048 return False 

1049 

1050 def __str__(self) -> str: 

1051 return f"{type(self).__name__}()"