Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 23%

396 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-01 02:05 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Cache management for a datastore.""" 

25 

26__all__ = ( 

27 "AbstractDatastoreCacheManager", 

28 "DatastoreDisabledCacheManager", 

29 "DatastoreCacheManager", 

30 "DatastoreCacheManagerConfig", 

31) 

32 

33import atexit 

34import contextlib 

35import datetime 

36import itertools 

37import logging 

38import os 

39import shutil 

40import tempfile 

41import uuid 

42from abc import ABC, abstractmethod 

43from collections import defaultdict 

44from random import Random 

45from typing import ( 

46 TYPE_CHECKING, 

47 Dict, 

48 ItemsView, 

49 Iterable, 

50 Iterator, 

51 KeysView, 

52 List, 

53 Optional, 

54 Union, 

55 ValuesView, 

56) 

57 

58from lsst.resources import ResourcePath 

59from pydantic import BaseModel, PrivateAttr 

60 

61from .config import ConfigSubset 

62from .configSupport import processLookupConfigs 

63from .datasets import DatasetId, DatasetRef 

64 

65if TYPE_CHECKING: 

66 from .configSupport import LookupKey 

67 from .datasets import DatasetType 

68 from .dimensions import DimensionUniverse 

69 from .storageClass import StorageClass 

70 

71log = logging.getLogger(__name__) 

72 

73 

74def remove_cache_directory(directory: str) -> None: 

75 """Remove the specified directory and all its contents.""" 

76 log.debug("Removing temporary cache directory %s", directory) 

77 shutil.rmtree(directory, ignore_errors=True) 

78 

79 

80def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

81 """Construct the full path to use for this dataset in the cache. 

82 

83 Parameters 

84 ---------- 

85 ref : `DatasetRef` 

86 The dataset to look up in or write to the cache. 

87 extension : `str` 

88 File extension to use for this file. Should include the 

89 leading "``.``". 

90 

91 Returns 

92 ------- 

93 uri : `lsst.resources.ResourcePath` 

94 URI to use for this dataset in the cache. 

95 """ 

96 # Dataset type component is needed in the name if composite 

97 # disassembly is happening since the ID is shared for all components. 

98 component = ref.datasetType.component() 

99 component = f"_{component}" if component else "" 

100 return root.join(f"{ref.id}{component}{extension}") 

101 

102 

103def _parse_cache_name(cached_location: str) -> tuple[uuid.UUID, str | None, str | None]: 

104 """For a given cache name, return its component parts. 

105 

106 Changes to ``_construct_cache_path()`` should be reflected here. 

107 

108 Parameters 

109 ---------- 

110 cached_location : `str` 

111 The name of the file within the cache. 

112 

113 Returns 

114 ------- 

115 id : `uuid.UUID` 

116 The dataset ID. 

117 component : `str` or `None` 

118 The name of the component, if present. 

119 extension: `str` or `None` 

120 The file extension, if present. 

121 """ 

122 # Assume first dot is the extension and so allow .fits.gz 

123 root_ext = cached_location.split(".", maxsplit=1) 

124 root = root_ext.pop(0) 

125 ext = "." + root_ext.pop(0) if root_ext else None 

126 

127 parts = root.split("_") 

128 id_ = uuid.UUID(parts.pop(0)) 

129 component = parts.pop(0) if parts else None 

130 return id_, component, ext 

131 

132 

133class CacheEntry(BaseModel): 

134 """Represent an entry in the cache.""" 

135 

136 name: str 

137 """Name of the file.""" 

138 

139 size: int 

140 """Size of the file in bytes.""" 

141 

142 ctime: datetime.datetime 

143 """Creation time of the file.""" 

144 

145 ref: DatasetId 

146 """ID of this dataset.""" 

147 

148 component: Optional[str] 

149 """Component for this disassembled composite (optional).""" 

150 

151 @classmethod 

152 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

153 """Construct an object from a file name. 

154 

155 Parameters 

156 ---------- 

157 file : `lsst.resources.ResourcePath` 

158 Path to the file. 

159 root : `lsst.resources.ResourcePath` 

160 Cache root directory. 

161 """ 

162 file_in_cache = file.relative_to(root) 

163 if file_in_cache is None: 

164 raise ValueError(f"Supplied file {file} is not inside root {root}") 

165 id_, component, _ = _parse_cache_name(file_in_cache) 

166 

167 stat = os.stat(file.ospath) 

168 return cls( 

169 name=file_in_cache, 

170 size=stat.st_size, 

171 ref=id_, 

172 component=component, 

173 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime), 

174 ) 

175 

176 

177class _MarkerEntry(CacheEntry): 

178 pass 

179 

180 

181class CacheRegistry(BaseModel): 

182 """Collection of cache entries.""" 

183 

184 _size: int = PrivateAttr(0) 

185 """Size of the cache.""" 

186 

187 _entries: Dict[str, CacheEntry] = PrivateAttr({}) 

188 """Internal collection of cache entries.""" 

189 

190 _ref_map: Dict[DatasetId, List[str]] = PrivateAttr({}) 

191 """Mapping of DatasetID to corresponding keys in cache registry.""" 

192 

193 @property 

194 def cache_size(self) -> int: 

195 return self._size 

196 

197 def __getitem__(self, key: str) -> CacheEntry: 

198 return self._entries[key] 

199 

200 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

201 self._size += entry.size 

202 self._entries[key] = entry 

203 

204 # Update the mapping from ref to path. 

205 if entry.ref not in self._ref_map: 

206 self._ref_map[entry.ref] = [] 

207 self._ref_map[entry.ref].append(key) 

208 

209 def __delitem__(self, key: str) -> None: 

210 entry = self._entries.pop(key) 

211 self._decrement(entry) 

212 self._ref_map[entry.ref].remove(key) 

213 

214 def _decrement(self, entry: Optional[CacheEntry]) -> None: 

215 if entry: 

216 self._size -= entry.size 

217 if self._size < 0: 

218 log.warning("Cache size has gone negative. Inconsistent cache records...") 

219 self._size = 0 

220 

221 def __contains__(self, key: str) -> bool: 

222 return key in self._entries 

223 

224 def __len__(self) -> int: 

225 return len(self._entries) 

226 

227 def __iter__(self) -> Iterator[str]: # type: ignore 

228 return iter(self._entries) 

229 

230 def keys(self) -> KeysView[str]: 

231 return self._entries.keys() 

232 

233 def values(self) -> ValuesView[CacheEntry]: 

234 return self._entries.values() 

235 

236 def items(self) -> ItemsView[str, CacheEntry]: 

237 return self._entries.items() 

238 

239 # An private marker to indicate that pop() should raise if no default 

240 # is given. 

241 __marker = _MarkerEntry( 

242 name="marker", 

243 size=0, 

244 ref=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), 

245 ctime=datetime.datetime.utcfromtimestamp(0), 

246 ) 

247 

248 def pop(self, key: str, default: Optional[CacheEntry] = __marker) -> Optional[CacheEntry]: 

249 # The marker for dict.pop is not the same as our marker. 

250 if default is self.__marker: 

251 entry = self._entries.pop(key) 

252 else: 

253 entry = self._entries.pop(key, self.__marker) 

254 # Should not attempt to correct for this entry being removed 

255 # if we got the default value. 

256 if entry is self.__marker: 

257 return default 

258 

259 self._decrement(entry) 

260 # The default entry given to this method may not even be in the cache. 

261 if entry and entry.ref in self._ref_map: 

262 keys = self._ref_map[entry.ref] 

263 if key in keys: 

264 keys.remove(key) 

265 return entry 

266 

267 def get_dataset_keys(self, dataset_id: Optional[DatasetId]) -> Optional[List[str]]: 

268 """Retrieve all keys associated with the given dataset ID. 

269 

270 Parameters 

271 ---------- 

272 dataset_id : `DatasetId` or `None` 

273 The dataset ID to look up. Returns `None` if the ID is `None`. 

274 

275 Returns 

276 ------- 

277 keys : `list` [`str`] 

278 Keys associated with this dataset. These keys can be used to lookup 

279 the cache entry information in the `CacheRegistry`. Returns 

280 `None` if the dataset is not known to the cache. 

281 """ 

282 if dataset_id not in self._ref_map: 

283 return None 

284 keys = self._ref_map[dataset_id] 

285 if not keys: 

286 return None 

287 return keys 

288 

289 

290class DatastoreCacheManagerConfig(ConfigSubset): 

291 """Configuration information for `DatastoreCacheManager`.""" 

292 

293 component = "cached" 

294 requiredKeys = ("cacheable",) 

295 

296 

297class AbstractDatastoreCacheManager(ABC): 

298 """An abstract base class for managing caching in a Datastore. 

299 

300 Parameters 

301 ---------- 

302 config : `str` or `DatastoreCacheManagerConfig` 

303 Configuration to control caching. 

304 universe : `DimensionUniverse` 

305 Set of all known dimensions, used to expand and validate any used 

306 in lookup keys. 

307 """ 

308 

309 @property 

310 def cache_size(self) -> int: 

311 """Size of the cache in bytes.""" 

312 return 0 

313 

314 @property 

315 def file_count(self) -> int: 

316 """Return number of cached files tracked by registry.""" 

317 return 0 

318 

319 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

320 if not isinstance(config, DatastoreCacheManagerConfig): 

321 config = DatastoreCacheManagerConfig(config) 

322 assert isinstance(config, DatastoreCacheManagerConfig) 

323 self.config = config 

324 

325 @abstractmethod 

326 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

327 """Indicate whether the entity should be added to the cache. 

328 

329 This is relevant when reading or writing. 

330 

331 Parameters 

332 ---------- 

333 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

334 Thing to test against the configuration. The ``name`` property 

335 is used to determine a match. A `DatasetType` will first check 

336 its name, before checking its `StorageClass`. If there are no 

337 matches the default will be returned. 

338 

339 Returns 

340 ------- 

341 should_cache : `bool` 

342 Returns `True` if the dataset should be cached; `False` otherwise. 

343 """ 

344 raise NotImplementedError() 

345 

346 @abstractmethod 

347 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

348 """Report if the dataset is known to the cache. 

349 

350 Parameters 

351 ---------- 

352 ref : `DatasetRef` 

353 Dataset to check for in the cache. 

354 extension : `str`, optional 

355 File extension expected. Should include the leading "``.``". 

356 If `None` the extension is ignored and the dataset ID alone is 

357 used to check in the cache. The extension must be defined if 

358 a specific component is being checked. 

359 

360 Returns 

361 ------- 

362 known : `bool` 

363 Returns `True` if the dataset is currently known to the cache 

364 and `False` otherwise. 

365 

366 Notes 

367 ----- 

368 This method can only report if the dataset is known to the cache 

369 in this specific instant and does not indicate whether the file 

370 can be read from the cache later. `find_in_cache()` should be called 

371 if the cached file is to be used. 

372 """ 

373 raise NotImplementedError() 

374 

375 @abstractmethod 

376 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

377 """Move a file to the cache. 

378 

379 Move the given file into the cache, using the supplied DatasetRef 

380 for naming. A call is made to `should_be_cached()` and if the 

381 DatasetRef should not be accepted `None` will be returned. 

382 

383 Cache expiry can occur during this. 

384 

385 Parameters 

386 ---------- 

387 uri : `lsst.resources.ResourcePath` 

388 Location of the file to be relocated to the cache. Will be moved. 

389 ref : `DatasetRef` 

390 Ref associated with this file. Will be used to determine the name 

391 of the file within the cache. 

392 

393 Returns 

394 ------- 

395 new : `lsst.resources.ResourcePath` or `None` 

396 URI to the file within the cache, or `None` if the dataset 

397 was not accepted by the cache. 

398 """ 

399 raise NotImplementedError() 

400 

401 @abstractmethod 

402 @contextlib.contextmanager 

403 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

404 """Look for a dataset in the cache and return its location. 

405 

406 Parameters 

407 ---------- 

408 ref : `DatasetRef` 

409 Dataset to locate in the cache. 

410 extension : `str` 

411 File extension expected. Should include the leading "``.``". 

412 

413 Yields 

414 ------ 

415 uri : `lsst.resources.ResourcePath` or `None` 

416 The URI to the cached file, or `None` if the file has not been 

417 cached. 

418 

419 Notes 

420 ----- 

421 Should be used as a context manager in order to prevent this 

422 file from being removed from the cache for that context. 

423 """ 

424 raise NotImplementedError() 

425 

426 @abstractmethod 

427 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

428 """Remove the specified datasets from the cache. 

429 

430 It is not an error for these datasets to be missing from the cache. 

431 

432 Parameters 

433 ---------- 

434 ref : `DatasetRef` or iterable of `DatasetRef` 

435 The datasets to remove from the cache. 

436 """ 

437 raise NotImplementedError() 

438 

439 @abstractmethod 

440 def __str__(self) -> str: 

441 raise NotImplementedError() 

442 

443 

444class DatastoreCacheManager(AbstractDatastoreCacheManager): 

445 """A class for managing caching in a Datastore using local files. 

446 

447 Parameters 

448 ---------- 

449 config : `str` or `DatastoreCacheManagerConfig` 

450 Configuration to control caching. 

451 universe : `DimensionUniverse` 

452 Set of all known dimensions, used to expand and validate any used 

453 in lookup keys. 

454 

455 Notes 

456 ----- 

457 Two environment variables can be used to override the cache directory 

458 and expiration configuration: 

459 

460 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

461 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

462 

463 The expiration mode should take the form ``mode=threshold`` so for 

464 example to configure expiration to limit the cache directory to 5 datasets 

465 the value would be ``datasets=5``. 

466 

467 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment 

468 variable can be used to indicate that this directory should be used 

469 if no explicit directory has been specified from configuration or from 

470 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable. 

471 """ 

472 

473 _temp_exemption_prefix = "exempt/" 

474 _tmpdir_prefix = "butler-cache-dir-" 

475 

476 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

477 super().__init__(config, universe) 

478 

479 # Set cache directory if it pre-exists, else defer creation until 

480 # requested. Allow external override from environment. 

481 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

482 

483 # Allow the execution environment to override the default values 

484 # so long as no default value has been set from the line above. 

485 if root is None: 

486 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET") 

487 

488 self._cache_directory = ( 

489 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

490 ) 

491 

492 if self._cache_directory: 

493 if not self._cache_directory.isLocal: 

494 raise ValueError( 

495 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

496 ) 

497 # Ensure that the cache directory is created. We assume that 

498 # someone specifying a permanent cache directory will be expecting 

499 # it to always be there. This will also trigger an error 

500 # early rather than waiting until the cache is needed. 

501 self._cache_directory.mkdir() 

502 

503 # Calculate the caching lookup table. 

504 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

505 

506 # Default decision to for whether a dataset should be cached. 

507 self._caching_default = self.config.get("default", False) 

508 

509 # Expiration mode. Read from config but allow override from 

510 # the environment. 

511 expiration_mode = self.config.get(("expiry", "mode")) 

512 threshold = self.config.get(("expiry", "threshold")) 

513 

514 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

515 if external_mode and "=" in external_mode: 

516 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

517 threshold = int(expiration_threshold) 

518 if expiration_mode is None: 

519 # Force to None to avoid confusion. 

520 threshold = None 

521 

522 self._expiration_mode: Optional[str] = expiration_mode 

523 self._expiration_threshold: Optional[int] = threshold 

524 if self._expiration_threshold is None and self._expiration_mode is not None: 

525 raise ValueError( 

526 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

527 ) 

528 

529 log.debug( 

530 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

531 self._cache_directory if self._cache_directory else "tmpdir", 

532 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

533 ) 

534 

535 # Files in cache, indexed by path within the cache directory. 

536 self._cache_entries = CacheRegistry() 

537 

538 @property 

539 def cache_directory(self) -> ResourcePath: 

540 if self._cache_directory is None: 

541 # Create on demand. Allow the override environment variable 

542 # to be used in case it got set after this object was created 

543 # but before a cache was used. 

544 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

545 # Someone else will clean this up. 

546 isTemporary = False 

547 msg = "deferred fallback" 

548 else: 

549 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix) 

550 isTemporary = True 

551 msg = "temporary" 

552 

553 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary) 

554 log.debug("Using %s cache directory at %s", msg, self._cache_directory) 

555 

556 # Remove when we no longer need it. 

557 if isTemporary: 

558 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

559 return self._cache_directory 

560 

561 @property 

562 def _temp_exempt_directory(self) -> ResourcePath: 

563 """Return the directory in which to store temporary cache files that 

564 should not be expired. 

565 """ 

566 return self.cache_directory.join(self._temp_exemption_prefix) 

567 

568 @property 

569 def cache_size(self) -> int: 

570 return self._cache_entries.cache_size 

571 

572 @property 

573 def file_count(self) -> int: 

574 return len(self._cache_entries) 

575 

576 @classmethod 

577 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]: 

578 """Defines a fallback cache directory if a fallback not set already. 

579 

580 Returns 

581 ------- 

582 defined : `bool` 

583 `True` if the fallback directory was newly-defined in this method. 

584 `False` if it had already been set. 

585 cache_dir : `str` 

586 Returns the path to the cache directory that will be used if it's 

587 needed. This can allow the caller to run a directory cleanup 

588 when it's no longer needed (something that the cache manager 

589 can not do because forks should not clean up directories defined 

590 by the parent process). 

591 

592 Notes 

593 ----- 

594 The fallback directory will not be defined if one has already been 

595 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` 

596 environment variable only if a value has not previously been stored 

597 in that environment variable. Setting the environment variable allows 

598 this value to survive into spawned subprocesses. Calling this method 

599 will lead to all subsequently created cache managers sharing the same 

600 cache. 

601 """ 

602 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

603 # A value has already been set. 

604 return (False, cache_dir) 

605 

606 # As a class method, we do not know at this point whether a cache 

607 # directory will be needed so it would be impolite to create a 

608 # directory that will never be used. 

609 

610 # Construct our own temp name -- 16 characters should have a fairly 

611 # low chance of clashing when combined with the process ID. 

612 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

613 rng = Random() 

614 tempchars = "".join(rng.choice(characters) for _ in range(16)) 

615 

616 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}" 

617 

618 cache_dir = os.path.join(tempfile.gettempdir(), tempname) 

619 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir 

620 return (True, cache_dir) 

621 

622 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

623 # Docstring inherited 

624 matchName: Union[LookupKey, str] = "{} (via default)".format(entity) 

625 should_cache = self._caching_default 

626 

627 for key in entity._lookupNames(): 

628 if key in self._lut: 

629 should_cache = bool(self._lut[key]) 

630 matchName = key 

631 break 

632 

633 if not isinstance(should_cache, bool): 

634 raise TypeError( 

635 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

636 ) 

637 

638 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

639 return should_cache 

640 

641 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

642 """Construct the name to use for this dataset in the cache. 

643 

644 Parameters 

645 ---------- 

646 ref : `DatasetRef` 

647 The dataset to look up in or write to the cache. 

648 extension : `str` 

649 File extension to use for this file. Should include the 

650 leading "``.``". 

651 

652 Returns 

653 ------- 

654 uri : `lsst.resources.ResourcePath` 

655 URI to use for this dataset in the cache. 

656 """ 

657 return _construct_cache_path(self.cache_directory, ref, extension) 

658 

659 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

660 # Docstring inherited 

661 if ref.id is None: 

662 raise ValueError(f"Can not cache a file associated with an unresolved reference ({ref})") 

663 

664 if not self.should_be_cached(ref): 

665 return None 

666 

667 # Write the file using the id of the dataset ref and the file 

668 # extension. 

669 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

670 

671 # Run cache expiry to ensure that we have room for this 

672 # item. 

673 self._expire_cache() 

674 

675 # The above reset the in-memory cache status. It's entirely possible 

676 # that another process has just cached this file (if multiple 

677 # processes are caching on read), so check our in-memory cache 

678 # before attempting to cache the dataset. 

679 path_in_cache = cached_location.relative_to(self.cache_directory) 

680 if path_in_cache and path_in_cache in self._cache_entries: 

681 return cached_location 

682 

683 # Move into the cache. Given that multiple processes might be 

684 # sharing a single cache directory, and the file we need might have 

685 # been copied in whilst we were checking, allow overwrite without 

686 # complaint. Even for a private cache directory it is possible that 

687 # a second butler in a subprocess could be writing to it. 

688 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

689 log.debug("Cached dataset %s to %s", ref, cached_location) 

690 

691 self._register_cache_entry(cached_location) 

692 

693 return cached_location 

694 

695 @contextlib.contextmanager 

696 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

697 # Docstring inherited 

698 # Short circuit this if the cache directory has not been created yet. 

699 if self._cache_directory is None: 

700 yield None 

701 return 

702 

703 cached_location = self._construct_cache_name(ref, extension) 

704 if cached_location.exists(): 

705 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

706 

707 # The cached file could be removed by another process doing 

708 # cache expiration so we need to protect against that by making 

709 # a copy in a different tree. Use hardlinks to ensure that 

710 # we either have the cached file or we don't. This is robust 

711 # against race conditions that can be caused by using soft links 

712 # and the other end of the link being deleted just after it 

713 # is created. 

714 path_in_cache = cached_location.relative_to(self.cache_directory) 

715 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

716 

717 # Need to use a unique file name for the temporary location to 

718 # ensure that two different processes can read the file 

719 # simultaneously without one of them deleting it when it's in 

720 # use elsewhere. Retain the original filename for easier debugging. 

721 random = str(uuid.uuid4())[:8] 

722 basename = cached_location.basename() 

723 filename = f"{random}-{basename}" 

724 

725 temp_location: Optional[ResourcePath] = self._temp_exempt_directory.join(filename) 

726 try: 

727 if temp_location is not None: 

728 temp_location.transfer_from(cached_location, transfer="hardlink") 

729 except Exception as e: 

730 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

731 # Any failure will be treated as if the file was not 

732 # in the cache. Yielding the original cache location 

733 # is too dangerous. 

734 temp_location = None 

735 

736 try: 

737 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

738 yield temp_location 

739 finally: 

740 try: 

741 if temp_location: 

742 temp_location.remove() 

743 except FileNotFoundError: 

744 pass 

745 return 

746 

747 log.debug("Dataset %s not found in cache.", ref) 

748 yield None 

749 return 

750 

751 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

752 # Docstring inherited. 

753 

754 # Stop early if there are no cache entries anyhow. 

755 if len(self._cache_entries) == 0: 

756 return 

757 

758 if isinstance(refs, DatasetRef): 

759 refs = [refs] 

760 

761 # Create a set of all the IDs 

762 all_ids = {ref.getCheckedId() for ref in refs} 

763 

764 keys_to_remove = [] 

765 for key, entry in self._cache_entries.items(): 

766 if entry.ref in all_ids: 

767 keys_to_remove.append(key) 

768 self._remove_from_cache(keys_to_remove) 

769 

770 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> Optional[str]: 

771 """Record the file in the cache registry. 

772 

773 Parameters 

774 ---------- 

775 cached_location : `lsst.resources.ResourcePath` 

776 Location of the file to be registered. 

777 can_exist : `bool`, optional 

778 If `True` the item being registered can already be listed. 

779 This can allow a cache refresh to run without checking the 

780 file again. If `False` it is an error for the registry to 

781 already know about this file. 

782 

783 Returns 

784 ------- 

785 cache_key : `str` or `None` 

786 The key used in the registry for this file. `None` if the file 

787 no longer exists (it could have been expired by another process). 

788 """ 

789 path_in_cache = cached_location.relative_to(self.cache_directory) 

790 if path_in_cache is None: 

791 raise ValueError( 

792 f"Can not register cached file {cached_location} that is not within" 

793 f" the cache directory at {self.cache_directory}." 

794 ) 

795 if path_in_cache in self._cache_entries: 

796 if can_exist: 

797 return path_in_cache 

798 else: 

799 raise ValueError( 

800 f"Cached file {cached_location} is already known to the registry" 

801 " but this was expected to be a new file." 

802 ) 

803 try: 

804 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

805 except FileNotFoundError: 

806 return None 

807 self._cache_entries[path_in_cache] = details 

808 return path_in_cache 

809 

810 def scan_cache(self) -> None: 

811 """Scan the cache directory and record information about files.""" 

812 found = set() 

813 for file in ResourcePath.findFileResources([self.cache_directory]): 

814 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

815 

816 # Skip any that are found in an exempt part of the hierarchy 

817 # since they should not be part of the registry. 

818 if file.relative_to(self._temp_exempt_directory) is not None: 

819 continue 

820 

821 path_in_cache = self._register_cache_entry(file, can_exist=True) 

822 if path_in_cache: 

823 found.add(path_in_cache) 

824 

825 # Find any files that were recorded in the cache but are no longer 

826 # on disk. (something else cleared them out?) 

827 known_to_cache = set(self._cache_entries) 

828 missing = known_to_cache - found 

829 

830 if missing: 

831 log.debug( 

832 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

833 ) 

834 for path_in_cache in missing: 

835 self._cache_entries.pop(path_in_cache, None) 

836 

837 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

838 """Report if the dataset is known to the cache. 

839 

840 Parameters 

841 ---------- 

842 ref : `DatasetRef` 

843 Dataset to check for in the cache. 

844 extension : `str`, optional 

845 File extension expected. Should include the leading "``.``". 

846 If `None` the extension is ignored and the dataset ID alone is 

847 used to check in the cache. The extension must be defined if 

848 a specific component is being checked. 

849 

850 Returns 

851 ------- 

852 known : `bool` 

853 Returns `True` if the dataset is currently known to the cache 

854 and `False` otherwise. If the dataset refers to a component and 

855 an extension is given then only that component is checked. 

856 

857 Notes 

858 ----- 

859 This method can only report if the dataset is known to the cache 

860 in this specific instant and does not indicate whether the file 

861 can be read from the cache later. `find_in_cache()` should be called 

862 if the cached file is to be used. 

863 

864 This method does not force the cache to be re-scanned and so can miss 

865 cached datasets that have recently been written by other processes. 

866 """ 

867 if self._cache_directory is None: 

868 return False 

869 if self.file_count == 0: 

870 return False 

871 

872 if extension is None: 

873 # Look solely for matching dataset ref ID and not specific 

874 # components. 

875 cached_paths = self._cache_entries.get_dataset_keys(ref.id) 

876 return True if cached_paths else False 

877 

878 else: 

879 # Extension is known so we can do an explicit look up for the 

880 # cache entry. 

881 cached_location = self._construct_cache_name(ref, extension) 

882 path_in_cache = cached_location.relative_to(self.cache_directory) 

883 assert path_in_cache is not None # For mypy 

884 return path_in_cache in self._cache_entries 

885 

886 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

887 """Remove the specified cache entries from cache. 

888 

889 Parameters 

890 ---------- 

891 cache_entries : iterable of `str` 

892 The entries to remove from the cache. The values are the path 

893 within the cache. 

894 """ 

895 for entry in cache_entries: 

896 path = self.cache_directory.join(entry) 

897 

898 self._cache_entries.pop(entry, None) 

899 log.debug("Removing file from cache: %s", path) 

900 try: 

901 path.remove() 

902 except FileNotFoundError: 

903 pass 

904 

905 def _expire_cache(self) -> None: 

906 """Expire the files in the cache. 

907 

908 Notes 

909 ----- 

910 The expiration modes are defined by the config or can be overridden. 

911 Available options: 

912 

913 * ``files``: Number of files. 

914 * ``datasets``: Number of datasets 

915 * ``size``: Total size of files. 

916 * ``age``: Age of files. 

917 

918 The first three would remove in reverse time order. 

919 Number of files is complicated by the possibility of disassembled 

920 composites where 10 small files can be created for each dataset. 

921 

922 Additionally there is a use case for an external user to explicitly 

923 state the dataset refs that should be cached and then when to 

924 remove them. Overriding any global configuration. 

925 """ 

926 if self._expiration_mode is None: 

927 # Expiration has been disabled. 

928 return 

929 

930 # mypy can't be sure we have set a threshold properly 

931 if self._expiration_threshold is None: 

932 log.warning( 

933 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

934 ) 

935 return 

936 

937 # Sync up cache. There is no file locking involved so for a shared 

938 # cache multiple processes may be racing to delete files. Deleting 

939 # a file that no longer exists is not an error. 

940 self.scan_cache() 

941 

942 if self._expiration_mode == "files": 

943 n_files = len(self._cache_entries) 

944 n_over = n_files - self._expiration_threshold 

945 if n_over > 0: 

946 sorted_keys = self._sort_cache() 

947 keys_to_remove = sorted_keys[:n_over] 

948 self._remove_from_cache(keys_to_remove) 

949 return 

950 

951 if self._expiration_mode == "datasets": 

952 # Count the datasets, in ascending timestamp order, 

953 # so that oldest turn up first. 

954 datasets = defaultdict(list) 

955 for key in self._sort_cache(): 

956 entry = self._cache_entries[key] 

957 datasets[entry.ref].append(key) 

958 

959 n_datasets = len(datasets) 

960 n_over = n_datasets - self._expiration_threshold 

961 if n_over > 0: 

962 # Keys will be read out in insert order which 

963 # will be date order so oldest ones are removed. 

964 ref_ids = list(datasets.keys())[:n_over] 

965 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

966 self._remove_from_cache(keys_to_remove) 

967 return 

968 

969 if self._expiration_mode == "size": 

970 if self.cache_size > self._expiration_threshold: 

971 for key in self._sort_cache(): 

972 self._remove_from_cache([key]) 

973 if self.cache_size <= self._expiration_threshold: 

974 break 

975 return 

976 

977 if self._expiration_mode == "age": 

978 now = datetime.datetime.utcnow() 

979 for key in self._sort_cache(): 

980 delta = now - self._cache_entries[key].ctime 

981 if delta.seconds > self._expiration_threshold: 

982 self._remove_from_cache([key]) 

983 else: 

984 # We're already in date order. 

985 break 

986 return 

987 

988 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

989 

990 def _sort_cache(self) -> List[str]: 

991 """Sort the cache entries by time and return the sorted keys. 

992 

993 Returns 

994 ------- 

995 sorted : `list` of `str` 

996 Keys into the cache, sorted by time with oldest first. 

997 """ 

998 

999 def sort_by_time(key: str) -> datetime.datetime: 

1000 """Sorter key function using cache entry details.""" 

1001 return self._cache_entries[key].ctime 

1002 

1003 return sorted(self._cache_entries, key=sort_by_time) 

1004 

1005 def __str__(self) -> str: 

1006 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

1007 return ( 

1008 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

1009 f"default={self._caching_default}) " 

1010 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

1011 ) 

1012 

1013 

1014class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

1015 """A variant of the datastore cache where no cache is enabled. 

1016 

1017 Parameters 

1018 ---------- 

1019 config : `str` or `DatastoreCacheManagerConfig` 

1020 Configuration to control caching. 

1021 universe : `DimensionUniverse` 

1022 Set of all known dimensions, used to expand and validate any used 

1023 in lookup keys. 

1024 """ 

1025 

1026 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

1027 return 

1028 

1029 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

1030 """Indicate whether the entity should be added to the cache. 

1031 

1032 Always returns `False`. 

1033 """ 

1034 return False 

1035 

1036 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

1037 """Move dataset to cache but always refuse and returns `None`.""" 

1038 return None 

1039 

1040 @contextlib.contextmanager 

1041 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

1042 """Look for a dataset in the cache and return its location. 

1043 

1044 Never finds a file. 

1045 """ 

1046 yield None 

1047 

1048 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

1049 """Remove datasets from cache. 

1050 

1051 Always does nothing. 

1052 """ 

1053 return 

1054 

1055 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

1056 """Report if a dataset is known to the cache. 

1057 

1058 Always returns `False`. 

1059 """ 

1060 return False 

1061 

1062 def __str__(self) -> str: 

1063 return f"{type(self).__name__}()"