Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 23%

407 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-11 02:31 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Cache management for a datastore.""" 

25 

26__all__ = ( 

27 "AbstractDatastoreCacheManager", 

28 "DatastoreDisabledCacheManager", 

29 "DatastoreCacheManager", 

30 "DatastoreCacheManagerConfig", 

31) 

32 

33import atexit 

34import contextlib 

35import datetime 

36import itertools 

37import logging 

38import os 

39import shutil 

40import tempfile 

41import uuid 

42from abc import ABC, abstractmethod 

43from collections import defaultdict 

44from random import Random 

45from typing import ( 

46 TYPE_CHECKING, 

47 Dict, 

48 ItemsView, 

49 Iterable, 

50 Iterator, 

51 KeysView, 

52 List, 

53 Optional, 

54 Union, 

55 ValuesView, 

56) 

57 

58from lsst.resources import ResourcePath 

59from pydantic import BaseModel, PrivateAttr 

60 

61from .config import ConfigSubset 

62from .configSupport import processLookupConfigs 

63from .datasets import DatasetId, DatasetRef 

64 

65if TYPE_CHECKING: 65 ↛ 66line 65 didn't jump to line 66, because the condition on line 65 was never true

66 from .configSupport import LookupKey 

67 from .datasets import DatasetType 

68 from .dimensions import DimensionUniverse 

69 from .storageClass import StorageClass 

70 

71log = logging.getLogger(__name__) 

72 

73 

74def remove_cache_directory(directory: str) -> None: 

75 """Remove the specified directory and all its contents.""" 

76 log.debug("Removing temporary cache directory %s", directory) 

77 shutil.rmtree(directory, ignore_errors=True) 

78 

79 

80def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

81 """Construct the full path to use for this dataset in the cache. 

82 

83 Parameters 

84 ---------- 

85 ref : `DatasetRef` 

86 The dataset to look up in or write to the cache. 

87 extension : `str` 

88 File extension to use for this file. Should include the 

89 leading "``.``". 

90 

91 Returns 

92 ------- 

93 uri : `lsst.resources.ResourcePath` 

94 URI to use for this dataset in the cache. 

95 """ 

96 # Dataset type component is needed in the name if composite 

97 # disassembly is happening since the ID is shared for all components. 

98 component = ref.datasetType.component() 

99 component = f"_{component}" if component else "" 

100 return root.join(f"{ref.id}{component}{extension}") 

101 

102 

103def _parse_cache_name(cached_location: str) -> Dict[str, Optional[str]]: 

104 """For a given cache name, return its component parts. 

105 

106 Changes to ``_construct_cache_path()`` should be reflected here. 

107 

108 Parameters 

109 ---------- 

110 cached_location : `str` 

111 The name of the file within the cache. 

112 

113 Returns 

114 ------- 

115 parsed : `dict` of `str`, `str` 

116 Parsed components of the file. These include: 

117 - "id": The dataset ID, 

118 - "component": The name of the component (can be `None`), 

119 - "extension": File extension (can be `None`). 

120 """ 

121 # Assume first dot is the extension and so allow .fits.gz 

122 root_ext = cached_location.split(".", maxsplit=1) 

123 root = root_ext.pop(0) 

124 ext = "." + root_ext.pop(0) if root_ext else None 

125 

126 parts = root.split("_") 

127 id_ = parts.pop(0) 

128 component = parts.pop(0) if parts else None 

129 return {"id": id_, "component": component, "extension": ext} 

130 

131 

132class CacheEntry(BaseModel): 

133 """Represent an entry in the cache.""" 

134 

135 name: str 

136 """Name of the file.""" 

137 

138 size: int 

139 """Size of the file in bytes.""" 

140 

141 ctime: datetime.datetime 

142 """Creation time of the file.""" 

143 

144 ref: DatasetId 

145 """ID of this dataset.""" 

146 

147 component: Optional[str] 

148 """Component for this disassembled composite (optional).""" 

149 

150 @classmethod 

151 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

152 """Construct an object from a file name. 

153 

154 Parameters 

155 ---------- 

156 file : `lsst.resources.ResourcePath` 

157 Path to the file. 

158 root : `lsst.resources.ResourcePath` 

159 Cache root directory. 

160 """ 

161 file_in_cache = file.relative_to(root) 

162 if file_in_cache is None: 

163 raise ValueError(f"Supplied file {file} is not inside root {root}") 

164 parts = _parse_cache_name(file_in_cache) 

165 

166 stat = os.stat(file.ospath) 

167 return cls( 

168 name=file_in_cache, 

169 size=stat.st_size, 

170 ref=parts["id"], 

171 component=parts["component"], 

172 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime), 

173 ) 

174 

175 

176class _MarkerEntry(CacheEntry): 

177 pass 

178 

179 

180class CacheRegistry(BaseModel): 

181 """Collection of cache entries.""" 

182 

183 _size: int = PrivateAttr(0) 

184 """Size of the cache.""" 

185 

186 _entries: Dict[str, CacheEntry] = PrivateAttr({}) 

187 """Internal collection of cache entries.""" 

188 

189 _ref_map: Dict[DatasetId, List[str]] = PrivateAttr({}) 

190 """Mapping of DatasetID to corresponding keys in cache registry.""" 

191 

192 @property 

193 def cache_size(self) -> int: 

194 return self._size 

195 

196 def __getitem__(self, key: str) -> CacheEntry: 

197 return self._entries[key] 

198 

199 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

200 self._size += entry.size 

201 self._entries[key] = entry 

202 

203 # Update the mapping from ref to path. 

204 if entry.ref not in self._ref_map: 

205 self._ref_map[entry.ref] = [] 

206 self._ref_map[entry.ref].append(key) 

207 

208 def __delitem__(self, key: str) -> None: 

209 entry = self._entries.pop(key) 

210 self._decrement(entry) 

211 self._ref_map[entry.ref].remove(key) 

212 

213 def _decrement(self, entry: Optional[CacheEntry]) -> None: 

214 if entry: 

215 self._size -= entry.size 

216 if self._size < 0: 

217 log.warning("Cache size has gone negative. Inconsistent cache records...") 

218 self._size = 0 

219 

220 def __contains__(self, key: str) -> bool: 

221 return key in self._entries 

222 

223 def __len__(self) -> int: 

224 return len(self._entries) 

225 

226 def __iter__(self) -> Iterator[str]: # type: ignore 

227 return iter(self._entries) 

228 

229 def keys(self) -> KeysView[str]: 

230 return self._entries.keys() 

231 

232 def values(self) -> ValuesView[CacheEntry]: 

233 return self._entries.values() 

234 

235 def items(self) -> ItemsView[str, CacheEntry]: 

236 return self._entries.items() 

237 

238 # An private marker to indicate that pop() should raise if no default 

239 # is given. 

240 __marker = _MarkerEntry(name="marker", size=0, ref=0, ctime=datetime.datetime.utcfromtimestamp(0)) 

241 

242 def pop(self, key: str, default: Optional[CacheEntry] = __marker) -> Optional[CacheEntry]: 

243 # The marker for dict.pop is not the same as our marker. 

244 if default is self.__marker: 

245 entry = self._entries.pop(key) 

246 else: 

247 entry = self._entries.pop(key, self.__marker) 

248 # Should not attempt to correct for this entry being removed 

249 # if we got the default value. 

250 if entry is self.__marker: 

251 return default 

252 

253 self._decrement(entry) 

254 # The default entry given to this method may not even be in the cache. 

255 if entry and entry.ref in self._ref_map: 

256 keys = self._ref_map[entry.ref] 

257 if key in keys: 

258 keys.remove(key) 

259 return entry 

260 

261 def get_dataset_keys(self, dataset_id: Optional[DatasetId]) -> Optional[List[str]]: 

262 """Retrieve all keys associated with the given dataset ID. 

263 

264 Parameters 

265 ---------- 

266 dataset_id : `DatasetId` or `None` 

267 The dataset ID to look up. Returns `None` if the ID is `None`. 

268 

269 Returns 

270 ------- 

271 keys : `list` [`str`] 

272 Keys associated with this dataset. These keys can be used to lookup 

273 the cache entry information in the `CacheRegistry`. Returns 

274 `None` if the dataset is not known to the cache. 

275 """ 

276 if dataset_id not in self._ref_map: 

277 return None 

278 keys = self._ref_map[dataset_id] 

279 if not keys: 

280 return None 

281 return keys 

282 

283 

284class DatastoreCacheManagerConfig(ConfigSubset): 

285 """Configuration information for `DatastoreCacheManager`.""" 

286 

287 component = "cached" 

288 requiredKeys = ("cacheable",) 

289 

290 

291class AbstractDatastoreCacheManager(ABC): 

292 """An abstract base class for managing caching in a Datastore. 

293 

294 Parameters 

295 ---------- 

296 config : `str` or `DatastoreCacheManagerConfig` 

297 Configuration to control caching. 

298 universe : `DimensionUniverse` 

299 Set of all known dimensions, used to expand and validate any used 

300 in lookup keys. 

301 """ 

302 

303 @property 

304 def cache_size(self) -> int: 

305 """Size of the cache in bytes.""" 

306 return 0 

307 

308 @property 

309 def file_count(self) -> int: 

310 """Return number of cached files tracked by registry.""" 

311 return 0 

312 

313 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

314 if not isinstance(config, DatastoreCacheManagerConfig): 

315 config = DatastoreCacheManagerConfig(config) 

316 assert isinstance(config, DatastoreCacheManagerConfig) 

317 self.config = config 

318 

319 @abstractmethod 

320 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

321 """Indicate whether the entity should be added to the cache. 

322 

323 This is relevant when reading or writing. 

324 

325 Parameters 

326 ---------- 

327 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

328 Thing to test against the configuration. The ``name`` property 

329 is used to determine a match. A `DatasetType` will first check 

330 its name, before checking its `StorageClass`. If there are no 

331 matches the default will be returned. 

332 

333 Returns 

334 ------- 

335 should_cache : `bool` 

336 Returns `True` if the dataset should be cached; `False` otherwise. 

337 """ 

338 raise NotImplementedError() 

339 

340 @abstractmethod 

341 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

342 """Report if the dataset is known to the cache. 

343 

344 Parameters 

345 ---------- 

346 ref : `DatasetRef` 

347 Dataset to check for in the cache. 

348 extension : `str`, optional 

349 File extension expected. Should include the leading "``.``". 

350 If `None` the extension is ignored and the dataset ID alone is 

351 used to check in the cache. The extension must be defined if 

352 a specific component is being checked. 

353 

354 Returns 

355 ------- 

356 known : `bool` 

357 Returns `True` if the dataset is currently known to the cache 

358 and `False` otherwise. 

359 

360 Notes 

361 ----- 

362 This method can only report if the dataset is known to the cache 

363 in this specific instant and does not indicate whether the file 

364 can be read from the cache later. `find_in_cache()` should be called 

365 if the cached file is to be used. 

366 """ 

367 raise NotImplementedError() 

368 

369 @abstractmethod 

370 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

371 """Move a file to the cache. 

372 

373 Move the given file into the cache, using the supplied DatasetRef 

374 for naming. A call is made to `should_be_cached()` and if the 

375 DatasetRef should not be accepted `None` will be returned. 

376 

377 Cache expiry can occur during this. 

378 

379 Parameters 

380 ---------- 

381 uri : `lsst.resources.ResourcePath` 

382 Location of the file to be relocated to the cache. Will be moved. 

383 ref : `DatasetRef` 

384 Ref associated with this file. Will be used to determine the name 

385 of the file within the cache. 

386 

387 Returns 

388 ------- 

389 new : `lsst.resources.ResourcePath` or `None` 

390 URI to the file within the cache, or `None` if the dataset 

391 was not accepted by the cache. 

392 """ 

393 raise NotImplementedError() 

394 

395 @abstractmethod 

396 @contextlib.contextmanager 

397 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

398 """Look for a dataset in the cache and return its location. 

399 

400 Parameters 

401 ---------- 

402 ref : `DatasetRef` 

403 Dataset to locate in the cache. 

404 extension : `str` 

405 File extension expected. Should include the leading "``.``". 

406 

407 Yields 

408 ------ 

409 uri : `lsst.resources.ResourcePath` or `None` 

410 The URI to the cached file, or `None` if the file has not been 

411 cached. 

412 

413 Notes 

414 ----- 

415 Should be used as a context manager in order to prevent this 

416 file from being removed from the cache for that context. 

417 """ 

418 raise NotImplementedError() 

419 

420 @abstractmethod 

421 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

422 """Remove the specified datasets from the cache. 

423 

424 It is not an error for these datasets to be missing from the cache. 

425 

426 Parameters 

427 ---------- 

428 ref : `DatasetRef` or iterable of `DatasetRef` 

429 The datasets to remove from the cache. 

430 """ 

431 raise NotImplementedError() 

432 

433 @abstractmethod 

434 def __str__(self) -> str: 

435 raise NotImplementedError() 

436 

437 

438class DatastoreCacheManager(AbstractDatastoreCacheManager): 

439 """A class for managing caching in a Datastore using local files. 

440 

441 Parameters 

442 ---------- 

443 config : `str` or `DatastoreCacheManagerConfig` 

444 Configuration to control caching. 

445 universe : `DimensionUniverse` 

446 Set of all known dimensions, used to expand and validate any used 

447 in lookup keys. 

448 

449 Notes 

450 ----- 

451 Two environment variables can be used to override the cache directory 

452 and expiration configuration: 

453 

454 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

455 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

456 

457 The expiration mode should take the form ``mode=threshold`` so for 

458 example to configure expiration to limit the cache directory to 5 datasets 

459 the value would be ``datasets=5``. 

460 

461 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment 

462 variable can be used to indicate that this directory should be used 

463 if no explicit directory has been specified from configuration or from 

464 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable. 

465 """ 

466 

467 _temp_exemption_prefix = "exempt/" 

468 _tmpdir_prefix = "butler-cache-dir-" 

469 

470 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

471 super().__init__(config, universe) 

472 

473 # Set cache directory if it pre-exists, else defer creation until 

474 # requested. Allow external override from environment. 

475 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

476 

477 # Allow the execution environment to override the default values 

478 # so long as no default value has been set from the line above. 

479 if root is None: 

480 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET") 

481 

482 self._cache_directory = ( 

483 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

484 ) 

485 

486 if self._cache_directory: 

487 if not self._cache_directory.isLocal: 

488 raise ValueError( 

489 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

490 ) 

491 # Ensure that the cache directory is created. We assume that 

492 # someone specifying a permanent cache directory will be expecting 

493 # it to always be there. This will also trigger an error 

494 # early rather than waiting until the cache is needed. 

495 self._cache_directory.mkdir() 

496 

497 # Calculate the caching lookup table. 

498 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

499 

500 # Default decision to for whether a dataset should be cached. 

501 self._caching_default = self.config.get("default", False) 

502 

503 # Expiration mode. Read from config but allow override from 

504 # the environment. 

505 expiration_mode = self.config.get(("expiry", "mode")) 

506 threshold = self.config.get(("expiry", "threshold")) 

507 

508 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

509 if external_mode and "=" in external_mode: 

510 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

511 threshold = int(expiration_threshold) 

512 if expiration_mode is None: 

513 # Force to None to avoid confusion. 

514 threshold = None 

515 

516 self._expiration_mode: Optional[str] = expiration_mode 

517 self._expiration_threshold: Optional[int] = threshold 

518 if self._expiration_threshold is None and self._expiration_mode is not None: 

519 raise ValueError( 

520 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

521 ) 

522 

523 log.debug( 

524 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

525 self._cache_directory if self._cache_directory else "tmpdir", 

526 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

527 ) 

528 

529 # Files in cache, indexed by path within the cache directory. 

530 self._cache_entries = CacheRegistry() 

531 

532 @property 

533 def cache_directory(self) -> ResourcePath: 

534 if self._cache_directory is None: 

535 # Create on demand. Allow the override environment variable 

536 # to be used in case it got set after this object was created 

537 # but before a cache was used. 

538 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

539 # Someone else will clean this up. 

540 isTemporary = False 

541 msg = "deferred fallback" 

542 else: 

543 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix) 

544 isTemporary = True 

545 msg = "temporary" 

546 

547 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary) 

548 log.debug("Using %s cache directory at %s", msg, self._cache_directory) 

549 

550 # Remove when we no longer need it. 

551 if isTemporary: 

552 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

553 return self._cache_directory 

554 

555 @property 

556 def _temp_exempt_directory(self) -> ResourcePath: 

557 """Return the directory in which to store temporary cache files that 

558 should not be expired. 

559 """ 

560 return self.cache_directory.join(self._temp_exemption_prefix) 

561 

562 @property 

563 def cache_size(self) -> int: 

564 return self._cache_entries.cache_size 

565 

566 @property 

567 def file_count(self) -> int: 

568 return len(self._cache_entries) 

569 

570 @classmethod 

571 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]: 

572 """Defines a fallback cache directory if a fallback not set already. 

573 

574 Returns 

575 ------- 

576 defined : `bool` 

577 `True` if the fallback directory was newly-defined in this method. 

578 `False` if it had already been set. 

579 cache_dir : `str` 

580 Returns the path to the cache directory that will be used if it's 

581 needed. This can allow the caller to run a directory cleanup 

582 when it's no longer needed (something that the cache manager 

583 can not do because forks should not clean up directories defined 

584 by the parent process). 

585 

586 Notes 

587 ----- 

588 The fallback directory will not be defined if one has already been 

589 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` 

590 environment variable only if a value has not previously been stored 

591 in that environment variable. Setting the environment variable allows 

592 this value to survive into spawned subprocesses. Calling this method 

593 will lead to all subsequently created cache managers sharing the same 

594 cache. 

595 """ 

596 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

597 # A value has already been set. 

598 return (False, cache_dir) 

599 

600 # As a class method, we do not know at this point whether a cache 

601 # directory will be needed so it would be impolite to create a 

602 # directory that will never be used. 

603 

604 # Construct our own temp name -- 16 characters should have a fairly 

605 # low chance of clashing when combined with the process ID. 

606 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

607 rng = Random() 

608 tempchars = "".join(rng.choice(characters) for _ in range(16)) 

609 

610 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}" 

611 

612 cache_dir = os.path.join(tempfile.gettempdir(), tempname) 

613 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir 

614 return (True, cache_dir) 

615 

616 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

617 # Docstring inherited 

618 matchName: Union[LookupKey, str] = "{} (via default)".format(entity) 

619 should_cache = self._caching_default 

620 

621 for key in entity._lookupNames(): 

622 if key in self._lut: 

623 should_cache = bool(self._lut[key]) 

624 matchName = key 

625 break 

626 

627 if not isinstance(should_cache, bool): 

628 raise TypeError( 

629 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

630 ) 

631 

632 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

633 return should_cache 

634 

635 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

636 """Construct the name to use for this dataset in the cache. 

637 

638 Parameters 

639 ---------- 

640 ref : `DatasetRef` 

641 The dataset to look up in or write to the cache. 

642 extension : `str` 

643 File extension to use for this file. Should include the 

644 leading "``.``". 

645 

646 Returns 

647 ------- 

648 uri : `lsst.resources.ResourcePath` 

649 URI to use for this dataset in the cache. 

650 """ 

651 return _construct_cache_path(self.cache_directory, ref, extension) 

652 

653 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

654 # Docstring inherited 

655 if ref.id is None: 

656 raise ValueError(f"Can not cache a file associated with an unresolved reference ({ref})") 

657 

658 if not self.should_be_cached(ref): 

659 return None 

660 

661 # Write the file using the id of the dataset ref and the file 

662 # extension. 

663 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

664 

665 # Run cache expiry to ensure that we have room for this 

666 # item. 

667 self._expire_cache() 

668 

669 # The above reset the in-memory cache status. It's entirely possible 

670 # that another process has just cached this file (if multiple 

671 # processes are caching on read), so check our in-memory cache 

672 # before attempting to cache the dataset. 

673 path_in_cache = cached_location.relative_to(self.cache_directory) 

674 if path_in_cache and path_in_cache in self._cache_entries: 

675 return cached_location 

676 

677 # Move into the cache. Given that multiple processes might be 

678 # sharing a single cache directory, and the file we need might have 

679 # been copied in whilst we were checking, allow overwrite without 

680 # complaint. Even for a private cache directory it is possible that 

681 # a second butler in a subprocess could be writing to it. 

682 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

683 log.debug("Cached dataset %s to %s", ref, cached_location) 

684 

685 self._register_cache_entry(cached_location) 

686 

687 return cached_location 

688 

689 @contextlib.contextmanager 

690 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

691 # Docstring inherited 

692 # Short circuit this if the cache directory has not been created yet. 

693 if self._cache_directory is None: 

694 yield None 

695 return 

696 

697 cached_location = self._construct_cache_name(ref, extension) 

698 if cached_location.exists(): 

699 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

700 

701 # The cached file could be removed by another process doing 

702 # cache expiration so we need to protect against that by making 

703 # a copy in a different tree. Use hardlinks to ensure that 

704 # we either have the cached file or we don't. This is robust 

705 # against race conditions that can be caused by using soft links 

706 # and the other end of the link being deleted just after it 

707 # is created. 

708 path_in_cache = cached_location.relative_to(self.cache_directory) 

709 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

710 

711 # Need to use a unique file name for the temporary location to 

712 # ensure that two different processes can read the file 

713 # simultaneously without one of them deleting it when it's in 

714 # use elsewhere. Retain the original filename for easier debugging. 

715 random = str(uuid.uuid4())[:8] 

716 basename = cached_location.basename() 

717 filename = f"{random}-{basename}" 

718 

719 temp_location: Optional[ResourcePath] = self._temp_exempt_directory.join(filename) 

720 try: 

721 if temp_location is not None: 

722 temp_location.transfer_from(cached_location, transfer="hardlink") 

723 except Exception as e: 

724 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

725 # Any failure will be treated as if the file was not 

726 # in the cache. Yielding the original cache location 

727 # is too dangerous. 

728 temp_location = None 

729 

730 try: 

731 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

732 yield temp_location 

733 finally: 

734 try: 

735 if temp_location: 

736 temp_location.remove() 

737 except FileNotFoundError: 

738 pass 

739 return 

740 

741 log.debug("Dataset %s not found in cache.", ref) 

742 yield None 

743 return 

744 

745 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

746 # Docstring inherited. 

747 

748 # Stop early if there are no cache entries anyhow. 

749 if len(self._cache_entries) == 0: 

750 return 

751 

752 if isinstance(refs, DatasetRef): 

753 refs = [refs] 

754 

755 # Create a set of all the IDs 

756 all_ids = {ref.getCheckedId() for ref in refs} 

757 

758 keys_to_remove = [] 

759 for key, entry in self._cache_entries.items(): 

760 if entry.ref in all_ids: 

761 keys_to_remove.append(key) 

762 self._remove_from_cache(keys_to_remove) 

763 

764 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> Optional[str]: 

765 """Record the file in the cache registry. 

766 

767 Parameters 

768 ---------- 

769 cached_location : `lsst.resources.ResourcePath` 

770 Location of the file to be registered. 

771 can_exist : `bool`, optional 

772 If `True` the item being registered can already be listed. 

773 This can allow a cache refresh to run without checking the 

774 file again. If `False` it is an error for the registry to 

775 already know about this file. 

776 

777 Returns 

778 ------- 

779 cache_key : `str` or `None` 

780 The key used in the registry for this file. `None` if the file 

781 no longer exists (it could have been expired by another process). 

782 """ 

783 path_in_cache = cached_location.relative_to(self.cache_directory) 

784 if path_in_cache is None: 

785 raise ValueError( 

786 f"Can not register cached file {cached_location} that is not within" 

787 f" the cache directory at {self.cache_directory}." 

788 ) 

789 if path_in_cache in self._cache_entries: 

790 if can_exist: 

791 return path_in_cache 

792 else: 

793 raise ValueError( 

794 f"Cached file {cached_location} is already known to the registry" 

795 " but this was expected to be a new file." 

796 ) 

797 try: 

798 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

799 except FileNotFoundError: 

800 return None 

801 self._cache_entries[path_in_cache] = details 

802 return path_in_cache 

803 

804 def scan_cache(self) -> None: 

805 """Scan the cache directory and record information about files.""" 

806 found = set() 

807 for file in ResourcePath.findFileResources([self.cache_directory]): 

808 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

809 

810 # Skip any that are found in an exempt part of the hierarchy 

811 # since they should not be part of the registry. 

812 if file.relative_to(self._temp_exempt_directory) is not None: 

813 continue 

814 

815 path_in_cache = self._register_cache_entry(file, can_exist=True) 

816 if path_in_cache: 

817 found.add(path_in_cache) 

818 

819 # Find any files that were recorded in the cache but are no longer 

820 # on disk. (something else cleared them out?) 

821 known_to_cache = set(self._cache_entries) 

822 missing = known_to_cache - found 

823 

824 if missing: 

825 log.debug( 

826 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

827 ) 

828 for path_in_cache in missing: 

829 self._cache_entries.pop(path_in_cache, None) 

830 

831 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

832 """Report if the dataset is known to the cache. 

833 

834 Parameters 

835 ---------- 

836 ref : `DatasetRef` 

837 Dataset to check for in the cache. 

838 extension : `str`, optional 

839 File extension expected. Should include the leading "``.``". 

840 If `None` the extension is ignored and the dataset ID alone is 

841 used to check in the cache. The extension must be defined if 

842 a specific component is being checked. 

843 

844 Returns 

845 ------- 

846 known : `bool` 

847 Returns `True` if the dataset is currently known to the cache 

848 and `False` otherwise. If the dataset refers to a component and 

849 an extension is given then only that component is checked. 

850 

851 Notes 

852 ----- 

853 This method can only report if the dataset is known to the cache 

854 in this specific instant and does not indicate whether the file 

855 can be read from the cache later. `find_in_cache()` should be called 

856 if the cached file is to be used. 

857 

858 This method does not force the cache to be re-scanned and so can miss 

859 cached datasets that have recently been written by other processes. 

860 """ 

861 if self._cache_directory is None: 

862 return False 

863 if self.file_count == 0: 

864 return False 

865 

866 if extension is None: 

867 # Look solely for matching dataset ref ID and not specific 

868 # components. 

869 cached_paths = self._cache_entries.get_dataset_keys(ref.id) 

870 return True if cached_paths else False 

871 

872 else: 

873 # Extension is known so we can do an explicit look up for the 

874 # cache entry. 

875 cached_location = self._construct_cache_name(ref, extension) 

876 path_in_cache = cached_location.relative_to(self.cache_directory) 

877 assert path_in_cache is not None # For mypy 

878 return path_in_cache in self._cache_entries 

879 

880 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

881 """Remove the specified cache entries from cache. 

882 

883 Parameters 

884 ---------- 

885 cache_entries : iterable of `str` 

886 The entries to remove from the cache. The values are the path 

887 within the cache. 

888 """ 

889 for entry in cache_entries: 

890 path = self.cache_directory.join(entry) 

891 

892 self._cache_entries.pop(entry, None) 

893 log.debug("Removing file from cache: %s", path) 

894 try: 

895 path.remove() 

896 except FileNotFoundError: 

897 pass 

898 

899 def _expire_cache(self) -> None: 

900 """Expire the files in the cache. 

901 

902 Notes 

903 ----- 

904 The expiration modes are defined by the config or can be overridden. 

905 Available options: 

906 

907 * ``files``: Number of files. 

908 * ``datasets``: Number of datasets 

909 * ``size``: Total size of files. 

910 * ``age``: Age of files. 

911 

912 The first three would remove in reverse time order. 

913 Number of files is complicated by the possibility of disassembled 

914 composites where 10 small files can be created for each dataset. 

915 

916 Additionally there is a use case for an external user to explicitly 

917 state the dataset refs that should be cached and then when to 

918 remove them. Overriding any global configuration. 

919 """ 

920 if self._expiration_mode is None: 

921 # Expiration has been disabled. 

922 return 

923 

924 # mypy can't be sure we have set a threshold properly 

925 if self._expiration_threshold is None: 

926 log.warning( 

927 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

928 ) 

929 return 

930 

931 # Sync up cache. There is no file locking involved so for a shared 

932 # cache multiple processes may be racing to delete files. Deleting 

933 # a file that no longer exists is not an error. 

934 self.scan_cache() 

935 

936 if self._expiration_mode == "files": 

937 n_files = len(self._cache_entries) 

938 n_over = n_files - self._expiration_threshold 

939 if n_over > 0: 

940 sorted_keys = self._sort_cache() 

941 keys_to_remove = sorted_keys[:n_over] 

942 self._remove_from_cache(keys_to_remove) 

943 return 

944 

945 if self._expiration_mode == "datasets": 

946 # Count the datasets, in ascending timestamp order, 

947 # so that oldest turn up first. 

948 datasets = defaultdict(list) 

949 for key in self._sort_cache(): 

950 entry = self._cache_entries[key] 

951 datasets[entry.ref].append(key) 

952 

953 n_datasets = len(datasets) 

954 n_over = n_datasets - self._expiration_threshold 

955 if n_over > 0: 

956 # Keys will be read out in insert order which 

957 # will be date order so oldest ones are removed. 

958 ref_ids = list(datasets.keys())[:n_over] 

959 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

960 self._remove_from_cache(keys_to_remove) 

961 return 

962 

963 if self._expiration_mode == "size": 

964 if self.cache_size > self._expiration_threshold: 

965 for key in self._sort_cache(): 

966 self._remove_from_cache([key]) 

967 if self.cache_size <= self._expiration_threshold: 

968 break 

969 return 

970 

971 if self._expiration_mode == "age": 

972 now = datetime.datetime.utcnow() 

973 for key in self._sort_cache(): 

974 delta = now - self._cache_entries[key].ctime 

975 if delta.seconds > self._expiration_threshold: 

976 self._remove_from_cache([key]) 

977 else: 

978 # We're already in date order. 

979 break 

980 return 

981 

982 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

983 

984 def _sort_cache(self) -> List[str]: 

985 """Sort the cache entries by time and return the sorted keys. 

986 

987 Returns 

988 ------- 

989 sorted : `list` of `str` 

990 Keys into the cache, sorted by time with oldest first. 

991 """ 

992 

993 def sort_by_time(key: str) -> datetime.datetime: 

994 """Sorter key function using cache entry details.""" 

995 return self._cache_entries[key].ctime 

996 

997 return sorted(self._cache_entries, key=sort_by_time) 

998 

999 def __str__(self) -> str: 

1000 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

1001 return ( 

1002 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

1003 f"default={self._caching_default}) " 

1004 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

1005 ) 

1006 

1007 

1008class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

1009 """A variant of the datastore cache where no cache is enabled. 

1010 

1011 Parameters 

1012 ---------- 

1013 config : `str` or `DatastoreCacheManagerConfig` 

1014 Configuration to control caching. 

1015 universe : `DimensionUniverse` 

1016 Set of all known dimensions, used to expand and validate any used 

1017 in lookup keys. 

1018 """ 

1019 

1020 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

1021 return 

1022 

1023 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

1024 """Indicate whether the entity should be added to the cache. 

1025 

1026 Always returns `False`. 

1027 """ 

1028 return False 

1029 

1030 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

1031 """Move dataset to cache but always refuse and returns `None`.""" 

1032 return None 

1033 

1034 @contextlib.contextmanager 

1035 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

1036 """Look for a dataset in the cache and return its location. 

1037 

1038 Never finds a file. 

1039 """ 

1040 yield None 

1041 

1042 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

1043 """Remove datasets from cache. 

1044 

1045 Always does nothing. 

1046 """ 

1047 return 

1048 

1049 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

1050 """Report if a dataset is known to the cache. 

1051 

1052 Always returns `False`. 

1053 """ 

1054 return False 

1055 

1056 def __str__(self) -> str: 

1057 return f"{type(self).__name__}()"