Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 23%

403 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-10-26 15:15 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Cache management for a datastore.""" 

25 

26__all__ = ( 

27 "AbstractDatastoreCacheManager", 

28 "DatastoreDisabledCacheManager", 

29 "DatastoreCacheManager", 

30 "DatastoreCacheManagerConfig", 

31) 

32 

33import atexit 

34import contextlib 

35import datetime 

36import itertools 

37import logging 

38import os 

39import shutil 

40import tempfile 

41from abc import ABC, abstractmethod 

42from collections import defaultdict 

43from random import Random 

44from typing import ( 

45 TYPE_CHECKING, 

46 Dict, 

47 ItemsView, 

48 Iterable, 

49 Iterator, 

50 KeysView, 

51 List, 

52 Optional, 

53 Union, 

54 ValuesView, 

55) 

56 

57from lsst.resources import ResourcePath 

58from pydantic import BaseModel, PrivateAttr 

59 

60from .config import ConfigSubset 

61from .configSupport import processLookupConfigs 

62from .datasets import DatasetId, DatasetRef 

63 

64if TYPE_CHECKING: 64 ↛ 65line 64 didn't jump to line 65, because the condition on line 64 was never true

65 from .configSupport import LookupKey 

66 from .datasets import DatasetType 

67 from .dimensions import DimensionUniverse 

68 from .storageClass import StorageClass 

69 

70log = logging.getLogger(__name__) 

71 

72 

73def remove_cache_directory(directory: str) -> None: 

74 """Remove the specified directory and all its contents.""" 

75 log.debug("Removing temporary cache directory %s", directory) 

76 shutil.rmtree(directory, ignore_errors=True) 

77 

78 

79def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath: 

80 """Construct the full path to use for this dataset in the cache. 

81 

82 Parameters 

83 ---------- 

84 ref : `DatasetRef` 

85 The dataset to look up in or write to the cache. 

86 extension : `str` 

87 File extension to use for this file. Should include the 

88 leading "``.``". 

89 

90 Returns 

91 ------- 

92 uri : `lsst.resources.ResourcePath` 

93 URI to use for this dataset in the cache. 

94 """ 

95 # Dataset type component is needed in the name if composite 

96 # disassembly is happening since the ID is shared for all components. 

97 component = ref.datasetType.component() 

98 component = f"_{component}" if component else "" 

99 return root.join(f"{ref.id}{component}{extension}") 

100 

101 

102def _parse_cache_name(cached_location: str) -> Dict[str, Optional[str]]: 

103 """For a given cache name, return its component parts. 

104 

105 Changes to ``_construct_cache_path()`` should be reflected here. 

106 

107 Parameters 

108 ---------- 

109 cached_location : `str` 

110 The name of the file within the cache. 

111 

112 Returns 

113 ------- 

114 parsed : `dict` of `str`, `str` 

115 Parsed components of the file. These include: 

116 - "id": The dataset ID, 

117 - "component": The name of the component (can be `None`), 

118 - "extension": File extension (can be `None`). 

119 """ 

120 # Assume first dot is the extension and so allow .fits.gz 

121 root_ext = cached_location.split(".", maxsplit=1) 

122 root = root_ext.pop(0) 

123 ext = "." + root_ext.pop(0) if root_ext else None 

124 

125 parts = root.split("_") 

126 id_ = parts.pop(0) 

127 component = parts.pop(0) if parts else None 

128 return {"id": id_, "component": component, "extension": ext} 

129 

130 

131class CacheEntry(BaseModel): 

132 """Represent an entry in the cache.""" 

133 

134 name: str 

135 """Name of the file.""" 

136 

137 size: int 

138 """Size of the file in bytes.""" 

139 

140 ctime: datetime.datetime 

141 """Creation time of the file.""" 

142 

143 ref: DatasetId 

144 """ID of this dataset.""" 

145 

146 component: Optional[str] 

147 """Component for this disassembled composite (optional).""" 

148 

149 @classmethod 

150 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry: 

151 """Construct an object from a file name. 

152 

153 Parameters 

154 ---------- 

155 file : `lsst.resources.ResourcePath` 

156 Path to the file. 

157 root : `lsst.resources.ResourcePath` 

158 Cache root directory. 

159 """ 

160 file_in_cache = file.relative_to(root) 

161 if file_in_cache is None: 

162 raise ValueError(f"Supplied file {file} is not inside root {root}") 

163 parts = _parse_cache_name(file_in_cache) 

164 

165 stat = os.stat(file.ospath) 

166 return cls( 

167 name=file_in_cache, 

168 size=stat.st_size, 

169 ref=parts["id"], 

170 component=parts["component"], 

171 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime), 

172 ) 

173 

174 

175class _MarkerEntry(CacheEntry): 

176 pass 

177 

178 

179class CacheRegistry(BaseModel): 

180 """Collection of cache entries.""" 

181 

182 _size: int = PrivateAttr(0) 

183 """Size of the cache.""" 

184 

185 _entries: Dict[str, CacheEntry] = PrivateAttr({}) 

186 """Internal collection of cache entries.""" 

187 

188 _ref_map: Dict[DatasetId, List[str]] = PrivateAttr({}) 

189 """Mapping of DatasetID to corresponding keys in cache registry.""" 

190 

191 @property 

192 def cache_size(self) -> int: 

193 return self._size 

194 

195 def __getitem__(self, key: str) -> CacheEntry: 

196 return self._entries[key] 

197 

198 def __setitem__(self, key: str, entry: CacheEntry) -> None: 

199 self._size += entry.size 

200 self._entries[key] = entry 

201 

202 # Update the mapping from ref to path. 

203 if entry.ref not in self._ref_map: 

204 self._ref_map[entry.ref] = [] 

205 self._ref_map[entry.ref].append(key) 

206 

207 def __delitem__(self, key: str) -> None: 

208 entry = self._entries.pop(key) 

209 self._decrement(entry) 

210 self._ref_map[entry.ref].remove(key) 

211 

212 def _decrement(self, entry: Optional[CacheEntry]) -> None: 

213 if entry: 

214 self._size -= entry.size 

215 if self._size < 0: 

216 log.warning("Cache size has gone negative. Inconsistent cache records...") 

217 self._size = 0 

218 

219 def __contains__(self, key: str) -> bool: 

220 return key in self._entries 

221 

222 def __len__(self) -> int: 

223 return len(self._entries) 

224 

225 def __iter__(self) -> Iterator[str]: # type: ignore 

226 return iter(self._entries) 

227 

228 def keys(self) -> KeysView[str]: 

229 return self._entries.keys() 

230 

231 def values(self) -> ValuesView[CacheEntry]: 

232 return self._entries.values() 

233 

234 def items(self) -> ItemsView[str, CacheEntry]: 

235 return self._entries.items() 

236 

237 # An private marker to indicate that pop() should raise if no default 

238 # is given. 

239 __marker = _MarkerEntry(name="marker", size=0, ref=0, ctime=datetime.datetime.utcfromtimestamp(0)) 

240 

241 def pop(self, key: str, default: Optional[CacheEntry] = __marker) -> Optional[CacheEntry]: 

242 # The marker for dict.pop is not the same as our marker. 

243 if default is self.__marker: 

244 entry = self._entries.pop(key) 

245 else: 

246 entry = self._entries.pop(key, self.__marker) 

247 # Should not attempt to correct for this entry being removed 

248 # if we got the default value. 

249 if entry is self.__marker: 

250 return default 

251 

252 self._decrement(entry) 

253 # The default entry given to this method may not even be in the cache. 

254 if entry and entry.ref in self._ref_map: 

255 keys = self._ref_map[entry.ref] 

256 if key in keys: 

257 keys.remove(key) 

258 return entry 

259 

260 def get_dataset_keys(self, dataset_id: Optional[DatasetId]) -> Optional[List[str]]: 

261 """Retrieve all keys associated with the given dataset ID. 

262 

263 Parameters 

264 ---------- 

265 dataset_id : `DatasetId` or `None` 

266 The dataset ID to look up. Returns `None` if the ID is `None`. 

267 

268 Returns 

269 ------- 

270 keys : `list` [`str`] 

271 Keys associated with this dataset. These keys can be used to lookup 

272 the cache entry information in the `CacheRegistry`. Returns 

273 `None` if the dataset is not known to the cache. 

274 """ 

275 if dataset_id not in self._ref_map: 

276 return None 

277 keys = self._ref_map[dataset_id] 

278 if not keys: 

279 return None 

280 return keys 

281 

282 

283class DatastoreCacheManagerConfig(ConfigSubset): 

284 """Configuration information for `DatastoreCacheManager`.""" 

285 

286 component = "cached" 

287 requiredKeys = ("cacheable",) 

288 

289 

290class AbstractDatastoreCacheManager(ABC): 

291 """An abstract base class for managing caching in a Datastore. 

292 

293 Parameters 

294 ---------- 

295 config : `str` or `DatastoreCacheManagerConfig` 

296 Configuration to control caching. 

297 universe : `DimensionUniverse` 

298 Set of all known dimensions, used to expand and validate any used 

299 in lookup keys. 

300 """ 

301 

302 @property 

303 def cache_size(self) -> int: 

304 """Size of the cache in bytes.""" 

305 return 0 

306 

307 @property 

308 def file_count(self) -> int: 

309 """Return number of cached files tracked by registry.""" 

310 return 0 

311 

312 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

313 if not isinstance(config, DatastoreCacheManagerConfig): 

314 config = DatastoreCacheManagerConfig(config) 

315 assert isinstance(config, DatastoreCacheManagerConfig) 

316 self.config = config 

317 

318 @abstractmethod 

319 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

320 """Indicate whether the entity should be added to the cache. 

321 

322 This is relevant when reading or writing. 

323 

324 Parameters 

325 ---------- 

326 entity : `StorageClass` or `DatasetType` or `DatasetRef` 

327 Thing to test against the configuration. The ``name`` property 

328 is used to determine a match. A `DatasetType` will first check 

329 its name, before checking its `StorageClass`. If there are no 

330 matches the default will be returned. 

331 

332 Returns 

333 ------- 

334 should_cache : `bool` 

335 Returns `True` if the dataset should be cached; `False` otherwise. 

336 """ 

337 raise NotImplementedError() 

338 

339 @abstractmethod 

340 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

341 """Report if the dataset is known to the cache. 

342 

343 Parameters 

344 ---------- 

345 ref : `DatasetRef` 

346 Dataset to check for in the cache. 

347 extension : `str`, optional 

348 File extension expected. Should include the leading "``.``". 

349 If `None` the extension is ignored and the dataset ID alone is 

350 used to check in the cache. The extension must be defined if 

351 a specific component is being checked. 

352 

353 Returns 

354 ------- 

355 known : `bool` 

356 Returns `True` if the dataset is currently known to the cache 

357 and `False` otherwise. 

358 

359 Notes 

360 ----- 

361 This method can only report if the dataset is known to the cache 

362 in this specific instant and does not indicate whether the file 

363 can be read from the cache later. `find_in_cache()` should be called 

364 if the cached file is to be used. 

365 """ 

366 raise NotImplementedError() 

367 

368 @abstractmethod 

369 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

370 """Move a file to the cache. 

371 

372 Move the given file into the cache, using the supplied DatasetRef 

373 for naming. A call is made to `should_be_cached()` and if the 

374 DatasetRef should not be accepted `None` will be returned. 

375 

376 Cache expiry can occur during this. 

377 

378 Parameters 

379 ---------- 

380 uri : `lsst.resources.ResourcePath` 

381 Location of the file to be relocated to the cache. Will be moved. 

382 ref : `DatasetRef` 

383 Ref associated with this file. Will be used to determine the name 

384 of the file within the cache. 

385 

386 Returns 

387 ------- 

388 new : `lsst.resources.ResourcePath` or `None` 

389 URI to the file within the cache, or `None` if the dataset 

390 was not accepted by the cache. 

391 """ 

392 raise NotImplementedError() 

393 

394 @abstractmethod 

395 @contextlib.contextmanager 

396 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

397 """Look for a dataset in the cache and return its location. 

398 

399 Parameters 

400 ---------- 

401 ref : `DatasetRef` 

402 Dataset to locate in the cache. 

403 extension : `str` 

404 File extension expected. Should include the leading "``.``". 

405 

406 Yields 

407 ------ 

408 uri : `lsst.resources.ResourcePath` or `None` 

409 The URI to the cached file, or `None` if the file has not been 

410 cached. 

411 

412 Notes 

413 ----- 

414 Should be used as a context manager in order to prevent this 

415 file from being removed from the cache for that context. 

416 """ 

417 raise NotImplementedError() 

418 

419 @abstractmethod 

420 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

421 """Remove the specified datasets from the cache. 

422 

423 It is not an error for these datasets to be missing from the cache. 

424 

425 Parameters 

426 ---------- 

427 ref : `DatasetRef` or iterable of `DatasetRef` 

428 The datasets to remove from the cache. 

429 """ 

430 raise NotImplementedError() 

431 

432 @abstractmethod 

433 def __str__(self) -> str: 

434 raise NotImplementedError() 

435 

436 

437class DatastoreCacheManager(AbstractDatastoreCacheManager): 

438 """A class for managing caching in a Datastore using local files. 

439 

440 Parameters 

441 ---------- 

442 config : `str` or `DatastoreCacheManagerConfig` 

443 Configuration to control caching. 

444 universe : `DimensionUniverse` 

445 Set of all known dimensions, used to expand and validate any used 

446 in lookup keys. 

447 

448 Notes 

449 ----- 

450 Two environment variables can be used to override the cache directory 

451 and expiration configuration: 

452 

453 * ``$DAF_BUTLER_CACHE_DIRECTORY`` 

454 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE`` 

455 

456 The expiration mode should take the form ``mode=threshold`` so for 

457 example to configure expiration to limit the cache directory to 5 datasets 

458 the value would be ``datasets=5``. 

459 

460 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment 

461 variable can be used to indicate that this directory should be used 

462 if no explicit directory has been specified from configuration or from 

463 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable. 

464 """ 

465 

466 _temp_exemption_prefix = "exempt/" 

467 _tmpdir_prefix = "butler-cache-dir-" 

468 

469 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

470 super().__init__(config, universe) 

471 

472 # Set cache directory if it pre-exists, else defer creation until 

473 # requested. Allow external override from environment. 

474 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root") 

475 

476 # Allow the execution environment to override the default values 

477 # so long as no default value has been set from the line above. 

478 if root is None: 

479 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET") 

480 

481 self._cache_directory = ( 

482 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None 

483 ) 

484 

485 if self._cache_directory: 

486 if not self._cache_directory.isLocal: 

487 raise ValueError( 

488 f"Cache directory must be on a local file system. Got: {self._cache_directory}" 

489 ) 

490 # Ensure that the cache directory is created. We assume that 

491 # someone specifying a permanent cache directory will be expecting 

492 # it to always be there. This will also trigger an error 

493 # early rather than waiting until the cache is needed. 

494 self._cache_directory.mkdir() 

495 

496 # Calculate the caching lookup table. 

497 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe) 

498 

499 # Default decision to for whether a dataset should be cached. 

500 self._caching_default = self.config.get("default", False) 

501 

502 # Expiration mode. Read from config but allow override from 

503 # the environment. 

504 expiration_mode = self.config.get(("expiry", "mode")) 

505 threshold = self.config.get(("expiry", "threshold")) 

506 

507 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE") 

508 if external_mode and "=" in external_mode: 

509 expiration_mode, expiration_threshold = external_mode.split("=", 1) 

510 threshold = int(expiration_threshold) 

511 if expiration_mode is None: 

512 # Force to None to avoid confusion. 

513 threshold = None 

514 

515 self._expiration_mode: Optional[str] = expiration_mode 

516 self._expiration_threshold: Optional[int] = threshold 

517 if self._expiration_threshold is None and self._expiration_mode is not None: 

518 raise ValueError( 

519 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}" 

520 ) 

521 

522 log.debug( 

523 "Cache configuration:\n- root: %s\n- expiration mode: %s", 

524 self._cache_directory if self._cache_directory else "tmpdir", 

525 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled", 

526 ) 

527 

528 # Files in cache, indexed by path within the cache directory. 

529 self._cache_entries = CacheRegistry() 

530 

531 @property 

532 def cache_directory(self) -> ResourcePath: 

533 if self._cache_directory is None: 

534 # Create on demand. Allow the override environment variable 

535 # to be used in case it got set after this object was created 

536 # but before a cache was used. 

537 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

538 # Someone else will clean this up. 

539 isTemporary = False 

540 msg = "deferred fallback" 

541 else: 

542 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix) 

543 isTemporary = True 

544 msg = "temporary" 

545 

546 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary) 

547 log.debug("Using %s cache directory at %s", msg, self._cache_directory) 

548 

549 # Remove when we no longer need it. 

550 if isTemporary: 

551 atexit.register(remove_cache_directory, self._cache_directory.ospath) 

552 return self._cache_directory 

553 

554 @property 

555 def _temp_exempt_directory(self) -> ResourcePath: 

556 """Return the directory in which to store temporary cache files that 

557 should not be expired. 

558 """ 

559 return self.cache_directory.join(self._temp_exemption_prefix) 

560 

561 @property 

562 def cache_size(self) -> int: 

563 return self._cache_entries.cache_size 

564 

565 @property 

566 def file_count(self) -> int: 

567 return len(self._cache_entries) 

568 

569 @classmethod 

570 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]: 

571 """Defines a fallback cache directory if a fallback not set already. 

572 

573 Returns 

574 ------- 

575 defined : `bool` 

576 `True` if the fallback directory was newly-defined in this method. 

577 `False` if it had already been set. 

578 cache_dir : `str` 

579 Returns the path to the cache directory that will be used if it's 

580 needed. This can allow the caller to run a directory cleanup 

581 when it's no longer needed (something that the cache manager 

582 can not do because forks should not clean up directories defined 

583 by the parent process). 

584 

585 Notes 

586 ----- 

587 The fallback directory will not be defined if one has already been 

588 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` 

589 environment variable only if a value has not previously been stored 

590 in that environment variable. Setting the environment variable allows 

591 this value to survive into spawned subprocesses. Calling this method 

592 will lead to all subsequently created cache managers sharing the same 

593 cache. 

594 """ 

595 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"): 

596 # A value has already been set. 

597 return (False, cache_dir) 

598 

599 # As a class method, we do not know at this point whether a cache 

600 # directory will be needed so it would be impolite to create a 

601 # directory that will never be used. 

602 

603 # Construct our own temp name -- 16 characters should have a fairly 

604 # low chance of clashing when combined with the process ID. 

605 characters = "abcdefghijklmnopqrstuvwxyz0123456789_" 

606 rng = Random() 

607 tempchars = "".join(rng.choice(characters) for _ in range(16)) 

608 

609 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}" 

610 

611 cache_dir = os.path.join(tempfile.gettempdir(), tempname) 

612 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir 

613 return (True, cache_dir) 

614 

615 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

616 # Docstring inherited 

617 matchName: Union[LookupKey, str] = "{} (via default)".format(entity) 

618 should_cache = self._caching_default 

619 

620 for key in entity._lookupNames(): 

621 if key in self._lut: 

622 should_cache = bool(self._lut[key]) 

623 matchName = key 

624 break 

625 

626 if not isinstance(should_cache, bool): 

627 raise TypeError( 

628 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool." 

629 ) 

630 

631 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not") 

632 return should_cache 

633 

634 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath: 

635 """Construct the name to use for this dataset in the cache. 

636 

637 Parameters 

638 ---------- 

639 ref : `DatasetRef` 

640 The dataset to look up in or write to the cache. 

641 extension : `str` 

642 File extension to use for this file. Should include the 

643 leading "``.``". 

644 

645 Returns 

646 ------- 

647 uri : `lsst.resources.ResourcePath` 

648 URI to use for this dataset in the cache. 

649 """ 

650 return _construct_cache_path(self.cache_directory, ref, extension) 

651 

652 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

653 # Docstring inherited 

654 if ref.id is None: 

655 raise ValueError(f"Can not cache a file associated with an unresolved reference ({ref})") 

656 

657 if not self.should_be_cached(ref): 

658 return None 

659 

660 # Write the file using the id of the dataset ref and the file 

661 # extension. 

662 cached_location = self._construct_cache_name(ref, uri.getExtension()) 

663 

664 # Run cache expiry to ensure that we have room for this 

665 # item. 

666 self._expire_cache() 

667 

668 # The above reset the in-memory cache status. It's entirely possible 

669 # that another process has just cached this file (if multiple 

670 # processes are caching on read), so check our in-memory cache 

671 # before attempting to cache the dataset. 

672 path_in_cache = cached_location.relative_to(self.cache_directory) 

673 if path_in_cache and path_in_cache in self._cache_entries: 

674 return cached_location 

675 

676 # Move into the cache. Given that multiple processes might be 

677 # sharing a single cache directory, and the file we need might have 

678 # been copied in whilst we were checking, allow overwrite without 

679 # complaint. Even for a private cache directory it is possible that 

680 # a second butler in a subprocess could be writing to it. 

681 cached_location.transfer_from(uri, transfer="move", overwrite=True) 

682 log.debug("Cached dataset %s to %s", ref, cached_location) 

683 

684 self._register_cache_entry(cached_location) 

685 

686 return cached_location 

687 

688 @contextlib.contextmanager 

689 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

690 # Docstring inherited 

691 # Short circuit this if the cache directory has not been created yet. 

692 if self._cache_directory is None: 

693 yield None 

694 return 

695 

696 cached_location = self._construct_cache_name(ref, extension) 

697 if cached_location.exists(): 

698 log.debug("Found cached file %s for dataset %s.", cached_location, ref) 

699 

700 # The cached file could be removed by another process doing 

701 # cache expiration so we need to protect against that by making 

702 # a copy in a different tree. Use hardlinks to ensure that 

703 # we either have the cached file or we don't. This is robust 

704 # against race conditions that can be caused by using soft links 

705 # and the other end of the link being deleted just after it 

706 # is created. 

707 path_in_cache = cached_location.relative_to(self.cache_directory) 

708 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory" 

709 temp_location: Optional[ResourcePath] = self._temp_exempt_directory.join(path_in_cache) 

710 try: 

711 if temp_location is not None: 

712 temp_location.transfer_from(cached_location, transfer="hardlink") 

713 except Exception as e: 

714 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e) 

715 # Any failure will be treated as if the file was not 

716 # in the cache. Yielding the original cache location 

717 # is too dangerous. 

718 temp_location = None 

719 

720 try: 

721 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref) 

722 yield temp_location 

723 finally: 

724 try: 

725 if temp_location: 

726 temp_location.remove() 

727 except FileNotFoundError: 

728 pass 

729 return 

730 

731 log.debug("Dataset %s not found in cache.", ref) 

732 yield None 

733 return 

734 

735 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

736 # Docstring inherited. 

737 

738 # Stop early if there are no cache entries anyhow. 

739 if len(self._cache_entries) == 0: 

740 return 

741 

742 if isinstance(refs, DatasetRef): 

743 refs = [refs] 

744 

745 # Create a set of all the IDs 

746 all_ids = {ref.getCheckedId() for ref in refs} 

747 

748 keys_to_remove = [] 

749 for key, entry in self._cache_entries.items(): 

750 if entry.ref in all_ids: 

751 keys_to_remove.append(key) 

752 self._remove_from_cache(keys_to_remove) 

753 

754 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> Optional[str]: 

755 """Record the file in the cache registry. 

756 

757 Parameters 

758 ---------- 

759 cached_location : `lsst.resources.ResourcePath` 

760 Location of the file to be registered. 

761 can_exist : `bool`, optional 

762 If `True` the item being registered can already be listed. 

763 This can allow a cache refresh to run without checking the 

764 file again. If `False` it is an error for the registry to 

765 already know about this file. 

766 

767 Returns 

768 ------- 

769 cache_key : `str` or `None` 

770 The key used in the registry for this file. `None` if the file 

771 no longer exists (it could have been expired by another process). 

772 """ 

773 path_in_cache = cached_location.relative_to(self.cache_directory) 

774 if path_in_cache is None: 

775 raise ValueError( 

776 f"Can not register cached file {cached_location} that is not within" 

777 f" the cache directory at {self.cache_directory}." 

778 ) 

779 if path_in_cache in self._cache_entries: 

780 if can_exist: 

781 return path_in_cache 

782 else: 

783 raise ValueError( 

784 f"Cached file {cached_location} is already known to the registry" 

785 " but this was expected to be a new file." 

786 ) 

787 try: 

788 details = CacheEntry.from_file(cached_location, root=self.cache_directory) 

789 except FileNotFoundError: 

790 return None 

791 self._cache_entries[path_in_cache] = details 

792 return path_in_cache 

793 

794 def scan_cache(self) -> None: 

795 """Scan the cache directory and record information about files.""" 

796 found = set() 

797 for file in ResourcePath.findFileResources([self.cache_directory]): 

798 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator" 

799 

800 # Skip any that are found in an exempt part of the hierarchy 

801 # since they should not be part of the registry. 

802 if file.relative_to(self._temp_exempt_directory) is not None: 

803 continue 

804 

805 path_in_cache = self._register_cache_entry(file, can_exist=True) 

806 if path_in_cache: 

807 found.add(path_in_cache) 

808 

809 # Find any files that were recorded in the cache but are no longer 

810 # on disk. (something else cleared them out?) 

811 known_to_cache = set(self._cache_entries) 

812 missing = known_to_cache - found 

813 

814 if missing: 

815 log.debug( 

816 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing) 

817 ) 

818 for path_in_cache in missing: 

819 self._cache_entries.pop(path_in_cache, None) 

820 

821 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

822 """Report if the dataset is known to the cache. 

823 

824 Parameters 

825 ---------- 

826 ref : `DatasetRef` 

827 Dataset to check for in the cache. 

828 extension : `str`, optional 

829 File extension expected. Should include the leading "``.``". 

830 If `None` the extension is ignored and the dataset ID alone is 

831 used to check in the cache. The extension must be defined if 

832 a specific component is being checked. 

833 

834 Returns 

835 ------- 

836 known : `bool` 

837 Returns `True` if the dataset is currently known to the cache 

838 and `False` otherwise. If the dataset refers to a component and 

839 an extension is given then only that component is checked. 

840 

841 Notes 

842 ----- 

843 This method can only report if the dataset is known to the cache 

844 in this specific instant and does not indicate whether the file 

845 can be read from the cache later. `find_in_cache()` should be called 

846 if the cached file is to be used. 

847 

848 This method does not force the cache to be re-scanned and so can miss 

849 cached datasets that have recently been written by other processes. 

850 """ 

851 if self._cache_directory is None: 

852 return False 

853 if self.file_count == 0: 

854 return False 

855 

856 if extension is None: 

857 # Look solely for matching dataset ref ID and not specific 

858 # components. 

859 cached_paths = self._cache_entries.get_dataset_keys(ref.id) 

860 return True if cached_paths else False 

861 

862 else: 

863 # Extension is known so we can do an explicit look up for the 

864 # cache entry. 

865 cached_location = self._construct_cache_name(ref, extension) 

866 path_in_cache = cached_location.relative_to(self.cache_directory) 

867 assert path_in_cache is not None # For mypy 

868 return path_in_cache in self._cache_entries 

869 

870 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None: 

871 """Remove the specified cache entries from cache. 

872 

873 Parameters 

874 ---------- 

875 cache_entries : iterable of `str` 

876 The entries to remove from the cache. The values are the path 

877 within the cache. 

878 """ 

879 for entry in cache_entries: 

880 path = self.cache_directory.join(entry) 

881 

882 self._cache_entries.pop(entry, None) 

883 log.debug("Removing file from cache: %s", path) 

884 try: 

885 path.remove() 

886 except FileNotFoundError: 

887 pass 

888 

889 def _expire_cache(self) -> None: 

890 """Expire the files in the cache. 

891 

892 Notes 

893 ----- 

894 The expiration modes are defined by the config or can be overridden. 

895 Available options: 

896 

897 * ``files``: Number of files. 

898 * ``datasets``: Number of datasets 

899 * ``size``: Total size of files. 

900 * ``age``: Age of files. 

901 

902 The first three would remove in reverse time order. 

903 Number of files is complicated by the possibility of disassembled 

904 composites where 10 small files can be created for each dataset. 

905 

906 Additionally there is a use case for an external user to explicitly 

907 state the dataset refs that should be cached and then when to 

908 remove them. Overriding any global configuration. 

909 """ 

910 if self._expiration_mode is None: 

911 # Expiration has been disabled. 

912 return 

913 

914 # mypy can't be sure we have set a threshold properly 

915 if self._expiration_threshold is None: 

916 log.warning( 

917 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode 

918 ) 

919 return 

920 

921 # Sync up cache. There is no file locking involved so for a shared 

922 # cache multiple processes may be racing to delete files. Deleting 

923 # a file that no longer exists is not an error. 

924 self.scan_cache() 

925 

926 if self._expiration_mode == "files": 

927 n_files = len(self._cache_entries) 

928 n_over = n_files - self._expiration_threshold 

929 if n_over > 0: 

930 sorted_keys = self._sort_cache() 

931 keys_to_remove = sorted_keys[:n_over] 

932 self._remove_from_cache(keys_to_remove) 

933 return 

934 

935 if self._expiration_mode == "datasets": 

936 # Count the datasets, in ascending timestamp order, 

937 # so that oldest turn up first. 

938 datasets = defaultdict(list) 

939 for key in self._sort_cache(): 

940 entry = self._cache_entries[key] 

941 datasets[entry.ref].append(key) 

942 

943 n_datasets = len(datasets) 

944 n_over = n_datasets - self._expiration_threshold 

945 if n_over > 0: 

946 # Keys will be read out in insert order which 

947 # will be date order so oldest ones are removed. 

948 ref_ids = list(datasets.keys())[:n_over] 

949 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids)) 

950 self._remove_from_cache(keys_to_remove) 

951 return 

952 

953 if self._expiration_mode == "size": 

954 if self.cache_size > self._expiration_threshold: 

955 for key in self._sort_cache(): 

956 self._remove_from_cache([key]) 

957 if self.cache_size <= self._expiration_threshold: 

958 break 

959 return 

960 

961 if self._expiration_mode == "age": 

962 now = datetime.datetime.utcnow() 

963 for key in self._sort_cache(): 

964 delta = now - self._cache_entries[key].ctime 

965 if delta.seconds > self._expiration_threshold: 

966 self._remove_from_cache([key]) 

967 else: 

968 # We're already in date order. 

969 break 

970 return 

971 

972 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}") 

973 

974 def _sort_cache(self) -> List[str]: 

975 """Sort the cache entries by time and return the sorted keys. 

976 

977 Returns 

978 ------- 

979 sorted : `list` of `str` 

980 Keys into the cache, sorted by time with oldest first. 

981 """ 

982 

983 def sort_by_time(key: str) -> datetime.datetime: 

984 """Sorter key function using cache entry details.""" 

985 return self._cache_entries[key].ctime 

986 

987 return sorted(self._cache_entries, key=sort_by_time) 

988 

989 def __str__(self) -> str: 

990 cachedir = self._cache_directory if self._cache_directory else "<tempdir>" 

991 return ( 

992 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold}," 

993 f"default={self._caching_default}) " 

994 f"n_files={self.file_count}, n_bytes={self.cache_size}" 

995 ) 

996 

997 

998class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): 

999 """A variant of the datastore cache where no cache is enabled. 

1000 

1001 Parameters 

1002 ---------- 

1003 config : `str` or `DatastoreCacheManagerConfig` 

1004 Configuration to control caching. 

1005 universe : `DimensionUniverse` 

1006 Set of all known dimensions, used to expand and validate any used 

1007 in lookup keys. 

1008 """ 

1009 

1010 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse): 

1011 return 

1012 

1013 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool: 

1014 """Indicate whether the entity should be added to the cache. 

1015 

1016 Always returns `False`. 

1017 """ 

1018 return False 

1019 

1020 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]: 

1021 """Move dataset to cache but always refuse and returns `None`.""" 

1022 return None 

1023 

1024 @contextlib.contextmanager 

1025 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]: 

1026 """Look for a dataset in the cache and return its location. 

1027 

1028 Never finds a file. 

1029 """ 

1030 yield None 

1031 

1032 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None: 

1033 """Remove datasets from cache. 

1034 

1035 Always does nothing. 

1036 """ 

1037 return 

1038 

1039 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool: 

1040 """Report if a dataset is known to the cache. 

1041 

1042 Always returns `False`. 

1043 """ 

1044 return False 

1045 

1046 def __str__(self) -> str: 

1047 return f"{type(self).__name__}()"