Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 29%
394 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Cache management for a datastore."""
24from __future__ import annotations
26__all__ = (
27 "AbstractDatastoreCacheManager",
28 "DatastoreDisabledCacheManager",
29 "DatastoreCacheManager",
30 "DatastoreCacheManagerConfig",
31)
33import atexit
34import contextlib
35import datetime
36import itertools
37import logging
38import os
39import shutil
40import tempfile
41import uuid
42from abc import ABC, abstractmethod
43from collections import defaultdict
44from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView
45from random import Random
46from typing import TYPE_CHECKING
48from lsst.daf.butler._compat import _BaseModelCompat
49from lsst.resources import ResourcePath
50from pydantic import PrivateAttr
52from .config import ConfigSubset
53from .configSupport import processLookupConfigs
54from .datasets import DatasetId, DatasetRef
56if TYPE_CHECKING:
57 from .configSupport import LookupKey
58 from .datasets import DatasetType
59 from .dimensions import DimensionUniverse
60 from .storageClass import StorageClass
62log = logging.getLogger(__name__)
65def remove_cache_directory(directory: str) -> None:
66 """Remove the specified directory and all its contents."""
67 log.debug("Removing temporary cache directory %s", directory)
68 shutil.rmtree(directory, ignore_errors=True)
71def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath:
72 """Construct the full path to use for this dataset in the cache.
74 Parameters
75 ----------
76 ref : `DatasetRef`
77 The dataset to look up in or write to the cache.
78 extension : `str`
79 File extension to use for this file. Should include the
80 leading "``.``".
82 Returns
83 -------
84 uri : `lsst.resources.ResourcePath`
85 URI to use for this dataset in the cache.
86 """
87 # Dataset type component is needed in the name if composite
88 # disassembly is happening since the ID is shared for all components.
89 component = ref.datasetType.component()
90 component = f"_{component}" if component else ""
91 return root.join(f"{ref.id}{component}{extension}")
94def _parse_cache_name(cached_location: str) -> tuple[uuid.UUID, str | None, str | None]:
95 """For a given cache name, return its component parts.
97 Changes to ``_construct_cache_path()`` should be reflected here.
99 Parameters
100 ----------
101 cached_location : `str`
102 The name of the file within the cache.
104 Returns
105 -------
106 id : `uuid.UUID`
107 The dataset ID.
108 component : `str` or `None`
109 The name of the component, if present.
110 extension: `str` or `None`
111 The file extension, if present.
112 """
113 # Assume first dot is the extension and so allow .fits.gz
114 root_ext = cached_location.split(".", maxsplit=1)
115 root = root_ext.pop(0)
116 ext = "." + root_ext.pop(0) if root_ext else None
118 parts = root.split("_")
119 id_ = uuid.UUID(parts.pop(0))
120 component = parts.pop(0) if parts else None
121 return id_, component, ext
124class CacheEntry(_BaseModelCompat):
125 """Represent an entry in the cache."""
127 name: str
128 """Name of the file."""
130 size: int
131 """Size of the file in bytes."""
133 ctime: datetime.datetime
134 """Creation time of the file."""
136 ref: DatasetId
137 """ID of this dataset."""
139 component: str | None = None
140 """Component for this disassembled composite (optional)."""
142 @classmethod
143 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry:
144 """Construct an object from a file name.
146 Parameters
147 ----------
148 file : `lsst.resources.ResourcePath`
149 Path to the file.
150 root : `lsst.resources.ResourcePath`
151 Cache root directory.
152 """
153 file_in_cache = file.relative_to(root)
154 if file_in_cache is None:
155 raise ValueError(f"Supplied file {file} is not inside root {root}")
156 id_, component, _ = _parse_cache_name(file_in_cache)
158 stat = os.stat(file.ospath)
159 return cls(
160 name=file_in_cache,
161 size=stat.st_size,
162 ref=id_,
163 component=component,
164 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime),
165 )
168class _MarkerEntry(CacheEntry):
169 pass
172class CacheRegistry(_BaseModelCompat):
173 """Collection of cache entries."""
175 _size: int = PrivateAttr(0)
176 """Size of the cache."""
178 _entries: dict[str, CacheEntry] = PrivateAttr({})
179 """Internal collection of cache entries."""
181 _ref_map: dict[DatasetId, list[str]] = PrivateAttr({})
182 """Mapping of DatasetID to corresponding keys in cache registry."""
184 @property
185 def cache_size(self) -> int:
186 return self._size
188 def __getitem__(self, key: str) -> CacheEntry:
189 return self._entries[key]
191 def __setitem__(self, key: str, entry: CacheEntry) -> None:
192 self._size += entry.size
193 self._entries[key] = entry
195 # Update the mapping from ref to path.
196 if entry.ref not in self._ref_map:
197 self._ref_map[entry.ref] = []
198 self._ref_map[entry.ref].append(key)
200 def __delitem__(self, key: str) -> None:
201 entry = self._entries.pop(key)
202 self._decrement(entry)
203 self._ref_map[entry.ref].remove(key)
205 def _decrement(self, entry: CacheEntry | None) -> None:
206 if entry:
207 self._size -= entry.size
208 if self._size < 0:
209 log.warning("Cache size has gone negative. Inconsistent cache records...")
210 self._size = 0
212 def __contains__(self, key: str) -> bool:
213 return key in self._entries
215 def __len__(self) -> int:
216 return len(self._entries)
218 def __iter__(self) -> Iterator[str]: # type: ignore
219 return iter(self._entries)
221 def keys(self) -> KeysView[str]:
222 return self._entries.keys()
224 def values(self) -> ValuesView[CacheEntry]:
225 return self._entries.values()
227 def items(self) -> ItemsView[str, CacheEntry]:
228 return self._entries.items()
230 # An private marker to indicate that pop() should raise if no default
231 # is given.
232 __marker = _MarkerEntry(
233 name="marker",
234 size=0,
235 ref=uuid.UUID("{00000000-0000-0000-0000-000000000000}"),
236 ctime=datetime.datetime.utcfromtimestamp(0),
237 )
239 def pop(self, key: str, default: CacheEntry | None = __marker) -> CacheEntry | None:
240 # The marker for dict.pop is not the same as our marker.
241 if default is self.__marker:
242 entry = self._entries.pop(key)
243 else:
244 entry = self._entries.pop(key, self.__marker)
245 # Should not attempt to correct for this entry being removed
246 # if we got the default value.
247 if entry is self.__marker:
248 return default
250 self._decrement(entry)
251 # The default entry given to this method may not even be in the cache.
252 if entry and entry.ref in self._ref_map:
253 keys = self._ref_map[entry.ref]
254 if key in keys:
255 keys.remove(key)
256 return entry
258 def get_dataset_keys(self, dataset_id: DatasetId | None) -> list[str] | None:
259 """Retrieve all keys associated with the given dataset ID.
261 Parameters
262 ----------
263 dataset_id : `DatasetId` or `None`
264 The dataset ID to look up. Returns `None` if the ID is `None`.
266 Returns
267 -------
268 keys : `list` [`str`]
269 Keys associated with this dataset. These keys can be used to lookup
270 the cache entry information in the `CacheRegistry`. Returns
271 `None` if the dataset is not known to the cache.
272 """
273 if dataset_id not in self._ref_map:
274 return None
275 keys = self._ref_map[dataset_id]
276 if not keys:
277 return None
278 return keys
281class DatastoreCacheManagerConfig(ConfigSubset):
282 """Configuration information for `DatastoreCacheManager`."""
284 component = "cached"
285 requiredKeys = ("cacheable",)
288class AbstractDatastoreCacheManager(ABC):
289 """An abstract base class for managing caching in a Datastore.
291 Parameters
292 ----------
293 config : `str` or `DatastoreCacheManagerConfig`
294 Configuration to control caching.
295 universe : `DimensionUniverse`
296 Set of all known dimensions, used to expand and validate any used
297 in lookup keys.
298 """
300 @property
301 def cache_size(self) -> int:
302 """Size of the cache in bytes."""
303 return 0
305 @property
306 def file_count(self) -> int:
307 """Return number of cached files tracked by registry."""
308 return 0
310 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse):
311 if not isinstance(config, DatastoreCacheManagerConfig):
312 config = DatastoreCacheManagerConfig(config)
313 assert isinstance(config, DatastoreCacheManagerConfig)
314 self.config = config
316 @abstractmethod
317 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool:
318 """Indicate whether the entity should be added to the cache.
320 This is relevant when reading or writing.
322 Parameters
323 ----------
324 entity : `StorageClass` or `DatasetType` or `DatasetRef`
325 Thing to test against the configuration. The ``name`` property
326 is used to determine a match. A `DatasetType` will first check
327 its name, before checking its `StorageClass`. If there are no
328 matches the default will be returned.
330 Returns
331 -------
332 should_cache : `bool`
333 Returns `True` if the dataset should be cached; `False` otherwise.
334 """
335 raise NotImplementedError()
337 @abstractmethod
338 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool:
339 """Report if the dataset is known to the cache.
341 Parameters
342 ----------
343 ref : `DatasetRef`
344 Dataset to check for in the cache.
345 extension : `str`, optional
346 File extension expected. Should include the leading "``.``".
347 If `None` the extension is ignored and the dataset ID alone is
348 used to check in the cache. The extension must be defined if
349 a specific component is being checked.
351 Returns
352 -------
353 known : `bool`
354 Returns `True` if the dataset is currently known to the cache
355 and `False` otherwise.
357 Notes
358 -----
359 This method can only report if the dataset is known to the cache
360 in this specific instant and does not indicate whether the file
361 can be read from the cache later. `find_in_cache()` should be called
362 if the cached file is to be used.
363 """
364 raise NotImplementedError()
366 @abstractmethod
367 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None:
368 """Move a file to the cache.
370 Move the given file into the cache, using the supplied DatasetRef
371 for naming. A call is made to `should_be_cached()` and if the
372 DatasetRef should not be accepted `None` will be returned.
374 Cache expiry can occur during this.
376 Parameters
377 ----------
378 uri : `lsst.resources.ResourcePath`
379 Location of the file to be relocated to the cache. Will be moved.
380 ref : `DatasetRef`
381 Ref associated with this file. Will be used to determine the name
382 of the file within the cache.
384 Returns
385 -------
386 new : `lsst.resources.ResourcePath` or `None`
387 URI to the file within the cache, or `None` if the dataset
388 was not accepted by the cache.
389 """
390 raise NotImplementedError()
392 @abstractmethod
393 @contextlib.contextmanager
394 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]:
395 """Look for a dataset in the cache and return its location.
397 Parameters
398 ----------
399 ref : `DatasetRef`
400 Dataset to locate in the cache.
401 extension : `str`
402 File extension expected. Should include the leading "``.``".
404 Yields
405 ------
406 uri : `lsst.resources.ResourcePath` or `None`
407 The URI to the cached file, or `None` if the file has not been
408 cached.
410 Notes
411 -----
412 Should be used as a context manager in order to prevent this
413 file from being removed from the cache for that context.
414 """
415 raise NotImplementedError()
417 @abstractmethod
418 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None:
419 """Remove the specified datasets from the cache.
421 It is not an error for these datasets to be missing from the cache.
423 Parameters
424 ----------
425 ref : `DatasetRef` or iterable of `DatasetRef`
426 The datasets to remove from the cache.
427 """
428 raise NotImplementedError()
430 @abstractmethod
431 def __str__(self) -> str:
432 raise NotImplementedError()
435class DatastoreCacheManager(AbstractDatastoreCacheManager):
436 """A class for managing caching in a Datastore using local files.
438 Parameters
439 ----------
440 config : `str` or `DatastoreCacheManagerConfig`
441 Configuration to control caching.
442 universe : `DimensionUniverse`
443 Set of all known dimensions, used to expand and validate any used
444 in lookup keys.
446 Notes
447 -----
448 Two environment variables can be used to override the cache directory
449 and expiration configuration:
451 * ``$DAF_BUTLER_CACHE_DIRECTORY``
452 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE``
454 The expiration mode should take the form ``mode=threshold`` so for
455 example to configure expiration to limit the cache directory to 5 datasets
456 the value would be ``datasets=5``.
458 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment
459 variable can be used to indicate that this directory should be used
460 if no explicit directory has been specified from configuration or from
461 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable.
462 """
464 _temp_exemption_prefix = "exempt/"
465 _tmpdir_prefix = "butler-cache-dir-"
467 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse):
468 super().__init__(config, universe)
470 # Set cache directory if it pre-exists, else defer creation until
471 # requested. Allow external override from environment.
472 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root")
474 # Allow the execution environment to override the default values
475 # so long as no default value has been set from the line above.
476 if root is None:
477 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET")
479 self._cache_directory = (
480 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None
481 )
483 if self._cache_directory:
484 if not self._cache_directory.isLocal:
485 raise ValueError(
486 f"Cache directory must be on a local file system. Got: {self._cache_directory}"
487 )
488 # Ensure that the cache directory is created. We assume that
489 # someone specifying a permanent cache directory will be expecting
490 # it to always be there. This will also trigger an error
491 # early rather than waiting until the cache is needed.
492 self._cache_directory.mkdir()
494 # Calculate the caching lookup table.
495 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe)
497 # Default decision to for whether a dataset should be cached.
498 self._caching_default = self.config.get("default", False)
500 # Expiration mode. Read from config but allow override from
501 # the environment.
502 expiration_mode = self.config.get(("expiry", "mode"))
503 threshold = self.config.get(("expiry", "threshold"))
505 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE")
506 if external_mode and "=" in external_mode:
507 expiration_mode, expiration_threshold = external_mode.split("=", 1)
508 threshold = int(expiration_threshold)
509 if expiration_mode is None:
510 # Force to None to avoid confusion.
511 threshold = None
513 self._expiration_mode: str | None = expiration_mode
514 self._expiration_threshold: int | None = threshold
515 if self._expiration_threshold is None and self._expiration_mode is not None:
516 raise ValueError(
517 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}"
518 )
520 log.debug(
521 "Cache configuration:\n- root: %s\n- expiration mode: %s",
522 self._cache_directory if self._cache_directory else "tmpdir",
523 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled",
524 )
526 # Files in cache, indexed by path within the cache directory.
527 self._cache_entries = CacheRegistry()
529 @property
530 def cache_directory(self) -> ResourcePath:
531 if self._cache_directory is None:
532 # Create on demand. Allow the override environment variable
533 # to be used in case it got set after this object was created
534 # but before a cache was used.
535 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"):
536 # Someone else will clean this up.
537 isTemporary = False
538 msg = "deferred fallback"
539 else:
540 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix)
541 isTemporary = True
542 msg = "temporary"
544 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary)
545 log.debug("Using %s cache directory at %s", msg, self._cache_directory)
547 # Remove when we no longer need it.
548 if isTemporary:
549 atexit.register(remove_cache_directory, self._cache_directory.ospath)
550 return self._cache_directory
552 @property
553 def _temp_exempt_directory(self) -> ResourcePath:
554 """Return the directory in which to store temporary cache files that
555 should not be expired.
556 """
557 return self.cache_directory.join(self._temp_exemption_prefix)
559 @property
560 def cache_size(self) -> int:
561 return self._cache_entries.cache_size
563 @property
564 def file_count(self) -> int:
565 return len(self._cache_entries)
567 @classmethod
568 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]:
569 """Define a fallback cache directory if a fallback not set already.
571 Returns
572 -------
573 defined : `bool`
574 `True` if the fallback directory was newly-defined in this method.
575 `False` if it had already been set.
576 cache_dir : `str`
577 Returns the path to the cache directory that will be used if it's
578 needed. This can allow the caller to run a directory cleanup
579 when it's no longer needed (something that the cache manager
580 can not do because forks should not clean up directories defined
581 by the parent process).
583 Notes
584 -----
585 The fallback directory will not be defined if one has already been
586 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET``
587 environment variable only if a value has not previously been stored
588 in that environment variable. Setting the environment variable allows
589 this value to survive into spawned subprocesses. Calling this method
590 will lead to all subsequently created cache managers sharing the same
591 cache.
592 """
593 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"):
594 # A value has already been set.
595 return (False, cache_dir)
597 # As a class method, we do not know at this point whether a cache
598 # directory will be needed so it would be impolite to create a
599 # directory that will never be used.
601 # Construct our own temp name -- 16 characters should have a fairly
602 # low chance of clashing when combined with the process ID.
603 characters = "abcdefghijklmnopqrstuvwxyz0123456789_"
604 rng = Random()
605 tempchars = "".join(rng.choice(characters) for _ in range(16))
607 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}"
609 cache_dir = os.path.join(tempfile.gettempdir(), tempname)
610 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir
611 return (True, cache_dir)
613 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool:
614 # Docstring inherited
615 matchName: LookupKey | str = f"{entity} (via default)"
616 should_cache = self._caching_default
618 for key in entity._lookupNames():
619 if key in self._lut:
620 should_cache = bool(self._lut[key])
621 matchName = key
622 break
624 if not isinstance(should_cache, bool):
625 raise TypeError(
626 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool."
627 )
629 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not")
630 return should_cache
632 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath:
633 """Construct the name to use for this dataset in the cache.
635 Parameters
636 ----------
637 ref : `DatasetRef`
638 The dataset to look up in or write to the cache.
639 extension : `str`
640 File extension to use for this file. Should include the
641 leading "``.``".
643 Returns
644 -------
645 uri : `lsst.resources.ResourcePath`
646 URI to use for this dataset in the cache.
647 """
648 return _construct_cache_path(self.cache_directory, ref, extension)
650 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None:
651 # Docstring inherited
652 if not self.should_be_cached(ref):
653 return None
655 # Write the file using the id of the dataset ref and the file
656 # extension.
657 cached_location = self._construct_cache_name(ref, uri.getExtension())
659 # Run cache expiry to ensure that we have room for this
660 # item.
661 self._expire_cache()
663 # The above reset the in-memory cache status. It's entirely possible
664 # that another process has just cached this file (if multiple
665 # processes are caching on read), so check our in-memory cache
666 # before attempting to cache the dataset.
667 path_in_cache = cached_location.relative_to(self.cache_directory)
668 if path_in_cache and path_in_cache in self._cache_entries:
669 return cached_location
671 # Move into the cache. Given that multiple processes might be
672 # sharing a single cache directory, and the file we need might have
673 # been copied in whilst we were checking, allow overwrite without
674 # complaint. Even for a private cache directory it is possible that
675 # a second butler in a subprocess could be writing to it.
676 cached_location.transfer_from(uri, transfer="move", overwrite=True)
677 log.debug("Cached dataset %s to %s", ref, cached_location)
679 self._register_cache_entry(cached_location)
681 return cached_location
683 @contextlib.contextmanager
684 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]:
685 # Docstring inherited
686 # Short circuit this if the cache directory has not been created yet.
687 if self._cache_directory is None:
688 yield None
689 return
691 cached_location = self._construct_cache_name(ref, extension)
692 if cached_location.exists():
693 log.debug("Found cached file %s for dataset %s.", cached_location, ref)
695 # The cached file could be removed by another process doing
696 # cache expiration so we need to protect against that by making
697 # a copy in a different tree. Use hardlinks to ensure that
698 # we either have the cached file or we don't. This is robust
699 # against race conditions that can be caused by using soft links
700 # and the other end of the link being deleted just after it
701 # is created.
702 path_in_cache = cached_location.relative_to(self.cache_directory)
703 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory"
705 # Need to use a unique file name for the temporary location to
706 # ensure that two different processes can read the file
707 # simultaneously without one of them deleting it when it's in
708 # use elsewhere. Retain the original filename for easier debugging.
709 random = str(uuid.uuid4())[:8]
710 basename = cached_location.basename()
711 filename = f"{random}-{basename}"
713 temp_location: ResourcePath | None = self._temp_exempt_directory.join(filename)
714 try:
715 if temp_location is not None:
716 temp_location.transfer_from(cached_location, transfer="hardlink")
717 except Exception as e:
718 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e)
719 # Any failure will be treated as if the file was not
720 # in the cache. Yielding the original cache location
721 # is too dangerous.
722 temp_location = None
724 try:
725 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref)
726 yield temp_location
727 finally:
728 try:
729 if temp_location:
730 temp_location.remove()
731 except FileNotFoundError:
732 pass
733 return
735 log.debug("Dataset %s not found in cache.", ref)
736 yield None
737 return
739 def remove_from_cache(self, refs: DatasetRef | Iterable[DatasetRef]) -> None:
740 # Docstring inherited.
742 # Stop early if there are no cache entries anyhow.
743 if len(self._cache_entries) == 0:
744 return
746 if isinstance(refs, DatasetRef):
747 refs = [refs]
749 # Create a set of all the IDs
750 all_ids = {ref.id for ref in refs}
752 keys_to_remove = []
753 for key, entry in self._cache_entries.items():
754 if entry.ref in all_ids:
755 keys_to_remove.append(key)
756 self._remove_from_cache(keys_to_remove)
758 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str | None:
759 """Record the file in the cache registry.
761 Parameters
762 ----------
763 cached_location : `lsst.resources.ResourcePath`
764 Location of the file to be registered.
765 can_exist : `bool`, optional
766 If `True` the item being registered can already be listed.
767 This can allow a cache refresh to run without checking the
768 file again. If `False` it is an error for the registry to
769 already know about this file.
771 Returns
772 -------
773 cache_key : `str` or `None`
774 The key used in the registry for this file. `None` if the file
775 no longer exists (it could have been expired by another process).
776 """
777 path_in_cache = cached_location.relative_to(self.cache_directory)
778 if path_in_cache is None:
779 raise ValueError(
780 f"Can not register cached file {cached_location} that is not within"
781 f" the cache directory at {self.cache_directory}."
782 )
783 if path_in_cache in self._cache_entries:
784 if can_exist:
785 return path_in_cache
786 else:
787 raise ValueError(
788 f"Cached file {cached_location} is already known to the registry"
789 " but this was expected to be a new file."
790 )
791 try:
792 details = CacheEntry.from_file(cached_location, root=self.cache_directory)
793 except FileNotFoundError:
794 return None
795 self._cache_entries[path_in_cache] = details
796 return path_in_cache
798 def scan_cache(self) -> None:
799 """Scan the cache directory and record information about files."""
800 found = set()
801 for file in ResourcePath.findFileResources([self.cache_directory]):
802 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator"
804 # Skip any that are found in an exempt part of the hierarchy
805 # since they should not be part of the registry.
806 if file.relative_to(self._temp_exempt_directory) is not None:
807 continue
809 path_in_cache = self._register_cache_entry(file, can_exist=True)
810 if path_in_cache:
811 found.add(path_in_cache)
813 # Find any files that were recorded in the cache but are no longer
814 # on disk. (something else cleared them out?)
815 known_to_cache = set(self._cache_entries)
816 missing = known_to_cache - found
818 if missing:
819 log.debug(
820 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing)
821 )
822 for path_in_cache in missing:
823 self._cache_entries.pop(path_in_cache, None)
825 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool:
826 """Report if the dataset is known to the cache.
828 Parameters
829 ----------
830 ref : `DatasetRef`
831 Dataset to check for in the cache.
832 extension : `str`, optional
833 File extension expected. Should include the leading "``.``".
834 If `None` the extension is ignored and the dataset ID alone is
835 used to check in the cache. The extension must be defined if
836 a specific component is being checked.
838 Returns
839 -------
840 known : `bool`
841 Returns `True` if the dataset is currently known to the cache
842 and `False` otherwise. If the dataset refers to a component and
843 an extension is given then only that component is checked.
845 Notes
846 -----
847 This method can only report if the dataset is known to the cache
848 in this specific instant and does not indicate whether the file
849 can be read from the cache later. `find_in_cache()` should be called
850 if the cached file is to be used.
852 This method does not force the cache to be re-scanned and so can miss
853 cached datasets that have recently been written by other processes.
854 """
855 if self._cache_directory is None:
856 return False
857 if self.file_count == 0:
858 return False
860 if extension is None:
861 # Look solely for matching dataset ref ID and not specific
862 # components.
863 cached_paths = self._cache_entries.get_dataset_keys(ref.id)
864 return bool(cached_paths)
866 else:
867 # Extension is known so we can do an explicit look up for the
868 # cache entry.
869 cached_location = self._construct_cache_name(ref, extension)
870 path_in_cache = cached_location.relative_to(self.cache_directory)
871 assert path_in_cache is not None # For mypy
872 return path_in_cache in self._cache_entries
874 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None:
875 """Remove the specified cache entries from cache.
877 Parameters
878 ----------
879 cache_entries : iterable of `str`
880 The entries to remove from the cache. The values are the path
881 within the cache.
882 """
883 for entry in cache_entries:
884 path = self.cache_directory.join(entry)
886 self._cache_entries.pop(entry, None)
887 log.debug("Removing file from cache: %s", path)
888 with contextlib.suppress(FileNotFoundError):
889 path.remove()
891 def _expire_cache(self) -> None:
892 """Expire the files in the cache.
894 Notes
895 -----
896 The expiration modes are defined by the config or can be overridden.
897 Available options:
899 * ``files``: Number of files.
900 * ``datasets``: Number of datasets
901 * ``size``: Total size of files.
902 * ``age``: Age of files.
904 The first three would remove in reverse time order.
905 Number of files is complicated by the possibility of disassembled
906 composites where 10 small files can be created for each dataset.
908 Additionally there is a use case for an external user to explicitly
909 state the dataset refs that should be cached and then when to
910 remove them. Overriding any global configuration.
911 """
912 if self._expiration_mode is None:
913 # Expiration has been disabled.
914 return
916 # mypy can't be sure we have set a threshold properly
917 if self._expiration_threshold is None:
918 log.warning(
919 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode
920 )
921 return
923 # Sync up cache. There is no file locking involved so for a shared
924 # cache multiple processes may be racing to delete files. Deleting
925 # a file that no longer exists is not an error.
926 self.scan_cache()
928 if self._expiration_mode == "files":
929 n_files = len(self._cache_entries)
930 n_over = n_files - self._expiration_threshold
931 if n_over > 0:
932 sorted_keys = self._sort_cache()
933 keys_to_remove = sorted_keys[:n_over]
934 self._remove_from_cache(keys_to_remove)
935 return
937 if self._expiration_mode == "datasets":
938 # Count the datasets, in ascending timestamp order,
939 # so that oldest turn up first.
940 datasets = defaultdict(list)
941 for key in self._sort_cache():
942 entry = self._cache_entries[key]
943 datasets[entry.ref].append(key)
945 n_datasets = len(datasets)
946 n_over = n_datasets - self._expiration_threshold
947 if n_over > 0:
948 # Keys will be read out in insert order which
949 # will be date order so oldest ones are removed.
950 ref_ids = list(datasets.keys())[:n_over]
951 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids))
952 self._remove_from_cache(keys_to_remove)
953 return
955 if self._expiration_mode == "size":
956 if self.cache_size > self._expiration_threshold:
957 for key in self._sort_cache():
958 self._remove_from_cache([key])
959 if self.cache_size <= self._expiration_threshold:
960 break
961 return
963 if self._expiration_mode == "age":
964 now = datetime.datetime.utcnow()
965 for key in self._sort_cache():
966 delta = now - self._cache_entries[key].ctime
967 if delta.seconds > self._expiration_threshold:
968 self._remove_from_cache([key])
969 else:
970 # We're already in date order.
971 break
972 return
974 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}")
976 def _sort_cache(self) -> list[str]:
977 """Sort the cache entries by time and return the sorted keys.
979 Returns
980 -------
981 sorted : `list` of `str`
982 Keys into the cache, sorted by time with oldest first.
983 """
985 def sort_by_time(key: str) -> datetime.datetime:
986 """Sorter key function using cache entry details."""
987 return self._cache_entries[key].ctime
989 return sorted(self._cache_entries, key=sort_by_time)
991 def __str__(self) -> str:
992 cachedir = self._cache_directory if self._cache_directory else "<tempdir>"
993 return (
994 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold},"
995 f"default={self._caching_default}) "
996 f"n_files={self.file_count}, n_bytes={self.cache_size}"
997 )
1000class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager):
1001 """A variant of the datastore cache where no cache is enabled.
1003 Parameters
1004 ----------
1005 config : `str` or `DatastoreCacheManagerConfig`
1006 Configuration to control caching.
1007 universe : `DimensionUniverse`
1008 Set of all known dimensions, used to expand and validate any used
1009 in lookup keys.
1010 """
1012 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse):
1013 return
1015 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool:
1016 """Indicate whether the entity should be added to the cache.
1018 Always returns `False`.
1019 """
1020 return False
1022 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None:
1023 """Move dataset to cache but always refuse and returns `None`."""
1024 return None
1026 @contextlib.contextmanager
1027 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]:
1028 """Look for a dataset in the cache and return its location.
1030 Never finds a file.
1031 """
1032 yield None
1034 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None:
1035 """Remove datasets from cache.
1037 Always does nothing.
1038 """
1039 return
1041 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool:
1042 """Report if a dataset is known to the cache.
1044 Always returns `False`.
1045 """
1046 return False
1048 def __str__(self) -> str:
1049 return f"{type(self).__name__}()"