Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 23%
394 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 02:10 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 02:10 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Cache management for a datastore."""
26__all__ = (
27 "AbstractDatastoreCacheManager",
28 "DatastoreDisabledCacheManager",
29 "DatastoreCacheManager",
30 "DatastoreCacheManagerConfig",
31)
33import atexit
34import contextlib
35import datetime
36import itertools
37import logging
38import os
39import shutil
40import tempfile
41import uuid
42from abc import ABC, abstractmethod
43from collections import defaultdict
44from random import Random
45from typing import (
46 TYPE_CHECKING,
47 Dict,
48 ItemsView,
49 Iterable,
50 Iterator,
51 KeysView,
52 List,
53 Optional,
54 Union,
55 ValuesView,
56)
58from lsst.resources import ResourcePath
59from pydantic import BaseModel, PrivateAttr
61from .config import ConfigSubset
62from .configSupport import processLookupConfigs
63from .datasets import DatasetId, DatasetRef
65if TYPE_CHECKING:
66 from .configSupport import LookupKey
67 from .datasets import DatasetType
68 from .dimensions import DimensionUniverse
69 from .storageClass import StorageClass
71log = logging.getLogger(__name__)
74def remove_cache_directory(directory: str) -> None:
75 """Remove the specified directory and all its contents."""
76 log.debug("Removing temporary cache directory %s", directory)
77 shutil.rmtree(directory, ignore_errors=True)
80def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath:
81 """Construct the full path to use for this dataset in the cache.
83 Parameters
84 ----------
85 ref : `DatasetRef`
86 The dataset to look up in or write to the cache.
87 extension : `str`
88 File extension to use for this file. Should include the
89 leading "``.``".
91 Returns
92 -------
93 uri : `lsst.resources.ResourcePath`
94 URI to use for this dataset in the cache.
95 """
96 # Dataset type component is needed in the name if composite
97 # disassembly is happening since the ID is shared for all components.
98 component = ref.datasetType.component()
99 component = f"_{component}" if component else ""
100 return root.join(f"{ref.id}{component}{extension}")
103def _parse_cache_name(cached_location: str) -> tuple[uuid.UUID, str | None, str | None]:
104 """For a given cache name, return its component parts.
106 Changes to ``_construct_cache_path()`` should be reflected here.
108 Parameters
109 ----------
110 cached_location : `str`
111 The name of the file within the cache.
113 Returns
114 -------
115 id : `uuid.UUID`
116 The dataset ID.
117 component : `str` or `None`
118 The name of the component, if present.
119 extension: `str` or `None`
120 The file extension, if present.
121 """
122 # Assume first dot is the extension and so allow .fits.gz
123 root_ext = cached_location.split(".", maxsplit=1)
124 root = root_ext.pop(0)
125 ext = "." + root_ext.pop(0) if root_ext else None
127 parts = root.split("_")
128 id_ = uuid.UUID(parts.pop(0))
129 component = parts.pop(0) if parts else None
130 return id_, component, ext
133class CacheEntry(BaseModel):
134 """Represent an entry in the cache."""
136 name: str
137 """Name of the file."""
139 size: int
140 """Size of the file in bytes."""
142 ctime: datetime.datetime
143 """Creation time of the file."""
145 ref: DatasetId
146 """ID of this dataset."""
148 component: Optional[str]
149 """Component for this disassembled composite (optional)."""
151 @classmethod
152 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry:
153 """Construct an object from a file name.
155 Parameters
156 ----------
157 file : `lsst.resources.ResourcePath`
158 Path to the file.
159 root : `lsst.resources.ResourcePath`
160 Cache root directory.
161 """
162 file_in_cache = file.relative_to(root)
163 if file_in_cache is None:
164 raise ValueError(f"Supplied file {file} is not inside root {root}")
165 id_, component, _ = _parse_cache_name(file_in_cache)
167 stat = os.stat(file.ospath)
168 return cls(
169 name=file_in_cache,
170 size=stat.st_size,
171 ref=id_,
172 component=component,
173 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime),
174 )
177class _MarkerEntry(CacheEntry):
178 pass
181class CacheRegistry(BaseModel):
182 """Collection of cache entries."""
184 _size: int = PrivateAttr(0)
185 """Size of the cache."""
187 _entries: Dict[str, CacheEntry] = PrivateAttr({})
188 """Internal collection of cache entries."""
190 _ref_map: Dict[DatasetId, List[str]] = PrivateAttr({})
191 """Mapping of DatasetID to corresponding keys in cache registry."""
193 @property
194 def cache_size(self) -> int:
195 return self._size
197 def __getitem__(self, key: str) -> CacheEntry:
198 return self._entries[key]
200 def __setitem__(self, key: str, entry: CacheEntry) -> None:
201 self._size += entry.size
202 self._entries[key] = entry
204 # Update the mapping from ref to path.
205 if entry.ref not in self._ref_map:
206 self._ref_map[entry.ref] = []
207 self._ref_map[entry.ref].append(key)
209 def __delitem__(self, key: str) -> None:
210 entry = self._entries.pop(key)
211 self._decrement(entry)
212 self._ref_map[entry.ref].remove(key)
214 def _decrement(self, entry: Optional[CacheEntry]) -> None:
215 if entry:
216 self._size -= entry.size
217 if self._size < 0:
218 log.warning("Cache size has gone negative. Inconsistent cache records...")
219 self._size = 0
221 def __contains__(self, key: str) -> bool:
222 return key in self._entries
224 def __len__(self) -> int:
225 return len(self._entries)
227 def __iter__(self) -> Iterator[str]: # type: ignore
228 return iter(self._entries)
230 def keys(self) -> KeysView[str]:
231 return self._entries.keys()
233 def values(self) -> ValuesView[CacheEntry]:
234 return self._entries.values()
236 def items(self) -> ItemsView[str, CacheEntry]:
237 return self._entries.items()
239 # An private marker to indicate that pop() should raise if no default
240 # is given.
241 __marker = _MarkerEntry(
242 name="marker",
243 size=0,
244 ref=uuid.UUID("{00000000-0000-0000-0000-000000000000}"),
245 ctime=datetime.datetime.utcfromtimestamp(0),
246 )
248 def pop(self, key: str, default: Optional[CacheEntry] = __marker) -> Optional[CacheEntry]:
249 # The marker for dict.pop is not the same as our marker.
250 if default is self.__marker:
251 entry = self._entries.pop(key)
252 else:
253 entry = self._entries.pop(key, self.__marker)
254 # Should not attempt to correct for this entry being removed
255 # if we got the default value.
256 if entry is self.__marker:
257 return default
259 self._decrement(entry)
260 # The default entry given to this method may not even be in the cache.
261 if entry and entry.ref in self._ref_map:
262 keys = self._ref_map[entry.ref]
263 if key in keys:
264 keys.remove(key)
265 return entry
267 def get_dataset_keys(self, dataset_id: Optional[DatasetId]) -> Optional[List[str]]:
268 """Retrieve all keys associated with the given dataset ID.
270 Parameters
271 ----------
272 dataset_id : `DatasetId` or `None`
273 The dataset ID to look up. Returns `None` if the ID is `None`.
275 Returns
276 -------
277 keys : `list` [`str`]
278 Keys associated with this dataset. These keys can be used to lookup
279 the cache entry information in the `CacheRegistry`. Returns
280 `None` if the dataset is not known to the cache.
281 """
282 if dataset_id not in self._ref_map:
283 return None
284 keys = self._ref_map[dataset_id]
285 if not keys:
286 return None
287 return keys
290class DatastoreCacheManagerConfig(ConfigSubset):
291 """Configuration information for `DatastoreCacheManager`."""
293 component = "cached"
294 requiredKeys = ("cacheable",)
297class AbstractDatastoreCacheManager(ABC):
298 """An abstract base class for managing caching in a Datastore.
300 Parameters
301 ----------
302 config : `str` or `DatastoreCacheManagerConfig`
303 Configuration to control caching.
304 universe : `DimensionUniverse`
305 Set of all known dimensions, used to expand and validate any used
306 in lookup keys.
307 """
309 @property
310 def cache_size(self) -> int:
311 """Size of the cache in bytes."""
312 return 0
314 @property
315 def file_count(self) -> int:
316 """Return number of cached files tracked by registry."""
317 return 0
319 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse):
320 if not isinstance(config, DatastoreCacheManagerConfig):
321 config = DatastoreCacheManagerConfig(config)
322 assert isinstance(config, DatastoreCacheManagerConfig)
323 self.config = config
325 @abstractmethod
326 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool:
327 """Indicate whether the entity should be added to the cache.
329 This is relevant when reading or writing.
331 Parameters
332 ----------
333 entity : `StorageClass` or `DatasetType` or `DatasetRef`
334 Thing to test against the configuration. The ``name`` property
335 is used to determine a match. A `DatasetType` will first check
336 its name, before checking its `StorageClass`. If there are no
337 matches the default will be returned.
339 Returns
340 -------
341 should_cache : `bool`
342 Returns `True` if the dataset should be cached; `False` otherwise.
343 """
344 raise NotImplementedError()
346 @abstractmethod
347 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool:
348 """Report if the dataset is known to the cache.
350 Parameters
351 ----------
352 ref : `DatasetRef`
353 Dataset to check for in the cache.
354 extension : `str`, optional
355 File extension expected. Should include the leading "``.``".
356 If `None` the extension is ignored and the dataset ID alone is
357 used to check in the cache. The extension must be defined if
358 a specific component is being checked.
360 Returns
361 -------
362 known : `bool`
363 Returns `True` if the dataset is currently known to the cache
364 and `False` otherwise.
366 Notes
367 -----
368 This method can only report if the dataset is known to the cache
369 in this specific instant and does not indicate whether the file
370 can be read from the cache later. `find_in_cache()` should be called
371 if the cached file is to be used.
372 """
373 raise NotImplementedError()
375 @abstractmethod
376 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]:
377 """Move a file to the cache.
379 Move the given file into the cache, using the supplied DatasetRef
380 for naming. A call is made to `should_be_cached()` and if the
381 DatasetRef should not be accepted `None` will be returned.
383 Cache expiry can occur during this.
385 Parameters
386 ----------
387 uri : `lsst.resources.ResourcePath`
388 Location of the file to be relocated to the cache. Will be moved.
389 ref : `DatasetRef`
390 Ref associated with this file. Will be used to determine the name
391 of the file within the cache.
393 Returns
394 -------
395 new : `lsst.resources.ResourcePath` or `None`
396 URI to the file within the cache, or `None` if the dataset
397 was not accepted by the cache.
398 """
399 raise NotImplementedError()
401 @abstractmethod
402 @contextlib.contextmanager
403 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]:
404 """Look for a dataset in the cache and return its location.
406 Parameters
407 ----------
408 ref : `DatasetRef`
409 Dataset to locate in the cache.
410 extension : `str`
411 File extension expected. Should include the leading "``.``".
413 Yields
414 ------
415 uri : `lsst.resources.ResourcePath` or `None`
416 The URI to the cached file, or `None` if the file has not been
417 cached.
419 Notes
420 -----
421 Should be used as a context manager in order to prevent this
422 file from being removed from the cache for that context.
423 """
424 raise NotImplementedError()
426 @abstractmethod
427 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None:
428 """Remove the specified datasets from the cache.
430 It is not an error for these datasets to be missing from the cache.
432 Parameters
433 ----------
434 ref : `DatasetRef` or iterable of `DatasetRef`
435 The datasets to remove from the cache.
436 """
437 raise NotImplementedError()
439 @abstractmethod
440 def __str__(self) -> str:
441 raise NotImplementedError()
444class DatastoreCacheManager(AbstractDatastoreCacheManager):
445 """A class for managing caching in a Datastore using local files.
447 Parameters
448 ----------
449 config : `str` or `DatastoreCacheManagerConfig`
450 Configuration to control caching.
451 universe : `DimensionUniverse`
452 Set of all known dimensions, used to expand and validate any used
453 in lookup keys.
455 Notes
456 -----
457 Two environment variables can be used to override the cache directory
458 and expiration configuration:
460 * ``$DAF_BUTLER_CACHE_DIRECTORY``
461 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE``
463 The expiration mode should take the form ``mode=threshold`` so for
464 example to configure expiration to limit the cache directory to 5 datasets
465 the value would be ``datasets=5``.
467 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment
468 variable can be used to indicate that this directory should be used
469 if no explicit directory has been specified from configuration or from
470 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable.
471 """
473 _temp_exemption_prefix = "exempt/"
474 _tmpdir_prefix = "butler-cache-dir-"
476 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse):
477 super().__init__(config, universe)
479 # Set cache directory if it pre-exists, else defer creation until
480 # requested. Allow external override from environment.
481 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root")
483 # Allow the execution environment to override the default values
484 # so long as no default value has been set from the line above.
485 if root is None:
486 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET")
488 self._cache_directory = (
489 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None
490 )
492 if self._cache_directory:
493 if not self._cache_directory.isLocal:
494 raise ValueError(
495 f"Cache directory must be on a local file system. Got: {self._cache_directory}"
496 )
497 # Ensure that the cache directory is created. We assume that
498 # someone specifying a permanent cache directory will be expecting
499 # it to always be there. This will also trigger an error
500 # early rather than waiting until the cache is needed.
501 self._cache_directory.mkdir()
503 # Calculate the caching lookup table.
504 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe)
506 # Default decision to for whether a dataset should be cached.
507 self._caching_default = self.config.get("default", False)
509 # Expiration mode. Read from config but allow override from
510 # the environment.
511 expiration_mode = self.config.get(("expiry", "mode"))
512 threshold = self.config.get(("expiry", "threshold"))
514 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE")
515 if external_mode and "=" in external_mode:
516 expiration_mode, expiration_threshold = external_mode.split("=", 1)
517 threshold = int(expiration_threshold)
518 if expiration_mode is None:
519 # Force to None to avoid confusion.
520 threshold = None
522 self._expiration_mode: Optional[str] = expiration_mode
523 self._expiration_threshold: Optional[int] = threshold
524 if self._expiration_threshold is None and self._expiration_mode is not None:
525 raise ValueError(
526 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}"
527 )
529 log.debug(
530 "Cache configuration:\n- root: %s\n- expiration mode: %s",
531 self._cache_directory if self._cache_directory else "tmpdir",
532 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled",
533 )
535 # Files in cache, indexed by path within the cache directory.
536 self._cache_entries = CacheRegistry()
538 @property
539 def cache_directory(self) -> ResourcePath:
540 if self._cache_directory is None:
541 # Create on demand. Allow the override environment variable
542 # to be used in case it got set after this object was created
543 # but before a cache was used.
544 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"):
545 # Someone else will clean this up.
546 isTemporary = False
547 msg = "deferred fallback"
548 else:
549 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix)
550 isTemporary = True
551 msg = "temporary"
553 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary)
554 log.debug("Using %s cache directory at %s", msg, self._cache_directory)
556 # Remove when we no longer need it.
557 if isTemporary:
558 atexit.register(remove_cache_directory, self._cache_directory.ospath)
559 return self._cache_directory
561 @property
562 def _temp_exempt_directory(self) -> ResourcePath:
563 """Return the directory in which to store temporary cache files that
564 should not be expired.
565 """
566 return self.cache_directory.join(self._temp_exemption_prefix)
568 @property
569 def cache_size(self) -> int:
570 return self._cache_entries.cache_size
572 @property
573 def file_count(self) -> int:
574 return len(self._cache_entries)
576 @classmethod
577 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]:
578 """Defines a fallback cache directory if a fallback not set already.
580 Returns
581 -------
582 defined : `bool`
583 `True` if the fallback directory was newly-defined in this method.
584 `False` if it had already been set.
585 cache_dir : `str`
586 Returns the path to the cache directory that will be used if it's
587 needed. This can allow the caller to run a directory cleanup
588 when it's no longer needed (something that the cache manager
589 can not do because forks should not clean up directories defined
590 by the parent process).
592 Notes
593 -----
594 The fallback directory will not be defined if one has already been
595 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET``
596 environment variable only if a value has not previously been stored
597 in that environment variable. Setting the environment variable allows
598 this value to survive into spawned subprocesses. Calling this method
599 will lead to all subsequently created cache managers sharing the same
600 cache.
601 """
602 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"):
603 # A value has already been set.
604 return (False, cache_dir)
606 # As a class method, we do not know at this point whether a cache
607 # directory will be needed so it would be impolite to create a
608 # directory that will never be used.
610 # Construct our own temp name -- 16 characters should have a fairly
611 # low chance of clashing when combined with the process ID.
612 characters = "abcdefghijklmnopqrstuvwxyz0123456789_"
613 rng = Random()
614 tempchars = "".join(rng.choice(characters) for _ in range(16))
616 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}"
618 cache_dir = os.path.join(tempfile.gettempdir(), tempname)
619 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir
620 return (True, cache_dir)
622 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool:
623 # Docstring inherited
624 matchName: Union[LookupKey, str] = "{} (via default)".format(entity)
625 should_cache = self._caching_default
627 for key in entity._lookupNames():
628 if key in self._lut:
629 should_cache = bool(self._lut[key])
630 matchName = key
631 break
633 if not isinstance(should_cache, bool):
634 raise TypeError(
635 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool."
636 )
638 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not")
639 return should_cache
641 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath:
642 """Construct the name to use for this dataset in the cache.
644 Parameters
645 ----------
646 ref : `DatasetRef`
647 The dataset to look up in or write to the cache.
648 extension : `str`
649 File extension to use for this file. Should include the
650 leading "``.``".
652 Returns
653 -------
654 uri : `lsst.resources.ResourcePath`
655 URI to use for this dataset in the cache.
656 """
657 return _construct_cache_path(self.cache_directory, ref, extension)
659 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]:
660 # Docstring inherited
661 if not self.should_be_cached(ref):
662 return None
664 # Write the file using the id of the dataset ref and the file
665 # extension.
666 cached_location = self._construct_cache_name(ref, uri.getExtension())
668 # Run cache expiry to ensure that we have room for this
669 # item.
670 self._expire_cache()
672 # The above reset the in-memory cache status. It's entirely possible
673 # that another process has just cached this file (if multiple
674 # processes are caching on read), so check our in-memory cache
675 # before attempting to cache the dataset.
676 path_in_cache = cached_location.relative_to(self.cache_directory)
677 if path_in_cache and path_in_cache in self._cache_entries:
678 return cached_location
680 # Move into the cache. Given that multiple processes might be
681 # sharing a single cache directory, and the file we need might have
682 # been copied in whilst we were checking, allow overwrite without
683 # complaint. Even for a private cache directory it is possible that
684 # a second butler in a subprocess could be writing to it.
685 cached_location.transfer_from(uri, transfer="move", overwrite=True)
686 log.debug("Cached dataset %s to %s", ref, cached_location)
688 self._register_cache_entry(cached_location)
690 return cached_location
692 @contextlib.contextmanager
693 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]:
694 # Docstring inherited
695 # Short circuit this if the cache directory has not been created yet.
696 if self._cache_directory is None:
697 yield None
698 return
700 cached_location = self._construct_cache_name(ref, extension)
701 if cached_location.exists():
702 log.debug("Found cached file %s for dataset %s.", cached_location, ref)
704 # The cached file could be removed by another process doing
705 # cache expiration so we need to protect against that by making
706 # a copy in a different tree. Use hardlinks to ensure that
707 # we either have the cached file or we don't. This is robust
708 # against race conditions that can be caused by using soft links
709 # and the other end of the link being deleted just after it
710 # is created.
711 path_in_cache = cached_location.relative_to(self.cache_directory)
712 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory"
714 # Need to use a unique file name for the temporary location to
715 # ensure that two different processes can read the file
716 # simultaneously without one of them deleting it when it's in
717 # use elsewhere. Retain the original filename for easier debugging.
718 random = str(uuid.uuid4())[:8]
719 basename = cached_location.basename()
720 filename = f"{random}-{basename}"
722 temp_location: Optional[ResourcePath] = self._temp_exempt_directory.join(filename)
723 try:
724 if temp_location is not None:
725 temp_location.transfer_from(cached_location, transfer="hardlink")
726 except Exception as e:
727 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e)
728 # Any failure will be treated as if the file was not
729 # in the cache. Yielding the original cache location
730 # is too dangerous.
731 temp_location = None
733 try:
734 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref)
735 yield temp_location
736 finally:
737 try:
738 if temp_location:
739 temp_location.remove()
740 except FileNotFoundError:
741 pass
742 return
744 log.debug("Dataset %s not found in cache.", ref)
745 yield None
746 return
748 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None:
749 # Docstring inherited.
751 # Stop early if there are no cache entries anyhow.
752 if len(self._cache_entries) == 0:
753 return
755 if isinstance(refs, DatasetRef):
756 refs = [refs]
758 # Create a set of all the IDs
759 all_ids = {ref.id for ref in refs}
761 keys_to_remove = []
762 for key, entry in self._cache_entries.items():
763 if entry.ref in all_ids:
764 keys_to_remove.append(key)
765 self._remove_from_cache(keys_to_remove)
767 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> Optional[str]:
768 """Record the file in the cache registry.
770 Parameters
771 ----------
772 cached_location : `lsst.resources.ResourcePath`
773 Location of the file to be registered.
774 can_exist : `bool`, optional
775 If `True` the item being registered can already be listed.
776 This can allow a cache refresh to run without checking the
777 file again. If `False` it is an error for the registry to
778 already know about this file.
780 Returns
781 -------
782 cache_key : `str` or `None`
783 The key used in the registry for this file. `None` if the file
784 no longer exists (it could have been expired by another process).
785 """
786 path_in_cache = cached_location.relative_to(self.cache_directory)
787 if path_in_cache is None:
788 raise ValueError(
789 f"Can not register cached file {cached_location} that is not within"
790 f" the cache directory at {self.cache_directory}."
791 )
792 if path_in_cache in self._cache_entries:
793 if can_exist:
794 return path_in_cache
795 else:
796 raise ValueError(
797 f"Cached file {cached_location} is already known to the registry"
798 " but this was expected to be a new file."
799 )
800 try:
801 details = CacheEntry.from_file(cached_location, root=self.cache_directory)
802 except FileNotFoundError:
803 return None
804 self._cache_entries[path_in_cache] = details
805 return path_in_cache
807 def scan_cache(self) -> None:
808 """Scan the cache directory and record information about files."""
809 found = set()
810 for file in ResourcePath.findFileResources([self.cache_directory]):
811 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator"
813 # Skip any that are found in an exempt part of the hierarchy
814 # since they should not be part of the registry.
815 if file.relative_to(self._temp_exempt_directory) is not None:
816 continue
818 path_in_cache = self._register_cache_entry(file, can_exist=True)
819 if path_in_cache:
820 found.add(path_in_cache)
822 # Find any files that were recorded in the cache but are no longer
823 # on disk. (something else cleared them out?)
824 known_to_cache = set(self._cache_entries)
825 missing = known_to_cache - found
827 if missing:
828 log.debug(
829 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing)
830 )
831 for path_in_cache in missing:
832 self._cache_entries.pop(path_in_cache, None)
834 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool:
835 """Report if the dataset is known to the cache.
837 Parameters
838 ----------
839 ref : `DatasetRef`
840 Dataset to check for in the cache.
841 extension : `str`, optional
842 File extension expected. Should include the leading "``.``".
843 If `None` the extension is ignored and the dataset ID alone is
844 used to check in the cache. The extension must be defined if
845 a specific component is being checked.
847 Returns
848 -------
849 known : `bool`
850 Returns `True` if the dataset is currently known to the cache
851 and `False` otherwise. If the dataset refers to a component and
852 an extension is given then only that component is checked.
854 Notes
855 -----
856 This method can only report if the dataset is known to the cache
857 in this specific instant and does not indicate whether the file
858 can be read from the cache later. `find_in_cache()` should be called
859 if the cached file is to be used.
861 This method does not force the cache to be re-scanned and so can miss
862 cached datasets that have recently been written by other processes.
863 """
864 if self._cache_directory is None:
865 return False
866 if self.file_count == 0:
867 return False
869 if extension is None:
870 # Look solely for matching dataset ref ID and not specific
871 # components.
872 cached_paths = self._cache_entries.get_dataset_keys(ref.id)
873 return True if cached_paths else False
875 else:
876 # Extension is known so we can do an explicit look up for the
877 # cache entry.
878 cached_location = self._construct_cache_name(ref, extension)
879 path_in_cache = cached_location.relative_to(self.cache_directory)
880 assert path_in_cache is not None # For mypy
881 return path_in_cache in self._cache_entries
883 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None:
884 """Remove the specified cache entries from cache.
886 Parameters
887 ----------
888 cache_entries : iterable of `str`
889 The entries to remove from the cache. The values are the path
890 within the cache.
891 """
892 for entry in cache_entries:
893 path = self.cache_directory.join(entry)
895 self._cache_entries.pop(entry, None)
896 log.debug("Removing file from cache: %s", path)
897 try:
898 path.remove()
899 except FileNotFoundError:
900 pass
902 def _expire_cache(self) -> None:
903 """Expire the files in the cache.
905 Notes
906 -----
907 The expiration modes are defined by the config or can be overridden.
908 Available options:
910 * ``files``: Number of files.
911 * ``datasets``: Number of datasets
912 * ``size``: Total size of files.
913 * ``age``: Age of files.
915 The first three would remove in reverse time order.
916 Number of files is complicated by the possibility of disassembled
917 composites where 10 small files can be created for each dataset.
919 Additionally there is a use case for an external user to explicitly
920 state the dataset refs that should be cached and then when to
921 remove them. Overriding any global configuration.
922 """
923 if self._expiration_mode is None:
924 # Expiration has been disabled.
925 return
927 # mypy can't be sure we have set a threshold properly
928 if self._expiration_threshold is None:
929 log.warning(
930 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode
931 )
932 return
934 # Sync up cache. There is no file locking involved so for a shared
935 # cache multiple processes may be racing to delete files. Deleting
936 # a file that no longer exists is not an error.
937 self.scan_cache()
939 if self._expiration_mode == "files":
940 n_files = len(self._cache_entries)
941 n_over = n_files - self._expiration_threshold
942 if n_over > 0:
943 sorted_keys = self._sort_cache()
944 keys_to_remove = sorted_keys[:n_over]
945 self._remove_from_cache(keys_to_remove)
946 return
948 if self._expiration_mode == "datasets":
949 # Count the datasets, in ascending timestamp order,
950 # so that oldest turn up first.
951 datasets = defaultdict(list)
952 for key in self._sort_cache():
953 entry = self._cache_entries[key]
954 datasets[entry.ref].append(key)
956 n_datasets = len(datasets)
957 n_over = n_datasets - self._expiration_threshold
958 if n_over > 0:
959 # Keys will be read out in insert order which
960 # will be date order so oldest ones are removed.
961 ref_ids = list(datasets.keys())[:n_over]
962 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids))
963 self._remove_from_cache(keys_to_remove)
964 return
966 if self._expiration_mode == "size":
967 if self.cache_size > self._expiration_threshold:
968 for key in self._sort_cache():
969 self._remove_from_cache([key])
970 if self.cache_size <= self._expiration_threshold:
971 break
972 return
974 if self._expiration_mode == "age":
975 now = datetime.datetime.utcnow()
976 for key in self._sort_cache():
977 delta = now - self._cache_entries[key].ctime
978 if delta.seconds > self._expiration_threshold:
979 self._remove_from_cache([key])
980 else:
981 # We're already in date order.
982 break
983 return
985 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}")
987 def _sort_cache(self) -> List[str]:
988 """Sort the cache entries by time and return the sorted keys.
990 Returns
991 -------
992 sorted : `list` of `str`
993 Keys into the cache, sorted by time with oldest first.
994 """
996 def sort_by_time(key: str) -> datetime.datetime:
997 """Sorter key function using cache entry details."""
998 return self._cache_entries[key].ctime
1000 return sorted(self._cache_entries, key=sort_by_time)
1002 def __str__(self) -> str:
1003 cachedir = self._cache_directory if self._cache_directory else "<tempdir>"
1004 return (
1005 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold},"
1006 f"default={self._caching_default}) "
1007 f"n_files={self.file_count}, n_bytes={self.cache_size}"
1008 )
1011class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager):
1012 """A variant of the datastore cache where no cache is enabled.
1014 Parameters
1015 ----------
1016 config : `str` or `DatastoreCacheManagerConfig`
1017 Configuration to control caching.
1018 universe : `DimensionUniverse`
1019 Set of all known dimensions, used to expand and validate any used
1020 in lookup keys.
1021 """
1023 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse):
1024 return
1026 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool:
1027 """Indicate whether the entity should be added to the cache.
1029 Always returns `False`.
1030 """
1031 return False
1033 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]:
1034 """Move dataset to cache but always refuse and returns `None`."""
1035 return None
1037 @contextlib.contextmanager
1038 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]:
1039 """Look for a dataset in the cache and return its location.
1041 Never finds a file.
1042 """
1043 yield None
1045 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None:
1046 """Remove datasets from cache.
1048 Always does nothing.
1049 """
1050 return
1052 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool:
1053 """Report if a dataset is known to the cache.
1055 Always returns `False`.
1056 """
1057 return False
1059 def __str__(self) -> str:
1060 return f"{type(self).__name__}()"