Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 29%
398 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Cache management for a datastore."""
24from __future__ import annotations
26__all__ = (
27 "AbstractDatastoreCacheManager",
28 "DatastoreDisabledCacheManager",
29 "DatastoreCacheManager",
30 "DatastoreCacheManagerConfig",
31)
33import atexit
34import contextlib
35import datetime
36import itertools
37import logging
38import os
39import shutil
40import tempfile
41import uuid
42from abc import ABC, abstractmethod
43from collections import defaultdict
44from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView
45from random import Random
46from typing import TYPE_CHECKING
48from lsst.resources import ResourcePath
50try:
51 from pydantic.v1 import BaseModel, PrivateAttr
52except ModuleNotFoundError:
53 from pydantic import BaseModel, PrivateAttr # type: ignore
55from .config import ConfigSubset
56from .configSupport import processLookupConfigs
57from .datasets import DatasetId, DatasetRef
59if TYPE_CHECKING:
60 from .configSupport import LookupKey
61 from .datasets import DatasetType
62 from .dimensions import DimensionUniverse
63 from .storageClass import StorageClass
65log = logging.getLogger(__name__)
68def remove_cache_directory(directory: str) -> None:
69 """Remove the specified directory and all its contents."""
70 log.debug("Removing temporary cache directory %s", directory)
71 shutil.rmtree(directory, ignore_errors=True)
74def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath:
75 """Construct the full path to use for this dataset in the cache.
77 Parameters
78 ----------
79 ref : `DatasetRef`
80 The dataset to look up in or write to the cache.
81 extension : `str`
82 File extension to use for this file. Should include the
83 leading "``.``".
85 Returns
86 -------
87 uri : `lsst.resources.ResourcePath`
88 URI to use for this dataset in the cache.
89 """
90 # Dataset type component is needed in the name if composite
91 # disassembly is happening since the ID is shared for all components.
92 component = ref.datasetType.component()
93 component = f"_{component}" if component else ""
94 return root.join(f"{ref.id}{component}{extension}")
97def _parse_cache_name(cached_location: str) -> tuple[uuid.UUID, str | None, str | None]:
98 """For a given cache name, return its component parts.
100 Changes to ``_construct_cache_path()`` should be reflected here.
102 Parameters
103 ----------
104 cached_location : `str`
105 The name of the file within the cache.
107 Returns
108 -------
109 id : `uuid.UUID`
110 The dataset ID.
111 component : `str` or `None`
112 The name of the component, if present.
113 extension: `str` or `None`
114 The file extension, if present.
115 """
116 # Assume first dot is the extension and so allow .fits.gz
117 root_ext = cached_location.split(".", maxsplit=1)
118 root = root_ext.pop(0)
119 ext = "." + root_ext.pop(0) if root_ext else None
121 parts = root.split("_")
122 id_ = uuid.UUID(parts.pop(0))
123 component = parts.pop(0) if parts else None
124 return id_, component, ext
127class CacheEntry(BaseModel):
128 """Represent an entry in the cache."""
130 name: str
131 """Name of the file."""
133 size: int
134 """Size of the file in bytes."""
136 ctime: datetime.datetime
137 """Creation time of the file."""
139 ref: DatasetId
140 """ID of this dataset."""
142 component: str | None = None
143 """Component for this disassembled composite (optional)."""
145 @classmethod
146 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry:
147 """Construct an object from a file name.
149 Parameters
150 ----------
151 file : `lsst.resources.ResourcePath`
152 Path to the file.
153 root : `lsst.resources.ResourcePath`
154 Cache root directory.
155 """
156 file_in_cache = file.relative_to(root)
157 if file_in_cache is None:
158 raise ValueError(f"Supplied file {file} is not inside root {root}")
159 id_, component, _ = _parse_cache_name(file_in_cache)
161 stat = os.stat(file.ospath)
162 return cls(
163 name=file_in_cache,
164 size=stat.st_size,
165 ref=id_,
166 component=component,
167 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime),
168 )
171class _MarkerEntry(CacheEntry):
172 pass
175class CacheRegistry(BaseModel):
176 """Collection of cache entries."""
178 _size: int = PrivateAttr(0)
179 """Size of the cache."""
181 _entries: dict[str, CacheEntry] = PrivateAttr({})
182 """Internal collection of cache entries."""
184 _ref_map: dict[DatasetId, list[str]] = PrivateAttr({})
185 """Mapping of DatasetID to corresponding keys in cache registry."""
187 @property
188 def cache_size(self) -> int:
189 return self._size
191 def __getitem__(self, key: str) -> CacheEntry:
192 return self._entries[key]
194 def __setitem__(self, key: str, entry: CacheEntry) -> None:
195 self._size += entry.size
196 self._entries[key] = entry
198 # Update the mapping from ref to path.
199 if entry.ref not in self._ref_map:
200 self._ref_map[entry.ref] = []
201 self._ref_map[entry.ref].append(key)
203 def __delitem__(self, key: str) -> None:
204 entry = self._entries.pop(key)
205 self._decrement(entry)
206 self._ref_map[entry.ref].remove(key)
208 def _decrement(self, entry: CacheEntry | None) -> None:
209 if entry:
210 self._size -= entry.size
211 if self._size < 0:
212 log.warning("Cache size has gone negative. Inconsistent cache records...")
213 self._size = 0
215 def __contains__(self, key: str) -> bool:
216 return key in self._entries
218 def __len__(self) -> int:
219 return len(self._entries)
221 def __iter__(self) -> Iterator[str]: # type: ignore
222 return iter(self._entries)
224 def keys(self) -> KeysView[str]:
225 return self._entries.keys()
227 def values(self) -> ValuesView[CacheEntry]:
228 return self._entries.values()
230 def items(self) -> ItemsView[str, CacheEntry]:
231 return self._entries.items()
233 # An private marker to indicate that pop() should raise if no default
234 # is given.
235 __marker = _MarkerEntry(
236 name="marker",
237 size=0,
238 ref=uuid.UUID("{00000000-0000-0000-0000-000000000000}"),
239 ctime=datetime.datetime.utcfromtimestamp(0),
240 )
242 def pop(self, key: str, default: CacheEntry | None = __marker) -> CacheEntry | None:
243 # The marker for dict.pop is not the same as our marker.
244 if default is self.__marker:
245 entry = self._entries.pop(key)
246 else:
247 entry = self._entries.pop(key, self.__marker)
248 # Should not attempt to correct for this entry being removed
249 # if we got the default value.
250 if entry is self.__marker:
251 return default
253 self._decrement(entry)
254 # The default entry given to this method may not even be in the cache.
255 if entry and entry.ref in self._ref_map:
256 keys = self._ref_map[entry.ref]
257 if key in keys:
258 keys.remove(key)
259 return entry
261 def get_dataset_keys(self, dataset_id: DatasetId | None) -> list[str] | None:
262 """Retrieve all keys associated with the given dataset ID.
264 Parameters
265 ----------
266 dataset_id : `DatasetId` or `None`
267 The dataset ID to look up. Returns `None` if the ID is `None`.
269 Returns
270 -------
271 keys : `list` [`str`]
272 Keys associated with this dataset. These keys can be used to lookup
273 the cache entry information in the `CacheRegistry`. Returns
274 `None` if the dataset is not known to the cache.
275 """
276 if dataset_id not in self._ref_map:
277 return None
278 keys = self._ref_map[dataset_id]
279 if not keys:
280 return None
281 return keys
284class DatastoreCacheManagerConfig(ConfigSubset):
285 """Configuration information for `DatastoreCacheManager`."""
287 component = "cached"
288 requiredKeys = ("cacheable",)
291class AbstractDatastoreCacheManager(ABC):
292 """An abstract base class for managing caching in a Datastore.
294 Parameters
295 ----------
296 config : `str` or `DatastoreCacheManagerConfig`
297 Configuration to control caching.
298 universe : `DimensionUniverse`
299 Set of all known dimensions, used to expand and validate any used
300 in lookup keys.
301 """
303 @property
304 def cache_size(self) -> int:
305 """Size of the cache in bytes."""
306 return 0
308 @property
309 def file_count(self) -> int:
310 """Return number of cached files tracked by registry."""
311 return 0
313 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse):
314 if not isinstance(config, DatastoreCacheManagerConfig):
315 config = DatastoreCacheManagerConfig(config)
316 assert isinstance(config, DatastoreCacheManagerConfig)
317 self.config = config
319 @abstractmethod
320 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool:
321 """Indicate whether the entity should be added to the cache.
323 This is relevant when reading or writing.
325 Parameters
326 ----------
327 entity : `StorageClass` or `DatasetType` or `DatasetRef`
328 Thing to test against the configuration. The ``name`` property
329 is used to determine a match. A `DatasetType` will first check
330 its name, before checking its `StorageClass`. If there are no
331 matches the default will be returned.
333 Returns
334 -------
335 should_cache : `bool`
336 Returns `True` if the dataset should be cached; `False` otherwise.
337 """
338 raise NotImplementedError()
340 @abstractmethod
341 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool:
342 """Report if the dataset is known to the cache.
344 Parameters
345 ----------
346 ref : `DatasetRef`
347 Dataset to check for in the cache.
348 extension : `str`, optional
349 File extension expected. Should include the leading "``.``".
350 If `None` the extension is ignored and the dataset ID alone is
351 used to check in the cache. The extension must be defined if
352 a specific component is being checked.
354 Returns
355 -------
356 known : `bool`
357 Returns `True` if the dataset is currently known to the cache
358 and `False` otherwise.
360 Notes
361 -----
362 This method can only report if the dataset is known to the cache
363 in this specific instant and does not indicate whether the file
364 can be read from the cache later. `find_in_cache()` should be called
365 if the cached file is to be used.
366 """
367 raise NotImplementedError()
369 @abstractmethod
370 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None:
371 """Move a file to the cache.
373 Move the given file into the cache, using the supplied DatasetRef
374 for naming. A call is made to `should_be_cached()` and if the
375 DatasetRef should not be accepted `None` will be returned.
377 Cache expiry can occur during this.
379 Parameters
380 ----------
381 uri : `lsst.resources.ResourcePath`
382 Location of the file to be relocated to the cache. Will be moved.
383 ref : `DatasetRef`
384 Ref associated with this file. Will be used to determine the name
385 of the file within the cache.
387 Returns
388 -------
389 new : `lsst.resources.ResourcePath` or `None`
390 URI to the file within the cache, or `None` if the dataset
391 was not accepted by the cache.
392 """
393 raise NotImplementedError()
395 @abstractmethod
396 @contextlib.contextmanager
397 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]:
398 """Look for a dataset in the cache and return its location.
400 Parameters
401 ----------
402 ref : `DatasetRef`
403 Dataset to locate in the cache.
404 extension : `str`
405 File extension expected. Should include the leading "``.``".
407 Yields
408 ------
409 uri : `lsst.resources.ResourcePath` or `None`
410 The URI to the cached file, or `None` if the file has not been
411 cached.
413 Notes
414 -----
415 Should be used as a context manager in order to prevent this
416 file from being removed from the cache for that context.
417 """
418 raise NotImplementedError()
420 @abstractmethod
421 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None:
422 """Remove the specified datasets from the cache.
424 It is not an error for these datasets to be missing from the cache.
426 Parameters
427 ----------
428 ref : `DatasetRef` or iterable of `DatasetRef`
429 The datasets to remove from the cache.
430 """
431 raise NotImplementedError()
433 @abstractmethod
434 def __str__(self) -> str:
435 raise NotImplementedError()
438class DatastoreCacheManager(AbstractDatastoreCacheManager):
439 """A class for managing caching in a Datastore using local files.
441 Parameters
442 ----------
443 config : `str` or `DatastoreCacheManagerConfig`
444 Configuration to control caching.
445 universe : `DimensionUniverse`
446 Set of all known dimensions, used to expand and validate any used
447 in lookup keys.
449 Notes
450 -----
451 Two environment variables can be used to override the cache directory
452 and expiration configuration:
454 * ``$DAF_BUTLER_CACHE_DIRECTORY``
455 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE``
457 The expiration mode should take the form ``mode=threshold`` so for
458 example to configure expiration to limit the cache directory to 5 datasets
459 the value would be ``datasets=5``.
461 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment
462 variable can be used to indicate that this directory should be used
463 if no explicit directory has been specified from configuration or from
464 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable.
465 """
467 _temp_exemption_prefix = "exempt/"
468 _tmpdir_prefix = "butler-cache-dir-"
470 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse):
471 super().__init__(config, universe)
473 # Set cache directory if it pre-exists, else defer creation until
474 # requested. Allow external override from environment.
475 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root")
477 # Allow the execution environment to override the default values
478 # so long as no default value has been set from the line above.
479 if root is None:
480 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET")
482 self._cache_directory = (
483 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None
484 )
486 if self._cache_directory:
487 if not self._cache_directory.isLocal:
488 raise ValueError(
489 f"Cache directory must be on a local file system. Got: {self._cache_directory}"
490 )
491 # Ensure that the cache directory is created. We assume that
492 # someone specifying a permanent cache directory will be expecting
493 # it to always be there. This will also trigger an error
494 # early rather than waiting until the cache is needed.
495 self._cache_directory.mkdir()
497 # Calculate the caching lookup table.
498 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe)
500 # Default decision to for whether a dataset should be cached.
501 self._caching_default = self.config.get("default", False)
503 # Expiration mode. Read from config but allow override from
504 # the environment.
505 expiration_mode = self.config.get(("expiry", "mode"))
506 threshold = self.config.get(("expiry", "threshold"))
508 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE")
509 if external_mode and "=" in external_mode:
510 expiration_mode, expiration_threshold = external_mode.split("=", 1)
511 threshold = int(expiration_threshold)
512 if expiration_mode is None:
513 # Force to None to avoid confusion.
514 threshold = None
516 self._expiration_mode: str | None = expiration_mode
517 self._expiration_threshold: int | None = threshold
518 if self._expiration_threshold is None and self._expiration_mode is not None:
519 raise ValueError(
520 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}"
521 )
523 log.debug(
524 "Cache configuration:\n- root: %s\n- expiration mode: %s",
525 self._cache_directory if self._cache_directory else "tmpdir",
526 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled",
527 )
529 # Files in cache, indexed by path within the cache directory.
530 self._cache_entries = CacheRegistry()
532 @property
533 def cache_directory(self) -> ResourcePath:
534 if self._cache_directory is None:
535 # Create on demand. Allow the override environment variable
536 # to be used in case it got set after this object was created
537 # but before a cache was used.
538 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"):
539 # Someone else will clean this up.
540 isTemporary = False
541 msg = "deferred fallback"
542 else:
543 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix)
544 isTemporary = True
545 msg = "temporary"
547 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary)
548 log.debug("Using %s cache directory at %s", msg, self._cache_directory)
550 # Remove when we no longer need it.
551 if isTemporary:
552 atexit.register(remove_cache_directory, self._cache_directory.ospath)
553 return self._cache_directory
555 @property
556 def _temp_exempt_directory(self) -> ResourcePath:
557 """Return the directory in which to store temporary cache files that
558 should not be expired.
559 """
560 return self.cache_directory.join(self._temp_exemption_prefix)
562 @property
563 def cache_size(self) -> int:
564 return self._cache_entries.cache_size
566 @property
567 def file_count(self) -> int:
568 return len(self._cache_entries)
570 @classmethod
571 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]:
572 """Define a fallback cache directory if a fallback not set already.
574 Returns
575 -------
576 defined : `bool`
577 `True` if the fallback directory was newly-defined in this method.
578 `False` if it had already been set.
579 cache_dir : `str`
580 Returns the path to the cache directory that will be used if it's
581 needed. This can allow the caller to run a directory cleanup
582 when it's no longer needed (something that the cache manager
583 can not do because forks should not clean up directories defined
584 by the parent process).
586 Notes
587 -----
588 The fallback directory will not be defined if one has already been
589 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET``
590 environment variable only if a value has not previously been stored
591 in that environment variable. Setting the environment variable allows
592 this value to survive into spawned subprocesses. Calling this method
593 will lead to all subsequently created cache managers sharing the same
594 cache.
595 """
596 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"):
597 # A value has already been set.
598 return (False, cache_dir)
600 # As a class method, we do not know at this point whether a cache
601 # directory will be needed so it would be impolite to create a
602 # directory that will never be used.
604 # Construct our own temp name -- 16 characters should have a fairly
605 # low chance of clashing when combined with the process ID.
606 characters = "abcdefghijklmnopqrstuvwxyz0123456789_"
607 rng = Random()
608 tempchars = "".join(rng.choice(characters) for _ in range(16))
610 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}"
612 cache_dir = os.path.join(tempfile.gettempdir(), tempname)
613 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir
614 return (True, cache_dir)
616 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool:
617 # Docstring inherited
618 matchName: LookupKey | str = f"{entity} (via default)"
619 should_cache = self._caching_default
621 for key in entity._lookupNames():
622 if key in self._lut:
623 should_cache = bool(self._lut[key])
624 matchName = key
625 break
627 if not isinstance(should_cache, bool):
628 raise TypeError(
629 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool."
630 )
632 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not")
633 return should_cache
635 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath:
636 """Construct the name to use for this dataset in the cache.
638 Parameters
639 ----------
640 ref : `DatasetRef`
641 The dataset to look up in or write to the cache.
642 extension : `str`
643 File extension to use for this file. Should include the
644 leading "``.``".
646 Returns
647 -------
648 uri : `lsst.resources.ResourcePath`
649 URI to use for this dataset in the cache.
650 """
651 return _construct_cache_path(self.cache_directory, ref, extension)
653 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None:
654 # Docstring inherited
655 if not self.should_be_cached(ref):
656 return None
658 # Write the file using the id of the dataset ref and the file
659 # extension.
660 cached_location = self._construct_cache_name(ref, uri.getExtension())
662 # Run cache expiry to ensure that we have room for this
663 # item.
664 self._expire_cache()
666 # The above reset the in-memory cache status. It's entirely possible
667 # that another process has just cached this file (if multiple
668 # processes are caching on read), so check our in-memory cache
669 # before attempting to cache the dataset.
670 path_in_cache = cached_location.relative_to(self.cache_directory)
671 if path_in_cache and path_in_cache in self._cache_entries:
672 return cached_location
674 # Move into the cache. Given that multiple processes might be
675 # sharing a single cache directory, and the file we need might have
676 # been copied in whilst we were checking, allow overwrite without
677 # complaint. Even for a private cache directory it is possible that
678 # a second butler in a subprocess could be writing to it.
679 cached_location.transfer_from(uri, transfer="move", overwrite=True)
680 log.debug("Cached dataset %s to %s", ref, cached_location)
682 self._register_cache_entry(cached_location)
684 return cached_location
686 @contextlib.contextmanager
687 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]:
688 # Docstring inherited
689 # Short circuit this if the cache directory has not been created yet.
690 if self._cache_directory is None:
691 yield None
692 return
694 cached_location = self._construct_cache_name(ref, extension)
695 if cached_location.exists():
696 log.debug("Found cached file %s for dataset %s.", cached_location, ref)
698 # The cached file could be removed by another process doing
699 # cache expiration so we need to protect against that by making
700 # a copy in a different tree. Use hardlinks to ensure that
701 # we either have the cached file or we don't. This is robust
702 # against race conditions that can be caused by using soft links
703 # and the other end of the link being deleted just after it
704 # is created.
705 path_in_cache = cached_location.relative_to(self.cache_directory)
706 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory"
708 # Need to use a unique file name for the temporary location to
709 # ensure that two different processes can read the file
710 # simultaneously without one of them deleting it when it's in
711 # use elsewhere. Retain the original filename for easier debugging.
712 random = str(uuid.uuid4())[:8]
713 basename = cached_location.basename()
714 filename = f"{random}-{basename}"
716 temp_location: ResourcePath | None = self._temp_exempt_directory.join(filename)
717 try:
718 if temp_location is not None:
719 temp_location.transfer_from(cached_location, transfer="hardlink")
720 except Exception as e:
721 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e)
722 # Any failure will be treated as if the file was not
723 # in the cache. Yielding the original cache location
724 # is too dangerous.
725 temp_location = None
727 try:
728 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref)
729 yield temp_location
730 finally:
731 try:
732 if temp_location:
733 temp_location.remove()
734 except FileNotFoundError:
735 pass
736 return
738 log.debug("Dataset %s not found in cache.", ref)
739 yield None
740 return
742 def remove_from_cache(self, refs: DatasetRef | Iterable[DatasetRef]) -> None:
743 # Docstring inherited.
745 # Stop early if there are no cache entries anyhow.
746 if len(self._cache_entries) == 0:
747 return
749 if isinstance(refs, DatasetRef):
750 refs = [refs]
752 # Create a set of all the IDs
753 all_ids = {ref.id for ref in refs}
755 keys_to_remove = []
756 for key, entry in self._cache_entries.items():
757 if entry.ref in all_ids:
758 keys_to_remove.append(key)
759 self._remove_from_cache(keys_to_remove)
761 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str | None:
762 """Record the file in the cache registry.
764 Parameters
765 ----------
766 cached_location : `lsst.resources.ResourcePath`
767 Location of the file to be registered.
768 can_exist : `bool`, optional
769 If `True` the item being registered can already be listed.
770 This can allow a cache refresh to run without checking the
771 file again. If `False` it is an error for the registry to
772 already know about this file.
774 Returns
775 -------
776 cache_key : `str` or `None`
777 The key used in the registry for this file. `None` if the file
778 no longer exists (it could have been expired by another process).
779 """
780 path_in_cache = cached_location.relative_to(self.cache_directory)
781 if path_in_cache is None:
782 raise ValueError(
783 f"Can not register cached file {cached_location} that is not within"
784 f" the cache directory at {self.cache_directory}."
785 )
786 if path_in_cache in self._cache_entries:
787 if can_exist:
788 return path_in_cache
789 else:
790 raise ValueError(
791 f"Cached file {cached_location} is already known to the registry"
792 " but this was expected to be a new file."
793 )
794 try:
795 details = CacheEntry.from_file(cached_location, root=self.cache_directory)
796 except FileNotFoundError:
797 return None
798 self._cache_entries[path_in_cache] = details
799 return path_in_cache
801 def scan_cache(self) -> None:
802 """Scan the cache directory and record information about files."""
803 found = set()
804 for file in ResourcePath.findFileResources([self.cache_directory]):
805 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator"
807 # Skip any that are found in an exempt part of the hierarchy
808 # since they should not be part of the registry.
809 if file.relative_to(self._temp_exempt_directory) is not None:
810 continue
812 path_in_cache = self._register_cache_entry(file, can_exist=True)
813 if path_in_cache:
814 found.add(path_in_cache)
816 # Find any files that were recorded in the cache but are no longer
817 # on disk. (something else cleared them out?)
818 known_to_cache = set(self._cache_entries)
819 missing = known_to_cache - found
821 if missing:
822 log.debug(
823 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing)
824 )
825 for path_in_cache in missing:
826 self._cache_entries.pop(path_in_cache, None)
828 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool:
829 """Report if the dataset is known to the cache.
831 Parameters
832 ----------
833 ref : `DatasetRef`
834 Dataset to check for in the cache.
835 extension : `str`, optional
836 File extension expected. Should include the leading "``.``".
837 If `None` the extension is ignored and the dataset ID alone is
838 used to check in the cache. The extension must be defined if
839 a specific component is being checked.
841 Returns
842 -------
843 known : `bool`
844 Returns `True` if the dataset is currently known to the cache
845 and `False` otherwise. If the dataset refers to a component and
846 an extension is given then only that component is checked.
848 Notes
849 -----
850 This method can only report if the dataset is known to the cache
851 in this specific instant and does not indicate whether the file
852 can be read from the cache later. `find_in_cache()` should be called
853 if the cached file is to be used.
855 This method does not force the cache to be re-scanned and so can miss
856 cached datasets that have recently been written by other processes.
857 """
858 if self._cache_directory is None:
859 return False
860 if self.file_count == 0:
861 return False
863 if extension is None:
864 # Look solely for matching dataset ref ID and not specific
865 # components.
866 cached_paths = self._cache_entries.get_dataset_keys(ref.id)
867 return True if cached_paths else False
869 else:
870 # Extension is known so we can do an explicit look up for the
871 # cache entry.
872 cached_location = self._construct_cache_name(ref, extension)
873 path_in_cache = cached_location.relative_to(self.cache_directory)
874 assert path_in_cache is not None # For mypy
875 return path_in_cache in self._cache_entries
877 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None:
878 """Remove the specified cache entries from cache.
880 Parameters
881 ----------
882 cache_entries : iterable of `str`
883 The entries to remove from the cache. The values are the path
884 within the cache.
885 """
886 for entry in cache_entries:
887 path = self.cache_directory.join(entry)
889 self._cache_entries.pop(entry, None)
890 log.debug("Removing file from cache: %s", path)
891 try:
892 path.remove()
893 except FileNotFoundError:
894 pass
896 def _expire_cache(self) -> None:
897 """Expire the files in the cache.
899 Notes
900 -----
901 The expiration modes are defined by the config or can be overridden.
902 Available options:
904 * ``files``: Number of files.
905 * ``datasets``: Number of datasets
906 * ``size``: Total size of files.
907 * ``age``: Age of files.
909 The first three would remove in reverse time order.
910 Number of files is complicated by the possibility of disassembled
911 composites where 10 small files can be created for each dataset.
913 Additionally there is a use case for an external user to explicitly
914 state the dataset refs that should be cached and then when to
915 remove them. Overriding any global configuration.
916 """
917 if self._expiration_mode is None:
918 # Expiration has been disabled.
919 return
921 # mypy can't be sure we have set a threshold properly
922 if self._expiration_threshold is None:
923 log.warning(
924 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode
925 )
926 return
928 # Sync up cache. There is no file locking involved so for a shared
929 # cache multiple processes may be racing to delete files. Deleting
930 # a file that no longer exists is not an error.
931 self.scan_cache()
933 if self._expiration_mode == "files":
934 n_files = len(self._cache_entries)
935 n_over = n_files - self._expiration_threshold
936 if n_over > 0:
937 sorted_keys = self._sort_cache()
938 keys_to_remove = sorted_keys[:n_over]
939 self._remove_from_cache(keys_to_remove)
940 return
942 if self._expiration_mode == "datasets":
943 # Count the datasets, in ascending timestamp order,
944 # so that oldest turn up first.
945 datasets = defaultdict(list)
946 for key in self._sort_cache():
947 entry = self._cache_entries[key]
948 datasets[entry.ref].append(key)
950 n_datasets = len(datasets)
951 n_over = n_datasets - self._expiration_threshold
952 if n_over > 0:
953 # Keys will be read out in insert order which
954 # will be date order so oldest ones are removed.
955 ref_ids = list(datasets.keys())[:n_over]
956 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids))
957 self._remove_from_cache(keys_to_remove)
958 return
960 if self._expiration_mode == "size":
961 if self.cache_size > self._expiration_threshold:
962 for key in self._sort_cache():
963 self._remove_from_cache([key])
964 if self.cache_size <= self._expiration_threshold:
965 break
966 return
968 if self._expiration_mode == "age":
969 now = datetime.datetime.utcnow()
970 for key in self._sort_cache():
971 delta = now - self._cache_entries[key].ctime
972 if delta.seconds > self._expiration_threshold:
973 self._remove_from_cache([key])
974 else:
975 # We're already in date order.
976 break
977 return
979 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}")
981 def _sort_cache(self) -> list[str]:
982 """Sort the cache entries by time and return the sorted keys.
984 Returns
985 -------
986 sorted : `list` of `str`
987 Keys into the cache, sorted by time with oldest first.
988 """
990 def sort_by_time(key: str) -> datetime.datetime:
991 """Sorter key function using cache entry details."""
992 return self._cache_entries[key].ctime
994 return sorted(self._cache_entries, key=sort_by_time)
996 def __str__(self) -> str:
997 cachedir = self._cache_directory if self._cache_directory else "<tempdir>"
998 return (
999 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold},"
1000 f"default={self._caching_default}) "
1001 f"n_files={self.file_count}, n_bytes={self.cache_size}"
1002 )
1005class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager):
1006 """A variant of the datastore cache where no cache is enabled.
1008 Parameters
1009 ----------
1010 config : `str` or `DatastoreCacheManagerConfig`
1011 Configuration to control caching.
1012 universe : `DimensionUniverse`
1013 Set of all known dimensions, used to expand and validate any used
1014 in lookup keys.
1015 """
1017 def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse):
1018 return
1020 def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool:
1021 """Indicate whether the entity should be added to the cache.
1023 Always returns `False`.
1024 """
1025 return False
1027 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> ResourcePath | None:
1028 """Move dataset to cache but always refuse and returns `None`."""
1029 return None
1031 @contextlib.contextmanager
1032 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[ResourcePath | None]:
1033 """Look for a dataset in the cache and return its location.
1035 Never finds a file.
1036 """
1037 yield None
1039 def remove_from_cache(self, ref: DatasetRef | Iterable[DatasetRef]) -> None:
1040 """Remove datasets from cache.
1042 Always does nothing.
1043 """
1044 return
1046 def known_to_cache(self, ref: DatasetRef, extension: str | None = None) -> bool:
1047 """Report if a dataset is known to the cache.
1049 Always returns `False`.
1050 """
1051 return False
1053 def __str__(self) -> str:
1054 return f"{type(self).__name__}()"