Coverage for python/lsst/daf/butler/core/datastoreCacheManager.py: 23%
399 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-29 02:20 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-29 02:20 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Cache management for a datastore."""
26__all__ = (
27 "AbstractDatastoreCacheManager",
28 "DatastoreDisabledCacheManager",
29 "DatastoreCacheManager",
30 "DatastoreCacheManagerConfig",
31)
33import atexit
34import contextlib
35import datetime
36import itertools
37import logging
38import os
39import shutil
40import tempfile
41from abc import ABC, abstractmethod
42from collections import defaultdict
43from random import Random
44from typing import (
45 TYPE_CHECKING,
46 Dict,
47 ItemsView,
48 Iterable,
49 Iterator,
50 KeysView,
51 List,
52 Optional,
53 Union,
54 ValuesView,
55)
57from lsst.resources import ResourcePath
58from pydantic import BaseModel, PrivateAttr
60from .config import ConfigSubset
61from .configSupport import processLookupConfigs
62from .datasets import DatasetId, DatasetRef
64if TYPE_CHECKING: 64 ↛ 65line 64 didn't jump to line 65, because the condition on line 64 was never true
65 from .configSupport import LookupKey
66 from .datasets import DatasetType
67 from .dimensions import DimensionUniverse
68 from .storageClass import StorageClass
70log = logging.getLogger(__name__)
73def remove_cache_directory(directory: str) -> None:
74 """Remove the specified directory and all its contents."""
75 log.debug("Removing temporary cache directory %s", directory)
76 shutil.rmtree(directory, ignore_errors=True)
79def _construct_cache_path(root: ResourcePath, ref: DatasetRef, extension: str) -> ResourcePath:
80 """Construct the full path to use for this dataset in the cache.
82 Parameters
83 ----------
84 ref : `DatasetRef`
85 The dataset to look up in or write to the cache.
86 extension : `str`
87 File extension to use for this file. Should include the
88 leading "``.``".
90 Returns
91 -------
92 uri : `lsst.resources.ResourcePath`
93 URI to use for this dataset in the cache.
94 """
95 # Dataset type component is needed in the name if composite
96 # disassembly is happening since the ID is shared for all components.
97 component = ref.datasetType.component()
98 component = f"_{component}" if component else ""
99 return root.join(f"{ref.id}{component}{extension}")
102def _parse_cache_name(cached_location: str) -> Dict[str, Optional[str]]:
103 """For a given cache name, return its component parts.
105 Changes to ``_construct_cache_path()`` should be reflected here.
107 Parameters
108 ----------
109 cached_location : `str`
110 The name of the file within the cache.
112 Returns
113 -------
114 parsed : `dict` of `str`, `str`
115 Parsed components of the file. These include:
116 - "id": The dataset ID,
117 - "component": The name of the component (can be `None`),
118 - "extension": File extension (can be `None`).
119 """
120 # Assume first dot is the extension and so allow .fits.gz
121 root_ext = cached_location.split(".", maxsplit=1)
122 root = root_ext.pop(0)
123 ext = "." + root_ext.pop(0) if root_ext else None
125 parts = root.split("_")
126 id_ = parts.pop(0)
127 component = parts.pop(0) if parts else None
128 return {"id": id_, "component": component, "extension": ext}
131class CacheEntry(BaseModel):
132 """Represent an entry in the cache."""
134 name: str
135 """Name of the file."""
137 size: int
138 """Size of the file in bytes."""
140 ctime: datetime.datetime
141 """Creation time of the file."""
143 ref: DatasetId
144 """ID of this dataset."""
146 component: Optional[str]
147 """Component for this disassembled composite (optional)."""
149 @classmethod
150 def from_file(cls, file: ResourcePath, root: ResourcePath) -> CacheEntry:
151 """Construct an object from a file name.
153 Parameters
154 ----------
155 file : `lsst.resources.ResourcePath`
156 Path to the file.
157 root : `lsst.resources.ResourcePath`
158 Cache root directory.
159 """
160 file_in_cache = file.relative_to(root)
161 if file_in_cache is None:
162 raise ValueError(f"Supplied file {file} is not inside root {root}")
163 parts = _parse_cache_name(file_in_cache)
165 stat = os.stat(file.ospath)
166 return cls(
167 name=file_in_cache,
168 size=stat.st_size,
169 ref=parts["id"],
170 component=parts["component"],
171 ctime=datetime.datetime.utcfromtimestamp(stat.st_ctime),
172 )
175class _MarkerEntry(CacheEntry):
176 pass
179class CacheRegistry(BaseModel):
180 """Collection of cache entries."""
182 _size: int = PrivateAttr(0)
183 """Size of the cache."""
185 _entries: Dict[str, CacheEntry] = PrivateAttr({})
186 """Internal collection of cache entries."""
188 _ref_map: Dict[DatasetId, List[str]] = PrivateAttr({})
189 """Mapping of DatasetID to corresponding keys in cache registry."""
191 @property
192 def cache_size(self) -> int:
193 return self._size
195 def __getitem__(self, key: str) -> CacheEntry:
196 return self._entries[key]
198 def __setitem__(self, key: str, entry: CacheEntry) -> None:
199 self._size += entry.size
200 self._entries[key] = entry
202 # Update the mapping from ref to path.
203 if entry.ref not in self._ref_map:
204 self._ref_map[entry.ref] = []
205 self._ref_map[entry.ref].append(key)
207 def __delitem__(self, key: str) -> None:
208 entry = self._entries.pop(key)
209 self._decrement(entry)
210 self._ref_map[entry.ref].remove(key)
212 def _decrement(self, entry: Optional[CacheEntry]) -> None:
213 if entry:
214 self._size -= entry.size
215 if self._size < 0:
216 log.warning("Cache size has gone negative. Inconsistent cache records...")
217 self._size = 0
219 def __contains__(self, key: str) -> bool:
220 return key in self._entries
222 def __len__(self) -> int:
223 return len(self._entries)
225 def __iter__(self) -> Iterator[str]: # type: ignore
226 return iter(self._entries)
228 def keys(self) -> KeysView[str]:
229 return self._entries.keys()
231 def values(self) -> ValuesView[CacheEntry]:
232 return self._entries.values()
234 def items(self) -> ItemsView[str, CacheEntry]:
235 return self._entries.items()
237 # An private marker to indicate that pop() should raise if no default
238 # is given.
239 __marker = _MarkerEntry(name="marker", size=0, ref=0, ctime=datetime.datetime.utcfromtimestamp(0))
241 def pop(self, key: str, default: Optional[CacheEntry] = __marker) -> Optional[CacheEntry]:
242 # The marker for dict.pop is not the same as our marker.
243 if default is self.__marker:
244 entry = self._entries.pop(key)
245 else:
246 entry = self._entries.pop(key, self.__marker)
247 # Should not attempt to correct for this entry being removed
248 # if we got the default value.
249 if entry is self.__marker:
250 return default
252 self._decrement(entry)
253 # The default entry given to this method may not even be in the cache.
254 if entry and entry.ref in self._ref_map:
255 keys = self._ref_map[entry.ref]
256 if key in keys:
257 keys.remove(key)
258 return entry
260 def get_dataset_keys(self, dataset_id: Optional[DatasetId]) -> Optional[List[str]]:
261 """Retrieve all keys associated with the given dataset ID.
263 Parameters
264 ----------
265 dataset_id : `DatasetId` or `None`
266 The dataset ID to look up. Returns `None` if the ID is `None`.
268 Returns
269 -------
270 keys : `list` [`str`]
271 Keys associated with this dataset. These keys can be used to lookup
272 the cache entry information in the `CacheRegistry`. Returns
273 `None` if the dataset is not known to the cache.
274 """
275 if dataset_id not in self._ref_map:
276 return None
277 keys = self._ref_map[dataset_id]
278 if not keys:
279 return None
280 return keys
283class DatastoreCacheManagerConfig(ConfigSubset):
284 """Configuration information for `DatastoreCacheManager`."""
286 component = "cached"
287 requiredKeys = ("cacheable",)
290class AbstractDatastoreCacheManager(ABC):
291 """An abstract base class for managing caching in a Datastore.
293 Parameters
294 ----------
295 config : `str` or `DatastoreCacheManagerConfig`
296 Configuration to control caching.
297 universe : `DimensionUniverse`
298 Set of all known dimensions, used to expand and validate any used
299 in lookup keys.
300 """
302 @property
303 def cache_size(self) -> int:
304 """Size of the cache in bytes."""
305 return 0
307 @property
308 def file_count(self) -> int:
309 """Return number of cached files tracked by registry."""
310 return 0
312 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse):
313 if not isinstance(config, DatastoreCacheManagerConfig):
314 config = DatastoreCacheManagerConfig(config)
315 assert isinstance(config, DatastoreCacheManagerConfig)
316 self.config = config
318 @abstractmethod
319 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool:
320 """Indicate whether the entity should be added to the cache.
322 This is relevant when reading or writing.
324 Parameters
325 ----------
326 entity : `StorageClass` or `DatasetType` or `DatasetRef`
327 Thing to test against the configuration. The ``name`` property
328 is used to determine a match. A `DatasetType` will first check
329 its name, before checking its `StorageClass`. If there are no
330 matches the default will be returned.
332 Returns
333 -------
334 should_cache : `bool`
335 Returns `True` if the dataset should be cached; `False` otherwise.
336 """
337 raise NotImplementedError()
339 @abstractmethod
340 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool:
341 """Report if the dataset is known to the cache.
343 Parameters
344 ----------
345 ref : `DatasetRef`
346 Dataset to check for in the cache.
347 extension : `str`, optional
348 File extension expected. Should include the leading "``.``".
349 If `None` the extension is ignored and the dataset ID alone is
350 used to check in the cache. The extension must be defined if
351 a specific component is being checked.
353 Returns
354 -------
355 known : `bool`
356 Returns `True` if the dataset is currently known to the cache
357 and `False` otherwise.
359 Notes
360 -----
361 This method can only report if the dataset is known to the cache
362 in this specific instant and does not indicate whether the file
363 can be read from the cache later. `find_in_cache()` should be called
364 if the cached file is to be used.
365 """
366 raise NotImplementedError()
368 @abstractmethod
369 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]:
370 """Move a file to the cache.
372 Move the given file into the cache, using the supplied DatasetRef
373 for naming. A call is made to `should_be_cached()` and if the
374 DatasetRef should not be accepted `None` will be returned.
376 Cache expiry can occur during this.
378 Parameters
379 ----------
380 uri : `lsst.resources.ResourcePath`
381 Location of the file to be relocated to the cache. Will be moved.
382 ref : `DatasetRef`
383 Ref associated with this file. Will be used to determine the name
384 of the file within the cache.
386 Returns
387 -------
388 new : `lsst.resources.ResourcePath` or `None`
389 URI to the file within the cache, or `None` if the dataset
390 was not accepted by the cache.
391 """
392 raise NotImplementedError()
394 @abstractmethod
395 @contextlib.contextmanager
396 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]:
397 """Look for a dataset in the cache and return its location.
399 Parameters
400 ----------
401 ref : `DatasetRef`
402 Dataset to locate in the cache.
403 extension : `str`
404 File extension expected. Should include the leading "``.``".
406 Yields
407 ------
408 uri : `lsst.resources.ResourcePath` or `None`
409 The URI to the cached file, or `None` if the file has not been
410 cached.
412 Notes
413 -----
414 Should be used as a context manager in order to prevent this
415 file from being removed from the cache for that context.
416 """
417 raise NotImplementedError()
419 @abstractmethod
420 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None:
421 """Remove the specified datasets from the cache.
423 It is not an error for these datasets to be missing from the cache.
425 Parameters
426 ----------
427 ref : `DatasetRef` or iterable of `DatasetRef`
428 The datasets to remove from the cache.
429 """
430 raise NotImplementedError()
432 @abstractmethod
433 def __str__(self) -> str:
434 raise NotImplementedError()
437class DatastoreCacheManager(AbstractDatastoreCacheManager):
438 """A class for managing caching in a Datastore using local files.
440 Parameters
441 ----------
442 config : `str` or `DatastoreCacheManagerConfig`
443 Configuration to control caching.
444 universe : `DimensionUniverse`
445 Set of all known dimensions, used to expand and validate any used
446 in lookup keys.
448 Notes
449 -----
450 Two environment variables can be used to override the cache directory
451 and expiration configuration:
453 * ``$DAF_BUTLER_CACHE_DIRECTORY``
454 * ``$DAF_BUTLER_CACHE_EXPIRATION_MODE``
456 The expiration mode should take the form ``mode=threshold`` so for
457 example to configure expiration to limit the cache directory to 5 datasets
458 the value would be ``datasets=5``.
460 Additionally the ``$DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET`` environment
461 variable can be used to indicate that this directory should be used
462 if no explicit directory has been specified from configuration or from
463 the ``$DAF_BUTLER_CACHE_DIRECTORY`` environment variable.
464 """
466 _temp_exemption_prefix = "exempt/"
467 _tmpdir_prefix = "butler-cache-dir-"
469 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse):
470 super().__init__(config, universe)
472 # Set cache directory if it pre-exists, else defer creation until
473 # requested. Allow external override from environment.
474 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY") or self.config.get("root")
476 # Allow the execution environment to override the default values
477 # so long as no default value has been set from the line above.
478 if root is None:
479 root = os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET")
481 self._cache_directory = (
482 ResourcePath(root, forceAbsolute=True, forceDirectory=True) if root is not None else None
483 )
485 if self._cache_directory:
486 if not self._cache_directory.isLocal:
487 raise ValueError(
488 f"Cache directory must be on a local file system. Got: {self._cache_directory}"
489 )
490 # Ensure that the cache directory is created. We assume that
491 # someone specifying a permanent cache directory will be expecting
492 # it to always be there. This will also trigger an error
493 # early rather than waiting until the cache is needed.
494 self._cache_directory.mkdir()
496 # Calculate the caching lookup table.
497 self._lut = processLookupConfigs(self.config["cacheable"], universe=universe)
499 # Default decision to for whether a dataset should be cached.
500 self._caching_default = self.config.get("default", False)
502 # Expiration mode. Read from config but allow override from
503 # the environment.
504 expiration_mode = self.config.get(("expiry", "mode"))
505 threshold = self.config.get(("expiry", "threshold"))
507 external_mode = os.environ.get("DAF_BUTLER_CACHE_EXPIRATION_MODE")
508 if external_mode and "=" in external_mode:
509 expiration_mode, expiration_threshold = external_mode.split("=", 1)
510 threshold = int(expiration_threshold)
511 if expiration_mode is None:
512 # Force to None to avoid confusion.
513 threshold = None
515 self._expiration_mode: Optional[str] = expiration_mode
516 self._expiration_threshold: Optional[int] = threshold
517 if self._expiration_threshold is None and self._expiration_mode is not None:
518 raise ValueError(
519 f"Cache expiration threshold must be set for expiration mode {self._expiration_mode}"
520 )
522 log.debug(
523 "Cache configuration:\n- root: %s\n- expiration mode: %s",
524 self._cache_directory if self._cache_directory else "tmpdir",
525 f"{self._expiration_mode}={self._expiration_threshold}" if self._expiration_mode else "disabled",
526 )
528 # Files in cache, indexed by path within the cache directory.
529 self._cache_entries = CacheRegistry()
531 @property
532 def cache_directory(self) -> ResourcePath:
533 if self._cache_directory is None:
534 # Create on demand. Allow the override environment variable
535 # to be used in case it got set after this object was created
536 # but before a cache was used.
537 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"):
538 # Someone else will clean this up.
539 isTemporary = False
540 msg = "deferred fallback"
541 else:
542 cache_dir = tempfile.mkdtemp(prefix=self._tmpdir_prefix)
543 isTemporary = True
544 msg = "temporary"
546 self._cache_directory = ResourcePath(cache_dir, forceDirectory=True, isTemporary=isTemporary)
547 log.debug("Using %s cache directory at %s", msg, self._cache_directory)
549 # Remove when we no longer need it.
550 if isTemporary:
551 atexit.register(remove_cache_directory, self._cache_directory.ospath)
552 return self._cache_directory
554 @property
555 def _temp_exempt_directory(self) -> ResourcePath:
556 """Return the directory in which to store temporary cache files that
557 should not be expired.
558 """
559 return self.cache_directory.join(self._temp_exemption_prefix)
561 @property
562 def cache_size(self) -> int:
563 return self._cache_entries.cache_size
565 @property
566 def file_count(self) -> int:
567 return len(self._cache_entries)
569 @classmethod
570 def set_fallback_cache_directory_if_unset(cls) -> tuple[bool, str]:
571 """Defines a fallback cache directory if a fallback not set already.
573 Returns
574 -------
575 defined : `bool`
576 `True` if the fallback directory was newly-defined in this method.
577 `False` if it had already been set.
578 cache_dir : `str`
579 Returns the path to the cache directory that will be used if it's
580 needed. This can allow the caller to run a directory cleanup
581 when it's no longer needed (something that the cache manager
582 can not do because forks should not clean up directories defined
583 by the parent process).
585 Notes
586 -----
587 The fallback directory will not be defined if one has already been
588 defined. This method sets the ``DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET``
589 environment variable only if a value has not previously been stored
590 in that environment variable. Setting the environment variable allows
591 this value to survive into spawned subprocesses. Calling this method
592 will lead to all subsequently created cache managers sharing the same
593 cache.
594 """
595 if cache_dir := os.environ.get("DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"):
596 # A value has already been set.
597 return (False, cache_dir)
599 # As a class method, we do not know at this point whether a cache
600 # directory will be needed so it would be impolite to create a
601 # directory that will never be used.
603 # Construct our own temp name -- 16 characters should have a fairly
604 # low chance of clashing when combined with the process ID.
605 characters = "abcdefghijklmnopqrstuvwxyz0123456789_"
606 rng = Random()
607 tempchars = "".join(rng.choice(characters) for _ in range(16))
609 tempname = f"{cls._tmpdir_prefix}{os.getpid()}-{tempchars}"
611 cache_dir = os.path.join(tempfile.gettempdir(), tempname)
612 os.environ["DAF_BUTLER_CACHE_DIRECTORY_IF_UNSET"] = cache_dir
613 return (True, cache_dir)
615 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool:
616 # Docstring inherited
617 matchName: Union[LookupKey, str] = "{} (via default)".format(entity)
618 should_cache = self._caching_default
620 for key in entity._lookupNames():
621 if key in self._lut:
622 should_cache = bool(self._lut[key])
623 matchName = key
624 break
626 if not isinstance(should_cache, bool):
627 raise TypeError(
628 f"Got cache value {should_cache!r} for config entry {matchName!r}; expected bool."
629 )
631 log.debug("%s (match: %s) should%s be cached", entity, matchName, "" if should_cache else " not")
632 return should_cache
634 def _construct_cache_name(self, ref: DatasetRef, extension: str) -> ResourcePath:
635 """Construct the name to use for this dataset in the cache.
637 Parameters
638 ----------
639 ref : `DatasetRef`
640 The dataset to look up in or write to the cache.
641 extension : `str`
642 File extension to use for this file. Should include the
643 leading "``.``".
645 Returns
646 -------
647 uri : `lsst.resources.ResourcePath`
648 URI to use for this dataset in the cache.
649 """
650 return _construct_cache_path(self.cache_directory, ref, extension)
652 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]:
653 # Docstring inherited
654 if ref.id is None:
655 raise ValueError(f"Can not cache a file associated with an unresolved reference ({ref})")
657 if not self.should_be_cached(ref):
658 return None
660 # Write the file using the id of the dataset ref and the file
661 # extension.
662 cached_location = self._construct_cache_name(ref, uri.getExtension())
664 # Run cache expiry to ensure that we have room for this
665 # item.
666 self._expire_cache()
668 # The above reset the in-memory cache status. It's entirely possible
669 # that another process has just cached this file (if multiple
670 # processes are caching on read), so check our in-memory cache
671 # before attempting to cache the dataset.
672 path_in_cache = cached_location.relative_to(self.cache_directory)
673 if path_in_cache and path_in_cache in self._cache_entries:
674 return cached_location
676 # Move into the cache. Given that multiple processes might be
677 # sharing a single cache directory, and the file we need might have
678 # been copied in whilst we were checking, allow overwrite without
679 # complaint. Even for a private cache directory it is possible that
680 # a second butler in a subprocess could be writing to it.
681 cached_location.transfer_from(uri, transfer="move", overwrite=True)
682 log.debug("Cached dataset %s to %s", ref, cached_location)
684 self._register_cache_entry(cached_location)
686 return cached_location
688 @contextlib.contextmanager
689 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]:
690 # Docstring inherited
691 # Short circuit this if the cache directory has not been created yet.
692 if self._cache_directory is None:
693 yield None
694 return
696 cached_location = self._construct_cache_name(ref, extension)
697 if cached_location.exists():
698 log.debug("Found cached file %s for dataset %s.", cached_location, ref)
700 # The cached file could be removed by another process doing
701 # cache expiration so we need to protect against that by making
702 # a copy in a different tree. Use hardlinks to ensure that
703 # we either have the cached file or we don't. This is robust
704 # against race conditions that can be caused by using soft links
705 # and the other end of the link being deleted just after it
706 # is created.
707 path_in_cache = cached_location.relative_to(self.cache_directory)
708 assert path_in_cache is not None, f"Somehow {cached_location} not in cache directory"
709 temp_location: Optional[ResourcePath] = self._temp_exempt_directory.join(path_in_cache)
710 try:
711 if temp_location is not None:
712 temp_location.transfer_from(cached_location, transfer="hardlink")
713 except Exception as e:
714 log.debug("Detected error creating hardlink for dataset %s: %s", ref, e)
715 # Any failure will be treated as if the file was not
716 # in the cache. Yielding the original cache location
717 # is too dangerous.
718 temp_location = None
720 try:
721 log.debug("Yielding temporary cache location %s for dataset %s", temp_location, ref)
722 yield temp_location
723 finally:
724 try:
725 if temp_location:
726 temp_location.remove()
727 except FileNotFoundError:
728 pass
729 return
731 log.debug("Dataset %s not found in cache.", ref)
732 yield None
733 return
735 def remove_from_cache(self, refs: Union[DatasetRef, Iterable[DatasetRef]]) -> None:
736 # Docstring inherited.
738 # Stop early if there are no cache entries anyhow.
739 if len(self._cache_entries) == 0:
740 return
742 if isinstance(refs, DatasetRef):
743 refs = [refs]
745 # Create a set of all the IDs
746 all_ids = {ref.getCheckedId() for ref in refs}
748 keys_to_remove = []
749 for key, entry in self._cache_entries.items():
750 if entry.ref in all_ids:
751 keys_to_remove.append(key)
752 self._remove_from_cache(keys_to_remove)
754 def _register_cache_entry(self, cached_location: ResourcePath, can_exist: bool = False) -> str:
755 """Record the file in the cache registry.
757 Parameters
758 ----------
759 cached_location : `lsst.resources.ResourcePath`
760 Location of the file to be registered.
761 can_exist : `bool`, optional
762 If `True` the item being registered can already be listed.
763 This can allow a cache refresh to run without checking the
764 file again. If `False` it is an error for the registry to
765 already know about this file.
767 Returns
768 -------
769 cache_key : `str`
770 The key used in the registry for this file.
771 """
772 path_in_cache = cached_location.relative_to(self.cache_directory)
773 if path_in_cache is None:
774 raise ValueError(
775 f"Can not register cached file {cached_location} that is not within"
776 f" the cache directory at {self.cache_directory}."
777 )
778 if path_in_cache in self._cache_entries:
779 if can_exist:
780 return path_in_cache
781 else:
782 raise ValueError(
783 f"Cached file {cached_location} is already known to the registry"
784 " but this was expected to be a new file."
785 )
786 details = CacheEntry.from_file(cached_location, root=self.cache_directory)
787 self._cache_entries[path_in_cache] = details
788 return path_in_cache
790 def scan_cache(self) -> None:
791 """Scan the cache directory and record information about files."""
792 found = set()
793 for file in ResourcePath.findFileResources([self.cache_directory]):
794 assert isinstance(file, ResourcePath), "Unexpectedly did not get ResourcePath from iterator"
796 # Skip any that are found in an exempt part of the hierarchy
797 # since they should not be part of the registry.
798 if file.relative_to(self._temp_exempt_directory) is not None:
799 continue
801 path_in_cache = self._register_cache_entry(file, can_exist=True)
802 found.add(path_in_cache)
804 # Find any files that were recorded in the cache but are no longer
805 # on disk. (something else cleared them out?)
806 known_to_cache = set(self._cache_entries)
807 missing = known_to_cache - found
809 if missing:
810 log.debug(
811 "Entries no longer on disk but thought to be in cache and so removed: %s", ",".join(missing)
812 )
813 for path_in_cache in missing:
814 self._cache_entries.pop(path_in_cache, None)
816 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool:
817 """Report if the dataset is known to the cache.
819 Parameters
820 ----------
821 ref : `DatasetRef`
822 Dataset to check for in the cache.
823 extension : `str`, optional
824 File extension expected. Should include the leading "``.``".
825 If `None` the extension is ignored and the dataset ID alone is
826 used to check in the cache. The extension must be defined if
827 a specific component is being checked.
829 Returns
830 -------
831 known : `bool`
832 Returns `True` if the dataset is currently known to the cache
833 and `False` otherwise. If the dataset refers to a component and
834 an extension is given then only that component is checked.
836 Notes
837 -----
838 This method can only report if the dataset is known to the cache
839 in this specific instant and does not indicate whether the file
840 can be read from the cache later. `find_in_cache()` should be called
841 if the cached file is to be used.
843 This method does not force the cache to be re-scanned and so can miss
844 cached datasets that have recently been written by other processes.
845 """
846 if self._cache_directory is None:
847 return False
848 if self.file_count == 0:
849 return False
851 if extension is None:
852 # Look solely for matching dataset ref ID and not specific
853 # components.
854 cached_paths = self._cache_entries.get_dataset_keys(ref.id)
855 return True if cached_paths else False
857 else:
858 # Extension is known so we can do an explicit look up for the
859 # cache entry.
860 cached_location = self._construct_cache_name(ref, extension)
861 path_in_cache = cached_location.relative_to(self.cache_directory)
862 assert path_in_cache is not None # For mypy
863 return path_in_cache in self._cache_entries
865 def _remove_from_cache(self, cache_entries: Iterable[str]) -> None:
866 """Remove the specified cache entries from cache.
868 Parameters
869 ----------
870 cache_entries : iterable of `str`
871 The entries to remove from the cache. The values are the path
872 within the cache.
873 """
874 for entry in cache_entries:
875 path = self.cache_directory.join(entry)
877 self._cache_entries.pop(entry, None)
878 log.debug("Removing file from cache: %s", path)
879 try:
880 path.remove()
881 except FileNotFoundError:
882 pass
884 def _expire_cache(self) -> None:
885 """Expire the files in the cache.
887 Notes
888 -----
889 The expiration modes are defined by the config or can be overridden.
890 Available options:
892 * ``files``: Number of files.
893 * ``datasets``: Number of datasets
894 * ``size``: Total size of files.
895 * ``age``: Age of files.
897 The first three would remove in reverse time order.
898 Number of files is complicated by the possibility of disassembled
899 composites where 10 small files can be created for each dataset.
901 Additionally there is a use case for an external user to explicitly
902 state the dataset refs that should be cached and then when to
903 remove them. Overriding any global configuration.
904 """
905 if self._expiration_mode is None:
906 # Expiration has been disabled.
907 return
909 # mypy can't be sure we have set a threshold properly
910 if self._expiration_threshold is None:
911 log.warning(
912 "Requesting cache expiry of mode %s but no threshold set in config.", self._expiration_mode
913 )
914 return
916 # Sync up cache. There is no file locking involved so for a shared
917 # cache multiple processes may be racing to delete files. Deleting
918 # a file that no longer exists is not an error.
919 self.scan_cache()
921 if self._expiration_mode == "files":
922 n_files = len(self._cache_entries)
923 n_over = n_files - self._expiration_threshold
924 if n_over > 0:
925 sorted_keys = self._sort_cache()
926 keys_to_remove = sorted_keys[:n_over]
927 self._remove_from_cache(keys_to_remove)
928 return
930 if self._expiration_mode == "datasets":
931 # Count the datasets, in ascending timestamp order,
932 # so that oldest turn up first.
933 datasets = defaultdict(list)
934 for key in self._sort_cache():
935 entry = self._cache_entries[key]
936 datasets[entry.ref].append(key)
938 n_datasets = len(datasets)
939 n_over = n_datasets - self._expiration_threshold
940 if n_over > 0:
941 # Keys will be read out in insert order which
942 # will be date order so oldest ones are removed.
943 ref_ids = list(datasets.keys())[:n_over]
944 keys_to_remove = list(itertools.chain.from_iterable(datasets[ref_id] for ref_id in ref_ids))
945 self._remove_from_cache(keys_to_remove)
946 return
948 if self._expiration_mode == "size":
949 if self.cache_size > self._expiration_threshold:
950 for key in self._sort_cache():
951 self._remove_from_cache([key])
952 if self.cache_size <= self._expiration_threshold:
953 break
954 return
956 if self._expiration_mode == "age":
957 now = datetime.datetime.utcnow()
958 for key in self._sort_cache():
959 delta = now - self._cache_entries[key].ctime
960 if delta.seconds > self._expiration_threshold:
961 self._remove_from_cache([key])
962 else:
963 # We're already in date order.
964 break
965 return
967 raise ValueError(f"Unrecognized cache expiration mode of {self._expiration_mode}")
969 def _sort_cache(self) -> List[str]:
970 """Sort the cache entries by time and return the sorted keys.
972 Returns
973 -------
974 sorted : `list` of `str`
975 Keys into the cache, sorted by time with oldest first.
976 """
978 def sort_by_time(key: str) -> datetime.datetime:
979 """Sorter key function using cache entry details."""
980 return self._cache_entries[key].ctime
982 return sorted(self._cache_entries, key=sort_by_time)
984 def __str__(self) -> str:
985 cachedir = self._cache_directory if self._cache_directory else "<tempdir>"
986 return (
987 f"{type(self).__name__}@{cachedir} ({self._expiration_mode}={self._expiration_threshold},"
988 f"default={self._caching_default}) "
989 f"n_files={self.file_count}, n_bytes={self.cache_size}"
990 )
993class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager):
994 """A variant of the datastore cache where no cache is enabled.
996 Parameters
997 ----------
998 config : `str` or `DatastoreCacheManagerConfig`
999 Configuration to control caching.
1000 universe : `DimensionUniverse`
1001 Set of all known dimensions, used to expand and validate any used
1002 in lookup keys.
1003 """
1005 def __init__(self, config: Union[str, DatastoreCacheManagerConfig], universe: DimensionUniverse):
1006 return
1008 def should_be_cached(self, entity: Union[DatasetRef, DatasetType, StorageClass]) -> bool:
1009 """Indicate whether the entity should be added to the cache.
1011 Always returns `False`.
1012 """
1013 return False
1015 def move_to_cache(self, uri: ResourcePath, ref: DatasetRef) -> Optional[ResourcePath]:
1016 """Move dataset to cache but always refuse and returns `None`."""
1017 return None
1019 @contextlib.contextmanager
1020 def find_in_cache(self, ref: DatasetRef, extension: str) -> Iterator[Optional[ResourcePath]]:
1021 """Look for a dataset in the cache and return its location.
1023 Never finds a file.
1024 """
1025 yield None
1027 def remove_from_cache(self, ref: Union[DatasetRef, Iterable[DatasetRef]]) -> None:
1028 """Remove datasets from cache.
1030 Always does nothing.
1031 """
1032 return
1034 def known_to_cache(self, ref: DatasetRef, extension: Optional[str] = None) -> bool:
1035 """Report if a dataset is known to the cache.
1037 Always returns `False`.
1038 """
1039 return False
1041 def __str__(self) -> str:
1042 return f"{type(self).__name__}()"