Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py : 85%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import time
29import logging
30from dataclasses import dataclass
31from typing import (
32 TYPE_CHECKING,
33 Any,
34 Dict,
35 Iterable,
36 List,
37 Mapping,
38 Optional,
39 Set,
40 Tuple,
41 Union,
42)
44from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass
45from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
46from .genericDatastore import GenericBaseDatastore
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from lsst.daf.butler import (Config, DatasetRef, DatasetType,
50 LookupKey)
51 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
53log = logging.getLogger(__name__)
56@dataclass(frozen=True)
57class StoredMemoryItemInfo(StoredDatastoreItemInfo):
58 """Internal InMemoryDatastore Metadata associated with a stored
59 DatasetRef.
60 """
61 __slots__ = {"timestamp", "storageClass", "parentID"}
63 timestamp: float
64 """Unix timestamp indicating the time the dataset was stored."""
66 storageClass: StorageClass
67 """StorageClass associated with the dataset."""
69 parentID: int
70 """ID of the parent `DatasetRef` if this entry is a concrete
71 composite. Not used if the dataset being stored is not a
72 virtual component of a composite
73 """
76class InMemoryDatastore(GenericBaseDatastore):
77 """Basic Datastore for writing to an in memory cache.
79 This datastore is ephemeral in that the contents of the datastore
80 disappear when the Python process completes. This also means that
81 other processes can not access this datastore.
83 Parameters
84 ----------
85 config : `DatastoreConfig` or `str`
86 Configuration.
87 bridgeManager : `DatastoreRegistryBridgeManager`
88 Object that manages the interface between `Registry` and datastores.
89 butlerRoot : `str`, optional
90 Unused parameter.
92 Notes
93 -----
94 InMemoryDatastore does not support any file-based ingest.
95 """
97 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
98 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
99 absolute path. Can be None if no defaults specified.
100 """
102 isEphemeral = True
103 """A new datastore is created every time and datasets disappear when
104 the process shuts down."""
106 datasets: Dict[int, Any]
107 """Internal storage of datasets indexed by dataset ID."""
109 records: Dict[int, StoredMemoryItemInfo]
110 """Internal records about stored datasets."""
112 def __init__(self, config: Union[Config, str],
113 bridgeManager: DatastoreRegistryBridgeManager,
114 butlerRoot: Optional[str] = None):
115 super().__init__(config, bridgeManager)
117 # Name ourselves with the timestamp the datastore
118 # was created.
119 self.name = "{}@{}".format(type(self).__name__, time.time())
120 log.debug("Creating datastore %s", self.name)
122 # Storage of datasets, keyed by dataset_id
123 self.datasets: Dict[int, Any] = {}
125 # Records is distinct in order to track concrete composite components
126 # where we register multiple components for a single dataset.
127 self.records: Dict[int, StoredMemoryItemInfo] = {}
129 # Related records that share the same parent
130 self.related: Dict[int, Set[int]] = {}
132 self._bridge = bridgeManager.register(self.name, ephemeral=True)
134 @classmethod
135 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
136 """Set any filesystem-dependent config options for this Datastore to
137 be appropriate for a new empty repository with the given root.
139 Does nothing in this implementation.
141 Parameters
142 ----------
143 root : `str`
144 Filesystem path to the root of the data repository.
145 config : `Config`
146 A `Config` to update. Only the subset understood by
147 this component will be updated. Will not expand
148 defaults.
149 full : `Config`
150 A complete config with all defaults expanded that can be
151 converted to a `DatastoreConfig`. Read-only and will not be
152 modified by this method.
153 Repository-specific options that should not be obtained
154 from defaults when Butler instances are constructed
155 should be copied from ``full`` to ``config``.
156 overwrite : `bool`, optional
157 If `False`, do not modify a value in ``config`` if the value
158 already exists. Default is always to overwrite with the provided
159 ``root``.
161 Notes
162 -----
163 If a keyword is explicitly defined in the supplied ``config`` it
164 will not be overridden by this method if ``overwrite`` is `False`.
165 This allows explicit values set in external configs to be retained.
166 """
167 return
169 @property
170 def bridge(self) -> DatastoreRegistryBridge:
171 # Docstring inherited from GenericBaseDatastore.
172 return self._bridge
174 def addStoredItemInfo(self, refs: Iterable[DatasetRef],
175 infos: Iterable[StoredMemoryItemInfo]) -> None:
176 # Docstring inherited from GenericBaseDatastore.
177 for ref, info in zip(refs, infos):
178 if ref.id is None: 178 ↛ 179line 178 didn't jump to line 179, because the condition on line 178 was never true
179 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
180 self.records[ref.id] = info
181 self.related.setdefault(info.parentID, set()).add(ref.id)
183 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
184 # Docstring inherited from GenericBaseDatastore.
185 if ref.id is None: 185 ↛ 186line 185 didn't jump to line 186, because the condition on line 185 was never true
186 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
187 return self.records[ref.id]
189 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
190 # Docstring inherited from GenericBaseDatastore.
191 return [self.getStoredItemInfo(ref)]
193 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
194 # Docstring inherited from GenericBaseDatastore.
195 # If a component has been removed previously then we can sometimes
196 # be asked to remove it again. Other datastores ignore this
197 # so also ignore here
198 if ref.id is None: 198 ↛ 199line 198 didn't jump to line 199, because the condition on line 198 was never true
199 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
200 if ref.id not in self.records: 200 ↛ 201line 200 didn't jump to line 201, because the condition on line 200 was never true
201 return
202 record = self.records[ref.id]
203 del self.records[ref.id]
204 self.related[record.parentID].remove(ref.id)
206 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[int, StoredMemoryItemInfo]:
207 """Check that the dataset is present and return the real ID and
208 associated information.
210 Parameters
211 ----------
212 ref : `DatasetRef`
213 Target `DatasetRef`
215 Returns
216 -------
217 realID : `int`
218 The dataset ID associated with this ref that shoul be used. This
219 could either be the ID of the supplied `DatasetRef` or the parent.
220 storageInfo : `StoredMemoryItemInfo`
221 Associated storage information.
223 Raises
224 ------
225 FileNotFoundError
226 Raised if the dataset is not present in this datastore.
227 """
228 try:
229 storedItemInfo = self.getStoredItemInfo(ref)
230 except KeyError:
231 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
232 realID = ref.id
233 if storedItemInfo.parentID is not None: 233 ↛ 236line 233 didn't jump to line 236, because the condition on line 233 was never false
234 realID = storedItemInfo.parentID
236 if realID not in self.datasets: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true
237 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
239 return realID, storedItemInfo
241 def exists(self, ref: DatasetRef) -> bool:
242 """Check if the dataset exists in the datastore.
244 Parameters
245 ----------
246 ref : `DatasetRef`
247 Reference to the required dataset.
249 Returns
250 -------
251 exists : `bool`
252 `True` if the entity exists in the `Datastore`.
253 """
254 try:
255 self._get_dataset_info(ref)
256 except FileNotFoundError:
257 return False
258 return True
260 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
261 """Load an InMemoryDataset from the store.
263 Parameters
264 ----------
265 ref : `DatasetRef`
266 Reference to the required Dataset.
267 parameters : `dict`
268 `StorageClass`-specific parameters that specify, for example,
269 a slice of the dataset to be loaded.
271 Returns
272 -------
273 inMemoryDataset : `object`
274 Requested dataset or slice thereof as an InMemoryDataset.
276 Raises
277 ------
278 FileNotFoundError
279 Requested dataset can not be retrieved.
280 TypeError
281 Return value from formatter has unexpected type.
282 ValueError
283 Formatter failed to process the dataset.
284 """
286 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
288 realID, storedItemInfo = self._get_dataset_info(ref)
290 # We have a write storage class and a read storage class and they
291 # can be different for concrete composites.
292 readStorageClass = ref.datasetType.storageClass
293 writeStorageClass = storedItemInfo.storageClass
295 # Check that the supplied parameters are suitable for the type read
296 readStorageClass.validateParameters(parameters)
298 inMemoryDataset = self.datasets[realID]
300 component = ref.datasetType.component()
302 # Different storage classes implies a component request
303 if readStorageClass != writeStorageClass:
305 if component is None: 305 ↛ 306line 305 didn't jump to line 306, because the condition on line 305 was never true
306 raise ValueError("Storage class inconsistency ({} vs {}) but no"
307 " component requested".format(readStorageClass.name,
308 writeStorageClass.name))
310 # Concrete composite written as a single object (we hope)
311 inMemoryDataset = writeStorageClass.assembler().getComponent(inMemoryDataset, component)
313 # Since there is no formatter to process parameters, they all must be
314 # passed to the assembler.
315 return self._post_process_get(inMemoryDataset, readStorageClass, parameters,
316 isComponent=component is not None)
318 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
319 """Write a InMemoryDataset with a given `DatasetRef` to the store.
321 Parameters
322 ----------
323 inMemoryDataset : `object`
324 The dataset to store.
325 ref : `DatasetRef`
326 Reference to the associated Dataset.
328 Raises
329 ------
330 TypeError
331 Supplied object and storage class are inconsistent.
332 DatasetTypeNotSupportedError
333 The associated `DatasetType` is not handled by this datastore.
335 Notes
336 -----
337 If the datastore is configured to reject certain dataset types it
338 is possible that the put will fail and raise a
339 `DatasetTypeNotSupportedError`. The main use case for this is to
340 allow `ChainedDatastore` to put to multiple datastores without
341 requiring that every datastore accepts the dataset.
342 """
344 if ref.id is None: 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true
345 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
347 self._validate_put_parameters(inMemoryDataset, ref)
349 self.datasets[ref.id] = inMemoryDataset
350 log.debug("Store %s in %s", ref, self.name)
352 # Store time we received this content, to allow us to optionally
353 # expire it. Instead of storing a filename here, we include the
354 # ID of this datasetRef so we can find it from components.
355 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass,
356 parentID=ref.id)
358 # We have to register this content with registry.
359 # Currently this assumes we have a file so we need to use stub entries
360 # TODO: Add to ephemeral part of registry
361 self._register_datasets([(ref, itemInfo)])
363 if self._transaction is not None:
364 self._transaction.registerUndo("put", self.remove, ref)
366 def getUri(self, ref: DatasetRef, predict: bool = False) -> str:
367 """URI to the Dataset.
369 Always uses "mem://" URI prefix.
371 Parameters
372 ----------
373 ref : `DatasetRef`
374 Reference to the required Dataset.
375 predict : `bool`
376 If `True`, allow URIs to be returned of datasets that have not
377 been written.
379 Returns
380 -------
381 uri : `str`
382 URI string pointing to the dataset within the datastore. If the
383 dataset does not exist in the datastore, and if ``predict`` is
384 `True`, the URI will be a prediction and will include a URI
385 fragment "#predicted".
386 If the datastore does not have entities that relate well
387 to the concept of a URI the returned URI string will be
388 descriptive. The returned URI is not guaranteed to be obtainable.
390 Raises
391 ------
392 FileNotFoundError
393 A URI has been requested for a dataset that does not exist and
394 guessing is not allowed.
396 """
398 # if this has never been written then we have to guess
399 if not self.exists(ref):
400 if not predict:
401 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
402 name = "{}#predicted".format(ref.datasetType.name)
403 else:
404 realID, _ = self._get_dataset_info(ref)
405 name = '{}'.format(id(self.datasets[realID]))
407 return "mem://{}".format(name)
409 def trash(self, ref: DatasetRef, ignore_errors: bool = False) -> None:
410 """Indicate to the Datastore that a dataset can be removed.
412 Parameters
413 ----------
414 ref : `DatasetRef`
415 Reference to the required Dataset.
416 ignore_errors: `bool`, optional
417 Indicate that errors should be ignored.
419 Raises
420 ------
421 FileNotFoundError
422 Attempt to remove a dataset that does not exist.
424 Notes
425 -----
426 Concurrency should not normally be an issue for the in memory datastore
427 since all internal changes are isolated to solely this process and
428 the registry only changes rows associated with this process.
429 """
431 log.debug("Trash %s in datastore %s", ref, self.name)
433 # Check that this dataset is known to datastore
434 try:
435 self._get_dataset_info(ref)
437 # Move datasets to trash table
438 self._move_to_trash_in_registry(ref)
439 except Exception as e:
440 if ignore_errors:
441 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s",
442 ref, self.name, e)
443 else:
444 raise
446 def emptyTrash(self, ignore_errors: bool = False) -> None:
447 """Remove all datasets from the trash.
449 Parameters
450 ----------
451 ignore_errors : `bool`, optional
452 Ignore errors.
454 Notes
455 -----
456 The internal tracking of datasets is affected by this method and
457 transaction handling is not supported if there is a problem before
458 the datasets themselves are deleted.
460 Concurrency should not normally be an issue for the in memory datastore
461 since all internal changes are isolated to solely this process and
462 the registry only changes rows associated with this process.
463 """
464 log.debug("Emptying trash in datastore %s", self.name)
465 with self._bridge.emptyTrash() as trashed:
466 for ref in trashed:
467 try:
468 realID, _ = self._get_dataset_info(ref)
469 except Exception as e:
470 if ignore_errors:
471 log.warning("Emptying trash in datastore %s but encountered an "
472 "error with dataset %s: %s",
473 self.name, ref.id, e)
474 continue
475 else:
476 raise
478 # Determine whether all references to this dataset have been
479 # removed and we can delete the dataset itself
480 allRefs = self.related[realID]
481 theseRefs = {r.id for r in ref.allRefs()}
482 remainingRefs = allRefs - theseRefs
483 if not remainingRefs:
484 log.debug("Removing artifact %s from datastore %s", realID, self.name)
485 del self.datasets[realID]
487 # Remove this entry
488 self.removeStoredItemInfo(ref)
490 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
491 logFailures: bool = False) -> None:
492 """Validate some of the configuration for this datastore.
494 Parameters
495 ----------
496 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
497 Entities to test against this configuration. Can be differing
498 types.
499 logFailures : `bool`, optional
500 If `True`, output a log message for every validation error
501 detected.
503 Raises
504 ------
505 DatastoreValidationError
506 Raised if there is a validation problem with a configuration.
507 All the problems are reported in a single exception.
509 Notes
510 -----
511 This method is a no-op.
512 """
513 return
515 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
516 # Docstring is inherited from base class
517 return transfer
519 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
520 # Docstring is inherited from base class
521 return
523 def getLookupKeys(self) -> Set[LookupKey]:
524 # Docstring is inherited from base class
525 return self.constraints.getLookupKeys()