Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py : 87%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import time
29import logging
30from dataclasses import dataclass
31from urllib.parse import urlencode
32from typing import (
33 TYPE_CHECKING,
34 Any,
35 Dict,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Tuple,
42 Union,
43)
45from lsst.daf.butler import DatasetId, StoredDatastoreItemInfo, StorageClass, ButlerURI
46from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
47from .genericDatastore import GenericBaseDatastore
49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true
50 from lsst.daf.butler import (Config, DatasetRef, DatasetType,
51 LookupKey)
52 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
54log = logging.getLogger(__name__)
57@dataclass(frozen=True)
58class StoredMemoryItemInfo(StoredDatastoreItemInfo):
59 """Internal InMemoryDatastore Metadata associated with a stored
60 DatasetRef.
61 """
62 __slots__ = {"timestamp", "storageClass", "parentID"}
64 timestamp: float
65 """Unix timestamp indicating the time the dataset was stored."""
67 storageClass: StorageClass
68 """StorageClass associated with the dataset."""
70 parentID: DatasetId
71 """ID of the parent `DatasetRef` if this entry is a concrete
72 composite. Not used if the dataset being stored is not a
73 virtual component of a composite
74 """
77class InMemoryDatastore(GenericBaseDatastore):
78 """Basic Datastore for writing to an in memory cache.
80 This datastore is ephemeral in that the contents of the datastore
81 disappear when the Python process completes. This also means that
82 other processes can not access this datastore.
84 Parameters
85 ----------
86 config : `DatastoreConfig` or `str`
87 Configuration.
88 bridgeManager : `DatastoreRegistryBridgeManager`
89 Object that manages the interface between `Registry` and datastores.
90 butlerRoot : `str`, optional
91 Unused parameter.
93 Notes
94 -----
95 InMemoryDatastore does not support any file-based ingest.
96 """
98 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
99 """Path to configuration defaults. Accessed within the ``configs`` resource
100 or relative to a search path. Can be None if no defaults specified.
101 """
103 isEphemeral = True
104 """A new datastore is created every time and datasets disappear when
105 the process shuts down."""
107 datasets: Dict[DatasetId, Any]
108 """Internal storage of datasets indexed by dataset ID."""
110 records: Dict[DatasetId, StoredMemoryItemInfo]
111 """Internal records about stored datasets."""
113 def __init__(self, config: Union[Config, str],
114 bridgeManager: DatastoreRegistryBridgeManager,
115 butlerRoot: Optional[str] = None):
116 super().__init__(config, bridgeManager)
118 # Name ourselves with the timestamp the datastore
119 # was created.
120 self.name = "{}@{}".format(type(self).__name__, time.time())
121 log.debug("Creating datastore %s", self.name)
123 # Storage of datasets, keyed by dataset_id
124 self.datasets: Dict[DatasetId, Any] = {}
126 # Records is distinct in order to track concrete composite components
127 # where we register multiple components for a single dataset.
128 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {}
130 # Related records that share the same parent
131 self.related: Dict[DatasetId, Set[DatasetId]] = {}
133 self._bridge = bridgeManager.register(self.name, ephemeral=True)
135 @classmethod
136 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
137 """Set any filesystem-dependent config options for this Datastore to
138 be appropriate for a new empty repository with the given root.
140 Does nothing in this implementation.
142 Parameters
143 ----------
144 root : `str`
145 Filesystem path to the root of the data repository.
146 config : `Config`
147 A `Config` to update. Only the subset understood by
148 this component will be updated. Will not expand
149 defaults.
150 full : `Config`
151 A complete config with all defaults expanded that can be
152 converted to a `DatastoreConfig`. Read-only and will not be
153 modified by this method.
154 Repository-specific options that should not be obtained
155 from defaults when Butler instances are constructed
156 should be copied from ``full`` to ``config``.
157 overwrite : `bool`, optional
158 If `False`, do not modify a value in ``config`` if the value
159 already exists. Default is always to overwrite with the provided
160 ``root``.
162 Notes
163 -----
164 If a keyword is explicitly defined in the supplied ``config`` it
165 will not be overridden by this method if ``overwrite`` is `False`.
166 This allows explicit values set in external configs to be retained.
167 """
168 return
170 @property
171 def bridge(self) -> DatastoreRegistryBridge:
172 # Docstring inherited from GenericBaseDatastore.
173 return self._bridge
175 def addStoredItemInfo(self, refs: Iterable[DatasetRef],
176 infos: Iterable[StoredMemoryItemInfo]) -> None:
177 # Docstring inherited from GenericBaseDatastore.
178 for ref, info in zip(refs, infos):
179 if ref.id is None: 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true
180 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
181 self.records[ref.id] = info
182 self.related.setdefault(info.parentID, set()).add(ref.id)
184 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
185 # Docstring inherited from GenericBaseDatastore.
186 if ref.id is None: 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true
187 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
188 return self.records[ref.id]
190 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
191 # Docstring inherited from GenericBaseDatastore.
192 return [self.getStoredItemInfo(ref)]
194 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
195 # Docstring inherited from GenericBaseDatastore.
196 # If a component has been removed previously then we can sometimes
197 # be asked to remove it again. Other datastores ignore this
198 # so also ignore here
199 if ref.id is None: 199 ↛ 200line 199 didn't jump to line 200, because the condition on line 199 was never true
200 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
201 if ref.id not in self.records:
202 return
203 record = self.records[ref.id]
204 del self.records[ref.id]
205 self.related[record.parentID].remove(ref.id)
207 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]:
208 """Check that the dataset is present and return the real ID and
209 associated information.
211 Parameters
212 ----------
213 ref : `DatasetRef`
214 Target `DatasetRef`
216 Returns
217 -------
218 realID : `int`
219 The dataset ID associated with this ref that shoul be used. This
220 could either be the ID of the supplied `DatasetRef` or the parent.
221 storageInfo : `StoredMemoryItemInfo`
222 Associated storage information.
224 Raises
225 ------
226 FileNotFoundError
227 Raised if the dataset is not present in this datastore.
228 """
229 try:
230 storedItemInfo = self.getStoredItemInfo(ref)
231 except KeyError:
232 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
233 realID = ref.id
234 if storedItemInfo.parentID is not None: 234 ↛ 237line 234 didn't jump to line 237, because the condition on line 234 was never false
235 realID = storedItemInfo.parentID
237 if realID not in self.datasets: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true
238 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
240 return realID, storedItemInfo
242 def exists(self, ref: DatasetRef) -> bool:
243 """Check if the dataset exists in the datastore.
245 Parameters
246 ----------
247 ref : `DatasetRef`
248 Reference to the required dataset.
250 Returns
251 -------
252 exists : `bool`
253 `True` if the entity exists in the `Datastore`.
254 """
255 try:
256 self._get_dataset_info(ref)
257 except FileNotFoundError:
258 return False
259 return True
261 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
262 """Load an InMemoryDataset from the store.
264 Parameters
265 ----------
266 ref : `DatasetRef`
267 Reference to the required Dataset.
268 parameters : `dict`
269 `StorageClass`-specific parameters that specify, for example,
270 a slice of the dataset to be loaded.
272 Returns
273 -------
274 inMemoryDataset : `object`
275 Requested dataset or slice thereof as an InMemoryDataset.
277 Raises
278 ------
279 FileNotFoundError
280 Requested dataset can not be retrieved.
281 TypeError
282 Return value from formatter has unexpected type.
283 ValueError
284 Formatter failed to process the dataset.
285 """
287 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
289 realID, storedItemInfo = self._get_dataset_info(ref)
291 # We have a write storage class and a read storage class and they
292 # can be different for concrete composites.
293 readStorageClass = ref.datasetType.storageClass
294 writeStorageClass = storedItemInfo.storageClass
296 component = ref.datasetType.component()
298 # Check that the supplied parameters are suitable for the type read
299 # If this is a derived component we validate against the composite
300 isDerivedComponent = False
301 if component in writeStorageClass.derivedComponents:
302 writeStorageClass.validateParameters(parameters)
303 isDerivedComponent = True
304 else:
305 readStorageClass.validateParameters(parameters)
307 inMemoryDataset = self.datasets[realID]
309 # if this is a read only component we need to apply parameters
310 # before we retrieve the component. We assume that the parameters
311 # will affect the data globally, before the derived component
312 # is selected.
313 if isDerivedComponent:
314 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
315 # Then disable parameters for later
316 parameters = {}
318 # Different storage classes implies a component request
319 if readStorageClass != writeStorageClass:
321 if component is None: 321 ↛ 322line 321 didn't jump to line 322, because the condition on line 321 was never true
322 raise ValueError("Storage class inconsistency ({} vs {}) but no"
323 " component requested".format(readStorageClass.name,
324 writeStorageClass.name))
326 # Concrete composite written as a single object (we hope)
327 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
329 # Since there is no formatter to process parameters, they all must be
330 # passed to the assembler.
331 return self._post_process_get(inMemoryDataset, readStorageClass, parameters,
332 isComponent=component is not None)
334 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
335 """Write a InMemoryDataset with a given `DatasetRef` to the store.
337 Parameters
338 ----------
339 inMemoryDataset : `object`
340 The dataset to store.
341 ref : `DatasetRef`
342 Reference to the associated Dataset.
344 Raises
345 ------
346 TypeError
347 Supplied object and storage class are inconsistent.
348 DatasetTypeNotSupportedError
349 The associated `DatasetType` is not handled by this datastore.
351 Notes
352 -----
353 If the datastore is configured to reject certain dataset types it
354 is possible that the put will fail and raise a
355 `DatasetTypeNotSupportedError`. The main use case for this is to
356 allow `ChainedDatastore` to put to multiple datastores without
357 requiring that every datastore accepts the dataset.
358 """
360 if ref.id is None: 360 ↛ 361line 360 didn't jump to line 361, because the condition on line 360 was never true
361 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
363 self._validate_put_parameters(inMemoryDataset, ref)
365 self.datasets[ref.id] = inMemoryDataset
366 log.debug("Store %s in %s", ref, self.name)
368 # Store time we received this content, to allow us to optionally
369 # expire it. Instead of storing a filename here, we include the
370 # ID of this datasetRef so we can find it from components.
371 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass,
372 parentID=ref.id)
374 # We have to register this content with registry.
375 # Currently this assumes we have a file so we need to use stub entries
376 # TODO: Add to ephemeral part of registry
377 self._register_datasets([(ref, itemInfo)])
379 if self._transaction is not None:
380 self._transaction.registerUndo("put", self.remove, ref)
382 def getURIs(self, ref: DatasetRef,
383 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
384 """Return URIs associated with dataset.
386 Parameters
387 ----------
388 ref : `DatasetRef`
389 Reference to the required dataset.
390 predict : `bool`, optional
391 If the datastore does not know about the dataset, should it
392 return a predicted URI or not?
394 Returns
395 -------
396 primary : `ButlerURI`
397 The URI to the primary artifact associated with this dataset.
398 If the dataset was disassembled within the datastore this
399 may be `None`.
400 components : `dict`
401 URIs to any components associated with the dataset artifact.
402 Can be empty if there are no components.
404 Notes
405 -----
406 The URIs returned for in-memory datastores are not usable but
407 provide an indication of the associated dataset.
408 """
410 # Include the dataID as a URI query
411 query = urlencode(ref.dataId)
413 # if this has never been written then we have to guess
414 if not self.exists(ref):
415 if not predict:
416 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
417 name = f"{ref.datasetType.name}"
418 fragment = "#predicted"
419 else:
420 realID, _ = self._get_dataset_info(ref)
421 name = f"{id(self.datasets[realID])}?{query}"
422 fragment = ""
424 return ButlerURI(f"mem://{name}?{query}{fragment}"), {}
426 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
427 """URI to the Dataset.
429 Always uses "mem://" URI prefix.
431 Parameters
432 ----------
433 ref : `DatasetRef`
434 Reference to the required Dataset.
435 predict : `bool`
436 If `True`, allow URIs to be returned of datasets that have not
437 been written.
439 Returns
440 -------
441 uri : `str`
442 URI pointing to the dataset within the datastore. If the
443 dataset does not exist in the datastore, and if ``predict`` is
444 `True`, the URI will be a prediction and will include a URI
445 fragment "#predicted".
446 If the datastore does not have entities that relate well
447 to the concept of a URI the returned URI string will be
448 descriptive. The returned URI is not guaranteed to be obtainable.
450 Raises
451 ------
452 FileNotFoundError
453 A URI has been requested for a dataset that does not exist and
454 guessing is not allowed.
455 AssertionError
456 Raised if an internal error occurs.
457 """
458 primary, _ = self.getURIs(ref, predict)
459 if primary is None: 459 ↛ 462line 459 didn't jump to line 462, because the condition on line 459 was never true
460 # This should be impossible since this datastore does
461 # not disassemble. This check also helps mypy.
462 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
463 return primary
465 def forget(self, refs: Iterable[DatasetRef]) -> None:
466 # Docstring inherited.
467 refs = list(refs)
468 self._bridge.forget(refs)
469 for ref in refs:
470 self.removeStoredItemInfo(ref)
472 def trash(self, ref: DatasetRef, ignore_errors: bool = False) -> None:
473 """Indicate to the Datastore that a dataset can be removed.
475 Parameters
476 ----------
477 ref : `DatasetRef`
478 Reference to the required Dataset.
479 ignore_errors: `bool`, optional
480 Indicate that errors should be ignored.
482 Raises
483 ------
484 FileNotFoundError
485 Attempt to remove a dataset that does not exist.
487 Notes
488 -----
489 Concurrency should not normally be an issue for the in memory datastore
490 since all internal changes are isolated to solely this process and
491 the registry only changes rows associated with this process.
492 """
494 log.debug("Trash %s in datastore %s", ref, self.name)
496 # Check that this dataset is known to datastore
497 try:
498 self._get_dataset_info(ref)
500 # Move datasets to trash table
501 self._move_to_trash_in_registry(ref)
502 except Exception as e:
503 if ignore_errors:
504 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s",
505 ref, self.name, e)
506 else:
507 raise
509 def emptyTrash(self, ignore_errors: bool = False) -> None:
510 """Remove all datasets from the trash.
512 Parameters
513 ----------
514 ignore_errors : `bool`, optional
515 Ignore errors.
517 Notes
518 -----
519 The internal tracking of datasets is affected by this method and
520 transaction handling is not supported if there is a problem before
521 the datasets themselves are deleted.
523 Concurrency should not normally be an issue for the in memory datastore
524 since all internal changes are isolated to solely this process and
525 the registry only changes rows associated with this process.
526 """
527 log.debug("Emptying trash in datastore %s", self.name)
528 with self._bridge.emptyTrash() as trashed:
529 for ref in trashed:
530 try:
531 realID, _ = self._get_dataset_info(ref)
532 except Exception as e:
533 if ignore_errors:
534 log.warning("Emptying trash in datastore %s but encountered an "
535 "error with dataset %s: %s",
536 self.name, ref.id, e)
537 continue
538 else:
539 raise
541 # Determine whether all references to this dataset have been
542 # removed and we can delete the dataset itself
543 allRefs = self.related[realID]
544 remainingRefs = allRefs - {ref.id}
545 if not remainingRefs: 545 ↛ 550line 545 didn't jump to line 550, because the condition on line 545 was never false
546 log.debug("Removing artifact %s from datastore %s", realID, self.name)
547 del self.datasets[realID]
549 # Remove this entry
550 self.removeStoredItemInfo(ref)
552 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
553 logFailures: bool = False) -> None:
554 """Validate some of the configuration for this datastore.
556 Parameters
557 ----------
558 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
559 Entities to test against this configuration. Can be differing
560 types.
561 logFailures : `bool`, optional
562 If `True`, output a log message for every validation error
563 detected.
565 Raises
566 ------
567 DatastoreValidationError
568 Raised if there is a validation problem with a configuration.
569 All the problems are reported in a single exception.
571 Notes
572 -----
573 This method is a no-op.
574 """
575 return
577 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
578 # Docstring is inherited from base class
579 return transfer
581 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
582 # Docstring is inherited from base class
583 return
585 def getLookupKeys(self) -> Set[LookupKey]:
586 # Docstring is inherited from base class
587 return self.constraints.getLookupKeys()
589 def needs_expanded_data_ids(
590 self,
591 transfer: Optional[str],
592 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
593 ) -> bool:
594 # Docstring inherited.
595 return False