Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 88%
193 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-23 11:07 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-23 11:07 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
32from urllib.parse import urlencode
34from lsst.daf.butler import (
35 DatasetId,
36 DatasetRef,
37 DatasetRefURIs,
38 DatastoreRecordData,
39 StorageClass,
40 StoredDatastoreItemInfo,
41)
42from lsst.daf.butler.core.utils import transactional
43from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
44from lsst.resources import ResourcePath
46from .genericDatastore import GenericBaseDatastore
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from lsst.daf.butler import Config, DatasetType, LookupKey
50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
52log = logging.getLogger(__name__)
55@dataclass(frozen=True)
56class StoredMemoryItemInfo(StoredDatastoreItemInfo):
57 """Internal InMemoryDatastore Metadata associated with a stored
58 DatasetRef.
59 """
61 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"}
63 timestamp: float
64 """Unix timestamp indicating the time the dataset was stored."""
66 storageClass: StorageClass
67 """StorageClass associated with the dataset."""
69 parentID: DatasetId
70 """ID of the parent `DatasetRef` if this entry is a concrete
71 composite. Not used if the dataset being stored is not a
72 virtual component of a composite
73 """
75 dataset_id: DatasetId
76 """DatasetId associated with this record."""
79class InMemoryDatastore(GenericBaseDatastore):
80 """Basic Datastore for writing to an in memory cache.
82 This datastore is ephemeral in that the contents of the datastore
83 disappear when the Python process completes. This also means that
84 other processes can not access this datastore.
86 Parameters
87 ----------
88 config : `DatastoreConfig` or `str`
89 Configuration.
90 bridgeManager : `DatastoreRegistryBridgeManager`
91 Object that manages the interface between `Registry` and datastores.
92 butlerRoot : `str`, optional
93 Unused parameter.
95 Notes
96 -----
97 InMemoryDatastore does not support any file-based ingest.
98 """
100 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
101 """Path to configuration defaults. Accessed within the ``configs`` resource
102 or relative to a search path. Can be None if no defaults specified.
103 """
105 isEphemeral = True
106 """A new datastore is created every time and datasets disappear when
107 the process shuts down."""
109 datasets: Dict[DatasetId, Any]
110 """Internal storage of datasets indexed by dataset ID."""
112 records: Dict[DatasetId, StoredMemoryItemInfo]
113 """Internal records about stored datasets."""
115 def __init__(
116 self,
117 config: Union[Config, str],
118 bridgeManager: DatastoreRegistryBridgeManager,
119 butlerRoot: Optional[str] = None,
120 ):
121 super().__init__(config, bridgeManager)
123 # Name ourselves with the timestamp the datastore
124 # was created.
125 self.name = "{}@{}".format(type(self).__name__, time.time())
126 log.debug("Creating datastore %s", self.name)
128 # Storage of datasets, keyed by dataset_id
129 self.datasets: Dict[DatasetId, Any] = {}
131 # Records is distinct in order to track concrete composite components
132 # where we register multiple components for a single dataset.
133 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {}
135 # Related records that share the same parent
136 self.related: Dict[DatasetId, Set[DatasetId]] = {}
138 self._bridge = bridgeManager.register(self.name, ephemeral=True)
140 @classmethod
141 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
142 """Set any filesystem-dependent config options for this Datastore to
143 be appropriate for a new empty repository with the given root.
145 Does nothing in this implementation.
147 Parameters
148 ----------
149 root : `str`
150 Filesystem path to the root of the data repository.
151 config : `Config`
152 A `Config` to update. Only the subset understood by
153 this component will be updated. Will not expand
154 defaults.
155 full : `Config`
156 A complete config with all defaults expanded that can be
157 converted to a `DatastoreConfig`. Read-only and will not be
158 modified by this method.
159 Repository-specific options that should not be obtained
160 from defaults when Butler instances are constructed
161 should be copied from ``full`` to ``config``.
162 overwrite : `bool`, optional
163 If `False`, do not modify a value in ``config`` if the value
164 already exists. Default is always to overwrite with the provided
165 ``root``.
167 Notes
168 -----
169 If a keyword is explicitly defined in the supplied ``config`` it
170 will not be overridden by this method if ``overwrite`` is `False`.
171 This allows explicit values set in external configs to be retained.
172 """
173 return
175 @property
176 def bridge(self) -> DatastoreRegistryBridge:
177 # Docstring inherited from GenericBaseDatastore.
178 return self._bridge
180 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None:
181 # Docstring inherited from GenericBaseDatastore.
182 for ref, info in zip(refs, infos):
183 if ref.id is None: 183 ↛ 184line 183 didn't jump to line 184, because the condition on line 183 was never true
184 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
185 self.records[ref.id] = info
186 self.related.setdefault(info.parentID, set()).add(ref.id)
188 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
189 # Docstring inherited from GenericBaseDatastore.
190 if ref.id is None: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true
191 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
192 return self.records[ref.id]
194 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
195 # Docstring inherited from GenericBaseDatastore.
196 return [self.getStoredItemInfo(ref)]
198 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
199 # Docstring inherited from GenericBaseDatastore.
200 # If a component has been removed previously then we can sometimes
201 # be asked to remove it again. Other datastores ignore this
202 # so also ignore here
203 if ref.id is None: 203 ↛ 204line 203 didn't jump to line 204, because the condition on line 203 was never true
204 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
205 if ref.id not in self.records:
206 return
207 record = self.records[ref.id]
208 del self.records[ref.id]
209 self.related[record.parentID].remove(ref.id)
211 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]:
212 """Check that the dataset is present and return the real ID and
213 associated information.
215 Parameters
216 ----------
217 ref : `DatasetRef`
218 Target `DatasetRef`
220 Returns
221 -------
222 realID : `int`
223 The dataset ID associated with this ref that should be used. This
224 could either be the ID of the supplied `DatasetRef` or the parent.
225 storageInfo : `StoredMemoryItemInfo`
226 Associated storage information.
228 Raises
229 ------
230 FileNotFoundError
231 Raised if the dataset is not present in this datastore.
232 """
233 try:
234 storedItemInfo = self.getStoredItemInfo(ref)
235 except KeyError:
236 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
237 realID = ref.id
238 if storedItemInfo.parentID is not None: 238 ↛ 241line 238 didn't jump to line 241, because the condition on line 238 was never false
239 realID = storedItemInfo.parentID
241 if realID not in self.datasets: 241 ↛ 242line 241 didn't jump to line 242, because the condition on line 241 was never true
242 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
244 return realID, storedItemInfo
246 def knows(self, ref: DatasetRef) -> bool:
247 """Check if the dataset is known to the datastore.
249 This datastore does not distinguish dataset existence from knowledge
250 of a dataset.
252 Parameters
253 ----------
254 ref : `DatasetRef`
255 Reference to the required dataset.
257 Returns
258 -------
259 exists : `bool`
260 `True` if the dataset is known to the datastore.
261 """
262 return self.exists(ref)
264 def exists(self, ref: DatasetRef) -> bool:
265 """Check if the dataset exists in the datastore.
267 Parameters
268 ----------
269 ref : `DatasetRef`
270 Reference to the required dataset.
272 Returns
273 -------
274 exists : `bool`
275 `True` if the entity exists in the `Datastore`.
276 """
277 try:
278 self._get_dataset_info(ref)
279 except FileNotFoundError:
280 return False
281 return True
283 def get(
284 self,
285 ref: DatasetRef,
286 parameters: Optional[Mapping[str, Any]] = None,
287 storageClass: Optional[Union[StorageClass, str]] = None,
288 ) -> Any:
289 """Load an InMemoryDataset from the store.
291 Parameters
292 ----------
293 ref : `DatasetRef`
294 Reference to the required Dataset.
295 parameters : `dict`
296 `StorageClass`-specific parameters that specify, for example,
297 a slice of the dataset to be loaded.
298 storageClass : `StorageClass` or `str`, optional
299 The storage class to be used to override the Python type
300 returned by this method. By default the returned type matches
301 the dataset type definition for this dataset. Specifying a
302 read `StorageClass` can force a different type to be returned.
303 This type must be compatible with the original type.
305 Returns
306 -------
307 inMemoryDataset : `object`
308 Requested dataset or slice thereof as an InMemoryDataset.
310 Raises
311 ------
312 FileNotFoundError
313 Requested dataset can not be retrieved.
314 TypeError
315 Return value from formatter has unexpected type.
316 ValueError
317 Formatter failed to process the dataset.
318 """
320 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
322 realID, storedItemInfo = self._get_dataset_info(ref)
324 # We have a write storage class and a read storage class and they
325 # can be different for concrete composites or if overridden.
326 if storageClass is not None:
327 ref = ref.overrideStorageClass(storageClass)
328 refStorageClass = ref.datasetType.storageClass
329 writeStorageClass = storedItemInfo.storageClass
331 component = ref.datasetType.component()
333 # Check that the supplied parameters are suitable for the type read
334 # If this is a derived component we validate against the composite
335 isDerivedComponent = False
336 if component in writeStorageClass.derivedComponents:
337 writeStorageClass.validateParameters(parameters)
338 isDerivedComponent = True
339 else:
340 refStorageClass.validateParameters(parameters)
342 inMemoryDataset = self.datasets[realID]
344 # if this is a read only component we need to apply parameters
345 # before we retrieve the component. We assume that the parameters
346 # will affect the data globally, before the derived component
347 # is selected.
348 if isDerivedComponent:
349 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
350 # Then disable parameters for later
351 parameters = {}
353 # Check if we have a component.
354 if component:
355 # In-memory datastore must have stored the dataset as a single
356 # object in the write storage class. We therefore use that
357 # storage class delegate to obtain the component.
358 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
360 # Since there is no formatter to process parameters, they all must be
361 # passed to the assembler.
362 inMemoryDataset = self._post_process_get(
363 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None
364 )
366 # Last minute type conversion.
367 return refStorageClass.coerce_type(inMemoryDataset)
369 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
370 """Write a InMemoryDataset with a given `DatasetRef` to the store.
372 Parameters
373 ----------
374 inMemoryDataset : `object`
375 The dataset to store.
376 ref : `DatasetRef`
377 Reference to the associated Dataset.
379 Raises
380 ------
381 TypeError
382 Supplied object and storage class are inconsistent.
383 DatasetTypeNotSupportedError
384 The associated `DatasetType` is not handled by this datastore.
386 Notes
387 -----
388 If the datastore is configured to reject certain dataset types it
389 is possible that the put will fail and raise a
390 `DatasetTypeNotSupportedError`. The main use case for this is to
391 allow `ChainedDatastore` to put to multiple datastores without
392 requiring that every datastore accepts the dataset.
393 """
395 if ref.id is None: 395 ↛ 396line 395 didn't jump to line 396, because the condition on line 395 was never true
396 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
398 # May need to coerce the in memory dataset to the correct
399 # python type, otherwise parameters may not work.
400 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
402 self._validate_put_parameters(inMemoryDataset, ref)
404 self.datasets[ref.id] = inMemoryDataset
405 log.debug("Store %s in %s", ref, self.name)
407 # Store time we received this content, to allow us to optionally
408 # expire it. Instead of storing a filename here, we include the
409 # ID of this datasetRef so we can find it from components.
410 itemInfo = StoredMemoryItemInfo(
411 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.getCheckedId()
412 )
414 # We have to register this content with registry.
415 # Currently this assumes we have a file so we need to use stub entries
416 # TODO: Add to ephemeral part of registry
417 self._register_datasets([(ref, itemInfo)])
419 if self._transaction is not None:
420 self._transaction.registerUndo("put", self.remove, ref)
422 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
423 """Return URIs associated with dataset.
425 Parameters
426 ----------
427 ref : `DatasetRef`
428 Reference to the required dataset.
429 predict : `bool`, optional
430 If the datastore does not know about the dataset, should it
431 return a predicted URI or not?
433 Returns
434 -------
435 uris : `DatasetRefURIs`
436 The URI to the primary artifact associated with this dataset (if
437 the dataset was disassembled within the datastore this may be
438 `None`), and the URIs to any components associated with the dataset
439 artifact. (can be empty if there are no components).
441 Notes
442 -----
443 The URIs returned for in-memory datastores are not usable but
444 provide an indication of the associated dataset.
445 """
447 # Include the dataID as a URI query
448 query = urlencode(ref.dataId)
450 # if this has never been written then we have to guess
451 if not self.exists(ref):
452 if not predict:
453 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
454 name = f"{ref.datasetType.name}"
455 fragment = "#predicted"
456 else:
457 realID, _ = self._get_dataset_info(ref)
458 name = f"{id(self.datasets[realID])}?{query}"
459 fragment = ""
461 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
463 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
464 """URI to the Dataset.
466 Always uses "mem://" URI prefix.
468 Parameters
469 ----------
470 ref : `DatasetRef`
471 Reference to the required Dataset.
472 predict : `bool`
473 If `True`, allow URIs to be returned of datasets that have not
474 been written.
476 Returns
477 -------
478 uri : `str`
479 URI pointing to the dataset within the datastore. If the
480 dataset does not exist in the datastore, and if ``predict`` is
481 `True`, the URI will be a prediction and will include a URI
482 fragment "#predicted".
483 If the datastore does not have entities that relate well
484 to the concept of a URI the returned URI string will be
485 descriptive. The returned URI is not guaranteed to be obtainable.
487 Raises
488 ------
489 FileNotFoundError
490 A URI has been requested for a dataset that does not exist and
491 guessing is not allowed.
492 AssertionError
493 Raised if an internal error occurs.
494 """
495 primary, _ = self.getURIs(ref, predict)
496 if primary is None: 496 ↛ 499line 496 didn't jump to line 499, because the condition on line 496 was never true
497 # This should be impossible since this datastore does
498 # not disassemble. This check also helps mypy.
499 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
500 return primary
502 def retrieveArtifacts(
503 self,
504 refs: Iterable[DatasetRef],
505 destination: ResourcePath,
506 transfer: str = "auto",
507 preserve_path: bool = True,
508 overwrite: Optional[bool] = False,
509 ) -> List[ResourcePath]:
510 """Retrieve the file artifacts associated with the supplied refs.
512 Notes
513 -----
514 Not implemented by this datastore.
515 """
516 # Could conceivably launch a FileDatastore to use formatters to write
517 # the data but this is fraught with problems.
518 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
520 def forget(self, refs: Iterable[DatasetRef]) -> None:
521 # Docstring inherited.
522 refs = list(refs)
523 self._bridge.forget(refs)
524 for ref in refs:
525 self.removeStoredItemInfo(ref)
527 @transactional
528 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None:
529 """Indicate to the Datastore that a dataset can be removed.
531 Parameters
532 ----------
533 ref : `DatasetRef` or iterable thereof
534 Reference to the required Dataset(s).
535 ignore_errors: `bool`, optional
536 Indicate that errors should be ignored.
538 Raises
539 ------
540 FileNotFoundError
541 Attempt to remove a dataset that does not exist. Only relevant
542 if a single dataset ref is given.
544 Notes
545 -----
546 Concurrency should not normally be an issue for the in memory datastore
547 since all internal changes are isolated to solely this process and
548 the registry only changes rows associated with this process.
549 """
550 if not isinstance(ref, DatasetRef):
551 log.debug("Bulk trashing of datasets in datastore %s", self.name)
552 self.bridge.moveToTrash(ref, transaction=self._transaction)
553 return
555 log.debug("Trash %s in datastore %s", ref, self.name)
557 # Check that this dataset is known to datastore
558 try:
559 self._get_dataset_info(ref)
561 # Move datasets to trash table
562 self.bridge.moveToTrash([ref], transaction=self._transaction)
563 except Exception as e:
564 if ignore_errors: 564 ↛ 565line 564 didn't jump to line 565, because the condition on line 564 was never true
565 log.warning(
566 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
567 )
568 else:
569 raise
571 def emptyTrash(self, ignore_errors: bool = False) -> None:
572 """Remove all datasets from the trash.
574 Parameters
575 ----------
576 ignore_errors : `bool`, optional
577 Ignore errors.
579 Notes
580 -----
581 The internal tracking of datasets is affected by this method and
582 transaction handling is not supported if there is a problem before
583 the datasets themselves are deleted.
585 Concurrency should not normally be an issue for the in memory datastore
586 since all internal changes are isolated to solely this process and
587 the registry only changes rows associated with this process.
588 """
589 log.debug("Emptying trash in datastore %s", self.name)
590 with self._bridge.emptyTrash() as trash_data:
591 trashed, _ = trash_data
592 for ref, _ in trashed:
593 try:
594 realID, _ = self._get_dataset_info(ref)
595 except FileNotFoundError: 595 ↛ 598line 595 didn't jump to line 598
596 # Dataset already removed so ignore it
597 continue
598 except Exception as e:
599 if ignore_errors:
600 log.warning(
601 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
602 self.name,
603 ref.id,
604 e,
605 )
606 continue
607 else:
608 raise
610 # Determine whether all references to this dataset have been
611 # removed and we can delete the dataset itself
612 allRefs = self.related[realID]
613 remainingRefs = allRefs - {ref.id}
614 if not remainingRefs: 614 ↛ 619line 614 didn't jump to line 619, because the condition on line 614 was never false
615 log.debug("Removing artifact %s from datastore %s", realID, self.name)
616 del self.datasets[realID]
618 # Remove this entry
619 self.removeStoredItemInfo(ref)
621 def validateConfiguration(
622 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
623 ) -> None:
624 """Validate some of the configuration for this datastore.
626 Parameters
627 ----------
628 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
629 Entities to test against this configuration. Can be differing
630 types.
631 logFailures : `bool`, optional
632 If `True`, output a log message for every validation error
633 detected.
635 Raises
636 ------
637 DatastoreValidationError
638 Raised if there is a validation problem with a configuration.
639 All the problems are reported in a single exception.
641 Notes
642 -----
643 This method is a no-op.
644 """
645 return
647 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
648 # Docstring is inherited from base class
649 return transfer
651 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
652 # Docstring is inherited from base class
653 return
655 def getLookupKeys(self) -> Set[LookupKey]:
656 # Docstring is inherited from base class
657 return self.constraints.getLookupKeys()
659 def needs_expanded_data_ids(
660 self,
661 transfer: Optional[str],
662 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
663 ) -> bool:
664 # Docstring inherited.
665 return False
667 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
668 # Docstring inherited from the base class.
669 return
671 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
672 # Docstring inherited from the base class.
674 # In-memory Datastore records cannot be exported or imported
675 return {}