Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 87%
192 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
32from urllib.parse import urlencode
34from lsst.daf.butler import (
35 DatasetId,
36 DatasetRef,
37 DatasetRefURIs,
38 DatastoreRecordData,
39 StorageClass,
40 StoredDatastoreItemInfo,
41)
42from lsst.daf.butler.core.utils import transactional
43from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
44from lsst.resources import ResourcePath
46from .genericDatastore import GenericBaseDatastore
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from lsst.daf.butler import Config, DatasetType, LookupKey
50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
52log = logging.getLogger(__name__)
55@dataclass(frozen=True)
56class StoredMemoryItemInfo(StoredDatastoreItemInfo):
57 """Internal InMemoryDatastore Metadata associated with a stored
58 DatasetRef.
59 """
61 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"}
63 timestamp: float
64 """Unix timestamp indicating the time the dataset was stored."""
66 storageClass: StorageClass
67 """StorageClass associated with the dataset."""
69 parentID: DatasetId
70 """ID of the parent `DatasetRef` if this entry is a concrete
71 composite. Not used if the dataset being stored is not a
72 virtual component of a composite
73 """
75 dataset_id: DatasetId
76 """DatasetId associated with this record."""
79class InMemoryDatastore(GenericBaseDatastore):
80 """Basic Datastore for writing to an in memory cache.
82 This datastore is ephemeral in that the contents of the datastore
83 disappear when the Python process completes. This also means that
84 other processes can not access this datastore.
86 Parameters
87 ----------
88 config : `DatastoreConfig` or `str`
89 Configuration.
90 bridgeManager : `DatastoreRegistryBridgeManager`
91 Object that manages the interface between `Registry` and datastores.
92 butlerRoot : `str`, optional
93 Unused parameter.
95 Notes
96 -----
97 InMemoryDatastore does not support any file-based ingest.
98 """
100 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
101 """Path to configuration defaults. Accessed within the ``configs`` resource
102 or relative to a search path. Can be None if no defaults specified.
103 """
105 isEphemeral = True
106 """A new datastore is created every time and datasets disappear when
107 the process shuts down."""
109 datasets: Dict[DatasetId, Any]
110 """Internal storage of datasets indexed by dataset ID."""
112 records: Dict[DatasetId, StoredMemoryItemInfo]
113 """Internal records about stored datasets."""
115 def __init__(
116 self,
117 config: Union[Config, str],
118 bridgeManager: DatastoreRegistryBridgeManager,
119 butlerRoot: Optional[str] = None,
120 ):
121 super().__init__(config, bridgeManager)
123 # Name ourselves with the timestamp the datastore
124 # was created.
125 self.name = "{}@{}".format(type(self).__name__, time.time())
126 log.debug("Creating datastore %s", self.name)
128 # Storage of datasets, keyed by dataset_id
129 self.datasets: Dict[DatasetId, Any] = {}
131 # Records is distinct in order to track concrete composite components
132 # where we register multiple components for a single dataset.
133 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {}
135 # Related records that share the same parent
136 self.related: Dict[DatasetId, Set[DatasetId]] = {}
138 self._bridge = bridgeManager.register(self.name, ephemeral=True)
140 @classmethod
141 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
142 """Set any filesystem-dependent config options for this Datastore to
143 be appropriate for a new empty repository with the given root.
145 Does nothing in this implementation.
147 Parameters
148 ----------
149 root : `str`
150 Filesystem path to the root of the data repository.
151 config : `Config`
152 A `Config` to update. Only the subset understood by
153 this component will be updated. Will not expand
154 defaults.
155 full : `Config`
156 A complete config with all defaults expanded that can be
157 converted to a `DatastoreConfig`. Read-only and will not be
158 modified by this method.
159 Repository-specific options that should not be obtained
160 from defaults when Butler instances are constructed
161 should be copied from ``full`` to ``config``.
162 overwrite : `bool`, optional
163 If `False`, do not modify a value in ``config`` if the value
164 already exists. Default is always to overwrite with the provided
165 ``root``.
167 Notes
168 -----
169 If a keyword is explicitly defined in the supplied ``config`` it
170 will not be overridden by this method if ``overwrite`` is `False`.
171 This allows explicit values set in external configs to be retained.
172 """
173 return
175 @property
176 def bridge(self) -> DatastoreRegistryBridge:
177 # Docstring inherited from GenericBaseDatastore.
178 return self._bridge
180 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None:
181 # Docstring inherited from GenericBaseDatastore.
182 for ref, info in zip(refs, infos):
183 if ref.id is None: 183 ↛ 184line 183 didn't jump to line 184, because the condition on line 183 was never true
184 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
185 self.records[ref.id] = info
186 self.related.setdefault(info.parentID, set()).add(ref.id)
188 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
189 # Docstring inherited from GenericBaseDatastore.
190 if ref.id is None: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true
191 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
192 return self.records[ref.id]
194 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
195 # Docstring inherited from GenericBaseDatastore.
196 return [self.getStoredItemInfo(ref)]
198 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
199 # Docstring inherited from GenericBaseDatastore.
200 # If a component has been removed previously then we can sometimes
201 # be asked to remove it again. Other datastores ignore this
202 # so also ignore here
203 if ref.id is None: 203 ↛ 204line 203 didn't jump to line 204, because the condition on line 203 was never true
204 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
205 if ref.id not in self.records:
206 return
207 record = self.records[ref.id]
208 del self.records[ref.id]
209 self.related[record.parentID].remove(ref.id)
211 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]:
212 """Check that the dataset is present and return the real ID and
213 associated information.
215 Parameters
216 ----------
217 ref : `DatasetRef`
218 Target `DatasetRef`
220 Returns
221 -------
222 realID : `int`
223 The dataset ID associated with this ref that should be used. This
224 could either be the ID of the supplied `DatasetRef` or the parent.
225 storageInfo : `StoredMemoryItemInfo`
226 Associated storage information.
228 Raises
229 ------
230 FileNotFoundError
231 Raised if the dataset is not present in this datastore.
232 """
233 try:
234 storedItemInfo = self.getStoredItemInfo(ref)
235 except KeyError:
236 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
237 realID = ref.id
238 if storedItemInfo.parentID is not None: 238 ↛ 241line 238 didn't jump to line 241, because the condition on line 238 was never false
239 realID = storedItemInfo.parentID
241 if realID not in self.datasets: 241 ↛ 242line 241 didn't jump to line 242, because the condition on line 241 was never true
242 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
244 return realID, storedItemInfo
246 def knows(self, ref: DatasetRef) -> bool:
247 """Check if the dataset is known to the datastore.
249 This datastore does not distinguish dataset existence from knowledge
250 of a dataset.
252 Parameters
253 ----------
254 ref : `DatasetRef`
255 Reference to the required dataset.
257 Returns
258 -------
259 exists : `bool`
260 `True` if the dataset is known to the datastore.
261 """
262 return self.exists(ref)
264 def exists(self, ref: DatasetRef) -> bool:
265 """Check if the dataset exists in the datastore.
267 Parameters
268 ----------
269 ref : `DatasetRef`
270 Reference to the required dataset.
272 Returns
273 -------
274 exists : `bool`
275 `True` if the entity exists in the `Datastore`.
276 """
277 try:
278 self._get_dataset_info(ref)
279 except FileNotFoundError:
280 return False
281 return True
283 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
284 """Load an InMemoryDataset from the store.
286 Parameters
287 ----------
288 ref : `DatasetRef`
289 Reference to the required Dataset.
290 parameters : `dict`
291 `StorageClass`-specific parameters that specify, for example,
292 a slice of the dataset to be loaded.
294 Returns
295 -------
296 inMemoryDataset : `object`
297 Requested dataset or slice thereof as an InMemoryDataset.
299 Raises
300 ------
301 FileNotFoundError
302 Requested dataset can not be retrieved.
303 TypeError
304 Return value from formatter has unexpected type.
305 ValueError
306 Formatter failed to process the dataset.
307 """
309 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
311 realID, storedItemInfo = self._get_dataset_info(ref)
313 # We have a write storage class and a read storage class and they
314 # can be different for concrete composites.
315 readStorageClass = ref.datasetType.storageClass
316 writeStorageClass = storedItemInfo.storageClass
318 component = ref.datasetType.component()
320 # Check that the supplied parameters are suitable for the type read
321 # If this is a derived component we validate against the composite
322 isDerivedComponent = False
323 if component in writeStorageClass.derivedComponents:
324 writeStorageClass.validateParameters(parameters)
325 isDerivedComponent = True
326 else:
327 readStorageClass.validateParameters(parameters)
329 inMemoryDataset = self.datasets[realID]
331 # if this is a read only component we need to apply parameters
332 # before we retrieve the component. We assume that the parameters
333 # will affect the data globally, before the derived component
334 # is selected.
335 if isDerivedComponent:
336 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
337 # Then disable parameters for later
338 parameters = {}
340 # Different storage classes implies a component request
341 if readStorageClass != writeStorageClass:
343 if component is None: 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true
344 raise ValueError(
345 "Storage class inconsistency ({} vs {}) but no"
346 " component requested".format(readStorageClass.name, writeStorageClass.name)
347 )
349 # Concrete composite written as a single object (we hope)
350 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
352 # Since there is no formatter to process parameters, they all must be
353 # passed to the assembler.
354 return self._post_process_get(
355 inMemoryDataset, readStorageClass, parameters, isComponent=component is not None
356 )
358 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
359 """Write a InMemoryDataset with a given `DatasetRef` to the store.
361 Parameters
362 ----------
363 inMemoryDataset : `object`
364 The dataset to store.
365 ref : `DatasetRef`
366 Reference to the associated Dataset.
368 Raises
369 ------
370 TypeError
371 Supplied object and storage class are inconsistent.
372 DatasetTypeNotSupportedError
373 The associated `DatasetType` is not handled by this datastore.
375 Notes
376 -----
377 If the datastore is configured to reject certain dataset types it
378 is possible that the put will fail and raise a
379 `DatasetTypeNotSupportedError`. The main use case for this is to
380 allow `ChainedDatastore` to put to multiple datastores without
381 requiring that every datastore accepts the dataset.
382 """
384 if ref.id is None: 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true
385 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
387 # May need to coerce the in memory dataset to the correct
388 # python type, otherwise parameters may not work.
389 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
391 self._validate_put_parameters(inMemoryDataset, ref)
393 self.datasets[ref.id] = inMemoryDataset
394 log.debug("Store %s in %s", ref, self.name)
396 # Store time we received this content, to allow us to optionally
397 # expire it. Instead of storing a filename here, we include the
398 # ID of this datasetRef so we can find it from components.
399 itemInfo = StoredMemoryItemInfo(
400 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.getCheckedId()
401 )
403 # We have to register this content with registry.
404 # Currently this assumes we have a file so we need to use stub entries
405 # TODO: Add to ephemeral part of registry
406 self._register_datasets([(ref, itemInfo)])
408 if self._transaction is not None:
409 self._transaction.registerUndo("put", self.remove, ref)
411 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
412 """Return URIs associated with dataset.
414 Parameters
415 ----------
416 ref : `DatasetRef`
417 Reference to the required dataset.
418 predict : `bool`, optional
419 If the datastore does not know about the dataset, should it
420 return a predicted URI or not?
422 Returns
423 -------
424 uris : `DatasetRefURIs`
425 The URI to the primary artifact associated with this dataset (if
426 the dataset was disassembled within the datastore this may be
427 `None`), and the URIs to any components associated with the dataset
428 artifact. (can be empty if there are no components).
430 Notes
431 -----
432 The URIs returned for in-memory datastores are not usable but
433 provide an indication of the associated dataset.
434 """
436 # Include the dataID as a URI query
437 query = urlencode(ref.dataId)
439 # if this has never been written then we have to guess
440 if not self.exists(ref):
441 if not predict:
442 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
443 name = f"{ref.datasetType.name}"
444 fragment = "#predicted"
445 else:
446 realID, _ = self._get_dataset_info(ref)
447 name = f"{id(self.datasets[realID])}?{query}"
448 fragment = ""
450 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
452 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
453 """URI to the Dataset.
455 Always uses "mem://" URI prefix.
457 Parameters
458 ----------
459 ref : `DatasetRef`
460 Reference to the required Dataset.
461 predict : `bool`
462 If `True`, allow URIs to be returned of datasets that have not
463 been written.
465 Returns
466 -------
467 uri : `str`
468 URI pointing to the dataset within the datastore. If the
469 dataset does not exist in the datastore, and if ``predict`` is
470 `True`, the URI will be a prediction and will include a URI
471 fragment "#predicted".
472 If the datastore does not have entities that relate well
473 to the concept of a URI the returned URI string will be
474 descriptive. The returned URI is not guaranteed to be obtainable.
476 Raises
477 ------
478 FileNotFoundError
479 A URI has been requested for a dataset that does not exist and
480 guessing is not allowed.
481 AssertionError
482 Raised if an internal error occurs.
483 """
484 primary, _ = self.getURIs(ref, predict)
485 if primary is None: 485 ↛ 488line 485 didn't jump to line 488, because the condition on line 485 was never true
486 # This should be impossible since this datastore does
487 # not disassemble. This check also helps mypy.
488 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
489 return primary
491 def retrieveArtifacts(
492 self,
493 refs: Iterable[DatasetRef],
494 destination: ResourcePath,
495 transfer: str = "auto",
496 preserve_path: bool = True,
497 overwrite: Optional[bool] = False,
498 ) -> List[ResourcePath]:
499 """Retrieve the file artifacts associated with the supplied refs.
501 Notes
502 -----
503 Not implemented by this datastore.
504 """
505 # Could conceivably launch a FileDatastore to use formatters to write
506 # the data but this is fraught with problems.
507 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
509 def forget(self, refs: Iterable[DatasetRef]) -> None:
510 # Docstring inherited.
511 refs = list(refs)
512 self._bridge.forget(refs)
513 for ref in refs:
514 self.removeStoredItemInfo(ref)
516 @transactional
517 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None:
518 """Indicate to the Datastore that a dataset can be removed.
520 Parameters
521 ----------
522 ref : `DatasetRef` or iterable thereof
523 Reference to the required Dataset(s).
524 ignore_errors: `bool`, optional
525 Indicate that errors should be ignored.
527 Raises
528 ------
529 FileNotFoundError
530 Attempt to remove a dataset that does not exist. Only relevant
531 if a single dataset ref is given.
533 Notes
534 -----
535 Concurrency should not normally be an issue for the in memory datastore
536 since all internal changes are isolated to solely this process and
537 the registry only changes rows associated with this process.
538 """
539 if not isinstance(ref, DatasetRef):
540 log.debug("Bulk trashing of datasets in datastore %s", self.name)
541 self.bridge.moveToTrash(ref, transaction=self._transaction)
542 return
544 log.debug("Trash %s in datastore %s", ref, self.name)
546 # Check that this dataset is known to datastore
547 try:
548 self._get_dataset_info(ref)
550 # Move datasets to trash table
551 self.bridge.moveToTrash([ref], transaction=self._transaction)
552 except Exception as e:
553 if ignore_errors: 553 ↛ 554line 553 didn't jump to line 554, because the condition on line 553 was never true
554 log.warning(
555 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
556 )
557 else:
558 raise
560 def emptyTrash(self, ignore_errors: bool = False) -> None:
561 """Remove all datasets from the trash.
563 Parameters
564 ----------
565 ignore_errors : `bool`, optional
566 Ignore errors.
568 Notes
569 -----
570 The internal tracking of datasets is affected by this method and
571 transaction handling is not supported if there is a problem before
572 the datasets themselves are deleted.
574 Concurrency should not normally be an issue for the in memory datastore
575 since all internal changes are isolated to solely this process and
576 the registry only changes rows associated with this process.
577 """
578 log.debug("Emptying trash in datastore %s", self.name)
579 with self._bridge.emptyTrash() as trash_data:
580 trashed, _ = trash_data
581 for ref, _ in trashed:
582 try:
583 realID, _ = self._get_dataset_info(ref)
584 except FileNotFoundError: 584 ↛ 587line 584 didn't jump to line 587
585 # Dataset already removed so ignore it
586 continue
587 except Exception as e:
588 if ignore_errors:
589 log.warning(
590 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
591 self.name,
592 ref.id,
593 e,
594 )
595 continue
596 else:
597 raise
599 # Determine whether all references to this dataset have been
600 # removed and we can delete the dataset itself
601 allRefs = self.related[realID]
602 remainingRefs = allRefs - {ref.id}
603 if not remainingRefs: 603 ↛ 608line 603 didn't jump to line 608, because the condition on line 603 was never false
604 log.debug("Removing artifact %s from datastore %s", realID, self.name)
605 del self.datasets[realID]
607 # Remove this entry
608 self.removeStoredItemInfo(ref)
610 def validateConfiguration(
611 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
612 ) -> None:
613 """Validate some of the configuration for this datastore.
615 Parameters
616 ----------
617 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
618 Entities to test against this configuration. Can be differing
619 types.
620 logFailures : `bool`, optional
621 If `True`, output a log message for every validation error
622 detected.
624 Raises
625 ------
626 DatastoreValidationError
627 Raised if there is a validation problem with a configuration.
628 All the problems are reported in a single exception.
630 Notes
631 -----
632 This method is a no-op.
633 """
634 return
636 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
637 # Docstring is inherited from base class
638 return transfer
640 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
641 # Docstring is inherited from base class
642 return
644 def getLookupKeys(self) -> Set[LookupKey]:
645 # Docstring is inherited from base class
646 return self.constraints.getLookupKeys()
648 def needs_expanded_data_ids(
649 self,
650 transfer: Optional[str],
651 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
652 ) -> bool:
653 # Docstring inherited.
654 return False
656 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
657 # Docstring inherited from the base class.
658 return
660 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
661 # Docstring inherited from the base class.
663 # In-memory Datastore records cannot be exported or imported
664 return {}