Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 93%
181 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:25 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:25 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""In-memory datastore."""
24from __future__ import annotations
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from collections.abc import Iterable, Mapping
31from dataclasses import dataclass
32from typing import TYPE_CHECKING, Any
33from urllib.parse import urlencode
35from lsst.daf.butler import (
36 DatasetId,
37 DatasetRef,
38 DatasetRefURIs,
39 DatastoreRecordData,
40 StorageClass,
41 StoredDatastoreItemInfo,
42)
43from lsst.daf.butler.core.utils import transactional
44from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
45from lsst.resources import ResourcePath
47from .genericDatastore import GenericBaseDatastore
49if TYPE_CHECKING:
50 from lsst.daf.butler import Config, DatasetType, LookupKey
51 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
53log = logging.getLogger(__name__)
56@dataclass(frozen=True)
57class StoredMemoryItemInfo(StoredDatastoreItemInfo):
58 """Internal InMemoryDatastore Metadata associated with a stored
59 DatasetRef.
60 """
62 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"}
64 timestamp: float
65 """Unix timestamp indicating the time the dataset was stored."""
67 storageClass: StorageClass
68 """StorageClass associated with the dataset."""
70 parentID: DatasetId
71 """ID of the parent `DatasetRef` if this entry is a concrete
72 composite. Not used if the dataset being stored is not a
73 virtual component of a composite
74 """
76 dataset_id: DatasetId
77 """DatasetId associated with this record."""
80class InMemoryDatastore(GenericBaseDatastore):
81 """Basic Datastore for writing to an in memory cache.
83 This datastore is ephemeral in that the contents of the datastore
84 disappear when the Python process completes. This also means that
85 other processes can not access this datastore.
87 Parameters
88 ----------
89 config : `DatastoreConfig` or `str`
90 Configuration.
91 bridgeManager : `DatastoreRegistryBridgeManager`
92 Object that manages the interface between `Registry` and datastores.
93 butlerRoot : `str`, optional
94 Unused parameter.
96 Notes
97 -----
98 InMemoryDatastore does not support any file-based ingest.
99 """
101 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
102 """Path to configuration defaults. Accessed within the ``configs`` resource
103 or relative to a search path. Can be None if no defaults specified.
104 """
106 isEphemeral = True
107 """A new datastore is created every time and datasets disappear when
108 the process shuts down."""
110 datasets: dict[DatasetId, Any]
111 """Internal storage of datasets indexed by dataset ID."""
113 records: dict[DatasetId, StoredMemoryItemInfo]
114 """Internal records about stored datasets."""
116 def __init__(
117 self,
118 config: Config | str,
119 bridgeManager: DatastoreRegistryBridgeManager,
120 butlerRoot: str | None = None,
121 ):
122 super().__init__(config, bridgeManager)
124 # Name ourselves with the timestamp the datastore
125 # was created.
126 self.name = f"{type(self).__name__}@{time.time()}"
127 log.debug("Creating datastore %s", self.name)
129 # Storage of datasets, keyed by dataset_id
130 self.datasets: dict[DatasetId, Any] = {}
132 # Records is distinct in order to track concrete composite components
133 # where we register multiple components for a single dataset.
134 self.records: dict[DatasetId, StoredMemoryItemInfo] = {}
136 # Related records that share the same parent
137 self.related: dict[DatasetId, set[DatasetId]] = {}
139 self._bridge = bridgeManager.register(self.name, ephemeral=True)
141 @classmethod
142 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
143 """Set any filesystem-dependent config options for this Datastore to
144 be appropriate for a new empty repository with the given root.
146 Does nothing in this implementation.
148 Parameters
149 ----------
150 root : `str`
151 Filesystem path to the root of the data repository.
152 config : `Config`
153 A `Config` to update. Only the subset understood by
154 this component will be updated. Will not expand
155 defaults.
156 full : `Config`
157 A complete config with all defaults expanded that can be
158 converted to a `DatastoreConfig`. Read-only and will not be
159 modified by this method.
160 Repository-specific options that should not be obtained
161 from defaults when Butler instances are constructed
162 should be copied from ``full`` to ``config``.
163 overwrite : `bool`, optional
164 If `False`, do not modify a value in ``config`` if the value
165 already exists. Default is always to overwrite with the provided
166 ``root``.
168 Notes
169 -----
170 If a keyword is explicitly defined in the supplied ``config`` it
171 will not be overridden by this method if ``overwrite`` is `False`.
172 This allows explicit values set in external configs to be retained.
173 """
174 return
176 @property
177 def bridge(self) -> DatastoreRegistryBridge:
178 # Docstring inherited from GenericBaseDatastore.
179 return self._bridge
181 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None:
182 # Docstring inherited from GenericBaseDatastore.
183 for ref, info in zip(refs, infos, strict=True):
184 self.records[ref.id] = info
185 self.related.setdefault(info.parentID, set()).add(ref.id)
187 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
188 # Docstring inherited from GenericBaseDatastore.
189 return self.records[ref.id]
191 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredMemoryItemInfo]:
192 # Docstring inherited from GenericBaseDatastore.
193 return [self.getStoredItemInfo(ref)]
195 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
196 # Docstring inherited from GenericBaseDatastore.
197 # If a component has been removed previously then we can sometimes
198 # be asked to remove it again. Other datastores ignore this
199 # so also ignore here
200 if ref.id not in self.records:
201 return
202 record = self.records[ref.id]
203 del self.records[ref.id]
204 self.related[record.parentID].remove(ref.id)
206 def _get_dataset_info(self, ref: DatasetIdRef) -> tuple[DatasetId, StoredMemoryItemInfo]:
207 """Check that the dataset is present and return the real ID and
208 associated information.
210 Parameters
211 ----------
212 ref : `DatasetRef`
213 Target `DatasetRef`
215 Returns
216 -------
217 realID : `int`
218 The dataset ID associated with this ref that should be used. This
219 could either be the ID of the supplied `DatasetRef` or the parent.
220 storageInfo : `StoredMemoryItemInfo`
221 Associated storage information.
223 Raises
224 ------
225 FileNotFoundError
226 Raised if the dataset is not present in this datastore.
227 """
228 try:
229 storedItemInfo = self.getStoredItemInfo(ref)
230 except KeyError:
231 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
232 realID = ref.id
233 if storedItemInfo.parentID is not None: 233 ↛ 236line 233 didn't jump to line 236, because the condition on line 233 was never false
234 realID = storedItemInfo.parentID
236 if realID not in self.datasets: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true
237 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
239 return realID, storedItemInfo
241 def knows(self, ref: DatasetRef) -> bool:
242 """Check if the dataset is known to the datastore.
244 This datastore does not distinguish dataset existence from knowledge
245 of a dataset.
247 Parameters
248 ----------
249 ref : `DatasetRef`
250 Reference to the required dataset.
252 Returns
253 -------
254 exists : `bool`
255 `True` if the dataset is known to the datastore.
256 """
257 return self.exists(ref)
259 def exists(self, ref: DatasetRef) -> bool:
260 """Check if the dataset exists in the datastore.
262 Parameters
263 ----------
264 ref : `DatasetRef`
265 Reference to the required dataset.
267 Returns
268 -------
269 exists : `bool`
270 `True` if the entity exists in the `Datastore`.
271 """
272 try:
273 self._get_dataset_info(ref)
274 except FileNotFoundError:
275 return False
276 return True
278 def get(
279 self,
280 ref: DatasetRef,
281 parameters: Mapping[str, Any] | None = None,
282 storageClass: StorageClass | str | None = None,
283 ) -> Any:
284 """Load an InMemoryDataset from the store.
286 Parameters
287 ----------
288 ref : `DatasetRef`
289 Reference to the required Dataset.
290 parameters : `dict`
291 `StorageClass`-specific parameters that specify, for example,
292 a slice of the dataset to be loaded.
293 storageClass : `StorageClass` or `str`, optional
294 The storage class to be used to override the Python type
295 returned by this method. By default the returned type matches
296 the dataset type definition for this dataset. Specifying a
297 read `StorageClass` can force a different type to be returned.
298 This type must be compatible with the original type.
300 Returns
301 -------
302 inMemoryDataset : `object`
303 Requested dataset or slice thereof as an InMemoryDataset.
305 Raises
306 ------
307 FileNotFoundError
308 Requested dataset can not be retrieved.
309 TypeError
310 Return value from formatter has unexpected type.
311 ValueError
312 Formatter failed to process the dataset.
313 """
314 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
316 realID, storedItemInfo = self._get_dataset_info(ref)
318 # We have a write storage class and a read storage class and they
319 # can be different for concrete composites or if overridden.
320 if storageClass is not None:
321 ref = ref.overrideStorageClass(storageClass)
322 refStorageClass = ref.datasetType.storageClass
323 writeStorageClass = storedItemInfo.storageClass
325 component = ref.datasetType.component()
327 # Check that the supplied parameters are suitable for the type read
328 # If this is a derived component we validate against the composite
329 isDerivedComponent = False
330 if component in writeStorageClass.derivedComponents:
331 writeStorageClass.validateParameters(parameters)
332 isDerivedComponent = True
333 else:
334 refStorageClass.validateParameters(parameters)
336 inMemoryDataset = self.datasets[realID]
338 # if this is a read only component we need to apply parameters
339 # before we retrieve the component. We assume that the parameters
340 # will affect the data globally, before the derived component
341 # is selected.
342 if isDerivedComponent:
343 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
344 # Then disable parameters for later
345 parameters = {}
347 # Check if we have a component.
348 if component:
349 # In-memory datastore must have stored the dataset as a single
350 # object in the write storage class. We therefore use that
351 # storage class delegate to obtain the component.
352 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
354 # Since there is no formatter to process parameters, they all must be
355 # passed to the assembler.
356 inMemoryDataset = self._post_process_get(
357 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None
358 )
360 # Last minute type conversion.
361 return refStorageClass.coerce_type(inMemoryDataset)
363 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
364 """Write a InMemoryDataset with a given `DatasetRef` to the store.
366 Parameters
367 ----------
368 inMemoryDataset : `object`
369 The dataset to store.
370 ref : `DatasetRef`
371 Reference to the associated Dataset.
373 Raises
374 ------
375 TypeError
376 Supplied object and storage class are inconsistent.
377 DatasetTypeNotSupportedError
378 The associated `DatasetType` is not handled by this datastore.
380 Notes
381 -----
382 If the datastore is configured to reject certain dataset types it
383 is possible that the put will fail and raise a
384 `DatasetTypeNotSupportedError`. The main use case for this is to
385 allow `ChainedDatastore` to put to multiple datastores without
386 requiring that every datastore accepts the dataset.
387 """
388 # May need to coerce the in memory dataset to the correct
389 # python type, otherwise parameters may not work.
390 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
392 self._validate_put_parameters(inMemoryDataset, ref)
394 self.datasets[ref.id] = inMemoryDataset
395 log.debug("Store %s in %s", ref, self.name)
397 # Store time we received this content, to allow us to optionally
398 # expire it. Instead of storing a filename here, we include the
399 # ID of this datasetRef so we can find it from components.
400 itemInfo = StoredMemoryItemInfo(
401 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.id
402 )
404 # We have to register this content with registry.
405 # Currently this assumes we have a file so we need to use stub entries
406 # TODO: Add to ephemeral part of registry
407 self._register_datasets([(ref, itemInfo)])
409 if self._transaction is not None:
410 self._transaction.registerUndo("put", self.remove, ref)
412 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
413 """Return URIs associated with dataset.
415 Parameters
416 ----------
417 ref : `DatasetRef`
418 Reference to the required dataset.
419 predict : `bool`, optional
420 If the datastore does not know about the dataset, should it
421 return a predicted URI or not?
423 Returns
424 -------
425 uris : `DatasetRefURIs`
426 The URI to the primary artifact associated with this dataset (if
427 the dataset was disassembled within the datastore this may be
428 `None`), and the URIs to any components associated with the dataset
429 artifact. (can be empty if there are no components).
431 Notes
432 -----
433 The URIs returned for in-memory datastores are not usable but
434 provide an indication of the associated dataset.
435 """
436 # Include the dataID as a URI query
437 query = urlencode(ref.dataId)
439 # if this has never been written then we have to guess
440 if not self.exists(ref):
441 if not predict:
442 raise FileNotFoundError(f"Dataset {ref} not in this datastore")
443 name = f"{ref.datasetType.name}"
444 fragment = "#predicted"
445 else:
446 realID, _ = self._get_dataset_info(ref)
447 name = f"{id(self.datasets[realID])}?{query}"
448 fragment = ""
450 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
452 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
453 """URI to the Dataset.
455 Always uses "mem://" URI prefix.
457 Parameters
458 ----------
459 ref : `DatasetRef`
460 Reference to the required Dataset.
461 predict : `bool`
462 If `True`, allow URIs to be returned of datasets that have not
463 been written.
465 Returns
466 -------
467 uri : `str`
468 URI pointing to the dataset within the datastore. If the
469 dataset does not exist in the datastore, and if ``predict`` is
470 `True`, the URI will be a prediction and will include a URI
471 fragment "#predicted".
472 If the datastore does not have entities that relate well
473 to the concept of a URI the returned URI string will be
474 descriptive. The returned URI is not guaranteed to be obtainable.
476 Raises
477 ------
478 FileNotFoundError
479 A URI has been requested for a dataset that does not exist and
480 guessing is not allowed.
481 AssertionError
482 Raised if an internal error occurs.
483 """
484 primary, _ = self.getURIs(ref, predict)
485 if primary is None:
486 # This should be impossible since this datastore does
487 # not disassemble. This check also helps mypy.
488 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
489 return primary
491 def retrieveArtifacts(
492 self,
493 refs: Iterable[DatasetRef],
494 destination: ResourcePath,
495 transfer: str = "auto",
496 preserve_path: bool = True,
497 overwrite: bool | None = False,
498 ) -> list[ResourcePath]:
499 """Retrieve the file artifacts associated with the supplied refs.
501 Notes
502 -----
503 Not implemented by this datastore.
504 """
505 # Could conceivably launch a FileDatastore to use formatters to write
506 # the data but this is fraught with problems.
507 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
509 def forget(self, refs: Iterable[DatasetRef]) -> None:
510 # Docstring inherited.
511 refs = list(refs)
512 self._bridge.forget(refs)
513 for ref in refs:
514 self.removeStoredItemInfo(ref)
516 @transactional
517 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = False) -> None:
518 """Indicate to the Datastore that a dataset can be removed.
520 Parameters
521 ----------
522 ref : `DatasetRef` or iterable thereof
523 Reference to the required Dataset(s).
524 ignore_errors: `bool`, optional
525 Indicate that errors should be ignored.
527 Raises
528 ------
529 FileNotFoundError
530 Attempt to remove a dataset that does not exist. Only relevant
531 if a single dataset ref is given.
533 Notes
534 -----
535 Concurrency should not normally be an issue for the in memory datastore
536 since all internal changes are isolated to solely this process and
537 the registry only changes rows associated with this process.
538 """
539 if not isinstance(ref, DatasetRef):
540 log.debug("Bulk trashing of datasets in datastore %s", self.name)
541 self.bridge.moveToTrash(ref, transaction=self._transaction)
542 return
544 log.debug("Trash %s in datastore %s", ref, self.name)
546 # Check that this dataset is known to datastore
547 try:
548 self._get_dataset_info(ref)
550 # Move datasets to trash table
551 self.bridge.moveToTrash([ref], transaction=self._transaction)
552 except Exception as e:
553 if ignore_errors: 553 ↛ 554line 553 didn't jump to line 554, because the condition on line 553 was never true
554 log.warning(
555 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
556 )
557 else:
558 raise
560 def emptyTrash(self, ignore_errors: bool = False) -> None:
561 """Remove all datasets from the trash.
563 Parameters
564 ----------
565 ignore_errors : `bool`, optional
566 Ignore errors.
568 Notes
569 -----
570 The internal tracking of datasets is affected by this method and
571 transaction handling is not supported if there is a problem before
572 the datasets themselves are deleted.
574 Concurrency should not normally be an issue for the in memory datastore
575 since all internal changes are isolated to solely this process and
576 the registry only changes rows associated with this process.
577 """
578 log.debug("Emptying trash in datastore %s", self.name)
579 with self._bridge.emptyTrash() as trash_data:
580 trashed, _ = trash_data
581 for ref, _ in trashed:
582 try:
583 realID, _ = self._get_dataset_info(ref)
584 except FileNotFoundError: 584 ↛ 587line 584 didn't jump to line 587
585 # Dataset already removed so ignore it
586 continue
587 except Exception as e:
588 if ignore_errors:
589 log.warning(
590 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
591 self.name,
592 ref.id,
593 e,
594 )
595 continue
596 else:
597 raise
599 # Determine whether all references to this dataset have been
600 # removed and we can delete the dataset itself
601 allRefs = self.related[realID]
602 remainingRefs = allRefs - {ref.id}
603 if not remainingRefs: 603 ↛ 608line 603 didn't jump to line 608, because the condition on line 603 was never false
604 log.debug("Removing artifact %s from datastore %s", realID, self.name)
605 del self.datasets[realID]
607 # Remove this entry
608 self.removeStoredItemInfo(ref)
610 def validateConfiguration(
611 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
612 ) -> None:
613 """Validate some of the configuration for this datastore.
615 Parameters
616 ----------
617 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
618 Entities to test against this configuration. Can be differing
619 types.
620 logFailures : `bool`, optional
621 If `True`, output a log message for every validation error
622 detected.
624 Raises
625 ------
626 DatastoreValidationError
627 Raised if there is a validation problem with a configuration.
628 All the problems are reported in a single exception.
630 Notes
631 -----
632 This method is a no-op.
633 """
634 return
636 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
637 # Docstring is inherited from base class
638 return transfer
640 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
641 # Docstring is inherited from base class
642 return
644 def getLookupKeys(self) -> set[LookupKey]:
645 # Docstring is inherited from base class
646 return self.constraints.getLookupKeys()
648 def needs_expanded_data_ids(
649 self,
650 transfer: str | None,
651 entity: DatasetRef | DatasetType | StorageClass | None = None,
652 ) -> bool:
653 # Docstring inherited.
654 return False
656 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
657 # Docstring inherited from the base class.
658 return
660 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
661 # Docstring inherited from the base class.
663 # In-memory Datastore records cannot be exported or imported
664 return {}