Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 93%
182 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:13 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:13 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""In-memory datastore."""
24from __future__ import annotations
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from collections.abc import Iterable, Mapping
31from dataclasses import dataclass
32from typing import TYPE_CHECKING, Any
33from urllib.parse import urlencode
35from lsst.daf.butler import (
36 DatasetId,
37 DatasetRef,
38 DatasetRefURIs,
39 DatastoreRecordData,
40 StorageClass,
41 StoredDatastoreItemInfo,
42)
43from lsst.daf.butler.core.utils import transactional
44from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
45from lsst.resources import ResourcePath
47from ..registry.interfaces import DatabaseInsertMode
48from .genericDatastore import GenericBaseDatastore
50if TYPE_CHECKING:
51 from lsst.daf.butler import Config, DatasetType, LookupKey
52 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
54log = logging.getLogger(__name__)
57@dataclass(frozen=True)
58class StoredMemoryItemInfo(StoredDatastoreItemInfo):
59 """Internal InMemoryDatastore Metadata associated with a stored
60 DatasetRef.
61 """
63 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"}
65 timestamp: float
66 """Unix timestamp indicating the time the dataset was stored."""
68 storageClass: StorageClass
69 """StorageClass associated with the dataset."""
71 parentID: DatasetId
72 """ID of the parent `DatasetRef` if this entry is a concrete
73 composite. Not used if the dataset being stored is not a
74 virtual component of a composite
75 """
77 dataset_id: DatasetId
78 """DatasetId associated with this record."""
81class InMemoryDatastore(GenericBaseDatastore):
82 """Basic Datastore for writing to an in memory cache.
84 This datastore is ephemeral in that the contents of the datastore
85 disappear when the Python process completes. This also means that
86 other processes can not access this datastore.
88 Parameters
89 ----------
90 config : `DatastoreConfig` or `str`
91 Configuration.
92 bridgeManager : `DatastoreRegistryBridgeManager`
93 Object that manages the interface between `Registry` and datastores.
94 butlerRoot : `str`, optional
95 Unused parameter.
97 Notes
98 -----
99 InMemoryDatastore does not support any file-based ingest.
100 """
102 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
103 """Path to configuration defaults. Accessed within the ``configs`` resource
104 or relative to a search path. Can be None if no defaults specified.
105 """
107 isEphemeral = True
108 """A new datastore is created every time and datasets disappear when
109 the process shuts down."""
111 datasets: dict[DatasetId, Any]
112 """Internal storage of datasets indexed by dataset ID."""
114 records: dict[DatasetId, StoredMemoryItemInfo]
115 """Internal records about stored datasets."""
117 def __init__(
118 self,
119 config: Config | str,
120 bridgeManager: DatastoreRegistryBridgeManager,
121 butlerRoot: str | None = None,
122 ):
123 super().__init__(config, bridgeManager)
125 # Name ourselves with the timestamp the datastore
126 # was created.
127 self.name = f"{type(self).__name__}@{time.time()}"
128 log.debug("Creating datastore %s", self.name)
130 # Storage of datasets, keyed by dataset_id
131 self.datasets: dict[DatasetId, Any] = {}
133 # Records is distinct in order to track concrete composite components
134 # where we register multiple components for a single dataset.
135 self.records: dict[DatasetId, StoredMemoryItemInfo] = {}
137 # Related records that share the same parent
138 self.related: dict[DatasetId, set[DatasetId]] = {}
140 self._bridge = bridgeManager.register(self.name, ephemeral=True)
142 @classmethod
143 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
144 """Set any filesystem-dependent config options for this Datastore to
145 be appropriate for a new empty repository with the given root.
147 Does nothing in this implementation.
149 Parameters
150 ----------
151 root : `str`
152 Filesystem path to the root of the data repository.
153 config : `Config`
154 A `Config` to update. Only the subset understood by
155 this component will be updated. Will not expand
156 defaults.
157 full : `Config`
158 A complete config with all defaults expanded that can be
159 converted to a `DatastoreConfig`. Read-only and will not be
160 modified by this method.
161 Repository-specific options that should not be obtained
162 from defaults when Butler instances are constructed
163 should be copied from ``full`` to ``config``.
164 overwrite : `bool`, optional
165 If `False`, do not modify a value in ``config`` if the value
166 already exists. Default is always to overwrite with the provided
167 ``root``.
169 Notes
170 -----
171 If a keyword is explicitly defined in the supplied ``config`` it
172 will not be overridden by this method if ``overwrite`` is `False`.
173 This allows explicit values set in external configs to be retained.
174 """
175 return
177 @property
178 def bridge(self) -> DatastoreRegistryBridge:
179 # Docstring inherited from GenericBaseDatastore.
180 return self._bridge
182 def addStoredItemInfo(
183 self,
184 refs: Iterable[DatasetRef],
185 infos: Iterable[StoredMemoryItemInfo],
186 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
187 ) -> None:
188 # Docstring inherited from GenericBaseDatastore.
189 for ref, info in zip(refs, infos, strict=True):
190 self.records[ref.id] = info
191 self.related.setdefault(info.parentID, set()).add(ref.id)
193 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
194 # Docstring inherited from GenericBaseDatastore.
195 return self.records[ref.id]
197 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredMemoryItemInfo]:
198 # Docstring inherited from GenericBaseDatastore.
199 return [self.getStoredItemInfo(ref)]
201 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
202 # Docstring inherited from GenericBaseDatastore.
203 # If a component has been removed previously then we can sometimes
204 # be asked to remove it again. Other datastores ignore this
205 # so also ignore here
206 if ref.id not in self.records:
207 return
208 record = self.records[ref.id]
209 del self.records[ref.id]
210 self.related[record.parentID].remove(ref.id)
212 def _get_dataset_info(self, ref: DatasetIdRef) -> tuple[DatasetId, StoredMemoryItemInfo]:
213 """Check that the dataset is present and return the real ID and
214 associated information.
216 Parameters
217 ----------
218 ref : `DatasetRef`
219 Target `DatasetRef`
221 Returns
222 -------
223 realID : `int`
224 The dataset ID associated with this ref that should be used. This
225 could either be the ID of the supplied `DatasetRef` or the parent.
226 storageInfo : `StoredMemoryItemInfo`
227 Associated storage information.
229 Raises
230 ------
231 FileNotFoundError
232 Raised if the dataset is not present in this datastore.
233 """
234 try:
235 storedItemInfo = self.getStoredItemInfo(ref)
236 except KeyError:
237 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
238 realID = ref.id
239 if storedItemInfo.parentID is not None: 239 ↛ 242line 239 didn't jump to line 242, because the condition on line 239 was never false
240 realID = storedItemInfo.parentID
242 if realID not in self.datasets: 242 ↛ 243line 242 didn't jump to line 243, because the condition on line 242 was never true
243 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
245 return realID, storedItemInfo
247 def knows(self, ref: DatasetRef) -> bool:
248 """Check if the dataset is known to the datastore.
250 This datastore does not distinguish dataset existence from knowledge
251 of a dataset.
253 Parameters
254 ----------
255 ref : `DatasetRef`
256 Reference to the required dataset.
258 Returns
259 -------
260 exists : `bool`
261 `True` if the dataset is known to the datastore.
262 """
263 return self.exists(ref)
265 def exists(self, ref: DatasetRef) -> bool:
266 """Check if the dataset exists in the datastore.
268 Parameters
269 ----------
270 ref : `DatasetRef`
271 Reference to the required dataset.
273 Returns
274 -------
275 exists : `bool`
276 `True` if the entity exists in the `Datastore`.
277 """
278 try:
279 self._get_dataset_info(ref)
280 except FileNotFoundError:
281 return False
282 return True
284 def get(
285 self,
286 ref: DatasetRef,
287 parameters: Mapping[str, Any] | None = None,
288 storageClass: StorageClass | str | None = None,
289 ) -> Any:
290 """Load an InMemoryDataset from the store.
292 Parameters
293 ----------
294 ref : `DatasetRef`
295 Reference to the required Dataset.
296 parameters : `dict`
297 `StorageClass`-specific parameters that specify, for example,
298 a slice of the dataset to be loaded.
299 storageClass : `StorageClass` or `str`, optional
300 The storage class to be used to override the Python type
301 returned by this method. By default the returned type matches
302 the dataset type definition for this dataset. Specifying a
303 read `StorageClass` can force a different type to be returned.
304 This type must be compatible with the original type.
306 Returns
307 -------
308 inMemoryDataset : `object`
309 Requested dataset or slice thereof as an InMemoryDataset.
311 Raises
312 ------
313 FileNotFoundError
314 Requested dataset can not be retrieved.
315 TypeError
316 Return value from formatter has unexpected type.
317 ValueError
318 Formatter failed to process the dataset.
319 """
320 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
322 realID, storedItemInfo = self._get_dataset_info(ref)
324 # We have a write storage class and a read storage class and they
325 # can be different for concrete composites or if overridden.
326 if storageClass is not None:
327 ref = ref.overrideStorageClass(storageClass)
328 refStorageClass = ref.datasetType.storageClass
329 writeStorageClass = storedItemInfo.storageClass
331 component = ref.datasetType.component()
333 # Check that the supplied parameters are suitable for the type read
334 # If this is a derived component we validate against the composite
335 isDerivedComponent = False
336 if component in writeStorageClass.derivedComponents:
337 writeStorageClass.validateParameters(parameters)
338 isDerivedComponent = True
339 else:
340 refStorageClass.validateParameters(parameters)
342 inMemoryDataset = self.datasets[realID]
344 # if this is a read only component we need to apply parameters
345 # before we retrieve the component. We assume that the parameters
346 # will affect the data globally, before the derived component
347 # is selected.
348 if isDerivedComponent:
349 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
350 # Then disable parameters for later
351 parameters = {}
353 # Check if we have a component.
354 if component:
355 # In-memory datastore must have stored the dataset as a single
356 # object in the write storage class. We therefore use that
357 # storage class delegate to obtain the component.
358 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
360 # Since there is no formatter to process parameters, they all must be
361 # passed to the assembler.
362 inMemoryDataset = self._post_process_get(
363 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None
364 )
366 # Last minute type conversion.
367 return refStorageClass.coerce_type(inMemoryDataset)
369 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
370 """Write a InMemoryDataset with a given `DatasetRef` to the store.
372 Parameters
373 ----------
374 inMemoryDataset : `object`
375 The dataset to store.
376 ref : `DatasetRef`
377 Reference to the associated Dataset.
379 Raises
380 ------
381 TypeError
382 Supplied object and storage class are inconsistent.
383 DatasetTypeNotSupportedError
384 The associated `DatasetType` is not handled by this datastore.
386 Notes
387 -----
388 If the datastore is configured to reject certain dataset types it
389 is possible that the put will fail and raise a
390 `DatasetTypeNotSupportedError`. The main use case for this is to
391 allow `ChainedDatastore` to put to multiple datastores without
392 requiring that every datastore accepts the dataset.
393 """
394 # May need to coerce the in memory dataset to the correct
395 # python type, otherwise parameters may not work.
396 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
398 self._validate_put_parameters(inMemoryDataset, ref)
400 self.datasets[ref.id] = inMemoryDataset
401 log.debug("Store %s in %s", ref, self.name)
403 # Store time we received this content, to allow us to optionally
404 # expire it. Instead of storing a filename here, we include the
405 # ID of this datasetRef so we can find it from components.
406 itemInfo = StoredMemoryItemInfo(
407 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.id
408 )
410 # We have to register this content with registry.
411 # Currently this assumes we have a file so we need to use stub entries
412 # TODO: Add to ephemeral part of registry
413 self._register_datasets([(ref, itemInfo)])
415 if self._transaction is not None:
416 self._transaction.registerUndo("put", self.remove, ref)
418 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
419 """Return URIs associated with dataset.
421 Parameters
422 ----------
423 ref : `DatasetRef`
424 Reference to the required dataset.
425 predict : `bool`, optional
426 If the datastore does not know about the dataset, should it
427 return a predicted URI or not?
429 Returns
430 -------
431 uris : `DatasetRefURIs`
432 The URI to the primary artifact associated with this dataset (if
433 the dataset was disassembled within the datastore this may be
434 `None`), and the URIs to any components associated with the dataset
435 artifact. (can be empty if there are no components).
437 Notes
438 -----
439 The URIs returned for in-memory datastores are not usable but
440 provide an indication of the associated dataset.
441 """
442 # Include the dataID as a URI query
443 query = urlencode(ref.dataId)
445 # if this has never been written then we have to guess
446 if not self.exists(ref):
447 if not predict:
448 raise FileNotFoundError(f"Dataset {ref} not in this datastore")
449 name = f"{ref.datasetType.name}"
450 fragment = "#predicted"
451 else:
452 realID, _ = self._get_dataset_info(ref)
453 name = f"{id(self.datasets[realID])}?{query}"
454 fragment = ""
456 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
458 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
459 """URI to the Dataset.
461 Always uses "mem://" URI prefix.
463 Parameters
464 ----------
465 ref : `DatasetRef`
466 Reference to the required Dataset.
467 predict : `bool`
468 If `True`, allow URIs to be returned of datasets that have not
469 been written.
471 Returns
472 -------
473 uri : `str`
474 URI pointing to the dataset within the datastore. If the
475 dataset does not exist in the datastore, and if ``predict`` is
476 `True`, the URI will be a prediction and will include a URI
477 fragment "#predicted".
478 If the datastore does not have entities that relate well
479 to the concept of a URI the returned URI string will be
480 descriptive. The returned URI is not guaranteed to be obtainable.
482 Raises
483 ------
484 FileNotFoundError
485 A URI has been requested for a dataset that does not exist and
486 guessing is not allowed.
487 AssertionError
488 Raised if an internal error occurs.
489 """
490 primary, _ = self.getURIs(ref, predict)
491 if primary is None:
492 # This should be impossible since this datastore does
493 # not disassemble. This check also helps mypy.
494 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
495 return primary
497 def retrieveArtifacts(
498 self,
499 refs: Iterable[DatasetRef],
500 destination: ResourcePath,
501 transfer: str = "auto",
502 preserve_path: bool = True,
503 overwrite: bool | None = False,
504 ) -> list[ResourcePath]:
505 """Retrieve the file artifacts associated with the supplied refs.
507 Notes
508 -----
509 Not implemented by this datastore.
510 """
511 # Could conceivably launch a FileDatastore to use formatters to write
512 # the data but this is fraught with problems.
513 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
515 def forget(self, refs: Iterable[DatasetRef]) -> None:
516 # Docstring inherited.
517 refs = list(refs)
518 self._bridge.forget(refs)
519 for ref in refs:
520 self.removeStoredItemInfo(ref)
522 @transactional
523 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = False) -> None:
524 """Indicate to the Datastore that a dataset can be removed.
526 Parameters
527 ----------
528 ref : `DatasetRef` or iterable thereof
529 Reference to the required Dataset(s).
530 ignore_errors: `bool`, optional
531 Indicate that errors should be ignored.
533 Raises
534 ------
535 FileNotFoundError
536 Attempt to remove a dataset that does not exist. Only relevant
537 if a single dataset ref is given.
539 Notes
540 -----
541 Concurrency should not normally be an issue for the in memory datastore
542 since all internal changes are isolated to solely this process and
543 the registry only changes rows associated with this process.
544 """
545 if not isinstance(ref, DatasetRef):
546 log.debug("Bulk trashing of datasets in datastore %s", self.name)
547 self.bridge.moveToTrash(ref, transaction=self._transaction)
548 return
550 log.debug("Trash %s in datastore %s", ref, self.name)
552 # Check that this dataset is known to datastore
553 try:
554 self._get_dataset_info(ref)
556 # Move datasets to trash table
557 self.bridge.moveToTrash([ref], transaction=self._transaction)
558 except Exception as e:
559 if ignore_errors: 559 ↛ 560line 559 didn't jump to line 560, because the condition on line 559 was never true
560 log.warning(
561 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
562 )
563 else:
564 raise
566 def emptyTrash(self, ignore_errors: bool = False) -> None:
567 """Remove all datasets from the trash.
569 Parameters
570 ----------
571 ignore_errors : `bool`, optional
572 Ignore errors.
574 Notes
575 -----
576 The internal tracking of datasets is affected by this method and
577 transaction handling is not supported if there is a problem before
578 the datasets themselves are deleted.
580 Concurrency should not normally be an issue for the in memory datastore
581 since all internal changes are isolated to solely this process and
582 the registry only changes rows associated with this process.
583 """
584 log.debug("Emptying trash in datastore %s", self.name)
585 with self._bridge.emptyTrash() as trash_data:
586 trashed, _ = trash_data
587 for ref, _ in trashed:
588 try:
589 realID, _ = self._get_dataset_info(ref)
590 except FileNotFoundError: 590 ↛ 593line 590 didn't jump to line 593
591 # Dataset already removed so ignore it
592 continue
593 except Exception as e:
594 if ignore_errors:
595 log.warning(
596 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
597 self.name,
598 ref.id,
599 e,
600 )
601 continue
602 else:
603 raise
605 # Determine whether all references to this dataset have been
606 # removed and we can delete the dataset itself
607 allRefs = self.related[realID]
608 remainingRefs = allRefs - {ref.id}
609 if not remainingRefs: 609 ↛ 614line 609 didn't jump to line 614, because the condition on line 609 was never false
610 log.debug("Removing artifact %s from datastore %s", realID, self.name)
611 del self.datasets[realID]
613 # Remove this entry
614 self.removeStoredItemInfo(ref)
616 def validateConfiguration(
617 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
618 ) -> None:
619 """Validate some of the configuration for this datastore.
621 Parameters
622 ----------
623 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
624 Entities to test against this configuration. Can be differing
625 types.
626 logFailures : `bool`, optional
627 If `True`, output a log message for every validation error
628 detected.
630 Raises
631 ------
632 DatastoreValidationError
633 Raised if there is a validation problem with a configuration.
634 All the problems are reported in a single exception.
636 Notes
637 -----
638 This method is a no-op.
639 """
640 return
642 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
643 # Docstring is inherited from base class
644 return transfer
646 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
647 # Docstring is inherited from base class
648 return
650 def getLookupKeys(self) -> set[LookupKey]:
651 # Docstring is inherited from base class
652 return self.constraints.getLookupKeys()
654 def needs_expanded_data_ids(
655 self,
656 transfer: str | None,
657 entity: DatasetRef | DatasetType | StorageClass | None = None,
658 ) -> bool:
659 # Docstring inherited.
660 return False
662 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
663 # Docstring inherited from the base class.
664 return
666 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
667 # Docstring inherited from the base class.
669 # In-memory Datastore records cannot be exported or imported
670 return {}