Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 92%
192 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-25 10:23 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-25 10:23 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""In-memory datastore."""
30from __future__ import annotations
32__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
34import logging
35import time
36from collections.abc import Iterable, Mapping
37from dataclasses import dataclass
38from typing import TYPE_CHECKING, Any
39from urllib.parse import urlencode
41from lsst.daf.butler import DatasetId, DatasetRef, StorageClass
42from lsst.daf.butler.datastore import DatasetRefURIs, DatastoreConfig
43from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore, post_process_get
44from lsst.daf.butler.datastore.record_data import DatastoreRecordData
45from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo
46from lsst.daf.butler.utils import transactional
47from lsst.resources import ResourcePath, ResourcePathExpression
49if TYPE_CHECKING:
50 from lsst.daf.butler import Config, DatasetType, LookupKey
51 from lsst.daf.butler.datastore import DatastoreOpaqueTable
52 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
54log = logging.getLogger(__name__)
57@dataclass(frozen=True, slots=True)
58class StoredMemoryItemInfo(StoredDatastoreItemInfo):
59 """Internal InMemoryDatastore Metadata associated with a stored
60 DatasetRef.
61 """
63 timestamp: float
64 """Unix timestamp indicating the time the dataset was stored."""
66 storageClass: StorageClass
67 """StorageClass associated with the dataset."""
69 parentID: DatasetId
70 """ID of the parent `DatasetRef` if this entry is a concrete
71 composite. Not used if the dataset being stored is not a
72 virtual component of a composite
73 """
76class InMemoryDatastore(GenericBaseDatastore[StoredMemoryItemInfo]):
77 """Basic Datastore for writing to an in memory cache.
79 This datastore is ephemeral in that the contents of the datastore
80 disappear when the Python process completes. This also means that
81 other processes can not access this datastore.
83 Parameters
84 ----------
85 config : `DatastoreConfig` or `str`
86 Configuration.
87 bridgeManager : `DatastoreRegistryBridgeManager`
88 Object that manages the interface between `Registry` and datastores.
90 Notes
91 -----
92 InMemoryDatastore does not support any file-based ingest.
93 """
95 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
96 """Path to configuration defaults. Accessed within the ``configs`` resource
97 or relative to a search path. Can be None if no defaults specified.
98 """
100 isEphemeral = True
101 """A new datastore is created every time and datasets disappear when
102 the process shuts down."""
104 datasets: dict[DatasetId, Any]
105 """Internal storage of datasets indexed by dataset ID."""
107 records: dict[DatasetId, StoredMemoryItemInfo]
108 """Internal records about stored datasets."""
110 def __init__(
111 self,
112 config: DatastoreConfig,
113 bridgeManager: DatastoreRegistryBridgeManager,
114 ):
115 super().__init__(config, bridgeManager)
117 # Name ourselves with the timestamp the datastore
118 # was created.
119 self.name = f"{type(self).__name__}@{time.time()}"
120 log.debug("Creating datastore %s", self.name)
122 # Storage of datasets, keyed by dataset_id
123 self.datasets: dict[DatasetId, Any] = {}
125 # Records is distinct in order to track concrete composite components
126 # where we register multiple components for a single dataset.
127 self.records: dict[DatasetId, StoredMemoryItemInfo] = {}
129 # Related records that share the same parent
130 self.related: dict[DatasetId, set[DatasetId]] = {}
132 self._trashedIds: set[DatasetId] = set()
134 @classmethod
135 def _create_from_config(
136 cls,
137 config: DatastoreConfig,
138 bridgeManager: DatastoreRegistryBridgeManager,
139 butlerRoot: ResourcePathExpression | None,
140 ) -> InMemoryDatastore:
141 return InMemoryDatastore(config, bridgeManager)
143 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> InMemoryDatastore:
144 clone = InMemoryDatastore(self.config, bridgeManager)
145 # Sharing these objects is not thread-safe, but this class is only used
146 # in single-threaded test code.
147 clone.datasets = self.datasets
148 clone.records = self.records
149 clone.related = self.related
150 clone._trashedIds = self._trashedIds
151 return clone
153 @classmethod
154 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
155 """Set any filesystem-dependent config options for this Datastore to
156 be appropriate for a new empty repository with the given root.
158 Does nothing in this implementation.
160 Parameters
161 ----------
162 root : `str`
163 Filesystem path to the root of the data repository.
164 config : `Config`
165 A `Config` to update. Only the subset understood by
166 this component will be updated. Will not expand
167 defaults.
168 full : `Config`
169 A complete config with all defaults expanded that can be
170 converted to a `DatastoreConfig`. Read-only and will not be
171 modified by this method.
172 Repository-specific options that should not be obtained
173 from defaults when Butler instances are constructed
174 should be copied from ``full`` to ``config``.
175 overwrite : `bool`, optional
176 If `False`, do not modify a value in ``config`` if the value
177 already exists. Default is always to overwrite with the provided
178 ``root``.
180 Notes
181 -----
182 If a keyword is explicitly defined in the supplied ``config`` it
183 will not be overridden by this method if ``overwrite`` is `False`.
184 This allows explicit values set in external configs to be retained.
185 """
186 return
188 def _get_stored_item_info(self, dataset_id: DatasetId) -> StoredMemoryItemInfo:
189 # Docstring inherited from GenericBaseDatastore.
190 return self.records[dataset_id]
192 def _remove_stored_item_info(self, dataset_id: DatasetId) -> None:
193 # Docstring inherited from GenericBaseDatastore.
194 # If a component has been removed previously then we can sometimes
195 # be asked to remove it again. Other datastores ignore this
196 # so also ignore here
197 if dataset_id not in self.records:
198 return
199 record = self.records[dataset_id]
200 del self.records[dataset_id]
201 self.related[record.parentID].remove(dataset_id)
203 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
204 """Remove information about the file associated with this dataset.
206 Parameters
207 ----------
208 ref : `DatasetRef`
209 The dataset that has been removed.
211 Notes
212 -----
213 This method is actually not used by this implementation, but there are
214 some tests that check that this method works, so we keep it for now.
215 """
216 self._remove_stored_item_info(ref.id)
218 def _get_dataset_info(self, dataset_id: DatasetId) -> tuple[DatasetId, StoredMemoryItemInfo]:
219 """Check that the dataset is present and return the real ID and
220 associated information.
222 Parameters
223 ----------
224 dataset_id : `DatasetRef`
225 Target `DatasetRef`
227 Returns
228 -------
229 realID : `int`
230 The dataset ID associated with this ref that should be used. This
231 could either be the ID of the supplied `DatasetRef` or the parent.
232 storageInfo : `StoredMemoryItemInfo`
233 Associated storage information.
235 Raises
236 ------
237 FileNotFoundError
238 Raised if the dataset is not present in this datastore.
239 """
240 try:
241 storedItemInfo = self._get_stored_item_info(dataset_id)
242 except KeyError:
243 raise FileNotFoundError(f"No such file dataset in memory: {dataset_id}") from None
244 realID = dataset_id
245 if storedItemInfo.parentID is not None: 245 ↛ 248line 245 didn't jump to line 248, because the condition on line 245 was never false
246 realID = storedItemInfo.parentID
248 if realID not in self.datasets: 248 ↛ 249line 248 didn't jump to line 249, because the condition on line 248 was never true
249 raise FileNotFoundError(f"No such file dataset in memory: {dataset_id}")
251 return realID, storedItemInfo
253 def knows(self, ref: DatasetRef) -> bool:
254 """Check if the dataset is known to the datastore.
256 This datastore does not distinguish dataset existence from knowledge
257 of a dataset.
259 Parameters
260 ----------
261 ref : `DatasetRef`
262 Reference to the required dataset.
264 Returns
265 -------
266 exists : `bool`
267 `True` if the dataset is known to the datastore.
268 """
269 return self.exists(ref)
271 def exists(self, ref: DatasetRef) -> bool:
272 """Check if the dataset exists in the datastore.
274 Parameters
275 ----------
276 ref : `DatasetRef`
277 Reference to the required dataset.
279 Returns
280 -------
281 exists : `bool`
282 `True` if the entity exists in the `Datastore`.
283 """
284 try:
285 self._get_dataset_info(ref.id)
286 except FileNotFoundError:
287 return False
288 return True
290 def get(
291 self,
292 ref: DatasetRef,
293 parameters: Mapping[str, Any] | None = None,
294 storageClass: StorageClass | str | None = None,
295 ) -> Any:
296 """Load an InMemoryDataset from the store.
298 Parameters
299 ----------
300 ref : `DatasetRef`
301 Reference to the required Dataset.
302 parameters : `dict`
303 `StorageClass`-specific parameters that specify, for example,
304 a slice of the dataset to be loaded.
305 storageClass : `StorageClass` or `str`, optional
306 The storage class to be used to override the Python type
307 returned by this method. By default the returned type matches
308 the dataset type definition for this dataset. Specifying a
309 read `StorageClass` can force a different type to be returned.
310 This type must be compatible with the original type.
312 Returns
313 -------
314 inMemoryDataset : `object`
315 Requested dataset or slice thereof as an InMemoryDataset.
317 Raises
318 ------
319 FileNotFoundError
320 Requested dataset can not be retrieved.
321 TypeError
322 Return value from formatter has unexpected type.
323 ValueError
324 Formatter failed to process the dataset.
325 """
326 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
328 realID, storedItemInfo = self._get_dataset_info(ref.id)
330 # We have a write storage class and a read storage class and they
331 # can be different for concrete composites or if overridden.
332 if storageClass is not None:
333 ref = ref.overrideStorageClass(storageClass)
334 refStorageClass = ref.datasetType.storageClass
335 writeStorageClass = storedItemInfo.storageClass
337 component = ref.datasetType.component()
339 # Check that the supplied parameters are suitable for the type read
340 # If this is a derived component we validate against the composite
341 isDerivedComponent = False
342 if component in writeStorageClass.derivedComponents:
343 writeStorageClass.validateParameters(parameters)
344 isDerivedComponent = True
345 else:
346 refStorageClass.validateParameters(parameters)
348 inMemoryDataset = self.datasets[realID]
350 # if this is a read only component we need to apply parameters
351 # before we retrieve the component. We assume that the parameters
352 # will affect the data globally, before the derived component
353 # is selected.
354 if isDerivedComponent:
355 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
356 # Then disable parameters for later
357 parameters = {}
359 # Check if we have a component.
360 if component:
361 # In-memory datastore must have stored the dataset as a single
362 # object in the write storage class. We therefore use that
363 # storage class delegate to obtain the component.
364 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
366 # Since there is no formatter to process parameters, they all must be
367 # passed to the assembler.
368 inMemoryDataset = post_process_get(
369 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None
370 )
372 # Last minute type conversion.
373 return refStorageClass.coerce_type(inMemoryDataset)
375 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
376 """Write a InMemoryDataset with a given `DatasetRef` to the store.
378 Parameters
379 ----------
380 inMemoryDataset : `object`
381 The dataset to store.
382 ref : `DatasetRef`
383 Reference to the associated Dataset.
385 Raises
386 ------
387 TypeError
388 Supplied object and storage class are inconsistent.
389 DatasetTypeNotSupportedError
390 The associated `DatasetType` is not handled by this datastore.
392 Notes
393 -----
394 If the datastore is configured to reject certain dataset types it
395 is possible that the put will fail and raise a
396 `DatasetTypeNotSupportedError`. The main use case for this is to
397 allow `ChainedDatastore` to put to multiple datastores without
398 requiring that every datastore accepts the dataset.
399 """
400 # May need to coerce the in memory dataset to the correct
401 # python type, otherwise parameters may not work.
402 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
404 self._validate_put_parameters(inMemoryDataset, ref)
406 self.datasets[ref.id] = inMemoryDataset
407 log.debug("Store %s in %s", ref, self.name)
409 # Store time we received this content, to allow us to optionally
410 # expire it. Instead of storing a filename here, we include the
411 # ID of this datasetRef so we can find it from components.
412 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, parentID=ref.id)
414 # We have to register this content with registry.
415 # Currently this assumes we have a file so we need to use stub entries
416 self.records[ref.id] = itemInfo
417 self.related.setdefault(itemInfo.parentID, set()).add(ref.id)
419 if self._transaction is not None:
420 self._transaction.registerUndo("put", self.remove, ref)
422 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
423 # It is OK to call put() here because registry is not populating
424 # bridges as we return empty dict from this method.
425 self.put(in_memory_dataset, ref)
426 # As ephemeral we return empty dict.
427 return {}
429 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
430 """Return URIs associated with dataset.
432 Parameters
433 ----------
434 ref : `DatasetRef`
435 Reference to the required dataset.
436 predict : `bool`, optional
437 If the datastore does not know about the dataset, controls whether
438 it should return a predicted URI or not.
440 Returns
441 -------
442 uris : `DatasetRefURIs`
443 The URI to the primary artifact associated with this dataset (if
444 the dataset was disassembled within the datastore this may be
445 `None`), and the URIs to any components associated with the dataset
446 artifact. (can be empty if there are no components).
448 Notes
449 -----
450 The URIs returned for in-memory datastores are not usable but
451 provide an indication of the associated dataset.
452 """
453 # Include the dataID as a URI query
454 query = urlencode(ref.dataId.required)
456 # if this has never been written then we have to guess
457 if not self.exists(ref):
458 if not predict:
459 raise FileNotFoundError(f"Dataset {ref} not in this datastore")
460 name = f"{ref.datasetType.name}"
461 fragment = "#predicted"
462 else:
463 realID, _ = self._get_dataset_info(ref.id)
464 name = f"{id(self.datasets[realID])}?{query}"
465 fragment = ""
467 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
469 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
470 """URI to the Dataset.
472 Always uses "mem://" URI prefix.
474 Parameters
475 ----------
476 ref : `DatasetRef`
477 Reference to the required Dataset.
478 predict : `bool`
479 If `True`, allow URIs to be returned of datasets that have not
480 been written.
482 Returns
483 -------
484 uri : `str`
485 URI pointing to the dataset within the datastore. If the
486 dataset does not exist in the datastore, and if ``predict`` is
487 `True`, the URI will be a prediction and will include a URI
488 fragment "#predicted".
489 If the datastore does not have entities that relate well
490 to the concept of a URI the returned URI string will be
491 descriptive. The returned URI is not guaranteed to be obtainable.
493 Raises
494 ------
495 FileNotFoundError
496 A URI has been requested for a dataset that does not exist and
497 guessing is not allowed.
498 AssertionError
499 Raised if an internal error occurs.
500 """
501 primary, _ = self.getURIs(ref, predict)
502 if primary is None:
503 # This should be impossible since this datastore does
504 # not disassemble. This check also helps mypy.
505 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
506 return primary
508 def retrieveArtifacts(
509 self,
510 refs: Iterable[DatasetRef],
511 destination: ResourcePath,
512 transfer: str = "auto",
513 preserve_path: bool = True,
514 overwrite: bool | None = False,
515 ) -> list[ResourcePath]:
516 """Retrieve the file artifacts associated with the supplied refs.
518 Parameters
519 ----------
520 refs : iterable of `DatasetRef`
521 The datasets for which artifacts are to be retrieved.
522 A single ref can result in multiple artifacts. The refs must
523 be resolved.
524 destination : `lsst.resources.ResourcePath`
525 Location to write the artifacts.
526 transfer : `str`, optional
527 Method to use to transfer the artifacts. Must be one of the options
528 supported by `lsst.resources.ResourcePath.transfer_from()`.
529 "move" is not allowed.
530 preserve_path : `bool`, optional
531 If `True` the full path of the artifact within the datastore
532 is preserved. If `False` the final file component of the path
533 is used.
534 overwrite : `bool`, optional
535 If `True` allow transfers to overwrite existing files at the
536 destination.
538 Notes
539 -----
540 Not implemented by this datastore.
541 """
542 # Could conceivably launch a FileDatastore to use formatters to write
543 # the data but this is fraught with problems.
544 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
546 def forget(self, refs: Iterable[DatasetRef]) -> None:
547 # Docstring inherited.
548 refs = list(refs)
549 for ref in refs:
550 self._remove_stored_item_info(ref.id)
552 @transactional
553 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = False) -> None:
554 """Indicate to the Datastore that a dataset can be removed.
556 Parameters
557 ----------
558 ref : `DatasetRef` or iterable thereof
559 Reference to the required Dataset(s).
560 ignore_errors : `bool`, optional
561 Indicate that errors should be ignored.
563 Raises
564 ------
565 FileNotFoundError
566 Attempt to remove a dataset that does not exist. Only relevant
567 if a single dataset ref is given.
569 Notes
570 -----
571 Concurrency should not normally be an issue for the in memory datastore
572 since all internal changes are isolated to solely this process and
573 the registry only changes rows associated with this process.
574 """
575 if isinstance(ref, DatasetRef):
576 # Check that this dataset is known to datastore
577 try:
578 self._get_dataset_info(ref.id)
579 except Exception as e:
580 if ignore_errors: 580 ↛ 581line 580 didn't jump to line 581, because the condition on line 580 was never true
581 log.warning(
582 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
583 )
584 else:
585 raise
586 log.debug("Trash %s in datastore %s", ref, self.name)
587 ref_list = [ref]
588 else:
589 ref_list = list(ref)
590 log.debug("Bulk trashing of datasets in datastore %s", self.name)
592 def _rollbackMoveToTrash(refs: Iterable[DatasetIdRef]) -> None:
593 for ref in refs: 593 ↛ exitline 593 didn't return from function '_rollbackMoveToTrash', because the loop on line 593 didn't complete
594 self._trashedIds.remove(ref.id)
596 assert self._transaction is not None, "Must be in transaction"
597 with self._transaction.undoWith(f"Trash {len(ref_list)} datasets", _rollbackMoveToTrash, ref_list):
598 self._trashedIds.update(ref.id for ref in ref_list)
600 def emptyTrash(self, ignore_errors: bool = False) -> None:
601 """Remove all datasets from the trash.
603 Parameters
604 ----------
605 ignore_errors : `bool`, optional
606 Ignore errors.
608 Notes
609 -----
610 The internal tracking of datasets is affected by this method and
611 transaction handling is not supported if there is a problem before
612 the datasets themselves are deleted.
614 Concurrency should not normally be an issue for the in memory datastore
615 since all internal changes are isolated to solely this process and
616 the registry only changes rows associated with this process.
617 """
618 log.debug("Emptying trash in datastore %s", self.name)
620 for dataset_id in self._trashedIds:
621 try:
622 realID, _ = self._get_dataset_info(dataset_id)
623 except FileNotFoundError: 623 ↛ 626line 623 didn't jump to line 626
624 # Dataset already removed so ignore it
625 continue
626 except Exception as e:
627 if ignore_errors:
628 log.warning(
629 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
630 self.name,
631 dataset_id,
632 e,
633 )
634 continue
635 else:
636 raise
638 # Determine whether all references to this dataset have been
639 # removed and we can delete the dataset itself
640 allRefs = self.related[realID]
641 remainingRefs = allRefs - {dataset_id}
642 if not remainingRefs: 642 ↛ 647line 642 didn't jump to line 647, because the condition on line 642 was never false
643 log.debug("Removing artifact %s from datastore %s", realID, self.name)
644 del self.datasets[realID]
646 # Remove this entry
647 self._remove_stored_item_info(dataset_id)
649 # Empty the trash table
650 self._trashedIds = set()
652 def validateConfiguration(
653 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
654 ) -> None:
655 """Validate some of the configuration for this datastore.
657 Parameters
658 ----------
659 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
660 Entities to test against this configuration. Can be differing
661 types.
662 logFailures : `bool`, optional
663 If `True`, output a log message for every validation error
664 detected.
666 Raises
667 ------
668 DatastoreValidationError
669 Raised if there is a validation problem with a configuration.
670 All the problems are reported in a single exception.
672 Notes
673 -----
674 This method is a no-op.
675 """
676 return
678 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
679 # Docstring is inherited from base class
680 return transfer
682 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
683 # Docstring is inherited from base class
684 return
686 def getLookupKeys(self) -> set[LookupKey]:
687 # Docstring is inherited from base class
688 return self.constraints.getLookupKeys()
690 def needs_expanded_data_ids(
691 self,
692 transfer: str | None,
693 entity: DatasetRef | DatasetType | StorageClass | None = None,
694 ) -> bool:
695 # Docstring inherited.
696 return False
698 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
699 # Docstring inherited from the base class.
700 return
702 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
703 # Docstring inherited from the base class.
705 # In-memory Datastore records cannot be exported or imported
706 return {}
708 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
709 # Docstring inherited from the base class.
710 return {}