Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 92%
182 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 10:52 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-06 10:52 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""In-memory datastore."""
30from __future__ import annotations
32__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
34import logging
35import time
36from collections.abc import Iterable, Mapping
37from dataclasses import dataclass
38from typing import TYPE_CHECKING, Any
39from urllib.parse import urlencode
41from lsst.daf.butler import DatasetId, DatasetRef, StorageClass
42from lsst.daf.butler.datastore import DatasetRefURIs
43from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore
44from lsst.daf.butler.datastore.record_data import DatastoreRecordData
45from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo
46from lsst.daf.butler.utils import transactional
47from lsst.resources import ResourcePath
49if TYPE_CHECKING:
50 from lsst.daf.butler import Config, DatasetType, LookupKey
51 from lsst.daf.butler.datastore import DatastoreOpaqueTable
52 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
54log = logging.getLogger(__name__)
57@dataclass(frozen=True, slots=True)
58class StoredMemoryItemInfo(StoredDatastoreItemInfo):
59 """Internal InMemoryDatastore Metadata associated with a stored
60 DatasetRef.
61 """
63 timestamp: float
64 """Unix timestamp indicating the time the dataset was stored."""
66 storageClass: StorageClass
67 """StorageClass associated with the dataset."""
69 parentID: DatasetId
70 """ID of the parent `DatasetRef` if this entry is a concrete
71 composite. Not used if the dataset being stored is not a
72 virtual component of a composite
73 """
76class InMemoryDatastore(GenericBaseDatastore[StoredMemoryItemInfo]):
77 """Basic Datastore for writing to an in memory cache.
79 This datastore is ephemeral in that the contents of the datastore
80 disappear when the Python process completes. This also means that
81 other processes can not access this datastore.
83 Parameters
84 ----------
85 config : `DatastoreConfig` or `str`
86 Configuration.
87 bridgeManager : `DatastoreRegistryBridgeManager`
88 Object that manages the interface between `Registry` and datastores.
89 butlerRoot : `str`, optional
90 Unused parameter.
92 Notes
93 -----
94 InMemoryDatastore does not support any file-based ingest.
95 """
97 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
98 """Path to configuration defaults. Accessed within the ``configs`` resource
99 or relative to a search path. Can be None if no defaults specified.
100 """
102 isEphemeral = True
103 """A new datastore is created every time and datasets disappear when
104 the process shuts down."""
106 datasets: dict[DatasetId, Any]
107 """Internal storage of datasets indexed by dataset ID."""
109 records: dict[DatasetId, StoredMemoryItemInfo]
110 """Internal records about stored datasets."""
112 def __init__(
113 self,
114 config: Config | str,
115 bridgeManager: DatastoreRegistryBridgeManager,
116 butlerRoot: str | None = None,
117 ):
118 super().__init__(config, bridgeManager)
120 # Name ourselves with the timestamp the datastore
121 # was created.
122 self.name = f"{type(self).__name__}@{time.time()}"
123 log.debug("Creating datastore %s", self.name)
125 # Storage of datasets, keyed by dataset_id
126 self.datasets: dict[DatasetId, Any] = {}
128 # Records is distinct in order to track concrete composite components
129 # where we register multiple components for a single dataset.
130 self.records: dict[DatasetId, StoredMemoryItemInfo] = {}
132 # Related records that share the same parent
133 self.related: dict[DatasetId, set[DatasetId]] = {}
135 self._trashedIds: set[DatasetId] = set()
137 @classmethod
138 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
139 """Set any filesystem-dependent config options for this Datastore to
140 be appropriate for a new empty repository with the given root.
142 Does nothing in this implementation.
144 Parameters
145 ----------
146 root : `str`
147 Filesystem path to the root of the data repository.
148 config : `Config`
149 A `Config` to update. Only the subset understood by
150 this component will be updated. Will not expand
151 defaults.
152 full : `Config`
153 A complete config with all defaults expanded that can be
154 converted to a `DatastoreConfig`. Read-only and will not be
155 modified by this method.
156 Repository-specific options that should not be obtained
157 from defaults when Butler instances are constructed
158 should be copied from ``full`` to ``config``.
159 overwrite : `bool`, optional
160 If `False`, do not modify a value in ``config`` if the value
161 already exists. Default is always to overwrite with the provided
162 ``root``.
164 Notes
165 -----
166 If a keyword is explicitly defined in the supplied ``config`` it
167 will not be overridden by this method if ``overwrite`` is `False`.
168 This allows explicit values set in external configs to be retained.
169 """
170 return
172 def _get_stored_item_info(self, dataset_id: DatasetId) -> StoredMemoryItemInfo:
173 # Docstring inherited from GenericBaseDatastore.
174 return self.records[dataset_id]
176 def _remove_stored_item_info(self, dataset_id: DatasetId) -> None:
177 # Docstring inherited from GenericBaseDatastore.
178 # If a component has been removed previously then we can sometimes
179 # be asked to remove it again. Other datastores ignore this
180 # so also ignore here
181 if dataset_id not in self.records:
182 return
183 record = self.records[dataset_id]
184 del self.records[dataset_id]
185 self.related[record.parentID].remove(dataset_id)
187 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
188 """Remove information about the file associated with this dataset.
190 Parameters
191 ----------
192 ref : `DatasetRef`
193 The dataset that has been removed.
195 Notes
196 -----
197 This method is actually not used by this implementation, but there are
198 some tests that check that this method works, so we keep it for now.
199 """
200 self._remove_stored_item_info(ref.id)
202 def _get_dataset_info(self, dataset_id: DatasetId) -> tuple[DatasetId, StoredMemoryItemInfo]:
203 """Check that the dataset is present and return the real ID and
204 associated information.
206 Parameters
207 ----------
208 ref : `DatasetRef`
209 Target `DatasetRef`
211 Returns
212 -------
213 realID : `int`
214 The dataset ID associated with this ref that should be used. This
215 could either be the ID of the supplied `DatasetRef` or the parent.
216 storageInfo : `StoredMemoryItemInfo`
217 Associated storage information.
219 Raises
220 ------
221 FileNotFoundError
222 Raised if the dataset is not present in this datastore.
223 """
224 try:
225 storedItemInfo = self._get_stored_item_info(dataset_id)
226 except KeyError:
227 raise FileNotFoundError(f"No such file dataset in memory: {dataset_id}") from None
228 realID = dataset_id
229 if storedItemInfo.parentID is not None: 229 ↛ 232line 229 didn't jump to line 232, because the condition on line 229 was never false
230 realID = storedItemInfo.parentID
232 if realID not in self.datasets: 232 ↛ 233line 232 didn't jump to line 233, because the condition on line 232 was never true
233 raise FileNotFoundError(f"No such file dataset in memory: {dataset_id}")
235 return realID, storedItemInfo
237 def knows(self, ref: DatasetRef) -> bool:
238 """Check if the dataset is known to the datastore.
240 This datastore does not distinguish dataset existence from knowledge
241 of a dataset.
243 Parameters
244 ----------
245 ref : `DatasetRef`
246 Reference to the required dataset.
248 Returns
249 -------
250 exists : `bool`
251 `True` if the dataset is known to the datastore.
252 """
253 return self.exists(ref)
255 def exists(self, ref: DatasetRef) -> bool:
256 """Check if the dataset exists in the datastore.
258 Parameters
259 ----------
260 ref : `DatasetRef`
261 Reference to the required dataset.
263 Returns
264 -------
265 exists : `bool`
266 `True` if the entity exists in the `Datastore`.
267 """
268 try:
269 self._get_dataset_info(ref.id)
270 except FileNotFoundError:
271 return False
272 return True
274 def get(
275 self,
276 ref: DatasetRef,
277 parameters: Mapping[str, Any] | None = None,
278 storageClass: StorageClass | str | None = None,
279 ) -> Any:
280 """Load an InMemoryDataset from the store.
282 Parameters
283 ----------
284 ref : `DatasetRef`
285 Reference to the required Dataset.
286 parameters : `dict`
287 `StorageClass`-specific parameters that specify, for example,
288 a slice of the dataset to be loaded.
289 storageClass : `StorageClass` or `str`, optional
290 The storage class to be used to override the Python type
291 returned by this method. By default the returned type matches
292 the dataset type definition for this dataset. Specifying a
293 read `StorageClass` can force a different type to be returned.
294 This type must be compatible with the original type.
296 Returns
297 -------
298 inMemoryDataset : `object`
299 Requested dataset or slice thereof as an InMemoryDataset.
301 Raises
302 ------
303 FileNotFoundError
304 Requested dataset can not be retrieved.
305 TypeError
306 Return value from formatter has unexpected type.
307 ValueError
308 Formatter failed to process the dataset.
309 """
310 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
312 realID, storedItemInfo = self._get_dataset_info(ref.id)
314 # We have a write storage class and a read storage class and they
315 # can be different for concrete composites or if overridden.
316 if storageClass is not None:
317 ref = ref.overrideStorageClass(storageClass)
318 refStorageClass = ref.datasetType.storageClass
319 writeStorageClass = storedItemInfo.storageClass
321 component = ref.datasetType.component()
323 # Check that the supplied parameters are suitable for the type read
324 # If this is a derived component we validate against the composite
325 isDerivedComponent = False
326 if component in writeStorageClass.derivedComponents:
327 writeStorageClass.validateParameters(parameters)
328 isDerivedComponent = True
329 else:
330 refStorageClass.validateParameters(parameters)
332 inMemoryDataset = self.datasets[realID]
334 # if this is a read only component we need to apply parameters
335 # before we retrieve the component. We assume that the parameters
336 # will affect the data globally, before the derived component
337 # is selected.
338 if isDerivedComponent:
339 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
340 # Then disable parameters for later
341 parameters = {}
343 # Check if we have a component.
344 if component:
345 # In-memory datastore must have stored the dataset as a single
346 # object in the write storage class. We therefore use that
347 # storage class delegate to obtain the component.
348 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
350 # Since there is no formatter to process parameters, they all must be
351 # passed to the assembler.
352 inMemoryDataset = self._post_process_get(
353 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None
354 )
356 # Last minute type conversion.
357 return refStorageClass.coerce_type(inMemoryDataset)
359 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
360 """Write a InMemoryDataset with a given `DatasetRef` to the store.
362 Parameters
363 ----------
364 inMemoryDataset : `object`
365 The dataset to store.
366 ref : `DatasetRef`
367 Reference to the associated Dataset.
369 Raises
370 ------
371 TypeError
372 Supplied object and storage class are inconsistent.
373 DatasetTypeNotSupportedError
374 The associated `DatasetType` is not handled by this datastore.
376 Notes
377 -----
378 If the datastore is configured to reject certain dataset types it
379 is possible that the put will fail and raise a
380 `DatasetTypeNotSupportedError`. The main use case for this is to
381 allow `ChainedDatastore` to put to multiple datastores without
382 requiring that every datastore accepts the dataset.
383 """
384 # May need to coerce the in memory dataset to the correct
385 # python type, otherwise parameters may not work.
386 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
388 self._validate_put_parameters(inMemoryDataset, ref)
390 self.datasets[ref.id] = inMemoryDataset
391 log.debug("Store %s in %s", ref, self.name)
393 # Store time we received this content, to allow us to optionally
394 # expire it. Instead of storing a filename here, we include the
395 # ID of this datasetRef so we can find it from components.
396 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, parentID=ref.id)
398 # We have to register this content with registry.
399 # Currently this assumes we have a file so we need to use stub entries
400 self.records[ref.id] = itemInfo
401 self.related.setdefault(itemInfo.parentID, set()).add(ref.id)
403 if self._transaction is not None:
404 self._transaction.registerUndo("put", self.remove, ref)
406 def put_new(self, inMemoryDataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
407 # It is OK to call put() here because registry is not populating
408 # bridges as we return empty dict from this method.
409 self.put(inMemoryDataset, ref)
410 # As ephemeral we return empty dict.
411 return {}
413 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
414 """Return URIs associated with dataset.
416 Parameters
417 ----------
418 ref : `DatasetRef`
419 Reference to the required dataset.
420 predict : `bool`, optional
421 If the datastore does not know about the dataset, should it
422 return a predicted URI or not?
424 Returns
425 -------
426 uris : `DatasetRefURIs`
427 The URI to the primary artifact associated with this dataset (if
428 the dataset was disassembled within the datastore this may be
429 `None`), and the URIs to any components associated with the dataset
430 artifact. (can be empty if there are no components).
432 Notes
433 -----
434 The URIs returned for in-memory datastores are not usable but
435 provide an indication of the associated dataset.
436 """
437 # Include the dataID as a URI query
438 query = urlencode(ref.dataId.required)
440 # if this has never been written then we have to guess
441 if not self.exists(ref):
442 if not predict:
443 raise FileNotFoundError(f"Dataset {ref} not in this datastore")
444 name = f"{ref.datasetType.name}"
445 fragment = "#predicted"
446 else:
447 realID, _ = self._get_dataset_info(ref.id)
448 name = f"{id(self.datasets[realID])}?{query}"
449 fragment = ""
451 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
453 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
454 """URI to the Dataset.
456 Always uses "mem://" URI prefix.
458 Parameters
459 ----------
460 ref : `DatasetRef`
461 Reference to the required Dataset.
462 predict : `bool`
463 If `True`, allow URIs to be returned of datasets that have not
464 been written.
466 Returns
467 -------
468 uri : `str`
469 URI pointing to the dataset within the datastore. If the
470 dataset does not exist in the datastore, and if ``predict`` is
471 `True`, the URI will be a prediction and will include a URI
472 fragment "#predicted".
473 If the datastore does not have entities that relate well
474 to the concept of a URI the returned URI string will be
475 descriptive. The returned URI is not guaranteed to be obtainable.
477 Raises
478 ------
479 FileNotFoundError
480 A URI has been requested for a dataset that does not exist and
481 guessing is not allowed.
482 AssertionError
483 Raised if an internal error occurs.
484 """
485 primary, _ = self.getURIs(ref, predict)
486 if primary is None:
487 # This should be impossible since this datastore does
488 # not disassemble. This check also helps mypy.
489 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
490 return primary
492 def retrieveArtifacts(
493 self,
494 refs: Iterable[DatasetRef],
495 destination: ResourcePath,
496 transfer: str = "auto",
497 preserve_path: bool = True,
498 overwrite: bool | None = False,
499 ) -> list[ResourcePath]:
500 """Retrieve the file artifacts associated with the supplied refs.
502 Notes
503 -----
504 Not implemented by this datastore.
505 """
506 # Could conceivably launch a FileDatastore to use formatters to write
507 # the data but this is fraught with problems.
508 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
510 def forget(self, refs: Iterable[DatasetRef]) -> None:
511 # Docstring inherited.
512 refs = list(refs)
513 for ref in refs:
514 self._remove_stored_item_info(ref.id)
516 @transactional
517 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = False) -> None:
518 """Indicate to the Datastore that a dataset can be removed.
520 Parameters
521 ----------
522 ref : `DatasetRef` or iterable thereof
523 Reference to the required Dataset(s).
524 ignore_errors: `bool`, optional
525 Indicate that errors should be ignored.
527 Raises
528 ------
529 FileNotFoundError
530 Attempt to remove a dataset that does not exist. Only relevant
531 if a single dataset ref is given.
533 Notes
534 -----
535 Concurrency should not normally be an issue for the in memory datastore
536 since all internal changes are isolated to solely this process and
537 the registry only changes rows associated with this process.
538 """
539 if isinstance(ref, DatasetRef):
540 # Check that this dataset is known to datastore
541 try:
542 self._get_dataset_info(ref.id)
543 except Exception as e:
544 if ignore_errors: 544 ↛ 545line 544 didn't jump to line 545, because the condition on line 544 was never true
545 log.warning(
546 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
547 )
548 else:
549 raise
550 log.debug("Trash %s in datastore %s", ref, self.name)
551 ref_list = [ref]
552 else:
553 ref_list = list(ref)
554 log.debug("Bulk trashing of datasets in datastore %s", self.name)
556 def _rollbackMoveToTrash(refs: Iterable[DatasetIdRef]) -> None:
557 for ref in refs: 557 ↛ exitline 557 didn't return from function '_rollbackMoveToTrash', because the loop on line 557 didn't complete
558 self._trashedIds.remove(ref.id)
560 assert self._transaction is not None, "Must be in transaction"
561 with self._transaction.undoWith(f"Trash {len(ref_list)} datasets", _rollbackMoveToTrash, ref_list):
562 self._trashedIds.update(ref.id for ref in ref_list)
564 def emptyTrash(self, ignore_errors: bool = False) -> None:
565 """Remove all datasets from the trash.
567 Parameters
568 ----------
569 ignore_errors : `bool`, optional
570 Ignore errors.
572 Notes
573 -----
574 The internal tracking of datasets is affected by this method and
575 transaction handling is not supported if there is a problem before
576 the datasets themselves are deleted.
578 Concurrency should not normally be an issue for the in memory datastore
579 since all internal changes are isolated to solely this process and
580 the registry only changes rows associated with this process.
581 """
582 log.debug("Emptying trash in datastore %s", self.name)
584 for dataset_id in self._trashedIds:
585 try:
586 realID, _ = self._get_dataset_info(dataset_id)
587 except FileNotFoundError: 587 ↛ 590line 587 didn't jump to line 590
588 # Dataset already removed so ignore it
589 continue
590 except Exception as e:
591 if ignore_errors:
592 log.warning(
593 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
594 self.name,
595 dataset_id,
596 e,
597 )
598 continue
599 else:
600 raise
602 # Determine whether all references to this dataset have been
603 # removed and we can delete the dataset itself
604 allRefs = self.related[realID]
605 remainingRefs = allRefs - {dataset_id}
606 if not remainingRefs: 606 ↛ 611line 606 didn't jump to line 611, because the condition on line 606 was never false
607 log.debug("Removing artifact %s from datastore %s", realID, self.name)
608 del self.datasets[realID]
610 # Remove this entry
611 self._remove_stored_item_info(dataset_id)
613 # Empty the trash table
614 self._trashedIds = set()
616 def validateConfiguration(
617 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
618 ) -> None:
619 """Validate some of the configuration for this datastore.
621 Parameters
622 ----------
623 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
624 Entities to test against this configuration. Can be differing
625 types.
626 logFailures : `bool`, optional
627 If `True`, output a log message for every validation error
628 detected.
630 Raises
631 ------
632 DatastoreValidationError
633 Raised if there is a validation problem with a configuration.
634 All the problems are reported in a single exception.
636 Notes
637 -----
638 This method is a no-op.
639 """
640 return
642 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
643 # Docstring is inherited from base class
644 return transfer
646 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
647 # Docstring is inherited from base class
648 return
650 def getLookupKeys(self) -> set[LookupKey]:
651 # Docstring is inherited from base class
652 return self.constraints.getLookupKeys()
654 def needs_expanded_data_ids(
655 self,
656 transfer: str | None,
657 entity: DatasetRef | DatasetType | StorageClass | None = None,
658 ) -> bool:
659 # Docstring inherited.
660 return False
662 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
663 # Docstring inherited from the base class.
664 return
666 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
667 # Docstring inherited from the base class.
669 # In-memory Datastore records cannot be exported or imported
670 return {}
672 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
673 # Docstring inherited from the base class.
674 return {}