Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 93%
185 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-12 09:43 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-12 09:43 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""In-memory datastore."""
30from __future__ import annotations
32__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
34import logging
35import time
36from collections.abc import Iterable, Mapping
37from dataclasses import dataclass
38from typing import TYPE_CHECKING, Any
39from urllib.parse import urlencode
41from lsst.daf.butler import DatasetId, DatasetRef, StorageClass
42from lsst.daf.butler.datastore import DatasetRefURIs
43from lsst.daf.butler.datastore.record_data import DatastoreRecordData
44from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo
45from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
46from lsst.daf.butler.utils import transactional
47from lsst.resources import ResourcePath
49from ..datastore.generic_base import GenericBaseDatastore
50from ..registry.interfaces import DatabaseInsertMode
52if TYPE_CHECKING:
53 from lsst.daf.butler import Config, DatasetType, LookupKey
54 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
56log = logging.getLogger(__name__)
59@dataclass(frozen=True)
60class StoredMemoryItemInfo(StoredDatastoreItemInfo):
61 """Internal InMemoryDatastore Metadata associated with a stored
62 DatasetRef.
63 """
65 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"}
67 timestamp: float
68 """Unix timestamp indicating the time the dataset was stored."""
70 storageClass: StorageClass
71 """StorageClass associated with the dataset."""
73 parentID: DatasetId
74 """ID of the parent `DatasetRef` if this entry is a concrete
75 composite. Not used if the dataset being stored is not a
76 virtual component of a composite
77 """
79 dataset_id: DatasetId
80 """DatasetId associated with this record."""
83class InMemoryDatastore(GenericBaseDatastore):
84 """Basic Datastore for writing to an in memory cache.
86 This datastore is ephemeral in that the contents of the datastore
87 disappear when the Python process completes. This also means that
88 other processes can not access this datastore.
90 Parameters
91 ----------
92 config : `DatastoreConfig` or `str`
93 Configuration.
94 bridgeManager : `DatastoreRegistryBridgeManager`
95 Object that manages the interface between `Registry` and datastores.
96 butlerRoot : `str`, optional
97 Unused parameter.
99 Notes
100 -----
101 InMemoryDatastore does not support any file-based ingest.
102 """
104 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
105 """Path to configuration defaults. Accessed within the ``configs`` resource
106 or relative to a search path. Can be None if no defaults specified.
107 """
109 isEphemeral = True
110 """A new datastore is created every time and datasets disappear when
111 the process shuts down."""
113 datasets: dict[DatasetId, Any]
114 """Internal storage of datasets indexed by dataset ID."""
116 records: dict[DatasetId, StoredMemoryItemInfo]
117 """Internal records about stored datasets."""
119 def __init__(
120 self,
121 config: Config | str,
122 bridgeManager: DatastoreRegistryBridgeManager,
123 butlerRoot: str | None = None,
124 ):
125 super().__init__(config, bridgeManager)
127 # Name ourselves with the timestamp the datastore
128 # was created.
129 self.name = f"{type(self).__name__}@{time.time()}"
130 log.debug("Creating datastore %s", self.name)
132 # Storage of datasets, keyed by dataset_id
133 self.datasets: dict[DatasetId, Any] = {}
135 # Records is distinct in order to track concrete composite components
136 # where we register multiple components for a single dataset.
137 self.records: dict[DatasetId, StoredMemoryItemInfo] = {}
139 # Related records that share the same parent
140 self.related: dict[DatasetId, set[DatasetId]] = {}
142 self._bridge = bridgeManager.register(self.name, ephemeral=True)
144 @classmethod
145 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
146 """Set any filesystem-dependent config options for this Datastore to
147 be appropriate for a new empty repository with the given root.
149 Does nothing in this implementation.
151 Parameters
152 ----------
153 root : `str`
154 Filesystem path to the root of the data repository.
155 config : `Config`
156 A `Config` to update. Only the subset understood by
157 this component will be updated. Will not expand
158 defaults.
159 full : `Config`
160 A complete config with all defaults expanded that can be
161 converted to a `DatastoreConfig`. Read-only and will not be
162 modified by this method.
163 Repository-specific options that should not be obtained
164 from defaults when Butler instances are constructed
165 should be copied from ``full`` to ``config``.
166 overwrite : `bool`, optional
167 If `False`, do not modify a value in ``config`` if the value
168 already exists. Default is always to overwrite with the provided
169 ``root``.
171 Notes
172 -----
173 If a keyword is explicitly defined in the supplied ``config`` it
174 will not be overridden by this method if ``overwrite`` is `False`.
175 This allows explicit values set in external configs to be retained.
176 """
177 return
179 @property
180 def bridge(self) -> DatastoreRegistryBridge:
181 # Docstring inherited from GenericBaseDatastore.
182 return self._bridge
184 def addStoredItemInfo(
185 self,
186 refs: Iterable[DatasetRef],
187 infos: Iterable[StoredMemoryItemInfo],
188 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT,
189 ) -> None:
190 # Docstring inherited from GenericBaseDatastore.
191 for ref, info in zip(refs, infos, strict=True):
192 self.records[ref.id] = info
193 self.related.setdefault(info.parentID, set()).add(ref.id)
195 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
196 # Docstring inherited from GenericBaseDatastore.
197 return self.records[ref.id]
199 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredMemoryItemInfo]:
200 # Docstring inherited from GenericBaseDatastore.
201 return [self.getStoredItemInfo(ref)]
203 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
204 # Docstring inherited from GenericBaseDatastore.
205 # If a component has been removed previously then we can sometimes
206 # be asked to remove it again. Other datastores ignore this
207 # so also ignore here
208 if ref.id not in self.records:
209 return
210 record = self.records[ref.id]
211 del self.records[ref.id]
212 self.related[record.parentID].remove(ref.id)
214 def _get_dataset_info(self, ref: DatasetIdRef) -> tuple[DatasetId, StoredMemoryItemInfo]:
215 """Check that the dataset is present and return the real ID and
216 associated information.
218 Parameters
219 ----------
220 ref : `DatasetRef`
221 Target `DatasetRef`
223 Returns
224 -------
225 realID : `int`
226 The dataset ID associated with this ref that should be used. This
227 could either be the ID of the supplied `DatasetRef` or the parent.
228 storageInfo : `StoredMemoryItemInfo`
229 Associated storage information.
231 Raises
232 ------
233 FileNotFoundError
234 Raised if the dataset is not present in this datastore.
235 """
236 try:
237 storedItemInfo = self.getStoredItemInfo(ref)
238 except KeyError:
239 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
240 realID = ref.id
241 if storedItemInfo.parentID is not None: 241 ↛ 244line 241 didn't jump to line 244, because the condition on line 241 was never false
242 realID = storedItemInfo.parentID
244 if realID not in self.datasets: 244 ↛ 245line 244 didn't jump to line 245, because the condition on line 244 was never true
245 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
247 return realID, storedItemInfo
249 def knows(self, ref: DatasetRef) -> bool:
250 """Check if the dataset is known to the datastore.
252 This datastore does not distinguish dataset existence from knowledge
253 of a dataset.
255 Parameters
256 ----------
257 ref : `DatasetRef`
258 Reference to the required dataset.
260 Returns
261 -------
262 exists : `bool`
263 `True` if the dataset is known to the datastore.
264 """
265 return self.exists(ref)
267 def exists(self, ref: DatasetRef) -> bool:
268 """Check if the dataset exists in the datastore.
270 Parameters
271 ----------
272 ref : `DatasetRef`
273 Reference to the required dataset.
275 Returns
276 -------
277 exists : `bool`
278 `True` if the entity exists in the `Datastore`.
279 """
280 try:
281 self._get_dataset_info(ref)
282 except FileNotFoundError:
283 return False
284 return True
286 def get(
287 self,
288 ref: DatasetRef,
289 parameters: Mapping[str, Any] | None = None,
290 storageClass: StorageClass | str | None = None,
291 ) -> Any:
292 """Load an InMemoryDataset from the store.
294 Parameters
295 ----------
296 ref : `DatasetRef`
297 Reference to the required Dataset.
298 parameters : `dict`
299 `StorageClass`-specific parameters that specify, for example,
300 a slice of the dataset to be loaded.
301 storageClass : `StorageClass` or `str`, optional
302 The storage class to be used to override the Python type
303 returned by this method. By default the returned type matches
304 the dataset type definition for this dataset. Specifying a
305 read `StorageClass` can force a different type to be returned.
306 This type must be compatible with the original type.
308 Returns
309 -------
310 inMemoryDataset : `object`
311 Requested dataset or slice thereof as an InMemoryDataset.
313 Raises
314 ------
315 FileNotFoundError
316 Requested dataset can not be retrieved.
317 TypeError
318 Return value from formatter has unexpected type.
319 ValueError
320 Formatter failed to process the dataset.
321 """
322 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
324 realID, storedItemInfo = self._get_dataset_info(ref)
326 # We have a write storage class and a read storage class and they
327 # can be different for concrete composites or if overridden.
328 if storageClass is not None:
329 ref = ref.overrideStorageClass(storageClass)
330 refStorageClass = ref.datasetType.storageClass
331 writeStorageClass = storedItemInfo.storageClass
333 component = ref.datasetType.component()
335 # Check that the supplied parameters are suitable for the type read
336 # If this is a derived component we validate against the composite
337 isDerivedComponent = False
338 if component in writeStorageClass.derivedComponents:
339 writeStorageClass.validateParameters(parameters)
340 isDerivedComponent = True
341 else:
342 refStorageClass.validateParameters(parameters)
344 inMemoryDataset = self.datasets[realID]
346 # if this is a read only component we need to apply parameters
347 # before we retrieve the component. We assume that the parameters
348 # will affect the data globally, before the derived component
349 # is selected.
350 if isDerivedComponent:
351 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
352 # Then disable parameters for later
353 parameters = {}
355 # Check if we have a component.
356 if component:
357 # In-memory datastore must have stored the dataset as a single
358 # object in the write storage class. We therefore use that
359 # storage class delegate to obtain the component.
360 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
362 # Since there is no formatter to process parameters, they all must be
363 # passed to the assembler.
364 inMemoryDataset = self._post_process_get(
365 inMemoryDataset, refStorageClass, parameters, isComponent=component is not None
366 )
368 # Last minute type conversion.
369 return refStorageClass.coerce_type(inMemoryDataset)
371 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
372 """Write a InMemoryDataset with a given `DatasetRef` to the store.
374 Parameters
375 ----------
376 inMemoryDataset : `object`
377 The dataset to store.
378 ref : `DatasetRef`
379 Reference to the associated Dataset.
381 Raises
382 ------
383 TypeError
384 Supplied object and storage class are inconsistent.
385 DatasetTypeNotSupportedError
386 The associated `DatasetType` is not handled by this datastore.
388 Notes
389 -----
390 If the datastore is configured to reject certain dataset types it
391 is possible that the put will fail and raise a
392 `DatasetTypeNotSupportedError`. The main use case for this is to
393 allow `ChainedDatastore` to put to multiple datastores without
394 requiring that every datastore accepts the dataset.
395 """
396 # May need to coerce the in memory dataset to the correct
397 # python type, otherwise parameters may not work.
398 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
400 self._validate_put_parameters(inMemoryDataset, ref)
402 self.datasets[ref.id] = inMemoryDataset
403 log.debug("Store %s in %s", ref, self.name)
405 # Store time we received this content, to allow us to optionally
406 # expire it. Instead of storing a filename here, we include the
407 # ID of this datasetRef so we can find it from components.
408 itemInfo = StoredMemoryItemInfo(
409 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.id
410 )
412 # We have to register this content with registry.
413 # Currently this assumes we have a file so we need to use stub entries
414 # TODO: Add to ephemeral part of registry
415 self._register_datasets([(ref, itemInfo)])
417 if self._transaction is not None:
418 self._transaction.registerUndo("put", self.remove, ref)
420 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
421 """Return URIs associated with dataset.
423 Parameters
424 ----------
425 ref : `DatasetRef`
426 Reference to the required dataset.
427 predict : `bool`, optional
428 If the datastore does not know about the dataset, should it
429 return a predicted URI or not?
431 Returns
432 -------
433 uris : `DatasetRefURIs`
434 The URI to the primary artifact associated with this dataset (if
435 the dataset was disassembled within the datastore this may be
436 `None`), and the URIs to any components associated with the dataset
437 artifact. (can be empty if there are no components).
439 Notes
440 -----
441 The URIs returned for in-memory datastores are not usable but
442 provide an indication of the associated dataset.
443 """
444 # Include the dataID as a URI query
445 query = urlencode(ref.dataId)
447 # if this has never been written then we have to guess
448 if not self.exists(ref):
449 if not predict:
450 raise FileNotFoundError(f"Dataset {ref} not in this datastore")
451 name = f"{ref.datasetType.name}"
452 fragment = "#predicted"
453 else:
454 realID, _ = self._get_dataset_info(ref)
455 name = f"{id(self.datasets[realID])}?{query}"
456 fragment = ""
458 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
460 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
461 """URI to the Dataset.
463 Always uses "mem://" URI prefix.
465 Parameters
466 ----------
467 ref : `DatasetRef`
468 Reference to the required Dataset.
469 predict : `bool`
470 If `True`, allow URIs to be returned of datasets that have not
471 been written.
473 Returns
474 -------
475 uri : `str`
476 URI pointing to the dataset within the datastore. If the
477 dataset does not exist in the datastore, and if ``predict`` is
478 `True`, the URI will be a prediction and will include a URI
479 fragment "#predicted".
480 If the datastore does not have entities that relate well
481 to the concept of a URI the returned URI string will be
482 descriptive. The returned URI is not guaranteed to be obtainable.
484 Raises
485 ------
486 FileNotFoundError
487 A URI has been requested for a dataset that does not exist and
488 guessing is not allowed.
489 AssertionError
490 Raised if an internal error occurs.
491 """
492 primary, _ = self.getURIs(ref, predict)
493 if primary is None:
494 # This should be impossible since this datastore does
495 # not disassemble. This check also helps mypy.
496 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
497 return primary
499 def retrieveArtifacts(
500 self,
501 refs: Iterable[DatasetRef],
502 destination: ResourcePath,
503 transfer: str = "auto",
504 preserve_path: bool = True,
505 overwrite: bool | None = False,
506 ) -> list[ResourcePath]:
507 """Retrieve the file artifacts associated with the supplied refs.
509 Notes
510 -----
511 Not implemented by this datastore.
512 """
513 # Could conceivably launch a FileDatastore to use formatters to write
514 # the data but this is fraught with problems.
515 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
517 def forget(self, refs: Iterable[DatasetRef]) -> None:
518 # Docstring inherited.
519 refs = list(refs)
520 self._bridge.forget(refs)
521 for ref in refs:
522 self.removeStoredItemInfo(ref)
524 @transactional
525 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = False) -> None:
526 """Indicate to the Datastore that a dataset can be removed.
528 Parameters
529 ----------
530 ref : `DatasetRef` or iterable thereof
531 Reference to the required Dataset(s).
532 ignore_errors: `bool`, optional
533 Indicate that errors should be ignored.
535 Raises
536 ------
537 FileNotFoundError
538 Attempt to remove a dataset that does not exist. Only relevant
539 if a single dataset ref is given.
541 Notes
542 -----
543 Concurrency should not normally be an issue for the in memory datastore
544 since all internal changes are isolated to solely this process and
545 the registry only changes rows associated with this process.
546 """
547 if not isinstance(ref, DatasetRef):
548 log.debug("Bulk trashing of datasets in datastore %s", self.name)
549 self.bridge.moveToTrash(ref, transaction=self._transaction)
550 return
552 log.debug("Trash %s in datastore %s", ref, self.name)
554 # Check that this dataset is known to datastore
555 try:
556 self._get_dataset_info(ref)
558 # Move datasets to trash table
559 self.bridge.moveToTrash([ref], transaction=self._transaction)
560 except Exception as e:
561 if ignore_errors: 561 ↛ 562line 561 didn't jump to line 562, because the condition on line 561 was never true
562 log.warning(
563 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
564 )
565 else:
566 raise
568 def emptyTrash(self, ignore_errors: bool = False) -> None:
569 """Remove all datasets from the trash.
571 Parameters
572 ----------
573 ignore_errors : `bool`, optional
574 Ignore errors.
576 Notes
577 -----
578 The internal tracking of datasets is affected by this method and
579 transaction handling is not supported if there is a problem before
580 the datasets themselves are deleted.
582 Concurrency should not normally be an issue for the in memory datastore
583 since all internal changes are isolated to solely this process and
584 the registry only changes rows associated with this process.
585 """
586 log.debug("Emptying trash in datastore %s", self.name)
587 with self._bridge.emptyTrash() as trash_data:
588 trashed, _ = trash_data
589 for ref, _ in trashed:
590 try:
591 realID, _ = self._get_dataset_info(ref)
592 except FileNotFoundError: 592 ↛ 595line 592 didn't jump to line 595
593 # Dataset already removed so ignore it
594 continue
595 except Exception as e:
596 if ignore_errors:
597 log.warning(
598 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
599 self.name,
600 ref.id,
601 e,
602 )
603 continue
604 else:
605 raise
607 # Determine whether all references to this dataset have been
608 # removed and we can delete the dataset itself
609 allRefs = self.related[realID]
610 remainingRefs = allRefs - {ref.id}
611 if not remainingRefs: 611 ↛ 616line 611 didn't jump to line 616, because the condition on line 611 was never false
612 log.debug("Removing artifact %s from datastore %s", realID, self.name)
613 del self.datasets[realID]
615 # Remove this entry
616 self.removeStoredItemInfo(ref)
618 def validateConfiguration(
619 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
620 ) -> None:
621 """Validate some of the configuration for this datastore.
623 Parameters
624 ----------
625 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
626 Entities to test against this configuration. Can be differing
627 types.
628 logFailures : `bool`, optional
629 If `True`, output a log message for every validation error
630 detected.
632 Raises
633 ------
634 DatastoreValidationError
635 Raised if there is a validation problem with a configuration.
636 All the problems are reported in a single exception.
638 Notes
639 -----
640 This method is a no-op.
641 """
642 return
644 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
645 # Docstring is inherited from base class
646 return transfer
648 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
649 # Docstring is inherited from base class
650 return
652 def getLookupKeys(self) -> set[LookupKey]:
653 # Docstring is inherited from base class
654 return self.constraints.getLookupKeys()
656 def needs_expanded_data_ids(
657 self,
658 transfer: str | None,
659 entity: DatasetRef | DatasetType | StorageClass | None = None,
660 ) -> bool:
661 # Docstring inherited.
662 return False
664 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
665 # Docstring inherited from the base class.
666 return
668 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
669 # Docstring inherited from the base class.
671 # In-memory Datastore records cannot be exported or imported
672 return {}