Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 87%
190 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 23:49 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 23:49 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
32from urllib.parse import urlencode
34from lsst.daf.butler import (
35 DatasetId,
36 DatasetRef,
37 DatasetRefURIs,
38 DatastoreRecordData,
39 StorageClass,
40 StoredDatastoreItemInfo,
41)
42from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
43from lsst.resources import ResourcePath
45from .genericDatastore import GenericBaseDatastore
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 from lsst.daf.butler import Config, DatasetType, LookupKey
49 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
51log = logging.getLogger(__name__)
54@dataclass(frozen=True)
55class StoredMemoryItemInfo(StoredDatastoreItemInfo):
56 """Internal InMemoryDatastore Metadata associated with a stored
57 DatasetRef.
58 """
60 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"}
62 timestamp: float
63 """Unix timestamp indicating the time the dataset was stored."""
65 storageClass: StorageClass
66 """StorageClass associated with the dataset."""
68 parentID: DatasetId
69 """ID of the parent `DatasetRef` if this entry is a concrete
70 composite. Not used if the dataset being stored is not a
71 virtual component of a composite
72 """
74 dataset_id: DatasetId
75 """DatasetId associated with this record."""
78class InMemoryDatastore(GenericBaseDatastore):
79 """Basic Datastore for writing to an in memory cache.
81 This datastore is ephemeral in that the contents of the datastore
82 disappear when the Python process completes. This also means that
83 other processes can not access this datastore.
85 Parameters
86 ----------
87 config : `DatastoreConfig` or `str`
88 Configuration.
89 bridgeManager : `DatastoreRegistryBridgeManager`
90 Object that manages the interface between `Registry` and datastores.
91 butlerRoot : `str`, optional
92 Unused parameter.
94 Notes
95 -----
96 InMemoryDatastore does not support any file-based ingest.
97 """
99 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
100 """Path to configuration defaults. Accessed within the ``configs`` resource
101 or relative to a search path. Can be None if no defaults specified.
102 """
104 isEphemeral = True
105 """A new datastore is created every time and datasets disappear when
106 the process shuts down."""
108 datasets: Dict[DatasetId, Any]
109 """Internal storage of datasets indexed by dataset ID."""
111 records: Dict[DatasetId, StoredMemoryItemInfo]
112 """Internal records about stored datasets."""
114 def __init__(
115 self,
116 config: Union[Config, str],
117 bridgeManager: DatastoreRegistryBridgeManager,
118 butlerRoot: Optional[str] = None,
119 ):
120 super().__init__(config, bridgeManager)
122 # Name ourselves with the timestamp the datastore
123 # was created.
124 self.name = "{}@{}".format(type(self).__name__, time.time())
125 log.debug("Creating datastore %s", self.name)
127 # Storage of datasets, keyed by dataset_id
128 self.datasets: Dict[DatasetId, Any] = {}
130 # Records is distinct in order to track concrete composite components
131 # where we register multiple components for a single dataset.
132 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {}
134 # Related records that share the same parent
135 self.related: Dict[DatasetId, Set[DatasetId]] = {}
137 self._bridge = bridgeManager.register(self.name, ephemeral=True)
139 @classmethod
140 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
141 """Set any filesystem-dependent config options for this Datastore to
142 be appropriate for a new empty repository with the given root.
144 Does nothing in this implementation.
146 Parameters
147 ----------
148 root : `str`
149 Filesystem path to the root of the data repository.
150 config : `Config`
151 A `Config` to update. Only the subset understood by
152 this component will be updated. Will not expand
153 defaults.
154 full : `Config`
155 A complete config with all defaults expanded that can be
156 converted to a `DatastoreConfig`. Read-only and will not be
157 modified by this method.
158 Repository-specific options that should not be obtained
159 from defaults when Butler instances are constructed
160 should be copied from ``full`` to ``config``.
161 overwrite : `bool`, optional
162 If `False`, do not modify a value in ``config`` if the value
163 already exists. Default is always to overwrite with the provided
164 ``root``.
166 Notes
167 -----
168 If a keyword is explicitly defined in the supplied ``config`` it
169 will not be overridden by this method if ``overwrite`` is `False`.
170 This allows explicit values set in external configs to be retained.
171 """
172 return
174 @property
175 def bridge(self) -> DatastoreRegistryBridge:
176 # Docstring inherited from GenericBaseDatastore.
177 return self._bridge
179 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None:
180 # Docstring inherited from GenericBaseDatastore.
181 for ref, info in zip(refs, infos):
182 if ref.id is None: 182 ↛ 183line 182 didn't jump to line 183, because the condition on line 182 was never true
183 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
184 self.records[ref.id] = info
185 self.related.setdefault(info.parentID, set()).add(ref.id)
187 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
188 # Docstring inherited from GenericBaseDatastore.
189 if ref.id is None: 189 ↛ 190line 189 didn't jump to line 190, because the condition on line 189 was never true
190 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
191 return self.records[ref.id]
193 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
194 # Docstring inherited from GenericBaseDatastore.
195 return [self.getStoredItemInfo(ref)]
197 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
198 # Docstring inherited from GenericBaseDatastore.
199 # If a component has been removed previously then we can sometimes
200 # be asked to remove it again. Other datastores ignore this
201 # so also ignore here
202 if ref.id is None: 202 ↛ 203line 202 didn't jump to line 203, because the condition on line 202 was never true
203 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
204 if ref.id not in self.records:
205 return
206 record = self.records[ref.id]
207 del self.records[ref.id]
208 self.related[record.parentID].remove(ref.id)
210 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]:
211 """Check that the dataset is present and return the real ID and
212 associated information.
214 Parameters
215 ----------
216 ref : `DatasetRef`
217 Target `DatasetRef`
219 Returns
220 -------
221 realID : `int`
222 The dataset ID associated with this ref that should be used. This
223 could either be the ID of the supplied `DatasetRef` or the parent.
224 storageInfo : `StoredMemoryItemInfo`
225 Associated storage information.
227 Raises
228 ------
229 FileNotFoundError
230 Raised if the dataset is not present in this datastore.
231 """
232 try:
233 storedItemInfo = self.getStoredItemInfo(ref)
234 except KeyError:
235 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
236 realID = ref.id
237 if storedItemInfo.parentID is not None: 237 ↛ 240line 237 didn't jump to line 240, because the condition on line 237 was never false
238 realID = storedItemInfo.parentID
240 if realID not in self.datasets: 240 ↛ 241line 240 didn't jump to line 241, because the condition on line 240 was never true
241 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
243 return realID, storedItemInfo
245 def knows(self, ref: DatasetRef) -> bool:
246 """Check if the dataset is known to the datastore.
248 This datastore does not distinguish dataset existence from knowledge
249 of a dataset.
251 Parameters
252 ----------
253 ref : `DatasetRef`
254 Reference to the required dataset.
256 Returns
257 -------
258 exists : `bool`
259 `True` if the dataset is known to the datastore.
260 """
261 return self.exists(ref)
263 def exists(self, ref: DatasetRef) -> bool:
264 """Check if the dataset exists in the datastore.
266 Parameters
267 ----------
268 ref : `DatasetRef`
269 Reference to the required dataset.
271 Returns
272 -------
273 exists : `bool`
274 `True` if the entity exists in the `Datastore`.
275 """
276 try:
277 self._get_dataset_info(ref)
278 except FileNotFoundError:
279 return False
280 return True
282 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
283 """Load an InMemoryDataset from the store.
285 Parameters
286 ----------
287 ref : `DatasetRef`
288 Reference to the required Dataset.
289 parameters : `dict`
290 `StorageClass`-specific parameters that specify, for example,
291 a slice of the dataset to be loaded.
293 Returns
294 -------
295 inMemoryDataset : `object`
296 Requested dataset or slice thereof as an InMemoryDataset.
298 Raises
299 ------
300 FileNotFoundError
301 Requested dataset can not be retrieved.
302 TypeError
303 Return value from formatter has unexpected type.
304 ValueError
305 Formatter failed to process the dataset.
306 """
308 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
310 realID, storedItemInfo = self._get_dataset_info(ref)
312 # We have a write storage class and a read storage class and they
313 # can be different for concrete composites.
314 readStorageClass = ref.datasetType.storageClass
315 writeStorageClass = storedItemInfo.storageClass
317 component = ref.datasetType.component()
319 # Check that the supplied parameters are suitable for the type read
320 # If this is a derived component we validate against the composite
321 isDerivedComponent = False
322 if component in writeStorageClass.derivedComponents:
323 writeStorageClass.validateParameters(parameters)
324 isDerivedComponent = True
325 else:
326 readStorageClass.validateParameters(parameters)
328 inMemoryDataset = self.datasets[realID]
330 # if this is a read only component we need to apply parameters
331 # before we retrieve the component. We assume that the parameters
332 # will affect the data globally, before the derived component
333 # is selected.
334 if isDerivedComponent:
335 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
336 # Then disable parameters for later
337 parameters = {}
339 # Different storage classes implies a component request
340 if readStorageClass != writeStorageClass:
341 if component is None: 341 ↛ 342line 341 didn't jump to line 342, because the condition on line 341 was never true
342 raise ValueError(
343 "Storage class inconsistency ({} vs {}) but no"
344 " component requested".format(readStorageClass.name, writeStorageClass.name)
345 )
347 # Concrete composite written as a single object (we hope)
348 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
350 # Since there is no formatter to process parameters, they all must be
351 # passed to the assembler.
352 return self._post_process_get(
353 inMemoryDataset, readStorageClass, parameters, isComponent=component is not None
354 )
356 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
357 """Write a InMemoryDataset with a given `DatasetRef` to the store.
359 Parameters
360 ----------
361 inMemoryDataset : `object`
362 The dataset to store.
363 ref : `DatasetRef`
364 Reference to the associated Dataset.
366 Raises
367 ------
368 TypeError
369 Supplied object and storage class are inconsistent.
370 DatasetTypeNotSupportedError
371 The associated `DatasetType` is not handled by this datastore.
373 Notes
374 -----
375 If the datastore is configured to reject certain dataset types it
376 is possible that the put will fail and raise a
377 `DatasetTypeNotSupportedError`. The main use case for this is to
378 allow `ChainedDatastore` to put to multiple datastores without
379 requiring that every datastore accepts the dataset.
380 """
382 if ref.id is None: 382 ↛ 383line 382 didn't jump to line 383, because the condition on line 382 was never true
383 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
385 # May need to coerce the in memory dataset to the correct
386 # python type, otherwise parameters may not work.
387 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
389 self._validate_put_parameters(inMemoryDataset, ref)
391 self.datasets[ref.id] = inMemoryDataset
392 log.debug("Store %s in %s", ref, self.name)
394 # Store time we received this content, to allow us to optionally
395 # expire it. Instead of storing a filename here, we include the
396 # ID of this datasetRef so we can find it from components.
397 itemInfo = StoredMemoryItemInfo(
398 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.getCheckedId()
399 )
401 # We have to register this content with registry.
402 # Currently this assumes we have a file so we need to use stub entries
403 # TODO: Add to ephemeral part of registry
404 self._register_datasets([(ref, itemInfo)])
406 if self._transaction is not None:
407 self._transaction.registerUndo("put", self.remove, ref)
409 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
410 """Return URIs associated with dataset.
412 Parameters
413 ----------
414 ref : `DatasetRef`
415 Reference to the required dataset.
416 predict : `bool`, optional
417 If the datastore does not know about the dataset, should it
418 return a predicted URI or not?
420 Returns
421 -------
422 uris : `DatasetRefURIs`
423 The URI to the primary artifact associated with this dataset (if
424 the dataset was disassembled within the datastore this may be
425 `None`), and the URIs to any components associated with the dataset
426 artifact. (can be empty if there are no components).
428 Notes
429 -----
430 The URIs returned for in-memory datastores are not usable but
431 provide an indication of the associated dataset.
432 """
434 # Include the dataID as a URI query
435 query = urlencode(ref.dataId)
437 # if this has never been written then we have to guess
438 if not self.exists(ref):
439 if not predict:
440 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
441 name = f"{ref.datasetType.name}"
442 fragment = "#predicted"
443 else:
444 realID, _ = self._get_dataset_info(ref)
445 name = f"{id(self.datasets[realID])}?{query}"
446 fragment = ""
448 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
450 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
451 """URI to the Dataset.
453 Always uses "mem://" URI prefix.
455 Parameters
456 ----------
457 ref : `DatasetRef`
458 Reference to the required Dataset.
459 predict : `bool`
460 If `True`, allow URIs to be returned of datasets that have not
461 been written.
463 Returns
464 -------
465 uri : `str`
466 URI pointing to the dataset within the datastore. If the
467 dataset does not exist in the datastore, and if ``predict`` is
468 `True`, the URI will be a prediction and will include a URI
469 fragment "#predicted".
470 If the datastore does not have entities that relate well
471 to the concept of a URI the returned URI string will be
472 descriptive. The returned URI is not guaranteed to be obtainable.
474 Raises
475 ------
476 FileNotFoundError
477 A URI has been requested for a dataset that does not exist and
478 guessing is not allowed.
479 AssertionError
480 Raised if an internal error occurs.
481 """
482 primary, _ = self.getURIs(ref, predict)
483 if primary is None: 483 ↛ 486line 483 didn't jump to line 486, because the condition on line 483 was never true
484 # This should be impossible since this datastore does
485 # not disassemble. This check also helps mypy.
486 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
487 return primary
489 def retrieveArtifacts(
490 self,
491 refs: Iterable[DatasetRef],
492 destination: ResourcePath,
493 transfer: str = "auto",
494 preserve_path: bool = True,
495 overwrite: Optional[bool] = False,
496 ) -> List[ResourcePath]:
497 """Retrieve the file artifacts associated with the supplied refs.
499 Notes
500 -----
501 Not implemented by this datastore.
502 """
503 # Could conceivably launch a FileDatastore to use formatters to write
504 # the data but this is fraught with problems.
505 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
507 def forget(self, refs: Iterable[DatasetRef]) -> None:
508 # Docstring inherited.
509 refs = list(refs)
510 self._bridge.forget(refs)
511 for ref in refs:
512 self.removeStoredItemInfo(ref)
514 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None:
515 """Indicate to the Datastore that a dataset can be removed.
517 Parameters
518 ----------
519 ref : `DatasetRef` or iterable thereof
520 Reference to the required Dataset(s).
521 ignore_errors: `bool`, optional
522 Indicate that errors should be ignored.
524 Raises
525 ------
526 FileNotFoundError
527 Attempt to remove a dataset that does not exist. Only relevant
528 if a single dataset ref is given.
530 Notes
531 -----
532 Concurrency should not normally be an issue for the in memory datastore
533 since all internal changes are isolated to solely this process and
534 the registry only changes rows associated with this process.
535 """
536 if not isinstance(ref, DatasetRef):
537 log.debug("Bulk trashing of datasets in datastore %s", self.name)
538 self.bridge.moveToTrash(ref)
539 return
541 log.debug("Trash %s in datastore %s", ref, self.name)
543 # Check that this dataset is known to datastore
544 try:
545 self._get_dataset_info(ref)
547 # Move datasets to trash table
548 self.bridge.moveToTrash([ref])
549 except Exception as e:
550 if ignore_errors: 550 ↛ 551line 550 didn't jump to line 551, because the condition on line 550 was never true
551 log.warning(
552 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
553 )
554 else:
555 raise
557 def emptyTrash(self, ignore_errors: bool = False) -> None:
558 """Remove all datasets from the trash.
560 Parameters
561 ----------
562 ignore_errors : `bool`, optional
563 Ignore errors.
565 Notes
566 -----
567 The internal tracking of datasets is affected by this method and
568 transaction handling is not supported if there is a problem before
569 the datasets themselves are deleted.
571 Concurrency should not normally be an issue for the in memory datastore
572 since all internal changes are isolated to solely this process and
573 the registry only changes rows associated with this process.
574 """
575 log.debug("Emptying trash in datastore %s", self.name)
576 with self._bridge.emptyTrash() as trash_data:
577 trashed, _ = trash_data
578 for ref, _ in trashed:
579 try:
580 realID, _ = self._get_dataset_info(ref)
581 except FileNotFoundError: 581 ↛ 584line 581 didn't jump to line 584
582 # Dataset already removed so ignore it
583 continue
584 except Exception as e:
585 if ignore_errors:
586 log.warning(
587 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
588 self.name,
589 ref.id,
590 e,
591 )
592 continue
593 else:
594 raise
596 # Determine whether all references to this dataset have been
597 # removed and we can delete the dataset itself
598 allRefs = self.related[realID]
599 remainingRefs = allRefs - {ref.id}
600 if not remainingRefs: 600 ↛ 605line 600 didn't jump to line 605, because the condition on line 600 was never false
601 log.debug("Removing artifact %s from datastore %s", realID, self.name)
602 del self.datasets[realID]
604 # Remove this entry
605 self.removeStoredItemInfo(ref)
607 def validateConfiguration(
608 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
609 ) -> None:
610 """Validate some of the configuration for this datastore.
612 Parameters
613 ----------
614 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
615 Entities to test against this configuration. Can be differing
616 types.
617 logFailures : `bool`, optional
618 If `True`, output a log message for every validation error
619 detected.
621 Raises
622 ------
623 DatastoreValidationError
624 Raised if there is a validation problem with a configuration.
625 All the problems are reported in a single exception.
627 Notes
628 -----
629 This method is a no-op.
630 """
631 return
633 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
634 # Docstring is inherited from base class
635 return transfer
637 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
638 # Docstring is inherited from base class
639 return
641 def getLookupKeys(self) -> Set[LookupKey]:
642 # Docstring is inherited from base class
643 return self.constraints.getLookupKeys()
645 def needs_expanded_data_ids(
646 self,
647 transfer: Optional[str],
648 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
649 ) -> bool:
650 # Docstring inherited.
651 return False
653 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
654 # Docstring inherited from the base class.
655 return
657 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
658 # Docstring inherited from the base class.
660 # In-memory Datastore records cannot be exported or imported
661 return {}