Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 86%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
32from urllib.parse import urlencode
34from lsst.daf.butler import DatasetId, DatasetRef, StorageClass, StoredDatastoreItemInfo
35from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
36from lsst.resources import ResourcePath
38from .genericDatastore import GenericBaseDatastore
40if TYPE_CHECKING: 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true
41 from lsst.daf.butler import Config, DatasetType, LookupKey
42 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
44log = logging.getLogger(__name__)
47@dataclass(frozen=True)
48class StoredMemoryItemInfo(StoredDatastoreItemInfo):
49 """Internal InMemoryDatastore Metadata associated with a stored
50 DatasetRef.
51 """
53 __slots__ = {"timestamp", "storageClass", "parentID"}
55 timestamp: float
56 """Unix timestamp indicating the time the dataset was stored."""
58 storageClass: StorageClass
59 """StorageClass associated with the dataset."""
61 parentID: DatasetId
62 """ID of the parent `DatasetRef` if this entry is a concrete
63 composite. Not used if the dataset being stored is not a
64 virtual component of a composite
65 """
68class InMemoryDatastore(GenericBaseDatastore):
69 """Basic Datastore for writing to an in memory cache.
71 This datastore is ephemeral in that the contents of the datastore
72 disappear when the Python process completes. This also means that
73 other processes can not access this datastore.
75 Parameters
76 ----------
77 config : `DatastoreConfig` or `str`
78 Configuration.
79 bridgeManager : `DatastoreRegistryBridgeManager`
80 Object that manages the interface between `Registry` and datastores.
81 butlerRoot : `str`, optional
82 Unused parameter.
84 Notes
85 -----
86 InMemoryDatastore does not support any file-based ingest.
87 """
89 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
90 """Path to configuration defaults. Accessed within the ``configs`` resource
91 or relative to a search path. Can be None if no defaults specified.
92 """
94 isEphemeral = True
95 """A new datastore is created every time and datasets disappear when
96 the process shuts down."""
98 datasets: Dict[DatasetId, Any]
99 """Internal storage of datasets indexed by dataset ID."""
101 records: Dict[DatasetId, StoredMemoryItemInfo]
102 """Internal records about stored datasets."""
104 def __init__(
105 self,
106 config: Union[Config, str],
107 bridgeManager: DatastoreRegistryBridgeManager,
108 butlerRoot: Optional[str] = None,
109 ):
110 super().__init__(config, bridgeManager)
112 # Name ourselves with the timestamp the datastore
113 # was created.
114 self.name = "{}@{}".format(type(self).__name__, time.time())
115 log.debug("Creating datastore %s", self.name)
117 # Storage of datasets, keyed by dataset_id
118 self.datasets: Dict[DatasetId, Any] = {}
120 # Records is distinct in order to track concrete composite components
121 # where we register multiple components for a single dataset.
122 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {}
124 # Related records that share the same parent
125 self.related: Dict[DatasetId, Set[DatasetId]] = {}
127 self._bridge = bridgeManager.register(self.name, ephemeral=True)
129 @classmethod
130 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
131 """Set any filesystem-dependent config options for this Datastore to
132 be appropriate for a new empty repository with the given root.
134 Does nothing in this implementation.
136 Parameters
137 ----------
138 root : `str`
139 Filesystem path to the root of the data repository.
140 config : `Config`
141 A `Config` to update. Only the subset understood by
142 this component will be updated. Will not expand
143 defaults.
144 full : `Config`
145 A complete config with all defaults expanded that can be
146 converted to a `DatastoreConfig`. Read-only and will not be
147 modified by this method.
148 Repository-specific options that should not be obtained
149 from defaults when Butler instances are constructed
150 should be copied from ``full`` to ``config``.
151 overwrite : `bool`, optional
152 If `False`, do not modify a value in ``config`` if the value
153 already exists. Default is always to overwrite with the provided
154 ``root``.
156 Notes
157 -----
158 If a keyword is explicitly defined in the supplied ``config`` it
159 will not be overridden by this method if ``overwrite`` is `False`.
160 This allows explicit values set in external configs to be retained.
161 """
162 return
164 @property
165 def bridge(self) -> DatastoreRegistryBridge:
166 # Docstring inherited from GenericBaseDatastore.
167 return self._bridge
169 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None:
170 # Docstring inherited from GenericBaseDatastore.
171 for ref, info in zip(refs, infos):
172 if ref.id is None: 172 ↛ 173line 172 didn't jump to line 173, because the condition on line 172 was never true
173 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
174 self.records[ref.id] = info
175 self.related.setdefault(info.parentID, set()).add(ref.id)
177 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
178 # Docstring inherited from GenericBaseDatastore.
179 if ref.id is None: 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true
180 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
181 return self.records[ref.id]
183 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
184 # Docstring inherited from GenericBaseDatastore.
185 return [self.getStoredItemInfo(ref)]
187 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
188 # Docstring inherited from GenericBaseDatastore.
189 # If a component has been removed previously then we can sometimes
190 # be asked to remove it again. Other datastores ignore this
191 # so also ignore here
192 if ref.id is None: 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true
193 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
194 if ref.id not in self.records:
195 return
196 record = self.records[ref.id]
197 del self.records[ref.id]
198 self.related[record.parentID].remove(ref.id)
200 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]:
201 """Check that the dataset is present and return the real ID and
202 associated information.
204 Parameters
205 ----------
206 ref : `DatasetRef`
207 Target `DatasetRef`
209 Returns
210 -------
211 realID : `int`
212 The dataset ID associated with this ref that should be used. This
213 could either be the ID of the supplied `DatasetRef` or the parent.
214 storageInfo : `StoredMemoryItemInfo`
215 Associated storage information.
217 Raises
218 ------
219 FileNotFoundError
220 Raised if the dataset is not present in this datastore.
221 """
222 try:
223 storedItemInfo = self.getStoredItemInfo(ref)
224 except KeyError:
225 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
226 realID = ref.id
227 if storedItemInfo.parentID is not None: 227 ↛ 230line 227 didn't jump to line 230, because the condition on line 227 was never false
228 realID = storedItemInfo.parentID
230 if realID not in self.datasets: 230 ↛ 231line 230 didn't jump to line 231, because the condition on line 230 was never true
231 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
233 return realID, storedItemInfo
235 def knows(self, ref: DatasetRef) -> bool:
236 """Check if the dataset is known to the datastore.
238 This datastore does not distinguish dataset existence from knowledge
239 of a dataset.
241 Parameters
242 ----------
243 ref : `DatasetRef`
244 Reference to the required dataset.
246 Returns
247 -------
248 exists : `bool`
249 `True` if the dataset is known to the datastore.
250 """
251 return self.exists(ref)
253 def exists(self, ref: DatasetRef) -> bool:
254 """Check if the dataset exists in the datastore.
256 Parameters
257 ----------
258 ref : `DatasetRef`
259 Reference to the required dataset.
261 Returns
262 -------
263 exists : `bool`
264 `True` if the entity exists in the `Datastore`.
265 """
266 try:
267 self._get_dataset_info(ref)
268 except FileNotFoundError:
269 return False
270 return True
272 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
273 """Load an InMemoryDataset from the store.
275 Parameters
276 ----------
277 ref : `DatasetRef`
278 Reference to the required Dataset.
279 parameters : `dict`
280 `StorageClass`-specific parameters that specify, for example,
281 a slice of the dataset to be loaded.
283 Returns
284 -------
285 inMemoryDataset : `object`
286 Requested dataset or slice thereof as an InMemoryDataset.
288 Raises
289 ------
290 FileNotFoundError
291 Requested dataset can not be retrieved.
292 TypeError
293 Return value from formatter has unexpected type.
294 ValueError
295 Formatter failed to process the dataset.
296 """
298 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
300 realID, storedItemInfo = self._get_dataset_info(ref)
302 # We have a write storage class and a read storage class and they
303 # can be different for concrete composites.
304 readStorageClass = ref.datasetType.storageClass
305 writeStorageClass = storedItemInfo.storageClass
307 component = ref.datasetType.component()
309 # Check that the supplied parameters are suitable for the type read
310 # If this is a derived component we validate against the composite
311 isDerivedComponent = False
312 if component in writeStorageClass.derivedComponents:
313 writeStorageClass.validateParameters(parameters)
314 isDerivedComponent = True
315 else:
316 readStorageClass.validateParameters(parameters)
318 inMemoryDataset = self.datasets[realID]
320 # if this is a read only component we need to apply parameters
321 # before we retrieve the component. We assume that the parameters
322 # will affect the data globally, before the derived component
323 # is selected.
324 if isDerivedComponent:
325 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
326 # Then disable parameters for later
327 parameters = {}
329 # Different storage classes implies a component request
330 if readStorageClass != writeStorageClass:
332 if component is None: 332 ↛ 333line 332 didn't jump to line 333, because the condition on line 332 was never true
333 raise ValueError(
334 "Storage class inconsistency ({} vs {}) but no"
335 " component requested".format(readStorageClass.name, writeStorageClass.name)
336 )
338 # Concrete composite written as a single object (we hope)
339 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
341 # Since there is no formatter to process parameters, they all must be
342 # passed to the assembler.
343 return self._post_process_get(
344 inMemoryDataset, readStorageClass, parameters, isComponent=component is not None
345 )
347 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
348 """Write a InMemoryDataset with a given `DatasetRef` to the store.
350 Parameters
351 ----------
352 inMemoryDataset : `object`
353 The dataset to store.
354 ref : `DatasetRef`
355 Reference to the associated Dataset.
357 Raises
358 ------
359 TypeError
360 Supplied object and storage class are inconsistent.
361 DatasetTypeNotSupportedError
362 The associated `DatasetType` is not handled by this datastore.
364 Notes
365 -----
366 If the datastore is configured to reject certain dataset types it
367 is possible that the put will fail and raise a
368 `DatasetTypeNotSupportedError`. The main use case for this is to
369 allow `ChainedDatastore` to put to multiple datastores without
370 requiring that every datastore accepts the dataset.
371 """
373 if ref.id is None: 373 ↛ 374line 373 didn't jump to line 374, because the condition on line 373 was never true
374 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
376 self._validate_put_parameters(inMemoryDataset, ref)
378 self.datasets[ref.id] = inMemoryDataset
379 log.debug("Store %s in %s", ref, self.name)
381 # Store time we received this content, to allow us to optionally
382 # expire it. Instead of storing a filename here, we include the
383 # ID of this datasetRef so we can find it from components.
384 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, parentID=ref.id)
386 # We have to register this content with registry.
387 # Currently this assumes we have a file so we need to use stub entries
388 # TODO: Add to ephemeral part of registry
389 self._register_datasets([(ref, itemInfo)])
391 if self._transaction is not None:
392 self._transaction.registerUndo("put", self.remove, ref)
394 def getURIs(
395 self, ref: DatasetRef, predict: bool = False
396 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
397 """Return URIs associated with dataset.
399 Parameters
400 ----------
401 ref : `DatasetRef`
402 Reference to the required dataset.
403 predict : `bool`, optional
404 If the datastore does not know about the dataset, should it
405 return a predicted URI or not?
407 Returns
408 -------
409 primary : `lsst.resources.ResourcePath`
410 The URI to the primary artifact associated with this dataset.
411 If the dataset was disassembled within the datastore this
412 may be `None`.
413 components : `dict`
414 URIs to any components associated with the dataset artifact.
415 Can be empty if there are no components.
417 Notes
418 -----
419 The URIs returned for in-memory datastores are not usable but
420 provide an indication of the associated dataset.
421 """
423 # Include the dataID as a URI query
424 query = urlencode(ref.dataId)
426 # if this has never been written then we have to guess
427 if not self.exists(ref):
428 if not predict:
429 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
430 name = f"{ref.datasetType.name}"
431 fragment = "#predicted"
432 else:
433 realID, _ = self._get_dataset_info(ref)
434 name = f"{id(self.datasets[realID])}?{query}"
435 fragment = ""
437 return ResourcePath(f"mem://{name}?{query}{fragment}"), {}
439 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
440 """URI to the Dataset.
442 Always uses "mem://" URI prefix.
444 Parameters
445 ----------
446 ref : `DatasetRef`
447 Reference to the required Dataset.
448 predict : `bool`
449 If `True`, allow URIs to be returned of datasets that have not
450 been written.
452 Returns
453 -------
454 uri : `str`
455 URI pointing to the dataset within the datastore. If the
456 dataset does not exist in the datastore, and if ``predict`` is
457 `True`, the URI will be a prediction and will include a URI
458 fragment "#predicted".
459 If the datastore does not have entities that relate well
460 to the concept of a URI the returned URI string will be
461 descriptive. The returned URI is not guaranteed to be obtainable.
463 Raises
464 ------
465 FileNotFoundError
466 A URI has been requested for a dataset that does not exist and
467 guessing is not allowed.
468 AssertionError
469 Raised if an internal error occurs.
470 """
471 primary, _ = self.getURIs(ref, predict)
472 if primary is None: 472 ↛ 475line 472 didn't jump to line 475, because the condition on line 472 was never true
473 # This should be impossible since this datastore does
474 # not disassemble. This check also helps mypy.
475 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
476 return primary
478 def retrieveArtifacts(
479 self,
480 refs: Iterable[DatasetRef],
481 destination: ResourcePath,
482 transfer: str = "auto",
483 preserve_path: bool = True,
484 overwrite: Optional[bool] = False,
485 ) -> List[ResourcePath]:
486 """Retrieve the file artifacts associated with the supplied refs.
488 Notes
489 -----
490 Not implemented by this datastore.
491 """
492 # Could conceivably launch a FileDatastore to use formatters to write
493 # the data but this is fraught with problems.
494 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
496 def forget(self, refs: Iterable[DatasetRef]) -> None:
497 # Docstring inherited.
498 refs = list(refs)
499 self._bridge.forget(refs)
500 for ref in refs:
501 self.removeStoredItemInfo(ref)
503 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None:
504 """Indicate to the Datastore that a dataset can be removed.
506 Parameters
507 ----------
508 ref : `DatasetRef` or iterable thereof
509 Reference to the required Dataset(s).
510 ignore_errors: `bool`, optional
511 Indicate that errors should be ignored.
513 Raises
514 ------
515 FileNotFoundError
516 Attempt to remove a dataset that does not exist. Only relevant
517 if a single dataset ref is given.
519 Notes
520 -----
521 Concurrency should not normally be an issue for the in memory datastore
522 since all internal changes are isolated to solely this process and
523 the registry only changes rows associated with this process.
524 """
525 if not isinstance(ref, DatasetRef):
526 log.debug("Bulk trashing of datasets in datastore %s", self.name)
527 self.bridge.moveToTrash(ref)
528 return
530 log.debug("Trash %s in datastore %s", ref, self.name)
532 # Check that this dataset is known to datastore
533 try:
534 self._get_dataset_info(ref)
536 # Move datasets to trash table
537 self.bridge.moveToTrash([ref])
538 except Exception as e:
539 if ignore_errors: 539 ↛ 540line 539 didn't jump to line 540, because the condition on line 539 was never true
540 log.warning(
541 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
542 )
543 else:
544 raise
546 def emptyTrash(self, ignore_errors: bool = False) -> None:
547 """Remove all datasets from the trash.
549 Parameters
550 ----------
551 ignore_errors : `bool`, optional
552 Ignore errors.
554 Notes
555 -----
556 The internal tracking of datasets is affected by this method and
557 transaction handling is not supported if there is a problem before
558 the datasets themselves are deleted.
560 Concurrency should not normally be an issue for the in memory datastore
561 since all internal changes are isolated to solely this process and
562 the registry only changes rows associated with this process.
563 """
564 log.debug("Emptying trash in datastore %s", self.name)
565 with self._bridge.emptyTrash() as trash_data:
566 trashed, _ = trash_data
567 for ref, _ in trashed:
568 try:
569 realID, _ = self._get_dataset_info(ref)
570 except FileNotFoundError: 570 ↛ 573line 570 didn't jump to line 573
571 # Dataset already removed so ignore it
572 continue
573 except Exception as e:
574 if ignore_errors:
575 log.warning(
576 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
577 self.name,
578 ref.id,
579 e,
580 )
581 continue
582 else:
583 raise
585 # Determine whether all references to this dataset have been
586 # removed and we can delete the dataset itself
587 allRefs = self.related[realID]
588 remainingRefs = allRefs - {ref.id}
589 if not remainingRefs: 589 ↛ 594line 589 didn't jump to line 594, because the condition on line 589 was never false
590 log.debug("Removing artifact %s from datastore %s", realID, self.name)
591 del self.datasets[realID]
593 # Remove this entry
594 self.removeStoredItemInfo(ref)
596 def validateConfiguration(
597 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
598 ) -> None:
599 """Validate some of the configuration for this datastore.
601 Parameters
602 ----------
603 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
604 Entities to test against this configuration. Can be differing
605 types.
606 logFailures : `bool`, optional
607 If `True`, output a log message for every validation error
608 detected.
610 Raises
611 ------
612 DatastoreValidationError
613 Raised if there is a validation problem with a configuration.
614 All the problems are reported in a single exception.
616 Notes
617 -----
618 This method is a no-op.
619 """
620 return
622 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
623 # Docstring is inherited from base class
624 return transfer
626 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
627 # Docstring is inherited from base class
628 return
630 def getLookupKeys(self) -> Set[LookupKey]:
631 # Docstring is inherited from base class
632 return self.constraints.getLookupKeys()
634 def needs_expanded_data_ids(
635 self,
636 transfer: Optional[str],
637 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
638 ) -> bool:
639 # Docstring inherited.
640 return False