Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 86%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
32from urllib.parse import urlencode
34from lsst.daf.butler import ButlerURI, DatasetId, DatasetRef, StorageClass, StoredDatastoreItemInfo
35from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
37from .genericDatastore import GenericBaseDatastore
39if TYPE_CHECKING: 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 from lsst.daf.butler import Config, DatasetType, LookupKey
41 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
43log = logging.getLogger(__name__)
46@dataclass(frozen=True)
47class StoredMemoryItemInfo(StoredDatastoreItemInfo):
48 """Internal InMemoryDatastore Metadata associated with a stored
49 DatasetRef.
50 """
52 __slots__ = {"timestamp", "storageClass", "parentID"}
54 timestamp: float
55 """Unix timestamp indicating the time the dataset was stored."""
57 storageClass: StorageClass
58 """StorageClass associated with the dataset."""
60 parentID: DatasetId
61 """ID of the parent `DatasetRef` if this entry is a concrete
62 composite. Not used if the dataset being stored is not a
63 virtual component of a composite
64 """
67class InMemoryDatastore(GenericBaseDatastore):
68 """Basic Datastore for writing to an in memory cache.
70 This datastore is ephemeral in that the contents of the datastore
71 disappear when the Python process completes. This also means that
72 other processes can not access this datastore.
74 Parameters
75 ----------
76 config : `DatastoreConfig` or `str`
77 Configuration.
78 bridgeManager : `DatastoreRegistryBridgeManager`
79 Object that manages the interface between `Registry` and datastores.
80 butlerRoot : `str`, optional
81 Unused parameter.
83 Notes
84 -----
85 InMemoryDatastore does not support any file-based ingest.
86 """
88 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
89 """Path to configuration defaults. Accessed within the ``configs`` resource
90 or relative to a search path. Can be None if no defaults specified.
91 """
93 isEphemeral = True
94 """A new datastore is created every time and datasets disappear when
95 the process shuts down."""
97 datasets: Dict[DatasetId, Any]
98 """Internal storage of datasets indexed by dataset ID."""
100 records: Dict[DatasetId, StoredMemoryItemInfo]
101 """Internal records about stored datasets."""
103 def __init__(
104 self,
105 config: Union[Config, str],
106 bridgeManager: DatastoreRegistryBridgeManager,
107 butlerRoot: Optional[str] = None,
108 ):
109 super().__init__(config, bridgeManager)
111 # Name ourselves with the timestamp the datastore
112 # was created.
113 self.name = "{}@{}".format(type(self).__name__, time.time())
114 log.debug("Creating datastore %s", self.name)
116 # Storage of datasets, keyed by dataset_id
117 self.datasets: Dict[DatasetId, Any] = {}
119 # Records is distinct in order to track concrete composite components
120 # where we register multiple components for a single dataset.
121 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {}
123 # Related records that share the same parent
124 self.related: Dict[DatasetId, Set[DatasetId]] = {}
126 self._bridge = bridgeManager.register(self.name, ephemeral=True)
128 @classmethod
129 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
130 """Set any filesystem-dependent config options for this Datastore to
131 be appropriate for a new empty repository with the given root.
133 Does nothing in this implementation.
135 Parameters
136 ----------
137 root : `str`
138 Filesystem path to the root of the data repository.
139 config : `Config`
140 A `Config` to update. Only the subset understood by
141 this component will be updated. Will not expand
142 defaults.
143 full : `Config`
144 A complete config with all defaults expanded that can be
145 converted to a `DatastoreConfig`. Read-only and will not be
146 modified by this method.
147 Repository-specific options that should not be obtained
148 from defaults when Butler instances are constructed
149 should be copied from ``full`` to ``config``.
150 overwrite : `bool`, optional
151 If `False`, do not modify a value in ``config`` if the value
152 already exists. Default is always to overwrite with the provided
153 ``root``.
155 Notes
156 -----
157 If a keyword is explicitly defined in the supplied ``config`` it
158 will not be overridden by this method if ``overwrite`` is `False`.
159 This allows explicit values set in external configs to be retained.
160 """
161 return
163 @property
164 def bridge(self) -> DatastoreRegistryBridge:
165 # Docstring inherited from GenericBaseDatastore.
166 return self._bridge
168 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None:
169 # Docstring inherited from GenericBaseDatastore.
170 for ref, info in zip(refs, infos):
171 if ref.id is None: 171 ↛ 172line 171 didn't jump to line 172, because the condition on line 171 was never true
172 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
173 self.records[ref.id] = info
174 self.related.setdefault(info.parentID, set()).add(ref.id)
176 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
177 # Docstring inherited from GenericBaseDatastore.
178 if ref.id is None: 178 ↛ 179line 178 didn't jump to line 179, because the condition on line 178 was never true
179 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
180 return self.records[ref.id]
182 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
183 # Docstring inherited from GenericBaseDatastore.
184 return [self.getStoredItemInfo(ref)]
186 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
187 # Docstring inherited from GenericBaseDatastore.
188 # If a component has been removed previously then we can sometimes
189 # be asked to remove it again. Other datastores ignore this
190 # so also ignore here
191 if ref.id is None: 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true
192 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
193 if ref.id not in self.records:
194 return
195 record = self.records[ref.id]
196 del self.records[ref.id]
197 self.related[record.parentID].remove(ref.id)
199 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]:
200 """Check that the dataset is present and return the real ID and
201 associated information.
203 Parameters
204 ----------
205 ref : `DatasetRef`
206 Target `DatasetRef`
208 Returns
209 -------
210 realID : `int`
211 The dataset ID associated with this ref that should be used. This
212 could either be the ID of the supplied `DatasetRef` or the parent.
213 storageInfo : `StoredMemoryItemInfo`
214 Associated storage information.
216 Raises
217 ------
218 FileNotFoundError
219 Raised if the dataset is not present in this datastore.
220 """
221 try:
222 storedItemInfo = self.getStoredItemInfo(ref)
223 except KeyError:
224 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
225 realID = ref.id
226 if storedItemInfo.parentID is not None: 226 ↛ 229line 226 didn't jump to line 229, because the condition on line 226 was never false
227 realID = storedItemInfo.parentID
229 if realID not in self.datasets: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true
230 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
232 return realID, storedItemInfo
234 def knows(self, ref: DatasetRef) -> bool:
235 """Check if the dataset is known to the datastore.
237 This datastore does not distinguish dataset existence from knowledge
238 of a dataset.
240 Parameters
241 ----------
242 ref : `DatasetRef`
243 Reference to the required dataset.
245 Returns
246 -------
247 exists : `bool`
248 `True` if the dataset is known to the datastore.
249 """
250 return self.exists(ref)
252 def exists(self, ref: DatasetRef) -> bool:
253 """Check if the dataset exists in the datastore.
255 Parameters
256 ----------
257 ref : `DatasetRef`
258 Reference to the required dataset.
260 Returns
261 -------
262 exists : `bool`
263 `True` if the entity exists in the `Datastore`.
264 """
265 try:
266 self._get_dataset_info(ref)
267 except FileNotFoundError:
268 return False
269 return True
271 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
272 """Load an InMemoryDataset from the store.
274 Parameters
275 ----------
276 ref : `DatasetRef`
277 Reference to the required Dataset.
278 parameters : `dict`
279 `StorageClass`-specific parameters that specify, for example,
280 a slice of the dataset to be loaded.
282 Returns
283 -------
284 inMemoryDataset : `object`
285 Requested dataset or slice thereof as an InMemoryDataset.
287 Raises
288 ------
289 FileNotFoundError
290 Requested dataset can not be retrieved.
291 TypeError
292 Return value from formatter has unexpected type.
293 ValueError
294 Formatter failed to process the dataset.
295 """
297 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
299 realID, storedItemInfo = self._get_dataset_info(ref)
301 # We have a write storage class and a read storage class and they
302 # can be different for concrete composites.
303 readStorageClass = ref.datasetType.storageClass
304 writeStorageClass = storedItemInfo.storageClass
306 component = ref.datasetType.component()
308 # Check that the supplied parameters are suitable for the type read
309 # If this is a derived component we validate against the composite
310 isDerivedComponent = False
311 if component in writeStorageClass.derivedComponents:
312 writeStorageClass.validateParameters(parameters)
313 isDerivedComponent = True
314 else:
315 readStorageClass.validateParameters(parameters)
317 inMemoryDataset = self.datasets[realID]
319 # if this is a read only component we need to apply parameters
320 # before we retrieve the component. We assume that the parameters
321 # will affect the data globally, before the derived component
322 # is selected.
323 if isDerivedComponent:
324 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
325 # Then disable parameters for later
326 parameters = {}
328 # Different storage classes implies a component request
329 if readStorageClass != writeStorageClass:
331 if component is None: 331 ↛ 332line 331 didn't jump to line 332, because the condition on line 331 was never true
332 raise ValueError(
333 "Storage class inconsistency ({} vs {}) but no"
334 " component requested".format(readStorageClass.name, writeStorageClass.name)
335 )
337 # Concrete composite written as a single object (we hope)
338 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
340 # Since there is no formatter to process parameters, they all must be
341 # passed to the assembler.
342 return self._post_process_get(
343 inMemoryDataset, readStorageClass, parameters, isComponent=component is not None
344 )
346 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
347 """Write a InMemoryDataset with a given `DatasetRef` to the store.
349 Parameters
350 ----------
351 inMemoryDataset : `object`
352 The dataset to store.
353 ref : `DatasetRef`
354 Reference to the associated Dataset.
356 Raises
357 ------
358 TypeError
359 Supplied object and storage class are inconsistent.
360 DatasetTypeNotSupportedError
361 The associated `DatasetType` is not handled by this datastore.
363 Notes
364 -----
365 If the datastore is configured to reject certain dataset types it
366 is possible that the put will fail and raise a
367 `DatasetTypeNotSupportedError`. The main use case for this is to
368 allow `ChainedDatastore` to put to multiple datastores without
369 requiring that every datastore accepts the dataset.
370 """
372 if ref.id is None: 372 ↛ 373line 372 didn't jump to line 373, because the condition on line 372 was never true
373 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
375 self._validate_put_parameters(inMemoryDataset, ref)
377 self.datasets[ref.id] = inMemoryDataset
378 log.debug("Store %s in %s", ref, self.name)
380 # Store time we received this content, to allow us to optionally
381 # expire it. Instead of storing a filename here, we include the
382 # ID of this datasetRef so we can find it from components.
383 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass, parentID=ref.id)
385 # We have to register this content with registry.
386 # Currently this assumes we have a file so we need to use stub entries
387 # TODO: Add to ephemeral part of registry
388 self._register_datasets([(ref, itemInfo)])
390 if self._transaction is not None:
391 self._transaction.registerUndo("put", self.remove, ref)
393 def getURIs(
394 self, ref: DatasetRef, predict: bool = False
395 ) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
396 """Return URIs associated with dataset.
398 Parameters
399 ----------
400 ref : `DatasetRef`
401 Reference to the required dataset.
402 predict : `bool`, optional
403 If the datastore does not know about the dataset, should it
404 return a predicted URI or not?
406 Returns
407 -------
408 primary : `ButlerURI`
409 The URI to the primary artifact associated with this dataset.
410 If the dataset was disassembled within the datastore this
411 may be `None`.
412 components : `dict`
413 URIs to any components associated with the dataset artifact.
414 Can be empty if there are no components.
416 Notes
417 -----
418 The URIs returned for in-memory datastores are not usable but
419 provide an indication of the associated dataset.
420 """
422 # Include the dataID as a URI query
423 query = urlencode(ref.dataId)
425 # if this has never been written then we have to guess
426 if not self.exists(ref):
427 if not predict:
428 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
429 name = f"{ref.datasetType.name}"
430 fragment = "#predicted"
431 else:
432 realID, _ = self._get_dataset_info(ref)
433 name = f"{id(self.datasets[realID])}?{query}"
434 fragment = ""
436 return ButlerURI(f"mem://{name}?{query}{fragment}"), {}
438 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
439 """URI to the Dataset.
441 Always uses "mem://" URI prefix.
443 Parameters
444 ----------
445 ref : `DatasetRef`
446 Reference to the required Dataset.
447 predict : `bool`
448 If `True`, allow URIs to be returned of datasets that have not
449 been written.
451 Returns
452 -------
453 uri : `str`
454 URI pointing to the dataset within the datastore. If the
455 dataset does not exist in the datastore, and if ``predict`` is
456 `True`, the URI will be a prediction and will include a URI
457 fragment "#predicted".
458 If the datastore does not have entities that relate well
459 to the concept of a URI the returned URI string will be
460 descriptive. The returned URI is not guaranteed to be obtainable.
462 Raises
463 ------
464 FileNotFoundError
465 A URI has been requested for a dataset that does not exist and
466 guessing is not allowed.
467 AssertionError
468 Raised if an internal error occurs.
469 """
470 primary, _ = self.getURIs(ref, predict)
471 if primary is None: 471 ↛ 474line 471 didn't jump to line 474, because the condition on line 471 was never true
472 # This should be impossible since this datastore does
473 # not disassemble. This check also helps mypy.
474 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
475 return primary
477 def retrieveArtifacts(
478 self,
479 refs: Iterable[DatasetRef],
480 destination: ButlerURI,
481 transfer: str = "auto",
482 preserve_path: bool = True,
483 overwrite: Optional[bool] = False,
484 ) -> List[ButlerURI]:
485 """Retrieve the file artifacts associated with the supplied refs.
487 Notes
488 -----
489 Not implemented by this datastore.
490 """
491 # Could conceivably launch a FileDatastore to use formatters to write
492 # the data but this is fraught with problems.
493 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
495 def forget(self, refs: Iterable[DatasetRef]) -> None:
496 # Docstring inherited.
497 refs = list(refs)
498 self._bridge.forget(refs)
499 for ref in refs:
500 self.removeStoredItemInfo(ref)
502 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None:
503 """Indicate to the Datastore that a dataset can be removed.
505 Parameters
506 ----------
507 ref : `DatasetRef` or iterable thereof
508 Reference to the required Dataset(s).
509 ignore_errors: `bool`, optional
510 Indicate that errors should be ignored.
512 Raises
513 ------
514 FileNotFoundError
515 Attempt to remove a dataset that does not exist. Only relevant
516 if a single dataset ref is given.
518 Notes
519 -----
520 Concurrency should not normally be an issue for the in memory datastore
521 since all internal changes are isolated to solely this process and
522 the registry only changes rows associated with this process.
523 """
524 if not isinstance(ref, DatasetRef):
525 log.debug("Bulk trashing of datasets in datastore %s", self.name)
526 self.bridge.moveToTrash(ref)
527 return
529 log.debug("Trash %s in datastore %s", ref, self.name)
531 # Check that this dataset is known to datastore
532 try:
533 self._get_dataset_info(ref)
535 # Move datasets to trash table
536 self.bridge.moveToTrash([ref])
537 except Exception as e:
538 if ignore_errors: 538 ↛ 539line 538 didn't jump to line 539, because the condition on line 538 was never true
539 log.warning(
540 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
541 )
542 else:
543 raise
545 def emptyTrash(self, ignore_errors: bool = False) -> None:
546 """Remove all datasets from the trash.
548 Parameters
549 ----------
550 ignore_errors : `bool`, optional
551 Ignore errors.
553 Notes
554 -----
555 The internal tracking of datasets is affected by this method and
556 transaction handling is not supported if there is a problem before
557 the datasets themselves are deleted.
559 Concurrency should not normally be an issue for the in memory datastore
560 since all internal changes are isolated to solely this process and
561 the registry only changes rows associated with this process.
562 """
563 log.debug("Emptying trash in datastore %s", self.name)
564 with self._bridge.emptyTrash() as trash_data:
565 trashed, _ = trash_data
566 for ref, _ in trashed:
567 try:
568 realID, _ = self._get_dataset_info(ref)
569 except FileNotFoundError: 569 ↛ 572line 569 didn't jump to line 572
570 # Dataset already removed so ignore it
571 continue
572 except Exception as e:
573 if ignore_errors:
574 log.warning(
575 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
576 self.name,
577 ref.id,
578 e,
579 )
580 continue
581 else:
582 raise
584 # Determine whether all references to this dataset have been
585 # removed and we can delete the dataset itself
586 allRefs = self.related[realID]
587 remainingRefs = allRefs - {ref.id}
588 if not remainingRefs: 588 ↛ 593line 588 didn't jump to line 593, because the condition on line 588 was never false
589 log.debug("Removing artifact %s from datastore %s", realID, self.name)
590 del self.datasets[realID]
592 # Remove this entry
593 self.removeStoredItemInfo(ref)
595 def validateConfiguration(
596 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
597 ) -> None:
598 """Validate some of the configuration for this datastore.
600 Parameters
601 ----------
602 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
603 Entities to test against this configuration. Can be differing
604 types.
605 logFailures : `bool`, optional
606 If `True`, output a log message for every validation error
607 detected.
609 Raises
610 ------
611 DatastoreValidationError
612 Raised if there is a validation problem with a configuration.
613 All the problems are reported in a single exception.
615 Notes
616 -----
617 This method is a no-op.
618 """
619 return
621 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
622 # Docstring is inherited from base class
623 return transfer
625 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
626 # Docstring is inherited from base class
627 return
629 def getLookupKeys(self) -> Set[LookupKey]:
630 # Docstring is inherited from base class
631 return self.constraints.getLookupKeys()
633 def needs_expanded_data_ids(
634 self,
635 transfer: Optional[str],
636 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
637 ) -> bool:
638 # Docstring inherited.
639 return False