Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 87%
190 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-17 02:07 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-17 02:07 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
32from urllib.parse import urlencode
34from lsst.daf.butler import DatasetId, DatasetRef, DatastoreRecordData, StorageClass, StoredDatastoreItemInfo
35from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
36from lsst.resources import ResourcePath
38from .genericDatastore import GenericBaseDatastore
40if TYPE_CHECKING: 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true
41 from lsst.daf.butler import Config, DatasetType, LookupKey
42 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
44log = logging.getLogger(__name__)
47@dataclass(frozen=True)
48class StoredMemoryItemInfo(StoredDatastoreItemInfo):
49 """Internal InMemoryDatastore Metadata associated with a stored
50 DatasetRef.
51 """
53 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"}
55 timestamp: float
56 """Unix timestamp indicating the time the dataset was stored."""
58 storageClass: StorageClass
59 """StorageClass associated with the dataset."""
61 parentID: DatasetId
62 """ID of the parent `DatasetRef` if this entry is a concrete
63 composite. Not used if the dataset being stored is not a
64 virtual component of a composite
65 """
67 dataset_id: DatasetId
68 """DatasetId associated with this record."""
71class InMemoryDatastore(GenericBaseDatastore):
72 """Basic Datastore for writing to an in memory cache.
74 This datastore is ephemeral in that the contents of the datastore
75 disappear when the Python process completes. This also means that
76 other processes can not access this datastore.
78 Parameters
79 ----------
80 config : `DatastoreConfig` or `str`
81 Configuration.
82 bridgeManager : `DatastoreRegistryBridgeManager`
83 Object that manages the interface between `Registry` and datastores.
84 butlerRoot : `str`, optional
85 Unused parameter.
87 Notes
88 -----
89 InMemoryDatastore does not support any file-based ingest.
90 """
92 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
93 """Path to configuration defaults. Accessed within the ``configs`` resource
94 or relative to a search path. Can be None if no defaults specified.
95 """
97 isEphemeral = True
98 """A new datastore is created every time and datasets disappear when
99 the process shuts down."""
101 datasets: Dict[DatasetId, Any]
102 """Internal storage of datasets indexed by dataset ID."""
104 records: Dict[DatasetId, StoredMemoryItemInfo]
105 """Internal records about stored datasets."""
107 def __init__(
108 self,
109 config: Union[Config, str],
110 bridgeManager: DatastoreRegistryBridgeManager,
111 butlerRoot: Optional[str] = None,
112 ):
113 super().__init__(config, bridgeManager)
115 # Name ourselves with the timestamp the datastore
116 # was created.
117 self.name = "{}@{}".format(type(self).__name__, time.time())
118 log.debug("Creating datastore %s", self.name)
120 # Storage of datasets, keyed by dataset_id
121 self.datasets: Dict[DatasetId, Any] = {}
123 # Records is distinct in order to track concrete composite components
124 # where we register multiple components for a single dataset.
125 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {}
127 # Related records that share the same parent
128 self.related: Dict[DatasetId, Set[DatasetId]] = {}
130 self._bridge = bridgeManager.register(self.name, ephemeral=True)
132 @classmethod
133 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
134 """Set any filesystem-dependent config options for this Datastore to
135 be appropriate for a new empty repository with the given root.
137 Does nothing in this implementation.
139 Parameters
140 ----------
141 root : `str`
142 Filesystem path to the root of the data repository.
143 config : `Config`
144 A `Config` to update. Only the subset understood by
145 this component will be updated. Will not expand
146 defaults.
147 full : `Config`
148 A complete config with all defaults expanded that can be
149 converted to a `DatastoreConfig`. Read-only and will not be
150 modified by this method.
151 Repository-specific options that should not be obtained
152 from defaults when Butler instances are constructed
153 should be copied from ``full`` to ``config``.
154 overwrite : `bool`, optional
155 If `False`, do not modify a value in ``config`` if the value
156 already exists. Default is always to overwrite with the provided
157 ``root``.
159 Notes
160 -----
161 If a keyword is explicitly defined in the supplied ``config`` it
162 will not be overridden by this method if ``overwrite`` is `False`.
163 This allows explicit values set in external configs to be retained.
164 """
165 return
167 @property
168 def bridge(self) -> DatastoreRegistryBridge:
169 # Docstring inherited from GenericBaseDatastore.
170 return self._bridge
172 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None:
173 # Docstring inherited from GenericBaseDatastore.
174 for ref, info in zip(refs, infos):
175 if ref.id is None: 175 ↛ 176line 175 didn't jump to line 176, because the condition on line 175 was never true
176 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
177 self.records[ref.id] = info
178 self.related.setdefault(info.parentID, set()).add(ref.id)
180 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
181 # Docstring inherited from GenericBaseDatastore.
182 if ref.id is None: 182 ↛ 183line 182 didn't jump to line 183, because the condition on line 182 was never true
183 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
184 return self.records[ref.id]
186 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
187 # Docstring inherited from GenericBaseDatastore.
188 return [self.getStoredItemInfo(ref)]
190 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
191 # Docstring inherited from GenericBaseDatastore.
192 # If a component has been removed previously then we can sometimes
193 # be asked to remove it again. Other datastores ignore this
194 # so also ignore here
195 if ref.id is None: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true
196 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
197 if ref.id not in self.records:
198 return
199 record = self.records[ref.id]
200 del self.records[ref.id]
201 self.related[record.parentID].remove(ref.id)
203 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]:
204 """Check that the dataset is present and return the real ID and
205 associated information.
207 Parameters
208 ----------
209 ref : `DatasetRef`
210 Target `DatasetRef`
212 Returns
213 -------
214 realID : `int`
215 The dataset ID associated with this ref that should be used. This
216 could either be the ID of the supplied `DatasetRef` or the parent.
217 storageInfo : `StoredMemoryItemInfo`
218 Associated storage information.
220 Raises
221 ------
222 FileNotFoundError
223 Raised if the dataset is not present in this datastore.
224 """
225 try:
226 storedItemInfo = self.getStoredItemInfo(ref)
227 except KeyError:
228 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
229 realID = ref.id
230 if storedItemInfo.parentID is not None: 230 ↛ 233line 230 didn't jump to line 233, because the condition on line 230 was never false
231 realID = storedItemInfo.parentID
233 if realID not in self.datasets: 233 ↛ 234line 233 didn't jump to line 234, because the condition on line 233 was never true
234 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
236 return realID, storedItemInfo
238 def knows(self, ref: DatasetRef) -> bool:
239 """Check if the dataset is known to the datastore.
241 This datastore does not distinguish dataset existence from knowledge
242 of a dataset.
244 Parameters
245 ----------
246 ref : `DatasetRef`
247 Reference to the required dataset.
249 Returns
250 -------
251 exists : `bool`
252 `True` if the dataset is known to the datastore.
253 """
254 return self.exists(ref)
256 def exists(self, ref: DatasetRef) -> bool:
257 """Check if the dataset exists in the datastore.
259 Parameters
260 ----------
261 ref : `DatasetRef`
262 Reference to the required dataset.
264 Returns
265 -------
266 exists : `bool`
267 `True` if the entity exists in the `Datastore`.
268 """
269 try:
270 self._get_dataset_info(ref)
271 except FileNotFoundError:
272 return False
273 return True
275 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
276 """Load an InMemoryDataset from the store.
278 Parameters
279 ----------
280 ref : `DatasetRef`
281 Reference to the required Dataset.
282 parameters : `dict`
283 `StorageClass`-specific parameters that specify, for example,
284 a slice of the dataset to be loaded.
286 Returns
287 -------
288 inMemoryDataset : `object`
289 Requested dataset or slice thereof as an InMemoryDataset.
291 Raises
292 ------
293 FileNotFoundError
294 Requested dataset can not be retrieved.
295 TypeError
296 Return value from formatter has unexpected type.
297 ValueError
298 Formatter failed to process the dataset.
299 """
301 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
303 realID, storedItemInfo = self._get_dataset_info(ref)
305 # We have a write storage class and a read storage class and they
306 # can be different for concrete composites.
307 readStorageClass = ref.datasetType.storageClass
308 writeStorageClass = storedItemInfo.storageClass
310 component = ref.datasetType.component()
312 # Check that the supplied parameters are suitable for the type read
313 # If this is a derived component we validate against the composite
314 isDerivedComponent = False
315 if component in writeStorageClass.derivedComponents:
316 writeStorageClass.validateParameters(parameters)
317 isDerivedComponent = True
318 else:
319 readStorageClass.validateParameters(parameters)
321 inMemoryDataset = self.datasets[realID]
323 # if this is a read only component we need to apply parameters
324 # before we retrieve the component. We assume that the parameters
325 # will affect the data globally, before the derived component
326 # is selected.
327 if isDerivedComponent:
328 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
329 # Then disable parameters for later
330 parameters = {}
332 # Different storage classes implies a component request
333 if readStorageClass != writeStorageClass:
335 if component is None: 335 ↛ 336line 335 didn't jump to line 336, because the condition on line 335 was never true
336 raise ValueError(
337 "Storage class inconsistency ({} vs {}) but no"
338 " component requested".format(readStorageClass.name, writeStorageClass.name)
339 )
341 # Concrete composite written as a single object (we hope)
342 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
344 # Since there is no formatter to process parameters, they all must be
345 # passed to the assembler.
346 return self._post_process_get(
347 inMemoryDataset, readStorageClass, parameters, isComponent=component is not None
348 )
350 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
351 """Write a InMemoryDataset with a given `DatasetRef` to the store.
353 Parameters
354 ----------
355 inMemoryDataset : `object`
356 The dataset to store.
357 ref : `DatasetRef`
358 Reference to the associated Dataset.
360 Raises
361 ------
362 TypeError
363 Supplied object and storage class are inconsistent.
364 DatasetTypeNotSupportedError
365 The associated `DatasetType` is not handled by this datastore.
367 Notes
368 -----
369 If the datastore is configured to reject certain dataset types it
370 is possible that the put will fail and raise a
371 `DatasetTypeNotSupportedError`. The main use case for this is to
372 allow `ChainedDatastore` to put to multiple datastores without
373 requiring that every datastore accepts the dataset.
374 """
376 if ref.id is None: 376 ↛ 377line 376 didn't jump to line 377, because the condition on line 376 was never true
377 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
379 # May need to coerce the in memory dataset to the correct
380 # python type, otherwise parameters may not work.
381 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
383 self._validate_put_parameters(inMemoryDataset, ref)
385 self.datasets[ref.id] = inMemoryDataset
386 log.debug("Store %s in %s", ref, self.name)
388 # Store time we received this content, to allow us to optionally
389 # expire it. Instead of storing a filename here, we include the
390 # ID of this datasetRef so we can find it from components.
391 itemInfo = StoredMemoryItemInfo(
392 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.getCheckedId()
393 )
395 # We have to register this content with registry.
396 # Currently this assumes we have a file so we need to use stub entries
397 # TODO: Add to ephemeral part of registry
398 self._register_datasets([(ref, itemInfo)])
400 if self._transaction is not None:
401 self._transaction.registerUndo("put", self.remove, ref)
403 def getURIs(
404 self, ref: DatasetRef, predict: bool = False
405 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
406 """Return URIs associated with dataset.
408 Parameters
409 ----------
410 ref : `DatasetRef`
411 Reference to the required dataset.
412 predict : `bool`, optional
413 If the datastore does not know about the dataset, should it
414 return a predicted URI or not?
416 Returns
417 -------
418 primary : `lsst.resources.ResourcePath`
419 The URI to the primary artifact associated with this dataset.
420 If the dataset was disassembled within the datastore this
421 may be `None`.
422 components : `dict`
423 URIs to any components associated with the dataset artifact.
424 Can be empty if there are no components.
426 Notes
427 -----
428 The URIs returned for in-memory datastores are not usable but
429 provide an indication of the associated dataset.
430 """
432 # Include the dataID as a URI query
433 query = urlencode(ref.dataId)
435 # if this has never been written then we have to guess
436 if not self.exists(ref):
437 if not predict:
438 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
439 name = f"{ref.datasetType.name}"
440 fragment = "#predicted"
441 else:
442 realID, _ = self._get_dataset_info(ref)
443 name = f"{id(self.datasets[realID])}?{query}"
444 fragment = ""
446 return ResourcePath(f"mem://{name}?{query}{fragment}"), {}
448 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
449 """URI to the Dataset.
451 Always uses "mem://" URI prefix.
453 Parameters
454 ----------
455 ref : `DatasetRef`
456 Reference to the required Dataset.
457 predict : `bool`
458 If `True`, allow URIs to be returned of datasets that have not
459 been written.
461 Returns
462 -------
463 uri : `str`
464 URI pointing to the dataset within the datastore. If the
465 dataset does not exist in the datastore, and if ``predict`` is
466 `True`, the URI will be a prediction and will include a URI
467 fragment "#predicted".
468 If the datastore does not have entities that relate well
469 to the concept of a URI the returned URI string will be
470 descriptive. The returned URI is not guaranteed to be obtainable.
472 Raises
473 ------
474 FileNotFoundError
475 A URI has been requested for a dataset that does not exist and
476 guessing is not allowed.
477 AssertionError
478 Raised if an internal error occurs.
479 """
480 primary, _ = self.getURIs(ref, predict)
481 if primary is None: 481 ↛ 484line 481 didn't jump to line 484, because the condition on line 481 was never true
482 # This should be impossible since this datastore does
483 # not disassemble. This check also helps mypy.
484 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
485 return primary
487 def retrieveArtifacts(
488 self,
489 refs: Iterable[DatasetRef],
490 destination: ResourcePath,
491 transfer: str = "auto",
492 preserve_path: bool = True,
493 overwrite: Optional[bool] = False,
494 ) -> List[ResourcePath]:
495 """Retrieve the file artifacts associated with the supplied refs.
497 Notes
498 -----
499 Not implemented by this datastore.
500 """
501 # Could conceivably launch a FileDatastore to use formatters to write
502 # the data but this is fraught with problems.
503 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
505 def forget(self, refs: Iterable[DatasetRef]) -> None:
506 # Docstring inherited.
507 refs = list(refs)
508 self._bridge.forget(refs)
509 for ref in refs:
510 self.removeStoredItemInfo(ref)
512 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None:
513 """Indicate to the Datastore that a dataset can be removed.
515 Parameters
516 ----------
517 ref : `DatasetRef` or iterable thereof
518 Reference to the required Dataset(s).
519 ignore_errors: `bool`, optional
520 Indicate that errors should be ignored.
522 Raises
523 ------
524 FileNotFoundError
525 Attempt to remove a dataset that does not exist. Only relevant
526 if a single dataset ref is given.
528 Notes
529 -----
530 Concurrency should not normally be an issue for the in memory datastore
531 since all internal changes are isolated to solely this process and
532 the registry only changes rows associated with this process.
533 """
534 if not isinstance(ref, DatasetRef):
535 log.debug("Bulk trashing of datasets in datastore %s", self.name)
536 self.bridge.moveToTrash(ref)
537 return
539 log.debug("Trash %s in datastore %s", ref, self.name)
541 # Check that this dataset is known to datastore
542 try:
543 self._get_dataset_info(ref)
545 # Move datasets to trash table
546 self.bridge.moveToTrash([ref])
547 except Exception as e:
548 if ignore_errors: 548 ↛ 549line 548 didn't jump to line 549, because the condition on line 548 was never true
549 log.warning(
550 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
551 )
552 else:
553 raise
555 def emptyTrash(self, ignore_errors: bool = False) -> None:
556 """Remove all datasets from the trash.
558 Parameters
559 ----------
560 ignore_errors : `bool`, optional
561 Ignore errors.
563 Notes
564 -----
565 The internal tracking of datasets is affected by this method and
566 transaction handling is not supported if there is a problem before
567 the datasets themselves are deleted.
569 Concurrency should not normally be an issue for the in memory datastore
570 since all internal changes are isolated to solely this process and
571 the registry only changes rows associated with this process.
572 """
573 log.debug("Emptying trash in datastore %s", self.name)
574 with self._bridge.emptyTrash() as trash_data:
575 trashed, _ = trash_data
576 for ref, _ in trashed:
577 try:
578 realID, _ = self._get_dataset_info(ref)
579 except FileNotFoundError: 579 ↛ 582line 579 didn't jump to line 582
580 # Dataset already removed so ignore it
581 continue
582 except Exception as e:
583 if ignore_errors:
584 log.warning(
585 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
586 self.name,
587 ref.id,
588 e,
589 )
590 continue
591 else:
592 raise
594 # Determine whether all references to this dataset have been
595 # removed and we can delete the dataset itself
596 allRefs = self.related[realID]
597 remainingRefs = allRefs - {ref.id}
598 if not remainingRefs: 598 ↛ 603line 598 didn't jump to line 603, because the condition on line 598 was never false
599 log.debug("Removing artifact %s from datastore %s", realID, self.name)
600 del self.datasets[realID]
602 # Remove this entry
603 self.removeStoredItemInfo(ref)
605 def validateConfiguration(
606 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
607 ) -> None:
608 """Validate some of the configuration for this datastore.
610 Parameters
611 ----------
612 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
613 Entities to test against this configuration. Can be differing
614 types.
615 logFailures : `bool`, optional
616 If `True`, output a log message for every validation error
617 detected.
619 Raises
620 ------
621 DatastoreValidationError
622 Raised if there is a validation problem with a configuration.
623 All the problems are reported in a single exception.
625 Notes
626 -----
627 This method is a no-op.
628 """
629 return
631 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
632 # Docstring is inherited from base class
633 return transfer
635 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
636 # Docstring is inherited from base class
637 return
639 def getLookupKeys(self) -> Set[LookupKey]:
640 # Docstring is inherited from base class
641 return self.constraints.getLookupKeys()
643 def needs_expanded_data_ids(
644 self,
645 transfer: Optional[str],
646 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
647 ) -> bool:
648 # Docstring inherited.
649 return False
651 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
652 # Docstring inherited from the base class.
653 return
655 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
656 # Docstring inherited from the base class.
658 # In-memory Datastore records cannot be exported or imported
659 return {}