Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py: 87%
190 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-06 12:40 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-06 12:40 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""In-memory datastore."""
26__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
28import logging
29import time
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union
32from urllib.parse import urlencode
34from lsst.daf.butler import (
35 DatasetId,
36 DatasetRef,
37 DatasetRefURIs,
38 DatastoreRecordData,
39 StorageClass,
40 StoredDatastoreItemInfo,
41)
42from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge
43from lsst.resources import ResourcePath
45from .genericDatastore import GenericBaseDatastore
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 from lsst.daf.butler import Config, DatasetType, LookupKey
49 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
51log = logging.getLogger(__name__)
54@dataclass(frozen=True)
55class StoredMemoryItemInfo(StoredDatastoreItemInfo):
56 """Internal InMemoryDatastore Metadata associated with a stored
57 DatasetRef.
58 """
60 __slots__ = {"timestamp", "storageClass", "parentID", "dataset_id"}
62 timestamp: float
63 """Unix timestamp indicating the time the dataset was stored."""
65 storageClass: StorageClass
66 """StorageClass associated with the dataset."""
68 parentID: DatasetId
69 """ID of the parent `DatasetRef` if this entry is a concrete
70 composite. Not used if the dataset being stored is not a
71 virtual component of a composite
72 """
74 dataset_id: DatasetId
75 """DatasetId associated with this record."""
78class InMemoryDatastore(GenericBaseDatastore):
79 """Basic Datastore for writing to an in memory cache.
81 This datastore is ephemeral in that the contents of the datastore
82 disappear when the Python process completes. This also means that
83 other processes can not access this datastore.
85 Parameters
86 ----------
87 config : `DatastoreConfig` or `str`
88 Configuration.
89 bridgeManager : `DatastoreRegistryBridgeManager`
90 Object that manages the interface between `Registry` and datastores.
91 butlerRoot : `str`, optional
92 Unused parameter.
94 Notes
95 -----
96 InMemoryDatastore does not support any file-based ingest.
97 """
99 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
100 """Path to configuration defaults. Accessed within the ``configs`` resource
101 or relative to a search path. Can be None if no defaults specified.
102 """
104 isEphemeral = True
105 """A new datastore is created every time and datasets disappear when
106 the process shuts down."""
108 datasets: Dict[DatasetId, Any]
109 """Internal storage of datasets indexed by dataset ID."""
111 records: Dict[DatasetId, StoredMemoryItemInfo]
112 """Internal records about stored datasets."""
114 def __init__(
115 self,
116 config: Union[Config, str],
117 bridgeManager: DatastoreRegistryBridgeManager,
118 butlerRoot: Optional[str] = None,
119 ):
120 super().__init__(config, bridgeManager)
122 # Name ourselves with the timestamp the datastore
123 # was created.
124 self.name = "{}@{}".format(type(self).__name__, time.time())
125 log.debug("Creating datastore %s", self.name)
127 # Storage of datasets, keyed by dataset_id
128 self.datasets: Dict[DatasetId, Any] = {}
130 # Records is distinct in order to track concrete composite components
131 # where we register multiple components for a single dataset.
132 self.records: Dict[DatasetId, StoredMemoryItemInfo] = {}
134 # Related records that share the same parent
135 self.related: Dict[DatasetId, Set[DatasetId]] = {}
137 self._bridge = bridgeManager.register(self.name, ephemeral=True)
139 @classmethod
140 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
141 """Set any filesystem-dependent config options for this Datastore to
142 be appropriate for a new empty repository with the given root.
144 Does nothing in this implementation.
146 Parameters
147 ----------
148 root : `str`
149 Filesystem path to the root of the data repository.
150 config : `Config`
151 A `Config` to update. Only the subset understood by
152 this component will be updated. Will not expand
153 defaults.
154 full : `Config`
155 A complete config with all defaults expanded that can be
156 converted to a `DatastoreConfig`. Read-only and will not be
157 modified by this method.
158 Repository-specific options that should not be obtained
159 from defaults when Butler instances are constructed
160 should be copied from ``full`` to ``config``.
161 overwrite : `bool`, optional
162 If `False`, do not modify a value in ``config`` if the value
163 already exists. Default is always to overwrite with the provided
164 ``root``.
166 Notes
167 -----
168 If a keyword is explicitly defined in the supplied ``config`` it
169 will not be overridden by this method if ``overwrite`` is `False`.
170 This allows explicit values set in external configs to be retained.
171 """
172 return
174 @property
175 def bridge(self) -> DatastoreRegistryBridge:
176 # Docstring inherited from GenericBaseDatastore.
177 return self._bridge
179 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredMemoryItemInfo]) -> None:
180 # Docstring inherited from GenericBaseDatastore.
181 for ref, info in zip(refs, infos):
182 if ref.id is None: 182 ↛ 183line 182 didn't jump to line 183, because the condition on line 182 was never true
183 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
184 self.records[ref.id] = info
185 self.related.setdefault(info.parentID, set()).add(ref.id)
187 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredMemoryItemInfo:
188 # Docstring inherited from GenericBaseDatastore.
189 if ref.id is None: 189 ↛ 190line 189 didn't jump to line 190, because the condition on line 189 was never true
190 raise RuntimeError(f"Can not retrieve unresolved DatasetRef {ref}")
191 return self.records[ref.id]
193 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredMemoryItemInfo]:
194 # Docstring inherited from GenericBaseDatastore.
195 return [self.getStoredItemInfo(ref)]
197 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None:
198 # Docstring inherited from GenericBaseDatastore.
199 # If a component has been removed previously then we can sometimes
200 # be asked to remove it again. Other datastores ignore this
201 # so also ignore here
202 if ref.id is None: 202 ↛ 203line 202 didn't jump to line 203, because the condition on line 202 was never true
203 raise RuntimeError(f"Can not remove unresolved DatasetRef {ref}")
204 if ref.id not in self.records:
205 return
206 record = self.records[ref.id]
207 del self.records[ref.id]
208 self.related[record.parentID].remove(ref.id)
210 def _get_dataset_info(self, ref: DatasetIdRef) -> Tuple[DatasetId, StoredMemoryItemInfo]:
211 """Check that the dataset is present and return the real ID and
212 associated information.
214 Parameters
215 ----------
216 ref : `DatasetRef`
217 Target `DatasetRef`
219 Returns
220 -------
221 realID : `int`
222 The dataset ID associated with this ref that should be used. This
223 could either be the ID of the supplied `DatasetRef` or the parent.
224 storageInfo : `StoredMemoryItemInfo`
225 Associated storage information.
227 Raises
228 ------
229 FileNotFoundError
230 Raised if the dataset is not present in this datastore.
231 """
232 try:
233 storedItemInfo = self.getStoredItemInfo(ref)
234 except KeyError:
235 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
236 realID = ref.id
237 if storedItemInfo.parentID is not None: 237 ↛ 240line 237 didn't jump to line 240, because the condition on line 237 was never false
238 realID = storedItemInfo.parentID
240 if realID not in self.datasets: 240 ↛ 241line 240 didn't jump to line 241, because the condition on line 240 was never true
241 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
243 return realID, storedItemInfo
245 def knows(self, ref: DatasetRef) -> bool:
246 """Check if the dataset is known to the datastore.
248 This datastore does not distinguish dataset existence from knowledge
249 of a dataset.
251 Parameters
252 ----------
253 ref : `DatasetRef`
254 Reference to the required dataset.
256 Returns
257 -------
258 exists : `bool`
259 `True` if the dataset is known to the datastore.
260 """
261 return self.exists(ref)
263 def exists(self, ref: DatasetRef) -> bool:
264 """Check if the dataset exists in the datastore.
266 Parameters
267 ----------
268 ref : `DatasetRef`
269 Reference to the required dataset.
271 Returns
272 -------
273 exists : `bool`
274 `True` if the entity exists in the `Datastore`.
275 """
276 try:
277 self._get_dataset_info(ref)
278 except FileNotFoundError:
279 return False
280 return True
282 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
283 """Load an InMemoryDataset from the store.
285 Parameters
286 ----------
287 ref : `DatasetRef`
288 Reference to the required Dataset.
289 parameters : `dict`
290 `StorageClass`-specific parameters that specify, for example,
291 a slice of the dataset to be loaded.
293 Returns
294 -------
295 inMemoryDataset : `object`
296 Requested dataset or slice thereof as an InMemoryDataset.
298 Raises
299 ------
300 FileNotFoundError
301 Requested dataset can not be retrieved.
302 TypeError
303 Return value from formatter has unexpected type.
304 ValueError
305 Formatter failed to process the dataset.
306 """
308 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
310 realID, storedItemInfo = self._get_dataset_info(ref)
312 # We have a write storage class and a read storage class and they
313 # can be different for concrete composites.
314 readStorageClass = ref.datasetType.storageClass
315 writeStorageClass = storedItemInfo.storageClass
317 component = ref.datasetType.component()
319 # Check that the supplied parameters are suitable for the type read
320 # If this is a derived component we validate against the composite
321 isDerivedComponent = False
322 if component in writeStorageClass.derivedComponents:
323 writeStorageClass.validateParameters(parameters)
324 isDerivedComponent = True
325 else:
326 readStorageClass.validateParameters(parameters)
328 inMemoryDataset = self.datasets[realID]
330 # if this is a read only component we need to apply parameters
331 # before we retrieve the component. We assume that the parameters
332 # will affect the data globally, before the derived component
333 # is selected.
334 if isDerivedComponent:
335 inMemoryDataset = writeStorageClass.delegate().handleParameters(inMemoryDataset, parameters)
336 # Then disable parameters for later
337 parameters = {}
339 # Different storage classes implies a component request
340 if readStorageClass != writeStorageClass:
342 if component is None: 342 ↛ 343line 342 didn't jump to line 343, because the condition on line 342 was never true
343 raise ValueError(
344 "Storage class inconsistency ({} vs {}) but no"
345 " component requested".format(readStorageClass.name, writeStorageClass.name)
346 )
348 # Concrete composite written as a single object (we hope)
349 inMemoryDataset = writeStorageClass.delegate().getComponent(inMemoryDataset, component)
351 # Since there is no formatter to process parameters, they all must be
352 # passed to the assembler.
353 return self._post_process_get(
354 inMemoryDataset, readStorageClass, parameters, isComponent=component is not None
355 )
357 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
358 """Write a InMemoryDataset with a given `DatasetRef` to the store.
360 Parameters
361 ----------
362 inMemoryDataset : `object`
363 The dataset to store.
364 ref : `DatasetRef`
365 Reference to the associated Dataset.
367 Raises
368 ------
369 TypeError
370 Supplied object and storage class are inconsistent.
371 DatasetTypeNotSupportedError
372 The associated `DatasetType` is not handled by this datastore.
374 Notes
375 -----
376 If the datastore is configured to reject certain dataset types it
377 is possible that the put will fail and raise a
378 `DatasetTypeNotSupportedError`. The main use case for this is to
379 allow `ChainedDatastore` to put to multiple datastores without
380 requiring that every datastore accepts the dataset.
381 """
383 if ref.id is None: 383 ↛ 384line 383 didn't jump to line 384, because the condition on line 383 was never true
384 raise RuntimeError(f"Can not store unresolved DatasetRef {ref}")
386 # May need to coerce the in memory dataset to the correct
387 # python type, otherwise parameters may not work.
388 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset)
390 self._validate_put_parameters(inMemoryDataset, ref)
392 self.datasets[ref.id] = inMemoryDataset
393 log.debug("Store %s in %s", ref, self.name)
395 # Store time we received this content, to allow us to optionally
396 # expire it. Instead of storing a filename here, we include the
397 # ID of this datasetRef so we can find it from components.
398 itemInfo = StoredMemoryItemInfo(
399 time.time(), ref.datasetType.storageClass, parentID=ref.id, dataset_id=ref.getCheckedId()
400 )
402 # We have to register this content with registry.
403 # Currently this assumes we have a file so we need to use stub entries
404 # TODO: Add to ephemeral part of registry
405 self._register_datasets([(ref, itemInfo)])
407 if self._transaction is not None:
408 self._transaction.registerUndo("put", self.remove, ref)
410 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
411 """Return URIs associated with dataset.
413 Parameters
414 ----------
415 ref : `DatasetRef`
416 Reference to the required dataset.
417 predict : `bool`, optional
418 If the datastore does not know about the dataset, should it
419 return a predicted URI or not?
421 Returns
422 -------
423 uris : `DatasetRefURIs`
424 The URI to the primary artifact associated with this dataset (if
425 the dataset was disassembled within the datastore this may be
426 `None`), and the URIs to any components associated with the dataset
427 artifact. (can be empty if there are no components).
429 Notes
430 -----
431 The URIs returned for in-memory datastores are not usable but
432 provide an indication of the associated dataset.
433 """
435 # Include the dataID as a URI query
436 query = urlencode(ref.dataId)
438 # if this has never been written then we have to guess
439 if not self.exists(ref):
440 if not predict:
441 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
442 name = f"{ref.datasetType.name}"
443 fragment = "#predicted"
444 else:
445 realID, _ = self._get_dataset_info(ref)
446 name = f"{id(self.datasets[realID])}?{query}"
447 fragment = ""
449 return DatasetRefURIs(ResourcePath(f"mem://{name}?{query}{fragment}"), {})
451 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
452 """URI to the Dataset.
454 Always uses "mem://" URI prefix.
456 Parameters
457 ----------
458 ref : `DatasetRef`
459 Reference to the required Dataset.
460 predict : `bool`
461 If `True`, allow URIs to be returned of datasets that have not
462 been written.
464 Returns
465 -------
466 uri : `str`
467 URI pointing to the dataset within the datastore. If the
468 dataset does not exist in the datastore, and if ``predict`` is
469 `True`, the URI will be a prediction and will include a URI
470 fragment "#predicted".
471 If the datastore does not have entities that relate well
472 to the concept of a URI the returned URI string will be
473 descriptive. The returned URI is not guaranteed to be obtainable.
475 Raises
476 ------
477 FileNotFoundError
478 A URI has been requested for a dataset that does not exist and
479 guessing is not allowed.
480 AssertionError
481 Raised if an internal error occurs.
482 """
483 primary, _ = self.getURIs(ref, predict)
484 if primary is None: 484 ↛ 487line 484 didn't jump to line 487, because the condition on line 484 was never true
485 # This should be impossible since this datastore does
486 # not disassemble. This check also helps mypy.
487 raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}")
488 return primary
490 def retrieveArtifacts(
491 self,
492 refs: Iterable[DatasetRef],
493 destination: ResourcePath,
494 transfer: str = "auto",
495 preserve_path: bool = True,
496 overwrite: Optional[bool] = False,
497 ) -> List[ResourcePath]:
498 """Retrieve the file artifacts associated with the supplied refs.
500 Notes
501 -----
502 Not implemented by this datastore.
503 """
504 # Could conceivably launch a FileDatastore to use formatters to write
505 # the data but this is fraught with problems.
506 raise NotImplementedError("Can not write artifacts to disk from in-memory datastore.")
508 def forget(self, refs: Iterable[DatasetRef]) -> None:
509 # Docstring inherited.
510 refs = list(refs)
511 self._bridge.forget(refs)
512 for ref in refs:
513 self.removeStoredItemInfo(ref)
515 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = False) -> None:
516 """Indicate to the Datastore that a dataset can be removed.
518 Parameters
519 ----------
520 ref : `DatasetRef` or iterable thereof
521 Reference to the required Dataset(s).
522 ignore_errors: `bool`, optional
523 Indicate that errors should be ignored.
525 Raises
526 ------
527 FileNotFoundError
528 Attempt to remove a dataset that does not exist. Only relevant
529 if a single dataset ref is given.
531 Notes
532 -----
533 Concurrency should not normally be an issue for the in memory datastore
534 since all internal changes are isolated to solely this process and
535 the registry only changes rows associated with this process.
536 """
537 if not isinstance(ref, DatasetRef):
538 log.debug("Bulk trashing of datasets in datastore %s", self.name)
539 self.bridge.moveToTrash(ref)
540 return
542 log.debug("Trash %s in datastore %s", ref, self.name)
544 # Check that this dataset is known to datastore
545 try:
546 self._get_dataset_info(ref)
548 # Move datasets to trash table
549 self.bridge.moveToTrash([ref])
550 except Exception as e:
551 if ignore_errors: 551 ↛ 552line 551 didn't jump to line 552, because the condition on line 551 was never true
552 log.warning(
553 "Error encountered moving dataset %s to trash in datastore %s: %s", ref, self.name, e
554 )
555 else:
556 raise
558 def emptyTrash(self, ignore_errors: bool = False) -> None:
559 """Remove all datasets from the trash.
561 Parameters
562 ----------
563 ignore_errors : `bool`, optional
564 Ignore errors.
566 Notes
567 -----
568 The internal tracking of datasets is affected by this method and
569 transaction handling is not supported if there is a problem before
570 the datasets themselves are deleted.
572 Concurrency should not normally be an issue for the in memory datastore
573 since all internal changes are isolated to solely this process and
574 the registry only changes rows associated with this process.
575 """
576 log.debug("Emptying trash in datastore %s", self.name)
577 with self._bridge.emptyTrash() as trash_data:
578 trashed, _ = trash_data
579 for ref, _ in trashed:
580 try:
581 realID, _ = self._get_dataset_info(ref)
582 except FileNotFoundError: 582 ↛ 585line 582 didn't jump to line 585
583 # Dataset already removed so ignore it
584 continue
585 except Exception as e:
586 if ignore_errors:
587 log.warning(
588 "Emptying trash in datastore %s but encountered an error with dataset %s: %s",
589 self.name,
590 ref.id,
591 e,
592 )
593 continue
594 else:
595 raise
597 # Determine whether all references to this dataset have been
598 # removed and we can delete the dataset itself
599 allRefs = self.related[realID]
600 remainingRefs = allRefs - {ref.id}
601 if not remainingRefs: 601 ↛ 606line 601 didn't jump to line 606, because the condition on line 601 was never false
602 log.debug("Removing artifact %s from datastore %s", realID, self.name)
603 del self.datasets[realID]
605 # Remove this entry
606 self.removeStoredItemInfo(ref)
608 def validateConfiguration(
609 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
610 ) -> None:
611 """Validate some of the configuration for this datastore.
613 Parameters
614 ----------
615 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
616 Entities to test against this configuration. Can be differing
617 types.
618 logFailures : `bool`, optional
619 If `True`, output a log message for every validation error
620 detected.
622 Raises
623 ------
624 DatastoreValidationError
625 Raised if there is a validation problem with a configuration.
626 All the problems are reported in a single exception.
628 Notes
629 -----
630 This method is a no-op.
631 """
632 return
634 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
635 # Docstring is inherited from base class
636 return transfer
638 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
639 # Docstring is inherited from base class
640 return
642 def getLookupKeys(self) -> Set[LookupKey]:
643 # Docstring is inherited from base class
644 return self.constraints.getLookupKeys()
646 def needs_expanded_data_ids(
647 self,
648 transfer: Optional[str],
649 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
650 ) -> bool:
651 # Docstring inherited.
652 return False
654 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
655 # Docstring inherited from the base class.
656 return
658 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
659 # Docstring inherited from the base class.
661 # In-memory Datastore records cannot be exported or imported
662 return {}