Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py : 90%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""In-memory datastore."""
24__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
26import time
27import logging
28from dataclasses import dataclass
29from typing import Dict, Optional, Any
31from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass
32from .genericDatastore import GenericBaseDatastore
34log = logging.getLogger(__name__)
37@dataclass(frozen=True)
38class StoredMemoryItemInfo(StoredDatastoreItemInfo):
39 """Internal InMemoryDatastore Metadata associated with a stored
40 DatasetRef.
41 """
42 __slots__ = {"timestamp", "storageClass", "parentID"}
44 timestamp: float
45 """Unix timestamp indicating the time the dataset was stored."""
47 storageClass: StorageClass
48 """StorageClass associated with the dataset."""
50 parentID: Optional[int]
51 """ID of the parent `DatasetRef` if this entry is a concrete
52 composite. Not used if the dataset being stored is not a
53 virtual component of a composite
54 """
57class InMemoryDatastore(GenericBaseDatastore):
58 """Basic Datastore for writing to an in memory cache.
60 This datastore is ephemeral in that the contents of the datastore
61 disappear when the Python process completes. This also means that
62 other processes can not access this datastore.
64 Parameters
65 ----------
66 config : `DatastoreConfig` or `str`
67 Configuration.
68 registry : `Registry`, optional
69 Unused parameter.
70 butlerRoot : `str`, optional
71 Unused parameter.
73 Notes
74 -----
75 InMemoryDatastore does not support any file-based ingest.
76 """
78 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
79 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
80 absolute path. Can be None if no defaults specified.
81 """
83 isEphemeral = True
84 """A new datastore is created every time and datasets disappear when
85 the process shuts down."""
87 datasets: Dict[int, Any]
88 """Internal storage of datasets indexed by dataset ID."""
90 records: Dict[int, StoredMemoryItemInfo]
91 """Internal records about stored datasets."""
93 def __init__(self, config, registry=None, butlerRoot=None):
94 super().__init__(config, registry)
96 # Name ourselves with the timestamp the datastore
97 # was created.
98 self.name = "{}@{}".format(type(self).__name__, time.time())
99 log.debug("Creating datastore %s", self.name)
101 # Storage of datasets, keyed by dataset_id
102 self.datasets = {}
104 # Records is distinct in order to track concrete composite components
105 # where we register multiple components for a single dataset.
106 self.records = {}
108 # Related records that share the same parent
109 self.related = {}
111 @classmethod
112 def setConfigRoot(cls, root, config, full, overwrite=True):
113 """Set any filesystem-dependent config options for this Datastore to
114 be appropriate for a new empty repository with the given root.
116 Does nothing in this implementation.
118 Parameters
119 ----------
120 root : `str`
121 Filesystem path to the root of the data repository.
122 config : `Config`
123 A `Config` to update. Only the subset understood by
124 this component will be updated. Will not expand
125 defaults.
126 full : `Config`
127 A complete config with all defaults expanded that can be
128 converted to a `DatastoreConfig`. Read-only and will not be
129 modified by this method.
130 Repository-specific options that should not be obtained
131 from defaults when Butler instances are constructed
132 should be copied from ``full`` to ``config``.
133 overwrite : `bool`, optional
134 If `False`, do not modify a value in ``config`` if the value
135 already exists. Default is always to overwrite with the provided
136 ``root``.
138 Notes
139 -----
140 If a keyword is explicitly defined in the supplied ``config`` it
141 will not be overridden by this method if ``overwrite`` is `False`.
142 This allows explicit values set in external configs to be retained.
143 """
144 return
146 def addStoredItemInfo(self, refs, infos):
147 # Docstring inherited from GenericBaseDatastore.
148 for ref, info in zip(refs, infos):
149 self.records[ref.id] = info
150 self.related.setdefault(info.parentID, set()).add(ref.id)
152 def getStoredItemInfo(self, ref):
153 # Docstring inherited from GenericBaseDatastore.
154 return self.records[ref.id]
156 def removeStoredItemInfo(self, ref):
157 # Docstring inherited from GenericBaseDatastore.
158 # If a component has been removed previously then we can sometimes
159 # be asked to remove it again. Other datastores ignore this
160 # so also ignore here
161 if ref.id not in self.records: 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true
162 return
163 record = self.records[ref.id]
164 del self.records[ref.id]
165 self.related[record.parentID].remove(ref.id)
167 def _get_dataset_info(self, ref):
168 """Check that the dataset is present and return the real ID and
169 associated information.
171 Parameters
172 ----------
173 ref : `DatasetRef`
174 Target `DatasetRef`
176 Returns
177 -------
178 realID : `int`
179 The dataset ID associated with this ref that shoul be used. This
180 could either be the ID of the supplied `DatasetRef` or the parent.
181 storageInfo : `StoredMemoryItemInfo`
182 Associated storage information.
184 Raises
185 ------
186 FileNotFoundError
187 Raised if the dataset is not present in this datastore.
188 """
189 try:
190 storedItemInfo = self.getStoredItemInfo(ref)
191 except KeyError:
192 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
193 realID = ref.id
194 if storedItemInfo.parentID is not None: 194 ↛ 197line 194 didn't jump to line 197, because the condition on line 194 was never false
195 realID = storedItemInfo.parentID
197 if realID not in self.datasets: 197 ↛ 198line 197 didn't jump to line 198, because the condition on line 197 was never true
198 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
200 return realID, storedItemInfo
202 def exists(self, ref):
203 """Check if the dataset exists in the datastore.
205 Parameters
206 ----------
207 ref : `DatasetRef`
208 Reference to the required dataset.
210 Returns
211 -------
212 exists : `bool`
213 `True` if the entity exists in the `Datastore`.
214 """
215 try:
216 self._get_dataset_info(ref)
217 except FileNotFoundError:
218 return False
219 return True
221 def get(self, ref, parameters=None):
222 """Load an InMemoryDataset from the store.
224 Parameters
225 ----------
226 ref : `DatasetRef`
227 Reference to the required Dataset.
228 parameters : `dict`
229 `StorageClass`-specific parameters that specify, for example,
230 a slice of the dataset to be loaded.
232 Returns
233 -------
234 inMemoryDataset : `object`
235 Requested dataset or slice thereof as an InMemoryDataset.
237 Raises
238 ------
239 FileNotFoundError
240 Requested dataset can not be retrieved.
241 TypeError
242 Return value from formatter has unexpected type.
243 ValueError
244 Formatter failed to process the dataset.
245 """
247 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
249 realID, storedItemInfo = self._get_dataset_info(ref)
251 # We have a write storage class and a read storage class and they
252 # can be different for concrete composites.
253 readStorageClass = ref.datasetType.storageClass
254 writeStorageClass = storedItemInfo.storageClass
256 # Check that the supplied parameters are suitable for the type read
257 readStorageClass.validateParameters(parameters)
259 inMemoryDataset = self.datasets[realID]
261 # Different storage classes implies a component request
262 if readStorageClass != writeStorageClass:
264 component = ref.datasetType.component()
266 if component is None: 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true
267 raise ValueError("Storage class inconsistency ({} vs {}) but no"
268 " component requested".format(readStorageClass.name,
269 writeStorageClass.name))
271 # Concrete composite written as a single object (we hope)
272 inMemoryDataset = writeStorageClass.assembler().getComponent(inMemoryDataset, component)
274 # Since there is no formatter to process parameters, they all must be
275 # passed to the assembler.
276 return self._post_process_get(inMemoryDataset, readStorageClass, parameters)
278 def put(self, inMemoryDataset, ref):
279 """Write a InMemoryDataset with a given `DatasetRef` to the store.
281 Parameters
282 ----------
283 inMemoryDataset : `object`
284 The dataset to store.
285 ref : `DatasetRef`
286 Reference to the associated Dataset.
288 Raises
289 ------
290 TypeError
291 Supplied object and storage class are inconsistent.
292 DatasetTypeNotSupportedError
293 The associated `DatasetType` is not handled by this datastore.
295 Notes
296 -----
297 If the datastore is configured to reject certain dataset types it
298 is possible that the put will fail and raise a
299 `DatasetTypeNotSupportedError`. The main use case for this is to
300 allow `ChainedDatastore` to put to multiple datastores without
301 requiring that every datastore accepts the dataset.
302 """
304 self._validate_put_parameters(inMemoryDataset, ref)
306 self.datasets[ref.id] = inMemoryDataset
307 log.debug("Store %s in %s", ref, self.name)
309 # Store time we received this content, to allow us to optionally
310 # expire it. Instead of storing a filename here, we include the
311 # ID of this datasetRef so we can find it from components.
312 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass,
313 parentID=ref.id)
315 # We have to register this content with registry.
316 # Currently this assumes we have a file so we need to use stub entries
317 # TODO: Add to ephemeral part of registry
318 self._register_datasets([(ref, itemInfo)])
320 if self._transaction is not None:
321 self._transaction.registerUndo("put", self.remove, ref)
323 def getUri(self, ref, predict=False):
324 """URI to the Dataset.
326 Always uses "mem://" URI prefix.
328 Parameters
329 ----------
330 ref : `DatasetRef`
331 Reference to the required Dataset.
332 predict : `bool`
333 If `True`, allow URIs to be returned of datasets that have not
334 been written.
336 Returns
337 -------
338 uri : `str`
339 URI string pointing to the dataset within the datastore. If the
340 dataset does not exist in the datastore, and if ``predict`` is
341 `True`, the URI will be a prediction and will include a URI
342 fragment "#predicted".
343 If the datastore does not have entities that relate well
344 to the concept of a URI the returned URI string will be
345 descriptive. The returned URI is not guaranteed to be obtainable.
347 Raises
348 ------
349 FileNotFoundError
350 A URI has been requested for a dataset that does not exist and
351 guessing is not allowed.
353 """
355 # if this has never been written then we have to guess
356 if not self.exists(ref):
357 if not predict:
358 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
359 name = "{}#predicted".format(ref.datasetType.name)
360 else:
361 name = '{}'.format(id(self.datasets[ref.id]))
363 return "mem://{}".format(name)
365 def trash(self, ref, ignore_errors=False):
366 """Indicate to the Datastore that a dataset can be removed.
368 Parameters
369 ----------
370 ref : `DatasetRef`
371 Reference to the required Dataset.
372 ignore_errors: `bool`, optional
373 Indicate that errors should be ignored.
375 Raises
376 ------
377 FileNotFoundError
378 Attempt to remove a dataset that does not exist.
380 Notes
381 -----
382 Concurrency should not normally be an issue for the in memory datastore
383 since all internal changes are isolated to solely this process and
384 the registry only changes rows associated with this process.
385 """
387 log.debug("Trash %s in datastore %s", ref, self.name)
389 # Check that this dataset is known to datastore
390 try:
391 self._get_dataset_info(ref)
393 # Move datasets to trash table
394 self._move_to_trash_in_registry(ref)
395 except Exception as e:
396 if ignore_errors:
397 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s",
398 ref, self.name, e)
399 else:
400 raise
402 def emptyTrash(self, ignore_errors=False):
403 """Remove all datasets from the trash.
405 Parameters
406 ----------
407 ignore_errors : `bool`, optional
408 Ignore errors.
410 Notes
411 -----
412 The internal tracking of datasets is affected by this method and
413 transaction handling is not supported if there is a problem before
414 the datasets themselves are deleted.
416 Concurrency should not normally be an issue for the in memory datastore
417 since all internal changes are isolated to solely this process and
418 the registry only changes rows associated with this process.
419 """
420 log.debug("Emptying trash in datastore %s", self.name)
421 trashed = self.registry.getTrashedDatasets(self.name)
423 for ref in trashed:
424 try:
425 realID, _ = self._get_dataset_info(ref)
426 except Exception as e:
427 if ignore_errors:
428 log.warning("Emptying trash in datastore %s but encountered an error with dataset %s: %s",
429 self.name, ref.id, e)
430 continue
431 else:
432 raise
434 # Determine whether all references to this dataset have been
435 # removed and we can delete the dataset itself
436 allRefs = self.related[realID]
437 theseRefs = {r.id for r in ref.flatten([ref])}
438 remainingRefs = allRefs - theseRefs
439 if not remainingRefs:
440 log.debug("Removing artifact %s from datastore %s", realID, self.name)
441 del self.datasets[realID]
443 # Remove this entry
444 self.removeStoredItemInfo(ref)
446 # Inform registry that we have handled these items
447 # This should work even if another process is clearing out those rows
448 self.registry.emptyDatasetLocationsTrash(self.name, trashed)
450 def validateConfiguration(self, entities, logFailures=False):
451 """Validate some of the configuration for this datastore.
453 Parameters
454 ----------
455 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
456 Entities to test against this configuration. Can be differing
457 types.
458 logFailures : `bool`, optional
459 If `True`, output a log message for every validation error
460 detected.
462 Raises
463 ------
464 DatastoreValidationError
465 Raised if there is a validation problem with a configuration.
466 All the problems are reported in a single exception.
468 Notes
469 -----
470 This method is a no-op.
471 """
472 return
474 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str:
475 # Docstring is inherited from base class
476 return transfer
478 def validateKey(self, lookupKey, entity):
479 # Docstring is inherited from base class
480 return
482 def getLookupKeys(self):
483 # Docstring is inherited from base class
484 return self.constraints.getLookupKeys()