Coverage for python/lsst/daf/butler/datastores/inMemoryDatastore.py : 90%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""In-memory datastore."""
24__all__ = ("StoredMemoryItemInfo", "InMemoryDatastore")
26import time
27import logging
28from dataclasses import dataclass
29from typing import Dict, Optional, Any
31from lsst.daf.butler import StoredDatastoreItemInfo, StorageClass
32from .genericDatastore import GenericBaseDatastore
34log = logging.getLogger(__name__)
37@dataclass(frozen=True)
38class StoredMemoryItemInfo(StoredDatastoreItemInfo):
39 """Internal InMemoryDatastore Metadata associated with a stored
40 DatasetRef.
41 """
42 __slots__ = {"timestamp", "storageClass", "parentID"}
44 timestamp: float
45 """Unix timestamp indicating the time the dataset was stored."""
47 storageClass: StorageClass
48 """StorageClass associated with the dataset."""
50 parentID: Optional[int]
51 """ID of the parent `DatasetRef` if this entry is a concrete
52 composite. Not used if the dataset being stored is not a
53 virtual component of a composite
54 """
57class InMemoryDatastore(GenericBaseDatastore):
58 """Basic Datastore for writing to an in memory cache.
60 This datastore is ephemeral in that the contents of the datastore
61 disappear when the Python process completes. This also means that
62 other processes can not access this datastore.
64 Parameters
65 ----------
66 config : `DatastoreConfig` or `str`
67 Configuration.
68 registry : `Registry`, optional
69 Unused parameter.
70 butlerRoot : `str`, optional
71 Unused parameter.
73 Notes
74 -----
75 InMemoryDatastore does not support any file-based ingest.
76 """
78 defaultConfigFile = "datastores/inMemoryDatastore.yaml"
79 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
80 absolute path. Can be None if no defaults specified.
81 """
83 isEphemeral = True
84 """A new datastore is created every time and datasets disappear when
85 the process shuts down."""
87 datasets: Dict[int, Any]
88 """Internal storage of datasets indexed by dataset ID."""
90 records: Dict[int, StoredMemoryItemInfo]
91 """Internal records about stored datasets."""
93 def __init__(self, config, registry=None, butlerRoot=None):
94 super().__init__(config, registry)
96 # Name ourselves with the timestamp the datastore
97 # was created.
98 self.name = "{}@{}".format(type(self).__name__, time.time())
99 log.debug("Creating datastore %s", self.name)
101 # Storage of datasets, keyed by dataset_id
102 self.datasets = {}
104 # Records is distinct in order to track concrete composite components
105 # where we register multiple components for a single dataset.
106 self.records = {}
108 # Related records that share the same parent
109 self.related = {}
111 @classmethod
112 def setConfigRoot(cls, root, config, full, overwrite=True):
113 """Set any filesystem-dependent config options for this Datastore to
114 be appropriate for a new empty repository with the given root.
116 Does nothing in this implementation.
118 Parameters
119 ----------
120 root : `str`
121 Filesystem path to the root of the data repository.
122 config : `Config`
123 A `Config` to update. Only the subset understood by
124 this component will be updated. Will not expand
125 defaults.
126 full : `Config`
127 A complete config with all defaults expanded that can be
128 converted to a `DatastoreConfig`. Read-only and will not be
129 modified by this method.
130 Repository-specific options that should not be obtained
131 from defaults when Butler instances are constructed
132 should be copied from ``full`` to ``config``.
133 overwrite : `bool`, optional
134 If `False`, do not modify a value in ``config`` if the value
135 already exists. Default is always to overwrite with the provided
136 ``root``.
138 Notes
139 -----
140 If a keyword is explicitly defined in the supplied ``config`` it
141 will not be overridden by this method if ``overwrite`` is `False`.
142 This allows explicit values set in external configs to be retained.
143 """
144 return
146 def addStoredItemInfo(self, refs, infos):
147 # Docstring inherited from GenericBaseDatastore.
148 for ref, info in zip(refs, infos):
149 self.records[ref.id] = info
150 self.related.setdefault(info.parentID, set()).add(ref.id)
152 def getStoredItemInfo(self, ref):
153 # Docstring inherited from GenericBaseDatastore.
154 return self.records[ref.id]
156 def getStoredItemsInfo(self, ref):
157 # Docstring inherited from GenericBaseDatastore.
158 return [self.getStoredItemInfo(ref)]
160 def removeStoredItemInfo(self, ref):
161 # Docstring inherited from GenericBaseDatastore.
162 # If a component has been removed previously then we can sometimes
163 # be asked to remove it again. Other datastores ignore this
164 # so also ignore here
165 if ref.id not in self.records: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true
166 return
167 record = self.records[ref.id]
168 del self.records[ref.id]
169 self.related[record.parentID].remove(ref.id)
171 def _get_dataset_info(self, ref):
172 """Check that the dataset is present and return the real ID and
173 associated information.
175 Parameters
176 ----------
177 ref : `DatasetRef`
178 Target `DatasetRef`
180 Returns
181 -------
182 realID : `int`
183 The dataset ID associated with this ref that shoul be used. This
184 could either be the ID of the supplied `DatasetRef` or the parent.
185 storageInfo : `StoredMemoryItemInfo`
186 Associated storage information.
188 Raises
189 ------
190 FileNotFoundError
191 Raised if the dataset is not present in this datastore.
192 """
193 try:
194 storedItemInfo = self.getStoredItemInfo(ref)
195 except KeyError:
196 raise FileNotFoundError(f"No such file dataset in memory: {ref}") from None
197 realID = ref.id
198 if storedItemInfo.parentID is not None: 198 ↛ 201line 198 didn't jump to line 201, because the condition on line 198 was never false
199 realID = storedItemInfo.parentID
201 if realID not in self.datasets: 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true
202 raise FileNotFoundError(f"No such file dataset in memory: {ref}")
204 return realID, storedItemInfo
206 def exists(self, ref):
207 """Check if the dataset exists in the datastore.
209 Parameters
210 ----------
211 ref : `DatasetRef`
212 Reference to the required dataset.
214 Returns
215 -------
216 exists : `bool`
217 `True` if the entity exists in the `Datastore`.
218 """
219 try:
220 self._get_dataset_info(ref)
221 except FileNotFoundError:
222 return False
223 return True
225 def get(self, ref, parameters=None):
226 """Load an InMemoryDataset from the store.
228 Parameters
229 ----------
230 ref : `DatasetRef`
231 Reference to the required Dataset.
232 parameters : `dict`
233 `StorageClass`-specific parameters that specify, for example,
234 a slice of the dataset to be loaded.
236 Returns
237 -------
238 inMemoryDataset : `object`
239 Requested dataset or slice thereof as an InMemoryDataset.
241 Raises
242 ------
243 FileNotFoundError
244 Requested dataset can not be retrieved.
245 TypeError
246 Return value from formatter has unexpected type.
247 ValueError
248 Formatter failed to process the dataset.
249 """
251 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
253 realID, storedItemInfo = self._get_dataset_info(ref)
255 # We have a write storage class and a read storage class and they
256 # can be different for concrete composites.
257 readStorageClass = ref.datasetType.storageClass
258 writeStorageClass = storedItemInfo.storageClass
260 # Check that the supplied parameters are suitable for the type read
261 readStorageClass.validateParameters(parameters)
263 inMemoryDataset = self.datasets[realID]
265 component = ref.datasetType.component()
267 # Different storage classes implies a component request
268 if readStorageClass != writeStorageClass:
270 if component is None: 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true
271 raise ValueError("Storage class inconsistency ({} vs {}) but no"
272 " component requested".format(readStorageClass.name,
273 writeStorageClass.name))
275 # Concrete composite written as a single object (we hope)
276 inMemoryDataset = writeStorageClass.assembler().getComponent(inMemoryDataset, component)
278 # Since there is no formatter to process parameters, they all must be
279 # passed to the assembler.
280 return self._post_process_get(inMemoryDataset, readStorageClass, parameters,
281 isComponent=component is not None)
283 def put(self, inMemoryDataset, ref):
284 """Write a InMemoryDataset with a given `DatasetRef` to the store.
286 Parameters
287 ----------
288 inMemoryDataset : `object`
289 The dataset to store.
290 ref : `DatasetRef`
291 Reference to the associated Dataset.
293 Raises
294 ------
295 TypeError
296 Supplied object and storage class are inconsistent.
297 DatasetTypeNotSupportedError
298 The associated `DatasetType` is not handled by this datastore.
300 Notes
301 -----
302 If the datastore is configured to reject certain dataset types it
303 is possible that the put will fail and raise a
304 `DatasetTypeNotSupportedError`. The main use case for this is to
305 allow `ChainedDatastore` to put to multiple datastores without
306 requiring that every datastore accepts the dataset.
307 """
309 self._validate_put_parameters(inMemoryDataset, ref)
311 self.datasets[ref.id] = inMemoryDataset
312 log.debug("Store %s in %s", ref, self.name)
314 # Store time we received this content, to allow us to optionally
315 # expire it. Instead of storing a filename here, we include the
316 # ID of this datasetRef so we can find it from components.
317 itemInfo = StoredMemoryItemInfo(time.time(), ref.datasetType.storageClass,
318 parentID=ref.id)
320 # We have to register this content with registry.
321 # Currently this assumes we have a file so we need to use stub entries
322 # TODO: Add to ephemeral part of registry
323 self._register_datasets([(ref, itemInfo)])
325 if self._transaction is not None:
326 self._transaction.registerUndo("put", self.remove, ref)
328 def getUri(self, ref, predict=False):
329 """URI to the Dataset.
331 Always uses "mem://" URI prefix.
333 Parameters
334 ----------
335 ref : `DatasetRef`
336 Reference to the required Dataset.
337 predict : `bool`
338 If `True`, allow URIs to be returned of datasets that have not
339 been written.
341 Returns
342 -------
343 uri : `str`
344 URI string pointing to the dataset within the datastore. If the
345 dataset does not exist in the datastore, and if ``predict`` is
346 `True`, the URI will be a prediction and will include a URI
347 fragment "#predicted".
348 If the datastore does not have entities that relate well
349 to the concept of a URI the returned URI string will be
350 descriptive. The returned URI is not guaranteed to be obtainable.
352 Raises
353 ------
354 FileNotFoundError
355 A URI has been requested for a dataset that does not exist and
356 guessing is not allowed.
358 """
360 # if this has never been written then we have to guess
361 if not self.exists(ref):
362 if not predict:
363 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
364 name = "{}#predicted".format(ref.datasetType.name)
365 else:
366 realID, _ = self._get_dataset_info(ref)
367 name = '{}'.format(id(self.datasets[realID]))
369 return "mem://{}".format(name)
371 def trash(self, ref, ignore_errors=False):
372 """Indicate to the Datastore that a dataset can be removed.
374 Parameters
375 ----------
376 ref : `DatasetRef`
377 Reference to the required Dataset.
378 ignore_errors: `bool`, optional
379 Indicate that errors should be ignored.
381 Raises
382 ------
383 FileNotFoundError
384 Attempt to remove a dataset that does not exist.
386 Notes
387 -----
388 Concurrency should not normally be an issue for the in memory datastore
389 since all internal changes are isolated to solely this process and
390 the registry only changes rows associated with this process.
391 """
393 log.debug("Trash %s in datastore %s", ref, self.name)
395 # Check that this dataset is known to datastore
396 try:
397 self._get_dataset_info(ref)
399 # Move datasets to trash table
400 self._move_to_trash_in_registry(ref)
401 except Exception as e:
402 if ignore_errors:
403 log.warning("Error encountered moving dataset %s to trash in datastore %s: %s",
404 ref, self.name, e)
405 else:
406 raise
408 def emptyTrash(self, ignore_errors=False):
409 """Remove all datasets from the trash.
411 Parameters
412 ----------
413 ignore_errors : `bool`, optional
414 Ignore errors.
416 Notes
417 -----
418 The internal tracking of datasets is affected by this method and
419 transaction handling is not supported if there is a problem before
420 the datasets themselves are deleted.
422 Concurrency should not normally be an issue for the in memory datastore
423 since all internal changes are isolated to solely this process and
424 the registry only changes rows associated with this process.
425 """
426 log.debug("Emptying trash in datastore %s", self.name)
427 trashed = self.registry.getTrashedDatasets(self.name)
429 for ref in trashed:
430 try:
431 realID, _ = self._get_dataset_info(ref)
432 except Exception as e:
433 if ignore_errors:
434 log.warning("Emptying trash in datastore %s but encountered an error with dataset %s: %s",
435 self.name, ref.id, e)
436 continue
437 else:
438 raise
440 # Determine whether all references to this dataset have been
441 # removed and we can delete the dataset itself
442 allRefs = self.related[realID]
443 theseRefs = {r.id for r in ref.flatten([ref])}
444 remainingRefs = allRefs - theseRefs
445 if not remainingRefs:
446 log.debug("Removing artifact %s from datastore %s", realID, self.name)
447 del self.datasets[realID]
449 # Remove this entry
450 self.removeStoredItemInfo(ref)
452 # Inform registry that we have handled these items
453 # This should work even if another process is clearing out those rows
454 self.registry.emptyDatasetLocationsTrash(self.name, trashed)
456 def validateConfiguration(self, entities, logFailures=False):
457 """Validate some of the configuration for this datastore.
459 Parameters
460 ----------
461 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
462 Entities to test against this configuration. Can be differing
463 types.
464 logFailures : `bool`, optional
465 If `True`, output a log message for every validation error
466 detected.
468 Raises
469 ------
470 DatastoreValidationError
471 Raised if there is a validation problem with a configuration.
472 All the problems are reported in a single exception.
474 Notes
475 -----
476 This method is a no-op.
477 """
478 return
480 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str:
481 # Docstring is inherited from base class
482 return transfer
484 def validateKey(self, lookupKey, entity):
485 # Docstring is inherited from base class
486 return
488 def getLookupKeys(self):
489 # Docstring is inherited from base class
490 return self.constraints.getLookupKeys()