Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py : 92%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Chained datastore."""
26__all__ = ("ChainedDatastore",)
28import time
29import logging
30import warnings
31import itertools
32from typing import (
33 TYPE_CHECKING,
34 Any,
35 List,
36 Iterable,
37 Mapping,
38 Optional,
39 Sequence,
40 Set,
41 Tuple,
42 Union,
43)
45from lsst.utils import doImport
46from lsst.daf.butler import Datastore, DatastoreConfig, DatasetTypeNotSupportedError, \
47 DatastoreValidationError, Constraints, FileDataset
49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true
50 from lsst.daf.butler import Config, DatasetRef, DatasetType, LookupKey, StorageClass
51 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager
53log = logging.getLogger(__name__)
56class _IngestPrepData(Datastore.IngestPrepData):
57 """Helper class for ChainedDatastore ingest implementation.
59 Parameters
60 ----------
61 children : `list` of `tuple`
62 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
63 """
64 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]):
65 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children))
66 self.children = children
69class ChainedDatastore(Datastore):
70 """Chained Datastores to allow read and writes from multiple datastores.
72 A ChainedDatastore is configured with multiple datastore configurations.
73 A ``put()`` is always sent to each datastore. A ``get()``
74 operation is sent to each datastore in turn and the first datastore
75 to return a valid dataset is used.
77 Parameters
78 ----------
79 config : `DatastoreConfig` or `str`
80 Configuration. This configuration must include a ``datastores`` field
81 as a sequence of datastore configurations. The order in this sequence
82 indicates the order to use for read operations.
83 bridgeManager : `DatastoreRegistryBridgeManager`
84 Object that manages the interface between `Registry` and datastores.
85 butlerRoot : `str`, optional
86 New datastore root to use to override the configuration value. This
87 root is sent to each child datastore.
89 Notes
90 -----
91 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
92 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
93 and `"hardlink"` if and only if all its child datastores do.
94 """
96 defaultConfigFile = "datastores/chainedDatastore.yaml"
97 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
98 absolute path. Can be None if no defaults specified.
99 """
101 containerKey = "datastores"
102 """Key to specify where child datastores are configured."""
104 datastores: List[Datastore]
105 """All the child datastores known to this datastore."""
107 datastoreConstraints: Sequence[Optional[Constraints]]
108 """Constraints to be applied to each of the child datastores."""
110 @classmethod
111 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
112 """Set any filesystem-dependent config options for child Datastores to
113 be appropriate for a new empty repository with the given root.
115 Parameters
116 ----------
117 root : `str`
118 Filesystem path to the root of the data repository.
119 config : `Config`
120 A `Config` to update. Only the subset understood by
121 this component will be updated. Will not expand
122 defaults.
123 full : `Config`
124 A complete config with all defaults expanded that can be
125 converted to a `DatastoreConfig`. Read-only and will not be
126 modified by this method.
127 Repository-specific options that should not be obtained
128 from defaults when Butler instances are constructed
129 should be copied from ``full`` to ``config``.
130 overwrite : `bool`, optional
131 If `False`, do not modify a value in ``config`` if the value
132 already exists. Default is always to overwrite with the provided
133 ``root``.
135 Notes
136 -----
137 If a keyword is explicitly defined in the supplied ``config`` it
138 will not be overridden by this method if ``overwrite`` is `False`.
139 This allows explicit values set in external configs to be retained.
140 """
142 # Extract the part of the config we care about updating
143 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
145 # And the subset of the full config that we can use for reference.
146 # Do not bother with defaults because we are told this already has
147 # them.
148 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
150 # Loop over each datastore config and pass the subsets to the
151 # child datastores to process.
153 containerKey = cls.containerKey
154 for idx, (child, fullChild) in enumerate(zip(datastoreConfig[containerKey],
155 fullDatastoreConfig[containerKey])):
156 childConfig = DatastoreConfig(child, mergeDefaults=False)
157 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
158 datastoreClass = doImport(fullChildConfig["cls"])
159 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
160 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
162 # Reattach to parent
163 datastoreConfig[containerKey, idx] = childConfig
165 # Reattach modified datastore config to parent
166 # If this has a datastore key we attach there, otherwise we assume
167 # this information goes at the top of the config hierarchy.
168 if DatastoreConfig.component in config:
169 config[DatastoreConfig.component] = datastoreConfig
170 else:
171 config.update(datastoreConfig)
173 return
175 def __init__(self, config: Union[Config, str], bridgeManager: DatastoreRegistryBridgeManager,
176 butlerRoot: str = None):
177 super().__init__(config, bridgeManager)
179 # Scan for child datastores and instantiate them with the same registry
180 self.datastores = []
181 for c in self.config["datastores"]:
182 c = DatastoreConfig(c)
183 datastoreType = doImport(c["cls"])
184 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
185 log.debug("Creating child datastore %s", datastore.name)
186 self.datastores.append(datastore)
188 # Name ourself based on our children
189 if self.datastores: 189 ↛ 194line 189 didn't jump to line 194, because the condition on line 189 was never false
190 # We must set the names explicitly
191 self._names = [d.name for d in self.datastores]
192 childNames = ",".join(self.names)
193 else:
194 childNames = "(empty@{})".format(time.time())
195 self._names = [childNames]
196 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
198 # We declare we are ephemeral if all our child datastores declare
199 # they are ephemeral
200 isEphemeral = True
201 for d in self.datastores:
202 if not d.isEphemeral:
203 isEphemeral = False
204 break
205 self.isEphemeral = isEphemeral
207 # per-datastore override constraints
208 if "datastore_constraints" in self.config:
209 overrides = self.config["datastore_constraints"]
211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 raise DatastoreValidationError(f"Number of registered datastores ({len(self.datastores)})"
213 " differs from number of constraints overrides"
214 f" {len(overrides)}")
216 self.datastoreConstraints = [Constraints(c.get("constraints"), universe=bridgeManager.universe)
217 for c in overrides]
219 else:
220 self.datastoreConstraints = (None,) * len(self.datastores)
222 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
224 @property
225 def names(self) -> Tuple[str, ...]:
226 return tuple(self._names)
228 def __str__(self) -> str:
229 chainName = ", ".join(str(ds) for ds in self.datastores)
230 return chainName
232 def exists(self, ref: DatasetRef) -> bool:
233 """Check if the dataset exists in one of the datastores.
235 Parameters
236 ----------
237 ref : `DatasetRef`
238 Reference to the required dataset.
240 Returns
241 -------
242 exists : `bool`
243 `True` if the entity exists in one of the child datastores.
244 """
245 for datastore in self.datastores:
246 if datastore.exists(ref):
247 log.debug("Found %s in datastore %s", ref, datastore.name)
248 return True
249 return False
251 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
252 """Load an InMemoryDataset from the store.
254 The dataset is returned from the first datastore that has
255 the dataset.
257 Parameters
258 ----------
259 ref : `DatasetRef`
260 Reference to the required Dataset.
261 parameters : `dict`
262 `StorageClass`-specific parameters that specify, for example,
263 a slice of the dataset to be loaded.
265 Returns
266 -------
267 inMemoryDataset : `object`
268 Requested dataset or slice thereof as an InMemoryDataset.
270 Raises
271 ------
272 FileNotFoundError
273 Requested dataset can not be retrieved.
274 TypeError
275 Return value from formatter has unexpected type.
276 ValueError
277 Formatter failed to process the dataset.
278 """
280 for datastore in self.datastores:
281 try:
282 inMemoryObject = datastore.get(ref, parameters)
283 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
284 return inMemoryObject
285 except FileNotFoundError:
286 pass
288 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
290 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
291 """Write a InMemoryDataset with a given `DatasetRef` to each
292 datastore.
294 The put() to child datastores can fail with
295 `DatasetTypeNotSupportedError`. The put() for this datastore will be
296 deemed to have succeeded so long as at least one child datastore
297 accepted the inMemoryDataset.
299 Parameters
300 ----------
301 inMemoryDataset : `object`
302 The dataset to store.
303 ref : `DatasetRef`
304 Reference to the associated Dataset.
306 Raises
307 ------
308 TypeError
309 Supplied object and storage class are inconsistent.
310 DatasetTypeNotSupportedError
311 All datastores reported `DatasetTypeNotSupportedError`.
312 """
313 log.debug("Put %s", ref)
315 # Confirm that we can accept this dataset
316 if not self.constraints.isAcceptable(ref):
317 # Raise rather than use boolean return value.
318 raise DatasetTypeNotSupportedError(f"Dataset {ref} has been rejected by this datastore via"
319 " configuration.")
321 isPermanent = False
322 nsuccess = 0
323 npermanent = 0
324 nephemeral = 0
325 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
326 if constraints is not None and not constraints.isAcceptable(ref):
327 log.debug("Datastore %s skipping put via configuration for ref %s",
328 datastore.name, ref)
329 continue
331 if datastore.isEphemeral:
332 nephemeral += 1
333 else:
334 npermanent += 1
335 try:
336 datastore.put(inMemoryDataset, ref)
337 nsuccess += 1
338 if not datastore.isEphemeral:
339 isPermanent = True
340 except DatasetTypeNotSupportedError:
341 pass
343 if nsuccess == 0:
344 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
346 if not isPermanent and npermanent > 0: 346 ↛ 347line 346 didn't jump to line 347, because the condition on line 346 was never true
347 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
349 if self._transaction is not None:
350 self._transaction.registerUndo('put', self.remove, ref)
352 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
353 # Docstring inherited from base class.
354 if transfer != "auto":
355 return transfer
356 # Ask each datastore what they think auto means
357 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
359 # Remove any untranslated "auto" values
360 transfers.discard(transfer)
362 if len(transfers) == 1:
363 return transfers.pop()
364 if not transfers: 364 ↛ 368line 364 didn't jump to line 368, because the condition on line 364 was never false
365 # Everything reported "auto"
366 return transfer
368 raise RuntimeError("Chained datastore does not yet support different transfer modes"
369 f" from 'auto' in each child datastore (wanted {transfers})")
371 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
372 # Docstring inherited from Datastore._prepIngest.
373 if transfer is None or transfer == "move":
374 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.")
376 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
377 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
378 if not acceptable:
379 log.debug("Datastore %s skipping ingest via configuration for refs %s",
380 name, ", ".join(str(ref) for ref in dataset.refs))
381 return False
382 else:
383 return True
385 # Filter down to just datasets the chained datastore's own
386 # configuration accepts.
387 okForParent: List[FileDataset] = [dataset for dataset in datasets
388 if isDatasetAcceptable(dataset, name=self.name,
389 constraints=self.constraints)]
391 # Iterate over nested datastores and call _prepIngest on each.
392 # Save the results to a list:
393 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = []
394 # ...and remember whether all of the failures are due to
395 # NotImplementedError being raised.
396 allFailuresAreNotImplementedError = True
397 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
398 okForChild: List[FileDataset]
399 if constraints is not None:
400 okForChild = [dataset for dataset in okForParent
401 if isDatasetAcceptable(dataset, name=datastore.name,
402 constraints=constraints)]
403 else:
404 okForChild = okForParent
405 try:
406 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
407 except NotImplementedError:
408 log.debug("Skipping ingest for datastore %s because transfer "
409 "mode %s is not supported.", datastore.name, transfer)
410 continue
411 allFailuresAreNotImplementedError = False
412 children.append((datastore, prepDataForChild))
413 if allFailuresAreNotImplementedError:
414 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
415 return _IngestPrepData(children=children)
417 def _finishIngest(self, prepData: _IngestPrepData, *, transfer: Optional[str] = None) -> None:
418 # Docstring inherited from Datastore._finishIngest.
419 for datastore, prepDataForChild in prepData.children:
420 datastore._finishIngest(prepDataForChild, transfer=transfer)
422 def getUri(self, ref: DatasetRef, predict: bool = False) -> str:
423 """URI to the Dataset.
425 The returned URI is from the first datastore in the list that has
426 the dataset with preference given to the first dataset coming from
427 a permanent datastore. If no datastores have the dataset and prediction
428 is allowed, the predicted URI for the first datastore in the list will
429 be returned.
431 Parameters
432 ----------
433 ref : `DatasetRef`
434 Reference to the required Dataset.
435 predict : `bool`
436 If `True`, allow URIs to be returned of datasets that have not
437 been written.
439 Returns
440 -------
441 uri : `str`
442 URI string pointing to the dataset within the datastore. If the
443 dataset does not exist in the datastore, and if ``predict`` is
444 `True`, the URI will be a prediction and will include a URI
445 fragment "#predicted".
447 Notes
448 -----
449 If the datastore does not have entities that relate well
450 to the concept of a URI the returned URI string will be
451 descriptive. The returned URI is not guaranteed to be obtainable.
453 Raises
454 ------
455 FileNotFoundError
456 A URI has been requested for a dataset that does not exist and
457 guessing is not allowed.
458 """
459 log.debug("Requesting URI for %s", ref)
460 predictedUri: Optional[str] = None
461 predictedEphemeralUri: Optional[str] = None
462 firstEphemeralUri: Optional[str] = None
463 for datastore in self.datastores:
464 if datastore.exists(ref):
465 if not datastore.isEphemeral:
466 uri = datastore.getUri(ref)
467 log.debug("Retrieved ephemeral URI: %s", uri)
468 return uri
469 elif firstEphemeralUri is None:
470 firstEphemeralUri = datastore.getUri(ref)
471 elif predict:
472 if predictedUri is None and not datastore.isEphemeral:
473 predictedUri = datastore.getUri(ref, predict)
474 elif predictedEphemeralUri is None and datastore.isEphemeral:
475 predictedEphemeralUri = datastore.getUri(ref, predict)
477 if firstEphemeralUri is not None:
478 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
479 return firstEphemeralUri
481 if predictedUri is not None:
482 log.debug("Retrieved predicted URI: %s", predictedUri)
483 return predictedUri
485 if predictedEphemeralUri is not None:
486 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
487 return predictedEphemeralUri
489 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
491 def remove(self, ref: DatasetRef) -> None:
492 """Indicate to the datastore that a dataset can be removed.
494 The dataset will be removed from each datastore. The dataset is
495 not required to exist in every child datastore.
497 Parameters
498 ----------
499 ref : `DatasetRef`
500 Reference to the required dataset.
502 Raises
503 ------
504 FileNotFoundError
505 Attempt to remove a dataset that does not exist. Raised if none
506 of the child datastores removed the dataset.
507 """
508 log.debug(f"Removing {ref}")
509 self.trash(ref, ignore_errors=False)
510 self.emptyTrash(ignore_errors=False)
512 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
513 log.debug("Trashing %s", ref)
515 counter = 0
516 for datastore in self.datastores:
517 try:
518 datastore.trash(ref, ignore_errors=ignore_errors)
519 counter += 1
520 except FileNotFoundError:
521 pass
523 if counter == 0:
524 err_msg = f"Could not mark for removal from any child datastore: {ref}"
525 if ignore_errors: 525 ↛ 526line 525 didn't jump to line 526, because the condition on line 525 was never true
526 log.warning(err_msg)
527 else:
528 raise FileNotFoundError(err_msg)
530 def emptyTrash(self, ignore_errors: bool = True) -> None:
531 for datastore in self.datastores:
532 datastore.emptyTrash(ignore_errors=ignore_errors)
534 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
535 """Retrieve a dataset from an input `Datastore`,
536 and store the result in this `Datastore`.
538 Parameters
539 ----------
540 inputDatastore : `Datastore`
541 The external `Datastore` from which to retreive the Dataset.
542 ref : `DatasetRef`
543 Reference to the required dataset in the input data store.
545 Returns
546 -------
547 results : `list`
548 List containing the return value from the ``put()`` to each
549 child datastore.
550 """
551 assert inputDatastore is not self # unless we want it for renames?
552 inMemoryDataset = inputDatastore.get(ref)
553 self.put(inMemoryDataset, ref)
555 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
556 logFailures: bool = False) -> None:
557 """Validate some of the configuration for this datastore.
559 Parameters
560 ----------
561 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
562 Entities to test against this configuration. Can be differing
563 types.
564 logFailures : `bool`, optional
565 If `True`, output a log message for every validation error
566 detected.
568 Raises
569 ------
570 DatastoreValidationError
571 Raised if there is a validation problem with a configuration.
572 All the problems are reported in a single exception.
574 Notes
575 -----
576 This method checks each datastore in turn.
577 """
579 # Need to catch each of the datastore outputs and ensure that
580 # all are tested.
581 failures = []
582 for datastore in self.datastores:
583 try:
584 datastore.validateConfiguration(entities, logFailures=logFailures)
585 except DatastoreValidationError as e:
586 if logFailures: 586 ↛ 588line 586 didn't jump to line 588, because the condition on line 586 was never false
587 log.fatal("Datastore %s failed validation", datastore.name)
588 failures.append(f"Datastore {self.name}: {e}")
590 if failures:
591 msg = ";\n".join(failures)
592 raise DatastoreValidationError(msg)
594 def validateKey(self, lookupKey: LookupKey,
595 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
596 # Docstring is inherited from base class
597 failures = []
598 for datastore in self.datastores:
599 try:
600 datastore.validateKey(lookupKey, entity)
601 except DatastoreValidationError as e:
602 failures.append(f"Datastore {self.name}: {e}")
604 if failures:
605 msg = ";\n".join(failures)
606 raise DatastoreValidationError(msg)
608 def getLookupKeys(self) -> Set[LookupKey]:
609 # Docstring is inherited from base class
610 keys = set()
611 for datastore in self.datastores:
612 keys.update(datastore.getLookupKeys())
614 keys.update(self.constraints.getLookupKeys())
615 for p in self.datastoreConstraints:
616 if p is not None: 616 ↛ 617line 616 didn't jump to line 617, because the condition on line 616 was never true
617 keys.update(p.getLookupKeys())
619 return keys