Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py : 91%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Chained datastore."""
26__all__ = ("ChainedDatastore",)
28import time
29import logging
30import warnings
31import itertools
32from typing import (
33 TYPE_CHECKING,
34 Any,
35 Dict,
36 List,
37 Iterable,
38 Mapping,
39 Optional,
40 Sequence,
41 Set,
42 Tuple,
43 Union,
44)
46from lsst.utils import doImport
47from lsst.daf.butler import ButlerURI, Datastore, DatastoreConfig, DatasetTypeNotSupportedError, \
48 DatastoreValidationError, Constraints, FileDataset
50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true
51 from lsst.daf.butler import Config, DatasetRef, DatasetType, LookupKey, StorageClass
52 from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridgeManager
54log = logging.getLogger(__name__)
57class _IngestPrepData(Datastore.IngestPrepData):
58 """Helper class for ChainedDatastore ingest implementation.
60 Parameters
61 ----------
62 children : `list` of `tuple`
63 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
64 """
65 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]):
66 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children))
67 self.children = children
70class ChainedDatastore(Datastore):
71 """Chained Datastores to allow read and writes from multiple datastores.
73 A ChainedDatastore is configured with multiple datastore configurations.
74 A ``put()`` is always sent to each datastore. A ``get()``
75 operation is sent to each datastore in turn and the first datastore
76 to return a valid dataset is used.
78 Parameters
79 ----------
80 config : `DatastoreConfig` or `str`
81 Configuration. This configuration must include a ``datastores`` field
82 as a sequence of datastore configurations. The order in this sequence
83 indicates the order to use for read operations.
84 bridgeManager : `DatastoreRegistryBridgeManager`
85 Object that manages the interface between `Registry` and datastores.
86 butlerRoot : `str`, optional
87 New datastore root to use to override the configuration value. This
88 root is sent to each child datastore.
90 Notes
91 -----
92 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
93 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
94 and `"hardlink"` if and only if all its child datastores do.
95 """
97 defaultConfigFile = "datastores/chainedDatastore.yaml"
98 """Path to configuration defaults. Accessed within the ``configs`` resource
99 or relative to a search path. Can be None if no defaults specified.
100 """
102 containerKey = "datastores"
103 """Key to specify where child datastores are configured."""
105 datastores: List[Datastore]
106 """All the child datastores known to this datastore."""
108 datastoreConstraints: Sequence[Optional[Constraints]]
109 """Constraints to be applied to each of the child datastores."""
111 @classmethod
112 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
113 """Set any filesystem-dependent config options for child Datastores to
114 be appropriate for a new empty repository with the given root.
116 Parameters
117 ----------
118 root : `str`
119 Filesystem path to the root of the data repository.
120 config : `Config`
121 A `Config` to update. Only the subset understood by
122 this component will be updated. Will not expand
123 defaults.
124 full : `Config`
125 A complete config with all defaults expanded that can be
126 converted to a `DatastoreConfig`. Read-only and will not be
127 modified by this method.
128 Repository-specific options that should not be obtained
129 from defaults when Butler instances are constructed
130 should be copied from ``full`` to ``config``.
131 overwrite : `bool`, optional
132 If `False`, do not modify a value in ``config`` if the value
133 already exists. Default is always to overwrite with the provided
134 ``root``.
136 Notes
137 -----
138 If a keyword is explicitly defined in the supplied ``config`` it
139 will not be overridden by this method if ``overwrite`` is `False`.
140 This allows explicit values set in external configs to be retained.
141 """
143 # Extract the part of the config we care about updating
144 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
146 # And the subset of the full config that we can use for reference.
147 # Do not bother with defaults because we are told this already has
148 # them.
149 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
151 # Loop over each datastore config and pass the subsets to the
152 # child datastores to process.
154 containerKey = cls.containerKey
155 for idx, (child, fullChild) in enumerate(zip(datastoreConfig[containerKey],
156 fullDatastoreConfig[containerKey])):
157 childConfig = DatastoreConfig(child, mergeDefaults=False)
158 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
159 datastoreClass = doImport(fullChildConfig["cls"])
160 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
161 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
163 # Reattach to parent
164 datastoreConfig[containerKey, idx] = childConfig
166 # Reattach modified datastore config to parent
167 # If this has a datastore key we attach there, otherwise we assume
168 # this information goes at the top of the config hierarchy.
169 if DatastoreConfig.component in config:
170 config[DatastoreConfig.component] = datastoreConfig
171 else:
172 config.update(datastoreConfig)
174 return
176 def __init__(self, config: Union[Config, str], bridgeManager: DatastoreRegistryBridgeManager,
177 butlerRoot: str = None):
178 super().__init__(config, bridgeManager)
180 # Scan for child datastores and instantiate them with the same registry
181 self.datastores = []
182 for c in self.config["datastores"]:
183 c = DatastoreConfig(c)
184 datastoreType = doImport(c["cls"])
185 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
186 log.debug("Creating child datastore %s", datastore.name)
187 self.datastores.append(datastore)
189 # Name ourself based on our children
190 if self.datastores: 190 ↛ 195line 190 didn't jump to line 195, because the condition on line 190 was never false
191 # We must set the names explicitly
192 self._names = [d.name for d in self.datastores]
193 childNames = ",".join(self.names)
194 else:
195 childNames = "(empty@{})".format(time.time())
196 self._names = [childNames]
197 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
199 # We declare we are ephemeral if all our child datastores declare
200 # they are ephemeral
201 isEphemeral = True
202 for d in self.datastores:
203 if not d.isEphemeral:
204 isEphemeral = False
205 break
206 self.isEphemeral = isEphemeral
208 # per-datastore override constraints
209 if "datastore_constraints" in self.config:
210 overrides = self.config["datastore_constraints"]
212 if len(overrides) != len(self.datastores): 212 ↛ 213line 212 didn't jump to line 213, because the condition on line 212 was never true
213 raise DatastoreValidationError(f"Number of registered datastores ({len(self.datastores)})"
214 " differs from number of constraints overrides"
215 f" {len(overrides)}")
217 self.datastoreConstraints = [Constraints(c.get("constraints"), universe=bridgeManager.universe)
218 for c in overrides]
220 else:
221 self.datastoreConstraints = (None,) * len(self.datastores)
223 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
225 @property
226 def names(self) -> Tuple[str, ...]:
227 return tuple(self._names)
229 def __str__(self) -> str:
230 chainName = ", ".join(str(ds) for ds in self.datastores)
231 return chainName
233 def exists(self, ref: DatasetRef) -> bool:
234 """Check if the dataset exists in one of the datastores.
236 Parameters
237 ----------
238 ref : `DatasetRef`
239 Reference to the required dataset.
241 Returns
242 -------
243 exists : `bool`
244 `True` if the entity exists in one of the child datastores.
245 """
246 for datastore in self.datastores:
247 if datastore.exists(ref):
248 log.debug("Found %s in datastore %s", ref, datastore.name)
249 return True
250 return False
252 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
253 """Load an InMemoryDataset from the store.
255 The dataset is returned from the first datastore that has
256 the dataset.
258 Parameters
259 ----------
260 ref : `DatasetRef`
261 Reference to the required Dataset.
262 parameters : `dict`
263 `StorageClass`-specific parameters that specify, for example,
264 a slice of the dataset to be loaded.
266 Returns
267 -------
268 inMemoryDataset : `object`
269 Requested dataset or slice thereof as an InMemoryDataset.
271 Raises
272 ------
273 FileNotFoundError
274 Requested dataset can not be retrieved.
275 TypeError
276 Return value from formatter has unexpected type.
277 ValueError
278 Formatter failed to process the dataset.
279 """
281 for datastore in self.datastores:
282 try:
283 inMemoryObject = datastore.get(ref, parameters)
284 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
285 return inMemoryObject
286 except FileNotFoundError:
287 pass
289 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
291 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
292 """Write a InMemoryDataset with a given `DatasetRef` to each
293 datastore.
295 The put() to child datastores can fail with
296 `DatasetTypeNotSupportedError`. The put() for this datastore will be
297 deemed to have succeeded so long as at least one child datastore
298 accepted the inMemoryDataset.
300 Parameters
301 ----------
302 inMemoryDataset : `object`
303 The dataset to store.
304 ref : `DatasetRef`
305 Reference to the associated Dataset.
307 Raises
308 ------
309 TypeError
310 Supplied object and storage class are inconsistent.
311 DatasetTypeNotSupportedError
312 All datastores reported `DatasetTypeNotSupportedError`.
313 """
314 log.debug("Put %s", ref)
316 # Confirm that we can accept this dataset
317 if not self.constraints.isAcceptable(ref):
318 # Raise rather than use boolean return value.
319 raise DatasetTypeNotSupportedError(f"Dataset {ref} has been rejected by this datastore via"
320 " configuration.")
322 isPermanent = False
323 nsuccess = 0
324 npermanent = 0
325 nephemeral = 0
326 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
327 if constraints is not None and not constraints.isAcceptable(ref):
328 log.debug("Datastore %s skipping put via configuration for ref %s",
329 datastore.name, ref)
330 continue
332 if datastore.isEphemeral:
333 nephemeral += 1
334 else:
335 npermanent += 1
336 try:
337 datastore.put(inMemoryDataset, ref)
338 nsuccess += 1
339 if not datastore.isEphemeral:
340 isPermanent = True
341 except DatasetTypeNotSupportedError:
342 pass
344 if nsuccess == 0:
345 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
347 if not isPermanent and npermanent > 0: 347 ↛ 348line 347 didn't jump to line 348, because the condition on line 347 was never true
348 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
350 if self._transaction is not None:
351 self._transaction.registerUndo('put', self.remove, ref)
353 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
354 # Docstring inherited from base class.
355 if transfer != "auto":
356 return transfer
357 # Ask each datastore what they think auto means
358 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
360 # Remove any untranslated "auto" values
361 transfers.discard(transfer)
363 if len(transfers) == 1: 363 ↛ 364line 363 didn't jump to line 364, because the condition on line 363 was never true
364 return transfers.pop()
365 if not transfers: 365 ↛ 369line 365 didn't jump to line 369, because the condition on line 365 was never false
366 # Everything reported "auto"
367 return transfer
369 raise RuntimeError("Chained datastore does not yet support different transfer modes"
370 f" from 'auto' in each child datastore (wanted {transfers})")
372 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
373 # Docstring inherited from Datastore._prepIngest.
374 if transfer is None or transfer == "move":
375 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.")
377 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
378 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
379 if not acceptable:
380 log.debug("Datastore %s skipping ingest via configuration for refs %s",
381 name, ", ".join(str(ref) for ref in dataset.refs))
382 return False
383 else:
384 return True
386 # Filter down to just datasets the chained datastore's own
387 # configuration accepts.
388 okForParent: List[FileDataset] = [dataset for dataset in datasets
389 if isDatasetAcceptable(dataset, name=self.name,
390 constraints=self.constraints)]
392 # Iterate over nested datastores and call _prepIngest on each.
393 # Save the results to a list:
394 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = []
395 # ...and remember whether all of the failures are due to
396 # NotImplementedError being raised.
397 allFailuresAreNotImplementedError = True
398 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
399 okForChild: List[FileDataset]
400 if constraints is not None:
401 okForChild = [dataset for dataset in okForParent
402 if isDatasetAcceptable(dataset, name=datastore.name,
403 constraints=constraints)]
404 else:
405 okForChild = okForParent
406 try:
407 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
408 except NotImplementedError:
409 log.debug("Skipping ingest for datastore %s because transfer "
410 "mode %s is not supported.", datastore.name, transfer)
411 continue
412 allFailuresAreNotImplementedError = False
413 children.append((datastore, prepDataForChild))
414 if allFailuresAreNotImplementedError:
415 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
416 return _IngestPrepData(children=children)
418 def _finishIngest(self, prepData: _IngestPrepData, *, transfer: Optional[str] = None) -> None:
419 # Docstring inherited from Datastore._finishIngest.
420 for datastore, prepDataForChild in prepData.children:
421 datastore._finishIngest(prepDataForChild, transfer=transfer)
423 def getURIs(self, ref: DatasetRef,
424 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
425 """Return URIs associated with dataset.
427 Parameters
428 ----------
429 ref : `DatasetRef`
430 Reference to the required dataset.
431 predict : `bool`, optional
432 If the datastore does not know about the dataset, should it
433 return a predicted URI or not?
435 Returns
436 -------
437 primary : `ButlerURI`
438 The URI to the primary artifact associated with this dataset.
439 If the dataset was disassembled within the datastore this
440 may be `None`.
441 components : `dict`
442 URIs to any components associated with the dataset artifact.
443 Can be empty if there are no components.
445 Notes
446 -----
447 The returned URI is from the first datastore in the list that has
448 the dataset with preference given to the first dataset coming from
449 a permanent datastore. If no datastores have the dataset and prediction
450 is allowed, the predicted URI for the first datastore in the list will
451 be returned.
452 """
453 DatastoreURIs = Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]
454 log.debug("Requesting URIs for %s", ref)
455 predictedUri: Optional[DatastoreURIs] = None
456 predictedEphemeralUri: Optional[DatastoreURIs] = None
457 firstEphemeralUri: Optional[DatastoreURIs] = None
458 for datastore in self.datastores:
459 if datastore.exists(ref):
460 if not datastore.isEphemeral:
461 uri = datastore.getURIs(ref)
462 log.debug("Retrieved non-ephemeral URI: %s", uri)
463 return uri
464 elif not firstEphemeralUri:
465 firstEphemeralUri = datastore.getURIs(ref)
466 elif predict:
467 if not predictedUri and not datastore.isEphemeral:
468 predictedUri = datastore.getURIs(ref, predict)
469 elif not predictedEphemeralUri and datastore.isEphemeral:
470 predictedEphemeralUri = datastore.getURIs(ref, predict)
472 if firstEphemeralUri:
473 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
474 return firstEphemeralUri
476 if predictedUri:
477 log.debug("Retrieved predicted URI: %s", predictedUri)
478 return predictedUri
480 if predictedEphemeralUri:
481 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
482 return predictedEphemeralUri
484 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
486 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI:
487 """URI to the Dataset.
489 The returned URI is from the first datastore in the list that has
490 the dataset with preference given to the first dataset coming from
491 a permanent datastore. If no datastores have the dataset and prediction
492 is allowed, the predicted URI for the first datastore in the list will
493 be returned.
495 Parameters
496 ----------
497 ref : `DatasetRef`
498 Reference to the required Dataset.
499 predict : `bool`
500 If `True`, allow URIs to be returned of datasets that have not
501 been written.
503 Returns
504 -------
505 uri : `ButlerURI`
506 URI pointing to the dataset within the datastore. If the
507 dataset does not exist in the datastore, and if ``predict`` is
508 `True`, the URI will be a prediction and will include a URI
509 fragment "#predicted".
511 Notes
512 -----
513 If the datastore does not have entities that relate well
514 to the concept of a URI the returned URI string will be
515 descriptive. The returned URI is not guaranteed to be obtainable.
517 Raises
518 ------
519 FileNotFoundError
520 A URI has been requested for a dataset that does not exist and
521 guessing is not allowed.
522 RuntimeError
523 Raised if a request is made for a single URI but multiple URIs
524 are associated with this dataset.
525 """
526 log.debug("Requesting URI for %s", ref)
527 primary, components = self.getURIs(ref, predict)
528 if primary is None or components: 528 ↛ 529line 528 didn't jump to line 529, because the condition on line 528 was never true
529 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. "
530 "Use Dataastore.getURIs() instead.")
531 return primary
533 def remove(self, ref: DatasetRef) -> None:
534 """Indicate to the datastore that a dataset can be removed.
536 The dataset will be removed from each datastore. The dataset is
537 not required to exist in every child datastore.
539 Parameters
540 ----------
541 ref : `DatasetRef`
542 Reference to the required dataset.
544 Raises
545 ------
546 FileNotFoundError
547 Attempt to remove a dataset that does not exist. Raised if none
548 of the child datastores removed the dataset.
549 """
550 log.debug(f"Removing {ref}")
551 self.trash(ref, ignore_errors=False)
552 self.emptyTrash(ignore_errors=False)
554 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None:
555 log.debug("Trashing %s", ref)
557 counter = 0
558 for datastore in self.datastores:
559 try:
560 datastore.trash(ref, ignore_errors=ignore_errors)
561 counter += 1
562 except FileNotFoundError:
563 pass
565 if counter == 0:
566 err_msg = f"Could not mark for removal from any child datastore: {ref}"
567 if ignore_errors: 567 ↛ 568line 567 didn't jump to line 568, because the condition on line 567 was never true
568 log.warning(err_msg)
569 else:
570 raise FileNotFoundError(err_msg)
572 def emptyTrash(self, ignore_errors: bool = True) -> None:
573 for datastore in self.datastores:
574 datastore.emptyTrash(ignore_errors=ignore_errors)
576 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
577 """Retrieve a dataset from an input `Datastore`,
578 and store the result in this `Datastore`.
580 Parameters
581 ----------
582 inputDatastore : `Datastore`
583 The external `Datastore` from which to retreive the Dataset.
584 ref : `DatasetRef`
585 Reference to the required dataset in the input data store.
587 Returns
588 -------
589 results : `list`
590 List containing the return value from the ``put()`` to each
591 child datastore.
592 """
593 assert inputDatastore is not self # unless we want it for renames?
594 inMemoryDataset = inputDatastore.get(ref)
595 self.put(inMemoryDataset, ref)
597 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]],
598 logFailures: bool = False) -> None:
599 """Validate some of the configuration for this datastore.
601 Parameters
602 ----------
603 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
604 Entities to test against this configuration. Can be differing
605 types.
606 logFailures : `bool`, optional
607 If `True`, output a log message for every validation error
608 detected.
610 Raises
611 ------
612 DatastoreValidationError
613 Raised if there is a validation problem with a configuration.
614 All the problems are reported in a single exception.
616 Notes
617 -----
618 This method checks each datastore in turn.
619 """
621 # Need to catch each of the datastore outputs and ensure that
622 # all are tested.
623 failures = []
624 for datastore in self.datastores:
625 try:
626 datastore.validateConfiguration(entities, logFailures=logFailures)
627 except DatastoreValidationError as e:
628 if logFailures: 628 ↛ 630line 628 didn't jump to line 630, because the condition on line 628 was never false
629 log.critical("Datastore %s failed validation", datastore.name)
630 failures.append(f"Datastore {self.name}: {e}")
632 if failures:
633 msg = ";\n".join(failures)
634 raise DatastoreValidationError(msg)
636 def validateKey(self, lookupKey: LookupKey,
637 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
638 # Docstring is inherited from base class
639 failures = []
640 for datastore in self.datastores:
641 try:
642 datastore.validateKey(lookupKey, entity)
643 except DatastoreValidationError as e:
644 failures.append(f"Datastore {self.name}: {e}")
646 if failures:
647 msg = ";\n".join(failures)
648 raise DatastoreValidationError(msg)
650 def getLookupKeys(self) -> Set[LookupKey]:
651 # Docstring is inherited from base class
652 keys = set()
653 for datastore in self.datastores:
654 keys.update(datastore.getLookupKeys())
656 keys.update(self.constraints.getLookupKeys())
657 for p in self.datastoreConstraints:
658 if p is not None: 658 ↛ 659line 658 didn't jump to line 659, because the condition on line 658 was never true
659 keys.update(p.getLookupKeys())
661 return keys