Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 91%
319 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-30 02:18 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-30 02:18 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Chained datastore."""
26__all__ = ("ChainedDatastore",)
28import itertools
29import logging
30import time
31import warnings
32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union
34from lsst.daf.butler import (
35 Constraints,
36 DatasetRef,
37 DatasetRefURIs,
38 DatasetTypeNotSupportedError,
39 Datastore,
40 DatastoreConfig,
41 DatastoreRecordData,
42 DatastoreValidationError,
43 FileDataset,
44)
45from lsst.resources import ResourcePath
46from lsst.utils import doImportType
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
52log = logging.getLogger(__name__)
55class _IngestPrepData(Datastore.IngestPrepData):
56 """Helper class for ChainedDatastore ingest implementation.
58 Parameters
59 ----------
60 children : `list` of `tuple`
61 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
62 """
64 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]):
65 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children))
66 self.children = children
69class ChainedDatastore(Datastore):
70 """Chained Datastores to allow read and writes from multiple datastores.
72 A ChainedDatastore is configured with multiple datastore configurations.
73 A ``put()`` is always sent to each datastore. A ``get()``
74 operation is sent to each datastore in turn and the first datastore
75 to return a valid dataset is used.
77 Parameters
78 ----------
79 config : `DatastoreConfig` or `str`
80 Configuration. This configuration must include a ``datastores`` field
81 as a sequence of datastore configurations. The order in this sequence
82 indicates the order to use for read operations.
83 bridgeManager : `DatastoreRegistryBridgeManager`
84 Object that manages the interface between `Registry` and datastores.
85 butlerRoot : `str`, optional
86 New datastore root to use to override the configuration value. This
87 root is sent to each child datastore.
89 Notes
90 -----
91 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
92 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
93 and `"hardlink"` if and only if all its child datastores do.
94 """
96 defaultConfigFile = "datastores/chainedDatastore.yaml"
97 """Path to configuration defaults. Accessed within the ``configs`` resource
98 or relative to a search path. Can be None if no defaults specified.
99 """
101 containerKey = "datastores"
102 """Key to specify where child datastores are configured."""
104 datastores: List[Datastore]
105 """All the child datastores known to this datastore."""
107 datastoreConstraints: Sequence[Optional[Constraints]]
108 """Constraints to be applied to each of the child datastores."""
110 @classmethod
111 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
112 """Set any filesystem-dependent config options for child Datastores to
113 be appropriate for a new empty repository with the given root.
115 Parameters
116 ----------
117 root : `str`
118 Filesystem path to the root of the data repository.
119 config : `Config`
120 A `Config` to update. Only the subset understood by
121 this component will be updated. Will not expand
122 defaults.
123 full : `Config`
124 A complete config with all defaults expanded that can be
125 converted to a `DatastoreConfig`. Read-only and will not be
126 modified by this method.
127 Repository-specific options that should not be obtained
128 from defaults when Butler instances are constructed
129 should be copied from ``full`` to ``config``.
130 overwrite : `bool`, optional
131 If `False`, do not modify a value in ``config`` if the value
132 already exists. Default is always to overwrite with the provided
133 ``root``.
135 Notes
136 -----
137 If a keyword is explicitly defined in the supplied ``config`` it
138 will not be overridden by this method if ``overwrite`` is `False`.
139 This allows explicit values set in external configs to be retained.
140 """
142 # Extract the part of the config we care about updating
143 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
145 # And the subset of the full config that we can use for reference.
146 # Do not bother with defaults because we are told this already has
147 # them.
148 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
150 # Loop over each datastore config and pass the subsets to the
151 # child datastores to process.
153 containerKey = cls.containerKey
154 for idx, (child, fullChild) in enumerate(
155 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey])
156 ):
157 childConfig = DatastoreConfig(child, mergeDefaults=False)
158 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
159 datastoreClass = doImportType(fullChildConfig["cls"])
160 if not issubclass(datastoreClass, Datastore): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
162 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
163 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
165 # Reattach to parent
166 datastoreConfig[containerKey, idx] = childConfig
168 # Reattach modified datastore config to parent
169 # If this has a datastore key we attach there, otherwise we assume
170 # this information goes at the top of the config hierarchy.
171 if DatastoreConfig.component in config:
172 config[DatastoreConfig.component] = datastoreConfig
173 else:
174 config.update(datastoreConfig)
176 return
178 def __init__(
179 self,
180 config: Union[Config, str],
181 bridgeManager: DatastoreRegistryBridgeManager,
182 butlerRoot: str = None,
183 ):
184 super().__init__(config, bridgeManager)
186 # Scan for child datastores and instantiate them with the same registry
187 self.datastores = []
188 for c in self.config["datastores"]:
189 c = DatastoreConfig(c)
190 datastoreType = doImportType(c["cls"])
191 if not issubclass(datastoreType, Datastore): 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true
192 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
193 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
194 log.debug("Creating child datastore %s", datastore.name)
195 self.datastores.append(datastore)
197 # Name ourself based on our children
198 if self.datastores: 198 ↛ 203line 198 didn't jump to line 203, because the condition on line 198 was never false
199 # We must set the names explicitly
200 self._names = [d.name for d in self.datastores]
201 childNames = ",".join(self.names)
202 else:
203 childNames = "(empty@{})".format(time.time())
204 self._names = [childNames]
205 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
207 # We declare we are ephemeral if all our child datastores declare
208 # they are ephemeral
209 isEphemeral = True
210 for d in self.datastores:
211 if not d.isEphemeral:
212 isEphemeral = False
213 break
214 self.isEphemeral = isEphemeral
216 # per-datastore override constraints
217 if "datastore_constraints" in self.config:
218 overrides = self.config["datastore_constraints"]
220 if len(overrides) != len(self.datastores): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 raise DatastoreValidationError(
222 f"Number of registered datastores ({len(self.datastores)})"
223 " differs from number of constraints overrides"
224 f" {len(overrides)}"
225 )
227 self.datastoreConstraints = [
228 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
229 ]
231 else:
232 self.datastoreConstraints = (None,) * len(self.datastores)
234 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
236 @property
237 def names(self) -> Tuple[str, ...]:
238 return tuple(self._names)
240 def __str__(self) -> str:
241 chainName = ", ".join(str(ds) for ds in self.datastores)
242 return chainName
244 def knows(self, ref: DatasetRef) -> bool:
245 """Check if the dataset is known to any of the datastores.
247 Does not check for existence of any artifact.
249 Parameters
250 ----------
251 ref : `DatasetRef`
252 Reference to the required dataset.
254 Returns
255 -------
256 exists : `bool`
257 `True` if the dataset is known to the datastore.
258 """
259 for datastore in self.datastores:
260 if datastore.knows(ref):
261 log.debug("%s known to datastore %s", ref, datastore.name)
262 return True
263 return False
265 def mexists(
266 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
267 ) -> Dict[DatasetRef, bool]:
268 """Check the existence of multiple datasets at once.
270 Parameters
271 ----------
272 refs : iterable of `DatasetRef`
273 The datasets to be checked.
274 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
275 Optional mapping of datastore artifact to existence. Updated by
276 this method with details of all artifacts tested. Can be `None`
277 if the caller is not interested.
279 Returns
280 -------
281 existence : `dict` of [`DatasetRef`, `bool`]
282 Mapping from dataset to boolean indicating existence in any
283 of the child datastores.
284 """
285 dataset_existence: Dict[DatasetRef, bool] = {}
286 for datastore in self.datastores:
287 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
289 # For next datastore no point asking about ones we know
290 # exist already. No special exemption for ephemeral datastores.
291 refs = [ref for ref, exists in dataset_existence.items() if not exists]
293 return dataset_existence
295 def exists(self, ref: DatasetRef) -> bool:
296 """Check if the dataset exists in one of the datastores.
298 Parameters
299 ----------
300 ref : `DatasetRef`
301 Reference to the required dataset.
303 Returns
304 -------
305 exists : `bool`
306 `True` if the entity exists in one of the child datastores.
307 """
308 for datastore in self.datastores:
309 if datastore.exists(ref):
310 log.debug("Found %s in datastore %s", ref, datastore.name)
311 return True
312 return False
314 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any:
315 """Load an InMemoryDataset from the store.
317 The dataset is returned from the first datastore that has
318 the dataset.
320 Parameters
321 ----------
322 ref : `DatasetRef`
323 Reference to the required Dataset.
324 parameters : `dict`
325 `StorageClass`-specific parameters that specify, for example,
326 a slice of the dataset to be loaded.
328 Returns
329 -------
330 inMemoryDataset : `object`
331 Requested dataset or slice thereof as an InMemoryDataset.
333 Raises
334 ------
335 FileNotFoundError
336 Requested dataset can not be retrieved.
337 TypeError
338 Return value from formatter has unexpected type.
339 ValueError
340 Formatter failed to process the dataset.
341 """
343 for datastore in self.datastores:
344 try:
345 inMemoryObject = datastore.get(ref, parameters)
346 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
347 return inMemoryObject
348 except FileNotFoundError:
349 pass
351 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
353 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
354 """Write a InMemoryDataset with a given `DatasetRef` to each
355 datastore.
357 The put() to child datastores can fail with
358 `DatasetTypeNotSupportedError`. The put() for this datastore will be
359 deemed to have succeeded so long as at least one child datastore
360 accepted the inMemoryDataset.
362 Parameters
363 ----------
364 inMemoryDataset : `object`
365 The dataset to store.
366 ref : `DatasetRef`
367 Reference to the associated Dataset.
369 Raises
370 ------
371 TypeError
372 Supplied object and storage class are inconsistent.
373 DatasetTypeNotSupportedError
374 All datastores reported `DatasetTypeNotSupportedError`.
375 """
376 log.debug("Put %s", ref)
378 # Confirm that we can accept this dataset
379 if not self.constraints.isAcceptable(ref):
380 # Raise rather than use boolean return value.
381 raise DatasetTypeNotSupportedError(
382 f"Dataset {ref} has been rejected by this datastore via configuration."
383 )
385 isPermanent = False
386 nsuccess = 0
387 npermanent = 0
388 nephemeral = 0
389 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
390 if constraints is not None and not constraints.isAcceptable(ref):
391 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
392 continue
394 if datastore.isEphemeral:
395 nephemeral += 1
396 else:
397 npermanent += 1
398 try:
399 datastore.put(inMemoryDataset, ref)
400 nsuccess += 1
401 if not datastore.isEphemeral:
402 isPermanent = True
403 except DatasetTypeNotSupportedError:
404 pass
406 if nsuccess == 0:
407 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
409 if not isPermanent and npermanent > 0: 409 ↛ 410line 409 didn't jump to line 410, because the condition on line 409 was never true
410 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
412 if self._transaction is not None:
413 self._transaction.registerUndo("put", self.remove, ref)
415 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
416 # Docstring inherited from base class.
417 if transfer != "auto":
418 return transfer
419 # Ask each datastore what they think auto means
420 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
422 # Remove any untranslated "auto" values
423 transfers.discard(transfer)
425 if len(transfers) == 1: 425 ↛ 426line 425 didn't jump to line 426, because the condition on line 425 was never true
426 return transfers.pop()
427 if not transfers: 427 ↛ 431line 427 didn't jump to line 431, because the condition on line 427 was never false
428 # Everything reported "auto"
429 return transfer
431 raise RuntimeError(
432 "Chained datastore does not yet support different transfer modes"
433 f" from 'auto' in each child datastore (wanted {transfers})"
434 )
436 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
437 # Docstring inherited from Datastore._prepIngest.
438 if transfer is None or transfer == "move":
439 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.")
441 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
442 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
443 if not acceptable:
444 log.debug(
445 "Datastore %s skipping ingest via configuration for refs %s",
446 name,
447 ", ".join(str(ref) for ref in dataset.refs),
448 )
449 return False
450 else:
451 return True
453 # Filter down to just datasets the chained datastore's own
454 # configuration accepts.
455 okForParent: List[FileDataset] = [
456 dataset
457 for dataset in datasets
458 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
459 ]
461 # Iterate over nested datastores and call _prepIngest on each.
462 # Save the results to a list:
463 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = []
464 # ...and remember whether all of the failures are due to
465 # NotImplementedError being raised.
466 allFailuresAreNotImplementedError = True
467 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
468 okForChild: List[FileDataset]
469 if constraints is not None:
470 okForChild = [
471 dataset
472 for dataset in okForParent
473 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
474 ]
475 else:
476 okForChild = okForParent
477 try:
478 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
479 except NotImplementedError:
480 log.debug(
481 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
482 datastore.name,
483 transfer,
484 )
485 continue
486 allFailuresAreNotImplementedError = False
487 children.append((datastore, prepDataForChild))
488 if allFailuresAreNotImplementedError:
489 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
490 return _IngestPrepData(children=children)
492 def _finishIngest(
493 self,
494 prepData: _IngestPrepData,
495 *,
496 transfer: Optional[str] = None,
497 record_validation_info: bool = True,
498 ) -> None:
499 # Docstring inherited from Datastore._finishIngest.
500 for datastore, prepDataForChild in prepData.children:
501 datastore._finishIngest(
502 prepDataForChild, transfer=transfer, record_validation_info=record_validation_info
503 )
505 def getManyURIs(
506 self,
507 refs: Iterable[DatasetRef],
508 predict: bool = False,
509 allow_missing: bool = False,
510 ) -> Dict[DatasetRef, DatasetRefURIs]:
511 # Docstring inherited
513 uris: Dict[DatasetRef, DatasetRefURIs] = {}
514 missing_refs = set(refs)
516 # If predict is True we don't want to predict a dataset in the first
517 # datastore if it actually exists in a later datastore, so in that
518 # case check all datastores with predict=False first, and then try
519 # again with predict=True.
520 for p in (False, True) if predict else (False,):
521 if not missing_refs:
522 break
523 for datastore in self.datastores:
524 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
525 missing_refs -= got_uris.keys()
526 uris.update(got_uris)
527 if not missing_refs:
528 break
530 if missing_refs and not allow_missing:
531 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
533 return uris
535 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
536 """Return URIs associated with dataset.
538 Parameters
539 ----------
540 ref : `DatasetRef`
541 Reference to the required dataset.
542 predict : `bool`, optional
543 If the datastore does not know about the dataset, should it
544 return a predicted URI or not?
546 Returns
547 -------
548 uris : `DatasetRefURIs`
549 The URI to the primary artifact associated with this dataset (if
550 the dataset was disassembled within the datastore this may be
551 `None`), and the URIs to any components associated with the dataset
552 artifact. (can be empty if there are no components).
554 Notes
555 -----
556 The returned URI is from the first datastore in the list that has
557 the dataset with preference given to the first dataset coming from
558 a permanent datastore. If no datastores have the dataset and prediction
559 is allowed, the predicted URI for the first datastore in the list will
560 be returned.
561 """
562 log.debug("Requesting URIs for %s", ref)
563 predictedUri: Optional[DatasetRefURIs] = None
564 predictedEphemeralUri: Optional[DatasetRefURIs] = None
565 firstEphemeralUri: Optional[DatasetRefURIs] = None
566 for datastore in self.datastores:
567 if datastore.exists(ref):
568 if not datastore.isEphemeral:
569 uri = datastore.getURIs(ref)
570 log.debug("Retrieved non-ephemeral URI: %s", uri)
571 return uri
572 elif not firstEphemeralUri:
573 firstEphemeralUri = datastore.getURIs(ref)
574 elif predict:
575 if not predictedUri and not datastore.isEphemeral:
576 predictedUri = datastore.getURIs(ref, predict)
577 elif not predictedEphemeralUri and datastore.isEphemeral:
578 predictedEphemeralUri = datastore.getURIs(ref, predict)
580 if firstEphemeralUri:
581 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
582 return firstEphemeralUri
584 if predictedUri:
585 log.debug("Retrieved predicted URI: %s", predictedUri)
586 return predictedUri
588 if predictedEphemeralUri:
589 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
590 return predictedEphemeralUri
592 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
594 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
595 """URI to the Dataset.
597 The returned URI is from the first datastore in the list that has
598 the dataset with preference given to the first dataset coming from
599 a permanent datastore. If no datastores have the dataset and prediction
600 is allowed, the predicted URI for the first datastore in the list will
601 be returned.
603 Parameters
604 ----------
605 ref : `DatasetRef`
606 Reference to the required Dataset.
607 predict : `bool`
608 If `True`, allow URIs to be returned of datasets that have not
609 been written.
611 Returns
612 -------
613 uri : `lsst.resources.ResourcePath`
614 URI pointing to the dataset within the datastore. If the
615 dataset does not exist in the datastore, and if ``predict`` is
616 `True`, the URI will be a prediction and will include a URI
617 fragment "#predicted".
619 Notes
620 -----
621 If the datastore does not have entities that relate well
622 to the concept of a URI the returned URI string will be
623 descriptive. The returned URI is not guaranteed to be obtainable.
625 Raises
626 ------
627 FileNotFoundError
628 A URI has been requested for a dataset that does not exist and
629 guessing is not allowed.
630 RuntimeError
631 Raised if a request is made for a single URI but multiple URIs
632 are associated with this dataset.
633 """
634 log.debug("Requesting URI for %s", ref)
635 primary, components = self.getURIs(ref, predict)
636 if primary is None or components: 636 ↛ 637line 636 didn't jump to line 637, because the condition on line 636 was never true
637 raise RuntimeError(
638 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
639 )
640 return primary
642 def retrieveArtifacts(
643 self,
644 refs: Iterable[DatasetRef],
645 destination: ResourcePath,
646 transfer: str = "auto",
647 preserve_path: bool = True,
648 overwrite: bool = False,
649 ) -> List[ResourcePath]:
650 """Retrieve the file artifacts associated with the supplied refs.
652 Parameters
653 ----------
654 refs : iterable of `DatasetRef`
655 The datasets for which file artifacts are to be retrieved.
656 A single ref can result in multiple files. The refs must
657 be resolved.
658 destination : `lsst.resources.ResourcePath`
659 Location to write the file artifacts.
660 transfer : `str`, optional
661 Method to use to transfer the artifacts. Must be one of the options
662 supported by `lsst.resources.ResourcePath.transfer_from()`.
663 "move" is not allowed.
664 preserve_path : `bool`, optional
665 If `True` the full path of the file artifact within the datastore
666 is preserved. If `False` the final file component of the path
667 is used.
668 overwrite : `bool`, optional
669 If `True` allow transfers to overwrite existing files at the
670 destination.
672 Returns
673 -------
674 targets : `list` of `lsst.resources.ResourcePath`
675 URIs of file artifacts in destination location. Order is not
676 preserved.
677 """
678 if not destination.isdir(): 678 ↛ 679line 678 didn't jump to line 679, because the condition on line 678 was never true
679 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
681 # Using getURIs is not feasible since it becomes difficult to
682 # determine the path within the datastore later on. For now
683 # follow getURIs implementation approach.
685 pending = set(refs)
687 # There is a question as to whether an exception should be raised
688 # early if some of the refs are missing, or whether files should be
689 # transferred until a problem is hit. Prefer to complain up front.
690 # Use the datastore integer as primary key.
691 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {}
693 for number, datastore in enumerate(self.datastores):
694 if datastore.isEphemeral:
695 # In the future we will want to distinguish in-memory from
696 # caching datastore since using an on-disk local
697 # cache is exactly what we should be doing.
698 continue
699 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
701 if datastore_refs:
702 grouped_by_datastore[number] = datastore_refs
704 # Remove these from the pending list so that we do not bother
705 # looking for them any more.
706 pending = pending - datastore_refs
708 if pending: 708 ↛ 709line 708 didn't jump to line 709, because the condition on line 708 was never true
709 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
711 # Now do the transfer.
712 targets: List[ResourcePath] = []
713 for number, datastore_refs in grouped_by_datastore.items():
714 targets.extend(
715 self.datastores[number].retrieveArtifacts(
716 datastore_refs,
717 destination,
718 transfer=transfer,
719 preserve_path=preserve_path,
720 overwrite=overwrite,
721 )
722 )
724 return targets
726 def remove(self, ref: DatasetRef) -> None:
727 """Indicate to the datastore that a dataset can be removed.
729 The dataset will be removed from each datastore. The dataset is
730 not required to exist in every child datastore.
732 Parameters
733 ----------
734 ref : `DatasetRef`
735 Reference to the required dataset.
737 Raises
738 ------
739 FileNotFoundError
740 Attempt to remove a dataset that does not exist. Raised if none
741 of the child datastores removed the dataset.
742 """
743 log.debug("Removing %s", ref)
744 self.trash(ref, ignore_errors=False)
745 self.emptyTrash(ignore_errors=False)
747 def forget(self, refs: Iterable[DatasetRef]) -> None:
748 for datastore in tuple(self.datastores):
749 datastore.forget(refs)
751 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
752 if isinstance(ref, DatasetRef):
753 ref_label = str(ref)
754 else:
755 ref_label = "bulk datasets"
757 log.debug("Trashing %s", ref_label)
759 counter = 0
760 for datastore in self.datastores:
761 try:
762 datastore.trash(ref, ignore_errors=ignore_errors)
763 counter += 1
764 except FileNotFoundError:
765 pass
767 if counter == 0:
768 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
769 if ignore_errors: 769 ↛ 770line 769 didn't jump to line 770, because the condition on line 769 was never true
770 log.warning(err_msg)
771 else:
772 raise FileNotFoundError(err_msg)
774 def emptyTrash(self, ignore_errors: bool = True) -> None:
775 for datastore in self.datastores:
776 datastore.emptyTrash(ignore_errors=ignore_errors)
778 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
779 """Retrieve a dataset from an input `Datastore`,
780 and store the result in this `Datastore`.
782 Parameters
783 ----------
784 inputDatastore : `Datastore`
785 The external `Datastore` from which to retreive the Dataset.
786 ref : `DatasetRef`
787 Reference to the required dataset in the input data store.
789 Returns
790 -------
791 results : `list`
792 List containing the return value from the ``put()`` to each
793 child datastore.
794 """
795 assert inputDatastore is not self # unless we want it for renames?
796 inMemoryDataset = inputDatastore.get(ref)
797 self.put(inMemoryDataset, ref)
799 def validateConfiguration(
800 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
801 ) -> None:
802 """Validate some of the configuration for this datastore.
804 Parameters
805 ----------
806 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
807 Entities to test against this configuration. Can be differing
808 types.
809 logFailures : `bool`, optional
810 If `True`, output a log message for every validation error
811 detected.
813 Raises
814 ------
815 DatastoreValidationError
816 Raised if there is a validation problem with a configuration.
817 All the problems are reported in a single exception.
819 Notes
820 -----
821 This method checks each datastore in turn.
822 """
824 # Need to catch each of the datastore outputs and ensure that
825 # all are tested.
826 failures = []
827 for datastore in self.datastores:
828 try:
829 datastore.validateConfiguration(entities, logFailures=logFailures)
830 except DatastoreValidationError as e:
831 if logFailures: 831 ↛ 833line 831 didn't jump to line 833, because the condition on line 831 was never false
832 log.critical("Datastore %s failed validation", datastore.name)
833 failures.append(f"Datastore {self.name}: {e}")
835 if failures:
836 msg = ";\n".join(failures)
837 raise DatastoreValidationError(msg)
839 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
840 # Docstring is inherited from base class
841 failures = []
842 for datastore in self.datastores:
843 try:
844 datastore.validateKey(lookupKey, entity)
845 except DatastoreValidationError as e:
846 failures.append(f"Datastore {self.name}: {e}")
848 if failures:
849 msg = ";\n".join(failures)
850 raise DatastoreValidationError(msg)
852 def getLookupKeys(self) -> Set[LookupKey]:
853 # Docstring is inherited from base class
854 keys = set()
855 for datastore in self.datastores:
856 keys.update(datastore.getLookupKeys())
858 keys.update(self.constraints.getLookupKeys())
859 for p in self.datastoreConstraints:
860 if p is not None: 860 ↛ 861line 860 didn't jump to line 861, because the condition on line 860 was never true
861 keys.update(p.getLookupKeys())
863 return keys
865 def needs_expanded_data_ids(
866 self,
867 transfer: Optional[str],
868 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
869 ) -> bool:
870 # Docstring inherited.
871 # We can't safely use `self.datastoreConstraints` with `entity` to
872 # check whether a child datastore would even want to ingest this
873 # dataset, because we don't want to filter out datastores that might
874 # need an expanded data ID based in incomplete information (e.g. we
875 # pass a StorageClass, but the constraint dispatches on DatasetType).
876 # So we pessimistically check if any datastore would need an expanded
877 # data ID for this transfer mode.
878 return any(datastore.needs_expanded_data_ids(transfer) for datastore in self.datastores) 878 ↛ exitline 878 didn't finish the generator expression on line 878
880 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
881 # Docstring inherited from the base class.
883 for datastore in self.datastores:
884 datastore.import_records(data)
886 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
887 # Docstring inherited from the base class.
889 all_records: Dict[str, DatastoreRecordData] = {}
891 # Merge all sub-datastore records into one structure
892 for datastore in self.datastores:
893 sub_records = datastore.export_records(refs)
894 for name, record_data in sub_records.items():
895 # All datastore names must be unique in a chain.
896 if name in all_records: 896 ↛ 897line 896 didn't jump to line 897, because the condition on line 896 was never true
897 raise ValueError("Non-unique datastore name found in datastore {datastore}")
898 all_records[name] = record_data
900 return all_records