Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 91%
319 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-07 02:46 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-07 02:46 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Chained datastore."""
26__all__ = ("ChainedDatastore",)
28import itertools
29import logging
30import time
31import warnings
32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union
34from lsst.daf.butler import (
35 Constraints,
36 DatasetRef,
37 DatasetRefURIs,
38 DatasetTypeNotSupportedError,
39 Datastore,
40 DatastoreConfig,
41 DatastoreRecordData,
42 DatastoreValidationError,
43 FileDataset,
44)
45from lsst.resources import ResourcePath
46from lsst.utils import doImportType
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
52log = logging.getLogger(__name__)
55class _IngestPrepData(Datastore.IngestPrepData):
56 """Helper class for ChainedDatastore ingest implementation.
58 Parameters
59 ----------
60 children : `list` of `tuple`
61 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
62 """
64 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]):
65 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children))
66 self.children = children
69class ChainedDatastore(Datastore):
70 """Chained Datastores to allow read and writes from multiple datastores.
72 A ChainedDatastore is configured with multiple datastore configurations.
73 A ``put()`` is always sent to each datastore. A ``get()``
74 operation is sent to each datastore in turn and the first datastore
75 to return a valid dataset is used.
77 Parameters
78 ----------
79 config : `DatastoreConfig` or `str`
80 Configuration. This configuration must include a ``datastores`` field
81 as a sequence of datastore configurations. The order in this sequence
82 indicates the order to use for read operations.
83 bridgeManager : `DatastoreRegistryBridgeManager`
84 Object that manages the interface between `Registry` and datastores.
85 butlerRoot : `str`, optional
86 New datastore root to use to override the configuration value. This
87 root is sent to each child datastore.
89 Notes
90 -----
91 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
92 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
93 and `"hardlink"` if and only if all its child datastores do.
94 """
96 defaultConfigFile = "datastores/chainedDatastore.yaml"
97 """Path to configuration defaults. Accessed within the ``configs`` resource
98 or relative to a search path. Can be None if no defaults specified.
99 """
101 containerKey = "datastores"
102 """Key to specify where child datastores are configured."""
104 datastores: List[Datastore]
105 """All the child datastores known to this datastore."""
107 datastoreConstraints: Sequence[Optional[Constraints]]
108 """Constraints to be applied to each of the child datastores."""
110 @classmethod
111 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
112 """Set any filesystem-dependent config options for child Datastores to
113 be appropriate for a new empty repository with the given root.
115 Parameters
116 ----------
117 root : `str`
118 Filesystem path to the root of the data repository.
119 config : `Config`
120 A `Config` to update. Only the subset understood by
121 this component will be updated. Will not expand
122 defaults.
123 full : `Config`
124 A complete config with all defaults expanded that can be
125 converted to a `DatastoreConfig`. Read-only and will not be
126 modified by this method.
127 Repository-specific options that should not be obtained
128 from defaults when Butler instances are constructed
129 should be copied from ``full`` to ``config``.
130 overwrite : `bool`, optional
131 If `False`, do not modify a value in ``config`` if the value
132 already exists. Default is always to overwrite with the provided
133 ``root``.
135 Notes
136 -----
137 If a keyword is explicitly defined in the supplied ``config`` it
138 will not be overridden by this method if ``overwrite`` is `False`.
139 This allows explicit values set in external configs to be retained.
140 """
142 # Extract the part of the config we care about updating
143 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
145 # And the subset of the full config that we can use for reference.
146 # Do not bother with defaults because we are told this already has
147 # them.
148 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
150 # Loop over each datastore config and pass the subsets to the
151 # child datastores to process.
153 containerKey = cls.containerKey
154 for idx, (child, fullChild) in enumerate(
155 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey])
156 ):
157 childConfig = DatastoreConfig(child, mergeDefaults=False)
158 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
159 datastoreClass = doImportType(fullChildConfig["cls"])
160 if not issubclass(datastoreClass, Datastore): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
162 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
163 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
165 # Reattach to parent
166 datastoreConfig[containerKey, idx] = childConfig
168 # Reattach modified datastore config to parent
169 # If this has a datastore key we attach there, otherwise we assume
170 # this information goes at the top of the config hierarchy.
171 if DatastoreConfig.component in config:
172 config[DatastoreConfig.component] = datastoreConfig
173 else:
174 config.update(datastoreConfig)
176 return
178 def __init__(
179 self,
180 config: Union[Config, str],
181 bridgeManager: DatastoreRegistryBridgeManager,
182 butlerRoot: str = None,
183 ):
184 super().__init__(config, bridgeManager)
186 # Scan for child datastores and instantiate them with the same registry
187 self.datastores = []
188 for c in self.config["datastores"]:
189 c = DatastoreConfig(c)
190 datastoreType = doImportType(c["cls"])
191 if not issubclass(datastoreType, Datastore): 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true
192 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
193 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
194 log.debug("Creating child datastore %s", datastore.name)
195 self.datastores.append(datastore)
197 # Name ourself based on our children
198 if self.datastores: 198 ↛ 203line 198 didn't jump to line 203, because the condition on line 198 was never false
199 # We must set the names explicitly
200 self._names = [d.name for d in self.datastores]
201 childNames = ",".join(self.names)
202 else:
203 childNames = "(empty@{})".format(time.time())
204 self._names = [childNames]
205 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
207 # We declare we are ephemeral if all our child datastores declare
208 # they are ephemeral
209 isEphemeral = True
210 for d in self.datastores:
211 if not d.isEphemeral:
212 isEphemeral = False
213 break
214 self.isEphemeral = isEphemeral
216 # per-datastore override constraints
217 if "datastore_constraints" in self.config:
218 overrides = self.config["datastore_constraints"]
220 if len(overrides) != len(self.datastores): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 raise DatastoreValidationError(
222 f"Number of registered datastores ({len(self.datastores)})"
223 " differs from number of constraints overrides"
224 f" {len(overrides)}"
225 )
227 self.datastoreConstraints = [
228 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
229 ]
231 else:
232 self.datastoreConstraints = (None,) * len(self.datastores)
234 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
236 @property
237 def names(self) -> Tuple[str, ...]:
238 return tuple(self._names)
240 def __str__(self) -> str:
241 chainName = ", ".join(str(ds) for ds in self.datastores)
242 return chainName
244 def knows(self, ref: DatasetRef) -> bool:
245 """Check if the dataset is known to any of the datastores.
247 Does not check for existence of any artifact.
249 Parameters
250 ----------
251 ref : `DatasetRef`
252 Reference to the required dataset.
254 Returns
255 -------
256 exists : `bool`
257 `True` if the dataset is known to the datastore.
258 """
259 for datastore in self.datastores:
260 if datastore.knows(ref):
261 log.debug("%s known to datastore %s", ref, datastore.name)
262 return True
263 return False
265 def mexists(
266 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
267 ) -> Dict[DatasetRef, bool]:
268 """Check the existence of multiple datasets at once.
270 Parameters
271 ----------
272 refs : iterable of `DatasetRef`
273 The datasets to be checked.
274 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
275 Optional mapping of datastore artifact to existence. Updated by
276 this method with details of all artifacts tested. Can be `None`
277 if the caller is not interested.
279 Returns
280 -------
281 existence : `dict` of [`DatasetRef`, `bool`]
282 Mapping from dataset to boolean indicating existence in any
283 of the child datastores.
284 """
285 dataset_existence: Dict[DatasetRef, bool] = {}
286 for datastore in self.datastores:
287 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
289 # For next datastore no point asking about ones we know
290 # exist already. No special exemption for ephemeral datastores.
291 refs = [ref for ref, exists in dataset_existence.items() if not exists]
293 return dataset_existence
295 def exists(self, ref: DatasetRef) -> bool:
296 """Check if the dataset exists in one of the datastores.
298 Parameters
299 ----------
300 ref : `DatasetRef`
301 Reference to the required dataset.
303 Returns
304 -------
305 exists : `bool`
306 `True` if the entity exists in one of the child datastores.
307 """
308 for datastore in self.datastores:
309 if datastore.exists(ref):
310 log.debug("Found %s in datastore %s", ref, datastore.name)
311 return True
312 return False
314 def get(
315 self,
316 ref: DatasetRef,
317 parameters: Optional[Mapping[str, Any]] = None,
318 storageClass: Optional[Union[StorageClass, str]] = None,
319 ) -> Any:
320 """Load an InMemoryDataset from the store.
322 The dataset is returned from the first datastore that has
323 the dataset.
325 Parameters
326 ----------
327 ref : `DatasetRef`
328 Reference to the required Dataset.
329 parameters : `dict`
330 `StorageClass`-specific parameters that specify, for example,
331 a slice of the dataset to be loaded.
332 storageClass : `StorageClass` or `str`, optional
333 The storage class to be used to override the Python type
334 returned by this method. By default the returned type matches
335 the dataset type definition for this dataset. Specifying a
336 read `StorageClass` can force a different type to be returned.
337 This type must be compatible with the original type.
339 Returns
340 -------
341 inMemoryDataset : `object`
342 Requested dataset or slice thereof as an InMemoryDataset.
344 Raises
345 ------
346 FileNotFoundError
347 Requested dataset can not be retrieved.
348 TypeError
349 Return value from formatter has unexpected type.
350 ValueError
351 Formatter failed to process the dataset.
352 """
354 for datastore in self.datastores:
355 try:
356 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
357 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
358 return inMemoryObject
359 except FileNotFoundError:
360 pass
362 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
364 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
365 """Write a InMemoryDataset with a given `DatasetRef` to each
366 datastore.
368 The put() to child datastores can fail with
369 `DatasetTypeNotSupportedError`. The put() for this datastore will be
370 deemed to have succeeded so long as at least one child datastore
371 accepted the inMemoryDataset.
373 Parameters
374 ----------
375 inMemoryDataset : `object`
376 The dataset to store.
377 ref : `DatasetRef`
378 Reference to the associated Dataset.
380 Raises
381 ------
382 TypeError
383 Supplied object and storage class are inconsistent.
384 DatasetTypeNotSupportedError
385 All datastores reported `DatasetTypeNotSupportedError`.
386 """
387 log.debug("Put %s", ref)
389 # Confirm that we can accept this dataset
390 if not self.constraints.isAcceptable(ref):
391 # Raise rather than use boolean return value.
392 raise DatasetTypeNotSupportedError(
393 f"Dataset {ref} has been rejected by this datastore via configuration."
394 )
396 isPermanent = False
397 nsuccess = 0
398 npermanent = 0
399 nephemeral = 0
400 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
401 if constraints is not None and not constraints.isAcceptable(ref):
402 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
403 continue
405 if datastore.isEphemeral:
406 nephemeral += 1
407 else:
408 npermanent += 1
409 try:
410 datastore.put(inMemoryDataset, ref)
411 nsuccess += 1
412 if not datastore.isEphemeral:
413 isPermanent = True
414 except DatasetTypeNotSupportedError:
415 pass
417 if nsuccess == 0:
418 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
420 if not isPermanent and npermanent > 0: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true
421 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
423 if self._transaction is not None:
424 self._transaction.registerUndo("put", self.remove, ref)
426 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
427 # Docstring inherited from base class.
428 if transfer != "auto":
429 return transfer
430 # Ask each datastore what they think auto means
431 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
433 # Remove any untranslated "auto" values
434 transfers.discard(transfer)
436 if len(transfers) == 1: 436 ↛ 437line 436 didn't jump to line 437, because the condition on line 436 was never true
437 return transfers.pop()
438 if not transfers: 438 ↛ 442line 438 didn't jump to line 442, because the condition on line 438 was never false
439 # Everything reported "auto"
440 return transfer
442 raise RuntimeError(
443 "Chained datastore does not yet support different transfer modes"
444 f" from 'auto' in each child datastore (wanted {transfers})"
445 )
447 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
448 # Docstring inherited from Datastore._prepIngest.
449 if transfer is None or transfer == "move":
450 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.")
452 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
453 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
454 if not acceptable:
455 log.debug(
456 "Datastore %s skipping ingest via configuration for refs %s",
457 name,
458 ", ".join(str(ref) for ref in dataset.refs),
459 )
460 return False
461 else:
462 return True
464 # Filter down to just datasets the chained datastore's own
465 # configuration accepts.
466 okForParent: List[FileDataset] = [
467 dataset
468 for dataset in datasets
469 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
470 ]
472 # Iterate over nested datastores and call _prepIngest on each.
473 # Save the results to a list:
474 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = []
475 # ...and remember whether all of the failures are due to
476 # NotImplementedError being raised.
477 allFailuresAreNotImplementedError = True
478 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
479 okForChild: List[FileDataset]
480 if constraints is not None:
481 okForChild = [
482 dataset
483 for dataset in okForParent
484 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
485 ]
486 else:
487 okForChild = okForParent
488 try:
489 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
490 except NotImplementedError:
491 log.debug(
492 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
493 datastore.name,
494 transfer,
495 )
496 continue
497 allFailuresAreNotImplementedError = False
498 children.append((datastore, prepDataForChild))
499 if allFailuresAreNotImplementedError:
500 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
501 return _IngestPrepData(children=children)
503 def _finishIngest(
504 self,
505 prepData: _IngestPrepData,
506 *,
507 transfer: Optional[str] = None,
508 record_validation_info: bool = True,
509 ) -> None:
510 # Docstring inherited from Datastore._finishIngest.
511 for datastore, prepDataForChild in prepData.children:
512 datastore._finishIngest(
513 prepDataForChild, transfer=transfer, record_validation_info=record_validation_info
514 )
516 def getManyURIs(
517 self,
518 refs: Iterable[DatasetRef],
519 predict: bool = False,
520 allow_missing: bool = False,
521 ) -> Dict[DatasetRef, DatasetRefURIs]:
522 # Docstring inherited
524 uris: Dict[DatasetRef, DatasetRefURIs] = {}
525 missing_refs = set(refs)
527 # If predict is True we don't want to predict a dataset in the first
528 # datastore if it actually exists in a later datastore, so in that
529 # case check all datastores with predict=False first, and then try
530 # again with predict=True.
531 for p in (False, True) if predict else (False,):
532 if not missing_refs:
533 break
534 for datastore in self.datastores:
535 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
536 missing_refs -= got_uris.keys()
537 uris.update(got_uris)
538 if not missing_refs:
539 break
541 if missing_refs and not allow_missing:
542 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
544 return uris
546 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
547 """Return URIs associated with dataset.
549 Parameters
550 ----------
551 ref : `DatasetRef`
552 Reference to the required dataset.
553 predict : `bool`, optional
554 If the datastore does not know about the dataset, should it
555 return a predicted URI or not?
557 Returns
558 -------
559 uris : `DatasetRefURIs`
560 The URI to the primary artifact associated with this dataset (if
561 the dataset was disassembled within the datastore this may be
562 `None`), and the URIs to any components associated with the dataset
563 artifact. (can be empty if there are no components).
565 Notes
566 -----
567 The returned URI is from the first datastore in the list that has
568 the dataset with preference given to the first dataset coming from
569 a permanent datastore. If no datastores have the dataset and prediction
570 is allowed, the predicted URI for the first datastore in the list will
571 be returned.
572 """
573 log.debug("Requesting URIs for %s", ref)
574 predictedUri: Optional[DatasetRefURIs] = None
575 predictedEphemeralUri: Optional[DatasetRefURIs] = None
576 firstEphemeralUri: Optional[DatasetRefURIs] = None
577 for datastore in self.datastores:
578 if datastore.exists(ref):
579 if not datastore.isEphemeral:
580 uri = datastore.getURIs(ref)
581 log.debug("Retrieved non-ephemeral URI: %s", uri)
582 return uri
583 elif not firstEphemeralUri:
584 firstEphemeralUri = datastore.getURIs(ref)
585 elif predict:
586 if not predictedUri and not datastore.isEphemeral:
587 predictedUri = datastore.getURIs(ref, predict)
588 elif not predictedEphemeralUri and datastore.isEphemeral:
589 predictedEphemeralUri = datastore.getURIs(ref, predict)
591 if firstEphemeralUri:
592 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
593 return firstEphemeralUri
595 if predictedUri:
596 log.debug("Retrieved predicted URI: %s", predictedUri)
597 return predictedUri
599 if predictedEphemeralUri:
600 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
601 return predictedEphemeralUri
603 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
605 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
606 """URI to the Dataset.
608 The returned URI is from the first datastore in the list that has
609 the dataset with preference given to the first dataset coming from
610 a permanent datastore. If no datastores have the dataset and prediction
611 is allowed, the predicted URI for the first datastore in the list will
612 be returned.
614 Parameters
615 ----------
616 ref : `DatasetRef`
617 Reference to the required Dataset.
618 predict : `bool`
619 If `True`, allow URIs to be returned of datasets that have not
620 been written.
622 Returns
623 -------
624 uri : `lsst.resources.ResourcePath`
625 URI pointing to the dataset within the datastore. If the
626 dataset does not exist in the datastore, and if ``predict`` is
627 `True`, the URI will be a prediction and will include a URI
628 fragment "#predicted".
630 Notes
631 -----
632 If the datastore does not have entities that relate well
633 to the concept of a URI the returned URI string will be
634 descriptive. The returned URI is not guaranteed to be obtainable.
636 Raises
637 ------
638 FileNotFoundError
639 A URI has been requested for a dataset that does not exist and
640 guessing is not allowed.
641 RuntimeError
642 Raised if a request is made for a single URI but multiple URIs
643 are associated with this dataset.
644 """
645 log.debug("Requesting URI for %s", ref)
646 primary, components = self.getURIs(ref, predict)
647 if primary is None or components: 647 ↛ 648line 647 didn't jump to line 648, because the condition on line 647 was never true
648 raise RuntimeError(
649 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
650 )
651 return primary
653 def retrieveArtifacts(
654 self,
655 refs: Iterable[DatasetRef],
656 destination: ResourcePath,
657 transfer: str = "auto",
658 preserve_path: bool = True,
659 overwrite: bool = False,
660 ) -> List[ResourcePath]:
661 """Retrieve the file artifacts associated with the supplied refs.
663 Parameters
664 ----------
665 refs : iterable of `DatasetRef`
666 The datasets for which file artifacts are to be retrieved.
667 A single ref can result in multiple files. The refs must
668 be resolved.
669 destination : `lsst.resources.ResourcePath`
670 Location to write the file artifacts.
671 transfer : `str`, optional
672 Method to use to transfer the artifacts. Must be one of the options
673 supported by `lsst.resources.ResourcePath.transfer_from()`.
674 "move" is not allowed.
675 preserve_path : `bool`, optional
676 If `True` the full path of the file artifact within the datastore
677 is preserved. If `False` the final file component of the path
678 is used.
679 overwrite : `bool`, optional
680 If `True` allow transfers to overwrite existing files at the
681 destination.
683 Returns
684 -------
685 targets : `list` of `lsst.resources.ResourcePath`
686 URIs of file artifacts in destination location. Order is not
687 preserved.
688 """
689 if not destination.isdir(): 689 ↛ 690line 689 didn't jump to line 690, because the condition on line 689 was never true
690 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
692 # Using getURIs is not feasible since it becomes difficult to
693 # determine the path within the datastore later on. For now
694 # follow getURIs implementation approach.
696 pending = set(refs)
698 # There is a question as to whether an exception should be raised
699 # early if some of the refs are missing, or whether files should be
700 # transferred until a problem is hit. Prefer to complain up front.
701 # Use the datastore integer as primary key.
702 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {}
704 for number, datastore in enumerate(self.datastores):
705 if datastore.isEphemeral:
706 # In the future we will want to distinguish in-memory from
707 # caching datastore since using an on-disk local
708 # cache is exactly what we should be doing.
709 continue
710 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
712 if datastore_refs:
713 grouped_by_datastore[number] = datastore_refs
715 # Remove these from the pending list so that we do not bother
716 # looking for them any more.
717 pending = pending - datastore_refs
719 if pending: 719 ↛ 720line 719 didn't jump to line 720, because the condition on line 719 was never true
720 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
722 # Now do the transfer.
723 targets: List[ResourcePath] = []
724 for number, datastore_refs in grouped_by_datastore.items():
725 targets.extend(
726 self.datastores[number].retrieveArtifacts(
727 datastore_refs,
728 destination,
729 transfer=transfer,
730 preserve_path=preserve_path,
731 overwrite=overwrite,
732 )
733 )
735 return targets
737 def remove(self, ref: DatasetRef) -> None:
738 """Indicate to the datastore that a dataset can be removed.
740 The dataset will be removed from each datastore. The dataset is
741 not required to exist in every child datastore.
743 Parameters
744 ----------
745 ref : `DatasetRef`
746 Reference to the required dataset.
748 Raises
749 ------
750 FileNotFoundError
751 Attempt to remove a dataset that does not exist. Raised if none
752 of the child datastores removed the dataset.
753 """
754 log.debug("Removing %s", ref)
755 self.trash(ref, ignore_errors=False)
756 self.emptyTrash(ignore_errors=False)
758 def forget(self, refs: Iterable[DatasetRef]) -> None:
759 for datastore in tuple(self.datastores):
760 datastore.forget(refs)
762 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
763 if isinstance(ref, DatasetRef):
764 ref_label = str(ref)
765 else:
766 ref_label = "bulk datasets"
768 log.debug("Trashing %s", ref_label)
770 counter = 0
771 for datastore in self.datastores:
772 try:
773 datastore.trash(ref, ignore_errors=ignore_errors)
774 counter += 1
775 except FileNotFoundError:
776 pass
778 if counter == 0:
779 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
780 if ignore_errors: 780 ↛ 781line 780 didn't jump to line 781, because the condition on line 780 was never true
781 log.warning(err_msg)
782 else:
783 raise FileNotFoundError(err_msg)
785 def emptyTrash(self, ignore_errors: bool = True) -> None:
786 for datastore in self.datastores:
787 datastore.emptyTrash(ignore_errors=ignore_errors)
789 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
790 """Retrieve a dataset from an input `Datastore`,
791 and store the result in this `Datastore`.
793 Parameters
794 ----------
795 inputDatastore : `Datastore`
796 The external `Datastore` from which to retreive the Dataset.
797 ref : `DatasetRef`
798 Reference to the required dataset in the input data store.
800 Returns
801 -------
802 results : `list`
803 List containing the return value from the ``put()`` to each
804 child datastore.
805 """
806 assert inputDatastore is not self # unless we want it for renames?
807 inMemoryDataset = inputDatastore.get(ref)
808 self.put(inMemoryDataset, ref)
810 def validateConfiguration(
811 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
812 ) -> None:
813 """Validate some of the configuration for this datastore.
815 Parameters
816 ----------
817 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
818 Entities to test against this configuration. Can be differing
819 types.
820 logFailures : `bool`, optional
821 If `True`, output a log message for every validation error
822 detected.
824 Raises
825 ------
826 DatastoreValidationError
827 Raised if there is a validation problem with a configuration.
828 All the problems are reported in a single exception.
830 Notes
831 -----
832 This method checks each datastore in turn.
833 """
835 # Need to catch each of the datastore outputs and ensure that
836 # all are tested.
837 failures = []
838 for datastore in self.datastores:
839 try:
840 datastore.validateConfiguration(entities, logFailures=logFailures)
841 except DatastoreValidationError as e:
842 if logFailures: 842 ↛ 844line 842 didn't jump to line 844, because the condition on line 842 was never false
843 log.critical("Datastore %s failed validation", datastore.name)
844 failures.append(f"Datastore {self.name}: {e}")
846 if failures:
847 msg = ";\n".join(failures)
848 raise DatastoreValidationError(msg)
850 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
851 # Docstring is inherited from base class
852 failures = []
853 for datastore in self.datastores:
854 try:
855 datastore.validateKey(lookupKey, entity)
856 except DatastoreValidationError as e:
857 failures.append(f"Datastore {self.name}: {e}")
859 if failures:
860 msg = ";\n".join(failures)
861 raise DatastoreValidationError(msg)
863 def getLookupKeys(self) -> Set[LookupKey]:
864 # Docstring is inherited from base class
865 keys = set()
866 for datastore in self.datastores:
867 keys.update(datastore.getLookupKeys())
869 keys.update(self.constraints.getLookupKeys())
870 for p in self.datastoreConstraints:
871 if p is not None: 871 ↛ 872line 871 didn't jump to line 872, because the condition on line 871 was never true
872 keys.update(p.getLookupKeys())
874 return keys
876 def needs_expanded_data_ids(
877 self,
878 transfer: Optional[str],
879 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
880 ) -> bool:
881 # Docstring inherited.
882 # We can't safely use `self.datastoreConstraints` with `entity` to
883 # check whether a child datastore would even want to ingest this
884 # dataset, because we don't want to filter out datastores that might
885 # need an expanded data ID based in incomplete information (e.g. we
886 # pass a StorageClass, but the constraint dispatches on DatasetType).
887 # So we pessimistically check if any datastore would need an expanded
888 # data ID for this transfer mode.
889 return any(datastore.needs_expanded_data_ids(transfer) for datastore in self.datastores) 889 ↛ exitline 889 didn't finish the generator expression on line 889
891 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
892 # Docstring inherited from the base class.
894 for datastore in self.datastores:
895 datastore.import_records(data)
897 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
898 # Docstring inherited from the base class.
900 all_records: Dict[str, DatastoreRecordData] = {}
902 # Merge all sub-datastore records into one structure
903 for datastore in self.datastores:
904 sub_records = datastore.export_records(refs)
905 for name, record_data in sub_records.items():
906 # All datastore names must be unique in a chain.
907 if name in all_records: 907 ↛ 908line 907 didn't jump to line 908, because the condition on line 907 was never true
908 raise ValueError("Non-unique datastore name found in datastore {datastore}")
909 all_records[name] = record_data
911 return all_records