Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%
414 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-17 09:32 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-17 09:32 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Chained datastore."""
26__all__ = ("ChainedDatastore",)
28import itertools
29import logging
30import time
31import warnings
32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union
34from lsst.daf.butler import (
35 Constraints,
36 DatasetRef,
37 DatasetRefURIs,
38 DatasetTypeNotSupportedError,
39 Datastore,
40 DatastoreConfig,
41 DatastoreRecordData,
42 DatastoreValidationError,
43 FileDataset,
44)
45from lsst.resources import ResourcePath
46from lsst.utils import doImportType
48if TYPE_CHECKING:
49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
51 from lsst.resources import ResourcePathExpression
53log = logging.getLogger(__name__)
56class _IngestPrepData(Datastore.IngestPrepData):
57 """Helper class for ChainedDatastore ingest implementation.
59 Parameters
60 ----------
61 children : `list` of `tuple`
62 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
63 """
65 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
66 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
67 self.children = children
70class ChainedDatastore(Datastore):
71 """Chained Datastores to allow read and writes from multiple datastores.
73 A ChainedDatastore is configured with multiple datastore configurations.
74 A ``put()`` is always sent to each datastore. A ``get()``
75 operation is sent to each datastore in turn and the first datastore
76 to return a valid dataset is used.
78 Parameters
79 ----------
80 config : `DatastoreConfig` or `str`
81 Configuration. This configuration must include a ``datastores`` field
82 as a sequence of datastore configurations. The order in this sequence
83 indicates the order to use for read operations.
84 bridgeManager : `DatastoreRegistryBridgeManager`
85 Object that manages the interface between `Registry` and datastores.
86 butlerRoot : `str`, optional
87 New datastore root to use to override the configuration value. This
88 root is sent to each child datastore.
90 Notes
91 -----
92 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
93 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
94 and `"hardlink"` if and only if all its child datastores do.
95 """
97 defaultConfigFile = "datastores/chainedDatastore.yaml"
98 """Path to configuration defaults. Accessed within the ``configs`` resource
99 or relative to a search path. Can be None if no defaults specified.
100 """
102 containerKey = "datastores"
103 """Key to specify where child datastores are configured."""
105 datastores: List[Datastore]
106 """All the child datastores known to this datastore."""
108 datastoreConstraints: Sequence[Optional[Constraints]]
109 """Constraints to be applied to each of the child datastores."""
111 @classmethod
112 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
113 """Set any filesystem-dependent config options for child Datastores to
114 be appropriate for a new empty repository with the given root.
116 Parameters
117 ----------
118 root : `str`
119 Filesystem path to the root of the data repository.
120 config : `Config`
121 A `Config` to update. Only the subset understood by
122 this component will be updated. Will not expand
123 defaults.
124 full : `Config`
125 A complete config with all defaults expanded that can be
126 converted to a `DatastoreConfig`. Read-only and will not be
127 modified by this method.
128 Repository-specific options that should not be obtained
129 from defaults when Butler instances are constructed
130 should be copied from ``full`` to ``config``.
131 overwrite : `bool`, optional
132 If `False`, do not modify a value in ``config`` if the value
133 already exists. Default is always to overwrite with the provided
134 ``root``.
136 Notes
137 -----
138 If a keyword is explicitly defined in the supplied ``config`` it
139 will not be overridden by this method if ``overwrite`` is `False`.
140 This allows explicit values set in external configs to be retained.
141 """
143 # Extract the part of the config we care about updating
144 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
146 # And the subset of the full config that we can use for reference.
147 # Do not bother with defaults because we are told this already has
148 # them.
149 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
151 # Loop over each datastore config and pass the subsets to the
152 # child datastores to process.
154 containerKey = cls.containerKey
155 for idx, (child, fullChild) in enumerate(
156 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey])
157 ):
158 childConfig = DatastoreConfig(child, mergeDefaults=False)
159 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
160 datastoreClass = doImportType(fullChildConfig["cls"])
161 if not issubclass(datastoreClass, Datastore): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true
162 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
163 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
164 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
166 # Reattach to parent
167 datastoreConfig[containerKey, idx] = childConfig
169 # Reattach modified datastore config to parent
170 # If this has a datastore key we attach there, otherwise we assume
171 # this information goes at the top of the config hierarchy.
172 if DatastoreConfig.component in config:
173 config[DatastoreConfig.component] = datastoreConfig
174 else:
175 config.update(datastoreConfig)
177 return
179 def __init__(
180 self,
181 config: Union[Config, str],
182 bridgeManager: DatastoreRegistryBridgeManager,
183 butlerRoot: str | None = None,
184 ):
185 super().__init__(config, bridgeManager)
187 # Scan for child datastores and instantiate them with the same registry
188 self.datastores = []
189 for c in self.config["datastores"]:
190 c = DatastoreConfig(c)
191 datastoreType = doImportType(c["cls"])
192 if not issubclass(datastoreType, Datastore): 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true
193 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
194 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
195 log.debug("Creating child datastore %s", datastore.name)
196 self.datastores.append(datastore)
198 # Name ourself based on our children
199 if self.datastores: 199 ↛ 204line 199 didn't jump to line 204, because the condition on line 199 was never false
200 # We must set the names explicitly
201 self._names = [d.name for d in self.datastores]
202 childNames = ",".join(self.names)
203 else:
204 childNames = "(empty@{})".format(time.time())
205 self._names = [childNames]
206 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
208 # We declare we are ephemeral if all our child datastores declare
209 # they are ephemeral
210 isEphemeral = True
211 for d in self.datastores:
212 if not d.isEphemeral:
213 isEphemeral = False
214 break
215 self.isEphemeral = isEphemeral
217 # per-datastore override constraints
218 if "datastore_constraints" in self.config:
219 overrides = self.config["datastore_constraints"]
221 if len(overrides) != len(self.datastores): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true
222 raise DatastoreValidationError(
223 f"Number of registered datastores ({len(self.datastores)})"
224 " differs from number of constraints overrides"
225 f" {len(overrides)}"
226 )
228 self.datastoreConstraints = [
229 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
230 ]
232 else:
233 self.datastoreConstraints = (None,) * len(self.datastores)
235 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
237 @property
238 def names(self) -> Tuple[str, ...]:
239 return tuple(self._names)
241 def __str__(self) -> str:
242 chainName = ", ".join(str(ds) for ds in self.datastores)
243 return chainName
245 def knows(self, ref: DatasetRef) -> bool:
246 """Check if the dataset is known to any of the datastores.
248 Does not check for existence of any artifact.
250 Parameters
251 ----------
252 ref : `DatasetRef`
253 Reference to the required dataset.
255 Returns
256 -------
257 exists : `bool`
258 `True` if the dataset is known to the datastore.
259 """
260 for datastore in self.datastores:
261 if datastore.knows(ref):
262 log.debug("%s known to datastore %s", ref, datastore.name)
263 return True
264 return False
266 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
267 # Docstring inherited from the base class.
268 refs_known: dict[DatasetRef, bool] = {}
269 for datastore in self.datastores:
270 refs_known.update(datastore.knows_these(refs))
272 # No need to check in next datastore for refs that are known.
273 # We only update entries that were initially False.
274 refs = [ref for ref, known in refs_known.items() if not known]
276 return refs_known
278 def mexists(
279 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
280 ) -> Dict[DatasetRef, bool]:
281 """Check the existence of multiple datasets at once.
283 Parameters
284 ----------
285 refs : iterable of `DatasetRef`
286 The datasets to be checked.
287 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
288 Optional mapping of datastore artifact to existence. Updated by
289 this method with details of all artifacts tested. Can be `None`
290 if the caller is not interested.
292 Returns
293 -------
294 existence : `dict` of [`DatasetRef`, `bool`]
295 Mapping from dataset to boolean indicating existence in any
296 of the child datastores.
297 """
298 dataset_existence: Dict[DatasetRef, bool] = {}
299 for datastore in self.datastores:
300 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
302 # For next datastore no point asking about ones we know
303 # exist already. No special exemption for ephemeral datastores.
304 refs = [ref for ref, exists in dataset_existence.items() if not exists]
306 return dataset_existence
308 def exists(self, ref: DatasetRef) -> bool:
309 """Check if the dataset exists in one of the datastores.
311 Parameters
312 ----------
313 ref : `DatasetRef`
314 Reference to the required dataset.
316 Returns
317 -------
318 exists : `bool`
319 `True` if the entity exists in one of the child datastores.
320 """
321 for datastore in self.datastores:
322 if datastore.exists(ref):
323 log.debug("Found %s in datastore %s", ref, datastore.name)
324 return True
325 return False
327 def get(
328 self,
329 ref: DatasetRef,
330 parameters: Optional[Mapping[str, Any]] = None,
331 storageClass: Optional[Union[StorageClass, str]] = None,
332 ) -> Any:
333 """Load an InMemoryDataset from the store.
335 The dataset is returned from the first datastore that has
336 the dataset.
338 Parameters
339 ----------
340 ref : `DatasetRef`
341 Reference to the required Dataset.
342 parameters : `dict`
343 `StorageClass`-specific parameters that specify, for example,
344 a slice of the dataset to be loaded.
345 storageClass : `StorageClass` or `str`, optional
346 The storage class to be used to override the Python type
347 returned by this method. By default the returned type matches
348 the dataset type definition for this dataset. Specifying a
349 read `StorageClass` can force a different type to be returned.
350 This type must be compatible with the original type.
352 Returns
353 -------
354 inMemoryDataset : `object`
355 Requested dataset or slice thereof as an InMemoryDataset.
357 Raises
358 ------
359 FileNotFoundError
360 Requested dataset can not be retrieved.
361 TypeError
362 Return value from formatter has unexpected type.
363 ValueError
364 Formatter failed to process the dataset.
365 """
367 for datastore in self.datastores:
368 try:
369 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
370 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
371 return inMemoryObject
372 except FileNotFoundError:
373 pass
375 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
377 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
378 """Write a InMemoryDataset with a given `DatasetRef` to each
379 datastore.
381 The put() to child datastores can fail with
382 `DatasetTypeNotSupportedError`. The put() for this datastore will be
383 deemed to have succeeded so long as at least one child datastore
384 accepted the inMemoryDataset.
386 Parameters
387 ----------
388 inMemoryDataset : `object`
389 The dataset to store.
390 ref : `DatasetRef`
391 Reference to the associated Dataset.
393 Raises
394 ------
395 TypeError
396 Supplied object and storage class are inconsistent.
397 DatasetTypeNotSupportedError
398 All datastores reported `DatasetTypeNotSupportedError`.
399 """
400 log.debug("Put %s", ref)
402 # Confirm that we can accept this dataset
403 if not self.constraints.isAcceptable(ref):
404 # Raise rather than use boolean return value.
405 raise DatasetTypeNotSupportedError(
406 f"Dataset {ref} has been rejected by this datastore via configuration."
407 )
409 isPermanent = False
410 nsuccess = 0
411 npermanent = 0
412 nephemeral = 0
413 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
414 if (
415 constraints is not None and not constraints.isAcceptable(ref)
416 ) or not datastore.constraints.isAcceptable(ref):
417 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
418 continue
420 if datastore.isEphemeral:
421 nephemeral += 1
422 else:
423 npermanent += 1
424 try:
425 datastore.put(inMemoryDataset, ref)
426 nsuccess += 1
427 if not datastore.isEphemeral:
428 isPermanent = True
429 except DatasetTypeNotSupportedError:
430 pass
432 if nsuccess == 0:
433 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
435 if not isPermanent and npermanent > 0: 435 ↛ 436line 435 didn't jump to line 436, because the condition on line 435 was never true
436 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
438 if self._transaction is not None:
439 self._transaction.registerUndo("put", self.remove, ref)
441 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
442 # Docstring inherited from base class.
443 if transfer != "auto":
444 return transfer
445 # Ask each datastore what they think auto means
446 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
448 # Remove any untranslated "auto" values
449 transfers.discard(transfer)
451 if len(transfers) == 1: 451 ↛ 452line 451 didn't jump to line 452, because the condition on line 451 was never true
452 return transfers.pop()
453 if not transfers: 453 ↛ 457line 453 didn't jump to line 457, because the condition on line 453 was never false
454 # Everything reported "auto"
455 return transfer
457 raise RuntimeError(
458 "Chained datastore does not yet support different transfer modes"
459 f" from 'auto' in each child datastore (wanted {transfers})"
460 )
462 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
463 # Docstring inherited from Datastore._prepIngest.
464 if transfer is None:
465 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
467 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
468 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
469 if not acceptable:
470 log.debug(
471 "Datastore %s skipping ingest via configuration for refs %s",
472 name,
473 ", ".join(str(ref) for ref in dataset.refs),
474 )
475 return False
476 else:
477 return True
479 # Filter down to just datasets the chained datastore's own
480 # configuration accepts.
481 okForParent: List[FileDataset] = [
482 dataset
483 for dataset in datasets
484 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
485 ]
487 # Iterate over nested datastores and call _prepIngest on each.
488 # Save the results to a list:
489 children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
490 # ...and remember whether all of the failures are due to
491 # NotImplementedError being raised.
492 allFailuresAreNotImplementedError = True
493 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
494 okForChild: List[FileDataset]
495 if constraints is not None:
496 okForChild = [
497 dataset
498 for dataset in okForParent
499 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
500 ]
501 else:
502 okForChild = okForParent
503 try:
504 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
505 except NotImplementedError:
506 log.debug(
507 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
508 datastore.name,
509 transfer,
510 )
511 continue
512 allFailuresAreNotImplementedError = False
513 if okForChild:
514 # Do not store for later if a datastore has rejected
515 # everything.
516 # Include the source paths if this is a "move". It's clearer
517 # to find the paths now rather than try to infer how
518 # each datastore has stored them in the internal prep class.
519 paths = (
520 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set()
521 )
522 children.append((datastore, prepDataForChild, paths))
523 if allFailuresAreNotImplementedError:
524 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
525 return _IngestPrepData(children=children)
527 def _finishIngest(
528 self,
529 prepData: _IngestPrepData,
530 *,
531 transfer: Optional[str] = None,
532 record_validation_info: bool = True,
533 ) -> None:
534 # Docstring inherited from Datastore._finishIngest.
535 # For "move" we must use "copy" and then delete the input
536 # data at the end. This has no rollback option if the ingest
537 # subsequently fails. If there is only one active datastore
538 # accepting any files we can leave it as "move"
539 actual_transfer: str | None
540 if transfer == "move" and len(prepData.children) > 1:
541 actual_transfer = "copy"
542 else:
543 actual_transfer = transfer
544 to_be_deleted: set[ResourcePath] = set()
545 for datastore, prepDataForChild, paths in prepData.children:
546 datastore._finishIngest(
547 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
548 )
549 to_be_deleted.update(paths)
550 if actual_transfer != transfer:
551 # These datasets were copied but now need to be deleted.
552 # This can not be rolled back.
553 for uri in to_be_deleted:
554 uri.remove()
556 def getManyURIs(
557 self,
558 refs: Iterable[DatasetRef],
559 predict: bool = False,
560 allow_missing: bool = False,
561 ) -> Dict[DatasetRef, DatasetRefURIs]:
562 # Docstring inherited
564 uris: Dict[DatasetRef, DatasetRefURIs] = {}
565 missing_refs = set(refs)
567 # If predict is True we don't want to predict a dataset in the first
568 # datastore if it actually exists in a later datastore, so in that
569 # case check all datastores with predict=False first, and then try
570 # again with predict=True.
571 for p in (False, True) if predict else (False,):
572 if not missing_refs:
573 break
574 for datastore in self.datastores:
575 try:
576 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
577 except NotImplementedError:
578 # some datastores may not implement generating URIs
579 continue
580 missing_refs -= got_uris.keys()
581 uris.update(got_uris)
582 if not missing_refs:
583 break
585 if missing_refs and not allow_missing:
586 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
588 return uris
590 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
591 """Return URIs associated with dataset.
593 Parameters
594 ----------
595 ref : `DatasetRef`
596 Reference to the required dataset.
597 predict : `bool`, optional
598 If the datastore does not know about the dataset, should it
599 return a predicted URI or not?
601 Returns
602 -------
603 uris : `DatasetRefURIs`
604 The URI to the primary artifact associated with this dataset (if
605 the dataset was disassembled within the datastore this may be
606 `None`), and the URIs to any components associated with the dataset
607 artifact. (can be empty if there are no components).
609 Notes
610 -----
611 The returned URI is from the first datastore in the list that has
612 the dataset with preference given to the first dataset coming from
613 a permanent datastore. If no datastores have the dataset and prediction
614 is allowed, the predicted URI for the first datastore in the list will
615 be returned.
616 """
617 log.debug("Requesting URIs for %s", ref)
618 predictedUri: Optional[DatasetRefURIs] = None
619 predictedEphemeralUri: Optional[DatasetRefURIs] = None
620 firstEphemeralUri: Optional[DatasetRefURIs] = None
621 for datastore in self.datastores:
622 if datastore.exists(ref):
623 if not datastore.isEphemeral:
624 uri = datastore.getURIs(ref)
625 log.debug("Retrieved non-ephemeral URI: %s", uri)
626 return uri
627 elif not firstEphemeralUri:
628 firstEphemeralUri = datastore.getURIs(ref)
629 elif predict:
630 if not predictedUri and not datastore.isEphemeral:
631 predictedUri = datastore.getURIs(ref, predict)
632 elif not predictedEphemeralUri and datastore.isEphemeral:
633 predictedEphemeralUri = datastore.getURIs(ref, predict)
635 if firstEphemeralUri:
636 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
637 return firstEphemeralUri
639 if predictedUri:
640 log.debug("Retrieved predicted URI: %s", predictedUri)
641 return predictedUri
643 if predictedEphemeralUri:
644 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
645 return predictedEphemeralUri
647 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
649 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
650 """URI to the Dataset.
652 The returned URI is from the first datastore in the list that has
653 the dataset with preference given to the first dataset coming from
654 a permanent datastore. If no datastores have the dataset and prediction
655 is allowed, the predicted URI for the first datastore in the list will
656 be returned.
658 Parameters
659 ----------
660 ref : `DatasetRef`
661 Reference to the required Dataset.
662 predict : `bool`
663 If `True`, allow URIs to be returned of datasets that have not
664 been written.
666 Returns
667 -------
668 uri : `lsst.resources.ResourcePath`
669 URI pointing to the dataset within the datastore. If the
670 dataset does not exist in the datastore, and if ``predict`` is
671 `True`, the URI will be a prediction and will include a URI
672 fragment "#predicted".
674 Notes
675 -----
676 If the datastore does not have entities that relate well
677 to the concept of a URI the returned URI string will be
678 descriptive. The returned URI is not guaranteed to be obtainable.
680 Raises
681 ------
682 FileNotFoundError
683 A URI has been requested for a dataset that does not exist and
684 guessing is not allowed.
685 RuntimeError
686 Raised if a request is made for a single URI but multiple URIs
687 are associated with this dataset.
688 """
689 log.debug("Requesting URI for %s", ref)
690 primary, components = self.getURIs(ref, predict)
691 if primary is None or components: 691 ↛ 692line 691 didn't jump to line 692, because the condition on line 691 was never true
692 raise RuntimeError(
693 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
694 )
695 return primary
697 def retrieveArtifacts(
698 self,
699 refs: Iterable[DatasetRef],
700 destination: ResourcePath,
701 transfer: str = "auto",
702 preserve_path: bool = True,
703 overwrite: bool = False,
704 ) -> List[ResourcePath]:
705 """Retrieve the file artifacts associated with the supplied refs.
707 Parameters
708 ----------
709 refs : iterable of `DatasetRef`
710 The datasets for which file artifacts are to be retrieved.
711 A single ref can result in multiple files. The refs must
712 be resolved.
713 destination : `lsst.resources.ResourcePath`
714 Location to write the file artifacts.
715 transfer : `str`, optional
716 Method to use to transfer the artifacts. Must be one of the options
717 supported by `lsst.resources.ResourcePath.transfer_from()`.
718 "move" is not allowed.
719 preserve_path : `bool`, optional
720 If `True` the full path of the file artifact within the datastore
721 is preserved. If `False` the final file component of the path
722 is used.
723 overwrite : `bool`, optional
724 If `True` allow transfers to overwrite existing files at the
725 destination.
727 Returns
728 -------
729 targets : `list` of `lsst.resources.ResourcePath`
730 URIs of file artifacts in destination location. Order is not
731 preserved.
732 """
733 if not destination.isdir(): 733 ↛ 734line 733 didn't jump to line 734, because the condition on line 733 was never true
734 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
736 # Using getURIs is not feasible since it becomes difficult to
737 # determine the path within the datastore later on. For now
738 # follow getURIs implementation approach.
740 pending = set(refs)
742 # There is a question as to whether an exception should be raised
743 # early if some of the refs are missing, or whether files should be
744 # transferred until a problem is hit. Prefer to complain up front.
745 # Use the datastore integer as primary key.
746 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {}
748 for number, datastore in enumerate(self.datastores):
749 if datastore.isEphemeral:
750 # In the future we will want to distinguish in-memory from
751 # caching datastore since using an on-disk local
752 # cache is exactly what we should be doing.
753 continue
754 try:
755 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
756 except NotImplementedError:
757 # Some datastores may not support retrieving artifacts
758 continue
760 if datastore_refs:
761 grouped_by_datastore[number] = datastore_refs
763 # Remove these from the pending list so that we do not bother
764 # looking for them any more.
765 pending = pending - datastore_refs
767 if pending: 767 ↛ 768line 767 didn't jump to line 768, because the condition on line 767 was never true
768 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
770 # Now do the transfer.
771 targets: List[ResourcePath] = []
772 for number, datastore_refs in grouped_by_datastore.items():
773 targets.extend(
774 self.datastores[number].retrieveArtifacts(
775 datastore_refs,
776 destination,
777 transfer=transfer,
778 preserve_path=preserve_path,
779 overwrite=overwrite,
780 )
781 )
783 return targets
785 def remove(self, ref: DatasetRef) -> None:
786 """Indicate to the datastore that a dataset can be removed.
788 The dataset will be removed from each datastore. The dataset is
789 not required to exist in every child datastore.
791 Parameters
792 ----------
793 ref : `DatasetRef`
794 Reference to the required dataset.
796 Raises
797 ------
798 FileNotFoundError
799 Attempt to remove a dataset that does not exist. Raised if none
800 of the child datastores removed the dataset.
801 """
802 log.debug("Removing %s", ref)
803 self.trash(ref, ignore_errors=False)
804 self.emptyTrash(ignore_errors=False)
806 def forget(self, refs: Iterable[DatasetRef]) -> None:
807 for datastore in tuple(self.datastores):
808 datastore.forget(refs)
810 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
811 if isinstance(ref, DatasetRef):
812 ref_label = str(ref)
813 else:
814 ref_label = "bulk datasets"
816 log.debug("Trashing %s", ref_label)
818 counter = 0
819 for datastore in self.datastores:
820 try:
821 datastore.trash(ref, ignore_errors=ignore_errors)
822 counter += 1
823 except FileNotFoundError:
824 pass
826 if counter == 0:
827 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
828 if ignore_errors: 828 ↛ 829line 828 didn't jump to line 829, because the condition on line 828 was never true
829 log.warning(err_msg)
830 else:
831 raise FileNotFoundError(err_msg)
833 def emptyTrash(self, ignore_errors: bool = True) -> None:
834 for datastore in self.datastores:
835 datastore.emptyTrash(ignore_errors=ignore_errors)
837 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
838 """Retrieve a dataset from an input `Datastore`,
839 and store the result in this `Datastore`.
841 Parameters
842 ----------
843 inputDatastore : `Datastore`
844 The external `Datastore` from which to retreive the Dataset.
845 ref : `DatasetRef`
846 Reference to the required dataset in the input data store.
848 Returns
849 -------
850 results : `list`
851 List containing the return value from the ``put()`` to each
852 child datastore.
853 """
854 assert inputDatastore is not self # unless we want it for renames?
855 inMemoryDataset = inputDatastore.get(ref)
856 self.put(inMemoryDataset, ref)
858 def validateConfiguration(
859 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
860 ) -> None:
861 """Validate some of the configuration for this datastore.
863 Parameters
864 ----------
865 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
866 Entities to test against this configuration. Can be differing
867 types.
868 logFailures : `bool`, optional
869 If `True`, output a log message for every validation error
870 detected.
872 Raises
873 ------
874 DatastoreValidationError
875 Raised if there is a validation problem with a configuration.
876 All the problems are reported in a single exception.
878 Notes
879 -----
880 This method checks each datastore in turn.
881 """
883 # Need to catch each of the datastore outputs and ensure that
884 # all are tested.
885 failures = []
886 for datastore in self.datastores:
887 try:
888 datastore.validateConfiguration(entities, logFailures=logFailures)
889 except DatastoreValidationError as e:
890 if logFailures: 890 ↛ 892line 890 didn't jump to line 892, because the condition on line 890 was never false
891 log.critical("Datastore %s failed validation", datastore.name)
892 failures.append(f"Datastore {self.name}: {e}")
894 if failures:
895 msg = ";\n".join(failures)
896 raise DatastoreValidationError(msg)
898 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
899 # Docstring is inherited from base class
900 failures = []
901 for datastore in self.datastores:
902 try:
903 datastore.validateKey(lookupKey, entity)
904 except DatastoreValidationError as e:
905 failures.append(f"Datastore {self.name}: {e}")
907 if failures:
908 msg = ";\n".join(failures)
909 raise DatastoreValidationError(msg)
911 def getLookupKeys(self) -> Set[LookupKey]:
912 # Docstring is inherited from base class
913 keys = set()
914 for datastore in self.datastores:
915 keys.update(datastore.getLookupKeys())
917 keys.update(self.constraints.getLookupKeys())
918 for p in self.datastoreConstraints:
919 if p is not None: 919 ↛ 918line 919 didn't jump to line 918, because the condition on line 919 was never false
920 keys.update(p.getLookupKeys())
922 return keys
924 def needs_expanded_data_ids(
925 self,
926 transfer: Optional[str],
927 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
928 ) -> bool:
929 # Docstring inherited.
930 # We can't safely use `self.datastoreConstraints` with `entity` to
931 # check whether a child datastore would even want to ingest this
932 # dataset, because we don't want to filter out datastores that might
933 # need an expanded data ID based in incomplete information (e.g. we
934 # pass a StorageClass, but the constraint dispatches on DatasetType).
935 # So we pessimistically check if any datastore would need an expanded
936 # data ID for this transfer mode.
937 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores)
939 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
940 # Docstring inherited from the base class.
942 for datastore in self.datastores:
943 datastore.import_records(data)
945 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
946 # Docstring inherited from the base class.
948 all_records: Dict[str, DatastoreRecordData] = {}
950 # Merge all sub-datastore records into one structure
951 for datastore in self.datastores:
952 sub_records = datastore.export_records(refs)
953 for name, record_data in sub_records.items():
954 # All datastore names must be unique in a chain.
955 if name in all_records: 955 ↛ 956line 955 didn't jump to line 956, because the condition on line 955 was never true
956 raise ValueError("Non-unique datastore name found in datastore {datastore}")
957 all_records[name] = record_data
959 return all_records
961 def export(
962 self,
963 refs: Iterable[DatasetRef],
964 *,
965 directory: Optional[ResourcePathExpression] = None,
966 transfer: Optional[str] = "auto",
967 ) -> Iterable[FileDataset]:
968 # Docstring inherited from Datastore.export.
969 if transfer == "auto" and directory is None:
970 transfer = None
972 if transfer is not None and directory is None:
973 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
975 if transfer == "move":
976 raise TypeError("Can not export by moving files out of datastore.")
978 # Exporting from a chain has the potential for a dataset to be
979 # in one or more of the datastores in the chain. We only need one
980 # of them since we assume the datasets are the same in all (but
981 # the file format could be different of course since that is a
982 # per-datastore configuration).
983 # We also do not know whether any of the datastores in the chain
984 # support file export.
986 # Ensure we have an ordered sequence that is not an iterator or set.
987 if not isinstance(refs, Sequence):
988 refs = list(refs)
990 # If any of the datasets are missing entirely we need to raise early
991 # before we try to run the export. This can be a little messy but is
992 # better than exporting files from the first datastore and then finding
993 # that one is missing but is not in the second datastore either.
994 known = [datastore.knows_these(refs) for datastore in self.datastores]
995 refs_known: set[DatasetRef] = set()
996 for known_to_this in known:
997 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this})
998 missing_count = len(refs) - len(refs_known)
999 if missing_count:
1000 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}")
1002 # To allow us to slot each result into the right place after
1003 # asking each datastore, create a dict with the index.
1004 ref_positions = {ref: i for i, ref in enumerate(refs)}
1006 # Presize the final export list.
1007 exported: list[FileDataset | None] = [None] * len(refs)
1009 # The order of the returned dataset has to match the order of the
1010 # given refs, even if they are all from different datastores.
1011 for i, datastore in enumerate(self.datastores):
1012 known_to_this = known[i]
1013 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions]
1015 try:
1016 this_export = datastore.export(filtered, directory=directory, transfer=transfer)
1017 except NotImplementedError:
1018 # Try the next datastore.
1019 continue
1021 for ref, export in zip(filtered, this_export):
1022 # Get the position and also delete it from the list.
1023 exported[ref_positions.pop(ref)] = export
1025 # Every dataset should be accounted for because of the earlier checks
1026 # but make sure that we did fill all the slots to appease mypy.
1027 for i, dataset in enumerate(exported):
1028 if dataset is None: 1028 ↛ 1029line 1028 didn't jump to line 1029, because the condition on line 1028 was never true
1029 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.")
1030 yield dataset
1032 def transfer_from(
1033 self,
1034 source_datastore: Datastore,
1035 refs: Iterable[DatasetRef],
1036 transfer: str = "auto",
1037 artifact_existence: Optional[Dict[ResourcePath, bool]] = None,
1038 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1039 # Docstring inherited
1040 # mypy does not understand "type(self) is not type(source)"
1041 if isinstance(source_datastore, ChainedDatastore):
1042 # Both the source and destination are chained datastores.
1043 source_datastores = tuple(source_datastore.datastores)
1044 else:
1045 # The source datastore is different, forward everything to the
1046 # child datastores.
1047 source_datastores = tuple([source_datastore])
1049 # Need to know the set of all possible refs that could be transferred.
1050 remaining_refs = set(refs)
1052 missing_from_source: set[DatasetRef] | None = None
1053 all_accepted = set()
1054 nsuccess = 0
1055 for source_child in source_datastores:
1056 # If we are reading from a chained datastore, it's possible that
1057 # only a subset of the datastores know about the dataset. We can't
1058 # ask the receiving datastore to copy it when it doesn't exist
1059 # so we have to filter again based on what the source datastore
1060 # understands.
1061 known_to_source = source_child.knows_these([ref for ref in refs])
1063 # Need to know that there is a possibility that some of these
1064 # datasets exist but are unknown to the source datastore if
1065 # trust is enabled.
1066 if getattr(source_child, "trustGetRequest", False):
1067 unknown = [ref for ref, known in known_to_source.items() if not known]
1068 existence = source_child.mexists(unknown, artifact_existence)
1069 for ref, exists in existence.items():
1070 known_to_source[ref] = exists
1072 missing = {ref for ref, known in known_to_source.items() if not known}
1073 if missing:
1074 if missing_from_source is None:
1075 missing_from_source = missing
1076 else:
1077 missing_from_source &= missing
1079 # Try to transfer from each source datastore to each child
1080 # datastore. Have to make sure we don't transfer something
1081 # we've already transferred to this destination on later passes.
1083 # Filter the initial list based on the datasets we have
1084 # not yet transferred.
1085 these_refs = []
1086 for ref in refs:
1087 if ref in remaining_refs and known_to_source[ref]:
1088 these_refs.append(ref)
1090 if not these_refs:
1091 # Already transferred all datasets known to this datastore.
1092 continue
1094 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
1095 if constraints is not None: 1095 ↛ 1103line 1095 didn't jump to line 1103, because the condition on line 1095 was never false
1096 filtered_refs = []
1097 for ref in these_refs:
1098 if constraints.isAcceptable(ref):
1099 filtered_refs.append(ref)
1100 else:
1101 log.debug("Rejecting ref by constraints: %s", ref)
1102 else:
1103 filtered_refs = [ref for ref in these_refs]
1104 try:
1105 accepted, _ = datastore.transfer_from(
1106 source_child, filtered_refs, transfer, artifact_existence
1107 )
1108 except (TypeError, NotImplementedError):
1109 # The datastores were incompatible.
1110 continue
1111 else:
1112 nsuccess += 1
1114 # Remove the accepted datasets from those remaining.
1115 remaining_refs = remaining_refs - accepted
1117 # Keep track of everything we have accepted.
1118 all_accepted.update(accepted)
1120 if missing_from_source:
1121 for ref in missing_from_source:
1122 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref)
1124 if nsuccess == 0: 1124 ↛ 1125line 1124 didn't jump to line 1125, because the condition on line 1124 was never true
1125 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}")
1127 return all_accepted, remaining_refs