Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%
367 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-14 02:05 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-14 02:05 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Chained datastore."""
26__all__ = ("ChainedDatastore",)
28import itertools
29import logging
30import time
31import warnings
32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union
34from lsst.daf.butler import (
35 Constraints,
36 DatasetRef,
37 DatasetRefURIs,
38 DatasetTypeNotSupportedError,
39 Datastore,
40 DatastoreConfig,
41 DatastoreRecordData,
42 DatastoreValidationError,
43 FileDataset,
44)
45from lsst.resources import ResourcePath
46from lsst.utils import doImportType
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
51 from lsst.resources import ResourcePathExpression
53log = logging.getLogger(__name__)
56class _IngestPrepData(Datastore.IngestPrepData):
57 """Helper class for ChainedDatastore ingest implementation.
59 Parameters
60 ----------
61 children : `list` of `tuple`
62 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
63 """
65 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
66 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
67 self.children = children
70class ChainedDatastore(Datastore):
71 """Chained Datastores to allow read and writes from multiple datastores.
73 A ChainedDatastore is configured with multiple datastore configurations.
74 A ``put()`` is always sent to each datastore. A ``get()``
75 operation is sent to each datastore in turn and the first datastore
76 to return a valid dataset is used.
78 Parameters
79 ----------
80 config : `DatastoreConfig` or `str`
81 Configuration. This configuration must include a ``datastores`` field
82 as a sequence of datastore configurations. The order in this sequence
83 indicates the order to use for read operations.
84 bridgeManager : `DatastoreRegistryBridgeManager`
85 Object that manages the interface between `Registry` and datastores.
86 butlerRoot : `str`, optional
87 New datastore root to use to override the configuration value. This
88 root is sent to each child datastore.
90 Notes
91 -----
92 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
93 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
94 and `"hardlink"` if and only if all its child datastores do.
95 """
97 defaultConfigFile = "datastores/chainedDatastore.yaml"
98 """Path to configuration defaults. Accessed within the ``configs`` resource
99 or relative to a search path. Can be None if no defaults specified.
100 """
102 containerKey = "datastores"
103 """Key to specify where child datastores are configured."""
105 datastores: List[Datastore]
106 """All the child datastores known to this datastore."""
108 datastoreConstraints: Sequence[Optional[Constraints]]
109 """Constraints to be applied to each of the child datastores."""
111 @classmethod
112 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
113 """Set any filesystem-dependent config options for child Datastores to
114 be appropriate for a new empty repository with the given root.
116 Parameters
117 ----------
118 root : `str`
119 Filesystem path to the root of the data repository.
120 config : `Config`
121 A `Config` to update. Only the subset understood by
122 this component will be updated. Will not expand
123 defaults.
124 full : `Config`
125 A complete config with all defaults expanded that can be
126 converted to a `DatastoreConfig`. Read-only and will not be
127 modified by this method.
128 Repository-specific options that should not be obtained
129 from defaults when Butler instances are constructed
130 should be copied from ``full`` to ``config``.
131 overwrite : `bool`, optional
132 If `False`, do not modify a value in ``config`` if the value
133 already exists. Default is always to overwrite with the provided
134 ``root``.
136 Notes
137 -----
138 If a keyword is explicitly defined in the supplied ``config`` it
139 will not be overridden by this method if ``overwrite`` is `False`.
140 This allows explicit values set in external configs to be retained.
141 """
143 # Extract the part of the config we care about updating
144 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
146 # And the subset of the full config that we can use for reference.
147 # Do not bother with defaults because we are told this already has
148 # them.
149 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
151 # Loop over each datastore config and pass the subsets to the
152 # child datastores to process.
154 containerKey = cls.containerKey
155 for idx, (child, fullChild) in enumerate(
156 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey])
157 ):
158 childConfig = DatastoreConfig(child, mergeDefaults=False)
159 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
160 datastoreClass = doImportType(fullChildConfig["cls"])
161 if not issubclass(datastoreClass, Datastore): 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true
162 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
163 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
164 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
166 # Reattach to parent
167 datastoreConfig[containerKey, idx] = childConfig
169 # Reattach modified datastore config to parent
170 # If this has a datastore key we attach there, otherwise we assume
171 # this information goes at the top of the config hierarchy.
172 if DatastoreConfig.component in config:
173 config[DatastoreConfig.component] = datastoreConfig
174 else:
175 config.update(datastoreConfig)
177 return
179 def __init__(
180 self,
181 config: Union[Config, str],
182 bridgeManager: DatastoreRegistryBridgeManager,
183 butlerRoot: str | None = None,
184 ):
185 super().__init__(config, bridgeManager)
187 # Scan for child datastores and instantiate them with the same registry
188 self.datastores = []
189 for c in self.config["datastores"]:
190 c = DatastoreConfig(c)
191 datastoreType = doImportType(c["cls"])
192 if not issubclass(datastoreType, Datastore): 192 ↛ 193line 192 didn't jump to line 193, because the condition on line 192 was never true
193 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
194 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
195 log.debug("Creating child datastore %s", datastore.name)
196 self.datastores.append(datastore)
198 # Name ourself based on our children
199 if self.datastores: 199 ↛ 204line 199 didn't jump to line 204, because the condition on line 199 was never false
200 # We must set the names explicitly
201 self._names = [d.name for d in self.datastores]
202 childNames = ",".join(self.names)
203 else:
204 childNames = "(empty@{})".format(time.time())
205 self._names = [childNames]
206 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
208 # We declare we are ephemeral if all our child datastores declare
209 # they are ephemeral
210 isEphemeral = True
211 for d in self.datastores:
212 if not d.isEphemeral:
213 isEphemeral = False
214 break
215 self.isEphemeral = isEphemeral
217 # per-datastore override constraints
218 if "datastore_constraints" in self.config:
219 overrides = self.config["datastore_constraints"]
221 if len(overrides) != len(self.datastores): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true
222 raise DatastoreValidationError(
223 f"Number of registered datastores ({len(self.datastores)})"
224 " differs from number of constraints overrides"
225 f" {len(overrides)}"
226 )
228 self.datastoreConstraints = [
229 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
230 ]
232 else:
233 self.datastoreConstraints = (None,) * len(self.datastores)
235 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
237 @property
238 def names(self) -> Tuple[str, ...]:
239 return tuple(self._names)
241 def __str__(self) -> str:
242 chainName = ", ".join(str(ds) for ds in self.datastores)
243 return chainName
245 def knows(self, ref: DatasetRef) -> bool:
246 """Check if the dataset is known to any of the datastores.
248 Does not check for existence of any artifact.
250 Parameters
251 ----------
252 ref : `DatasetRef`
253 Reference to the required dataset.
255 Returns
256 -------
257 exists : `bool`
258 `True` if the dataset is known to the datastore.
259 """
260 for datastore in self.datastores:
261 if datastore.knows(ref):
262 log.debug("%s known to datastore %s", ref, datastore.name)
263 return True
264 return False
266 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
267 # Docstring inherited from the base class.
268 refs_known: dict[DatasetRef, bool] = {}
269 for datastore in self.datastores:
270 refs_known.update(datastore.knows_these(refs))
272 # No need to check in next datastore for refs that are known.
273 # We only update entries that were initially False.
274 refs = [ref for ref, known in refs_known.items() if not known]
276 return refs_known
278 def mexists(
279 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
280 ) -> Dict[DatasetRef, bool]:
281 """Check the existence of multiple datasets at once.
283 Parameters
284 ----------
285 refs : iterable of `DatasetRef`
286 The datasets to be checked.
287 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
288 Optional mapping of datastore artifact to existence. Updated by
289 this method with details of all artifacts tested. Can be `None`
290 if the caller is not interested.
292 Returns
293 -------
294 existence : `dict` of [`DatasetRef`, `bool`]
295 Mapping from dataset to boolean indicating existence in any
296 of the child datastores.
297 """
298 dataset_existence: Dict[DatasetRef, bool] = {}
299 for datastore in self.datastores:
300 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
302 # For next datastore no point asking about ones we know
303 # exist already. No special exemption for ephemeral datastores.
304 refs = [ref for ref, exists in dataset_existence.items() if not exists]
306 return dataset_existence
308 def exists(self, ref: DatasetRef) -> bool:
309 """Check if the dataset exists in one of the datastores.
311 Parameters
312 ----------
313 ref : `DatasetRef`
314 Reference to the required dataset.
316 Returns
317 -------
318 exists : `bool`
319 `True` if the entity exists in one of the child datastores.
320 """
321 for datastore in self.datastores:
322 if datastore.exists(ref):
323 log.debug("Found %s in datastore %s", ref, datastore.name)
324 return True
325 return False
327 def get(
328 self,
329 ref: DatasetRef,
330 parameters: Optional[Mapping[str, Any]] = None,
331 storageClass: Optional[Union[StorageClass, str]] = None,
332 ) -> Any:
333 """Load an InMemoryDataset from the store.
335 The dataset is returned from the first datastore that has
336 the dataset.
338 Parameters
339 ----------
340 ref : `DatasetRef`
341 Reference to the required Dataset.
342 parameters : `dict`
343 `StorageClass`-specific parameters that specify, for example,
344 a slice of the dataset to be loaded.
345 storageClass : `StorageClass` or `str`, optional
346 The storage class to be used to override the Python type
347 returned by this method. By default the returned type matches
348 the dataset type definition for this dataset. Specifying a
349 read `StorageClass` can force a different type to be returned.
350 This type must be compatible with the original type.
352 Returns
353 -------
354 inMemoryDataset : `object`
355 Requested dataset or slice thereof as an InMemoryDataset.
357 Raises
358 ------
359 FileNotFoundError
360 Requested dataset can not be retrieved.
361 TypeError
362 Return value from formatter has unexpected type.
363 ValueError
364 Formatter failed to process the dataset.
365 """
367 for datastore in self.datastores:
368 try:
369 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
370 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
371 return inMemoryObject
372 except FileNotFoundError:
373 pass
375 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
377 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
378 """Write a InMemoryDataset with a given `DatasetRef` to each
379 datastore.
381 The put() to child datastores can fail with
382 `DatasetTypeNotSupportedError`. The put() for this datastore will be
383 deemed to have succeeded so long as at least one child datastore
384 accepted the inMemoryDataset.
386 Parameters
387 ----------
388 inMemoryDataset : `object`
389 The dataset to store.
390 ref : `DatasetRef`
391 Reference to the associated Dataset.
393 Raises
394 ------
395 TypeError
396 Supplied object and storage class are inconsistent.
397 DatasetTypeNotSupportedError
398 All datastores reported `DatasetTypeNotSupportedError`.
399 """
400 log.debug("Put %s", ref)
402 # Confirm that we can accept this dataset
403 if not self.constraints.isAcceptable(ref):
404 # Raise rather than use boolean return value.
405 raise DatasetTypeNotSupportedError(
406 f"Dataset {ref} has been rejected by this datastore via configuration."
407 )
409 isPermanent = False
410 nsuccess = 0
411 npermanent = 0
412 nephemeral = 0
413 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
414 if constraints is not None and not constraints.isAcceptable(ref):
415 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
416 continue
418 if datastore.isEphemeral:
419 nephemeral += 1
420 else:
421 npermanent += 1
422 try:
423 datastore.put(inMemoryDataset, ref)
424 nsuccess += 1
425 if not datastore.isEphemeral:
426 isPermanent = True
427 except DatasetTypeNotSupportedError:
428 pass
430 if nsuccess == 0:
431 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
433 if not isPermanent and npermanent > 0: 433 ↛ 434line 433 didn't jump to line 434, because the condition on line 433 was never true
434 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
436 if self._transaction is not None:
437 self._transaction.registerUndo("put", self.remove, ref)
439 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
440 # Docstring inherited from base class.
441 if transfer != "auto":
442 return transfer
443 # Ask each datastore what they think auto means
444 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
446 # Remove any untranslated "auto" values
447 transfers.discard(transfer)
449 if len(transfers) == 1: 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true
450 return transfers.pop()
451 if not transfers: 451 ↛ 455line 451 didn't jump to line 455, because the condition on line 451 was never false
452 # Everything reported "auto"
453 return transfer
455 raise RuntimeError(
456 "Chained datastore does not yet support different transfer modes"
457 f" from 'auto' in each child datastore (wanted {transfers})"
458 )
460 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
461 # Docstring inherited from Datastore._prepIngest.
462 if transfer is None:
463 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
465 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
466 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
467 if not acceptable:
468 log.debug(
469 "Datastore %s skipping ingest via configuration for refs %s",
470 name,
471 ", ".join(str(ref) for ref in dataset.refs),
472 )
473 return False
474 else:
475 return True
477 # Filter down to just datasets the chained datastore's own
478 # configuration accepts.
479 okForParent: List[FileDataset] = [
480 dataset
481 for dataset in datasets
482 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
483 ]
485 # Iterate over nested datastores and call _prepIngest on each.
486 # Save the results to a list:
487 children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
488 # ...and remember whether all of the failures are due to
489 # NotImplementedError being raised.
490 allFailuresAreNotImplementedError = True
491 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
492 okForChild: List[FileDataset]
493 if constraints is not None:
494 okForChild = [
495 dataset
496 for dataset in okForParent
497 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
498 ]
499 else:
500 okForChild = okForParent
501 try:
502 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
503 except NotImplementedError:
504 log.debug(
505 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
506 datastore.name,
507 transfer,
508 )
509 continue
510 allFailuresAreNotImplementedError = False
511 if okForChild:
512 # Do not store for later if a datastore has rejected
513 # everything.
514 # Include the source paths if this is a "move". It's clearer
515 # to find the paths now rather than try to infer how
516 # each datastore has stored them in the internal prep class.
517 paths = (
518 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set()
519 )
520 children.append((datastore, prepDataForChild, paths))
521 if allFailuresAreNotImplementedError:
522 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
523 return _IngestPrepData(children=children)
525 def _finishIngest(
526 self,
527 prepData: _IngestPrepData,
528 *,
529 transfer: Optional[str] = None,
530 record_validation_info: bool = True,
531 ) -> None:
532 # Docstring inherited from Datastore._finishIngest.
533 # For "move" we must use "copy" and then delete the input
534 # data at the end. This has no rollback option if the ingest
535 # subsequently fails. If there is only one active datastore
536 # accepting any files we can leave it as "move"
537 actual_transfer: str | None
538 if transfer == "move" and len(prepData.children) > 1:
539 actual_transfer = "copy"
540 else:
541 actual_transfer = transfer
542 to_be_deleted: set[ResourcePath] = set()
543 for datastore, prepDataForChild, paths in prepData.children:
544 datastore._finishIngest(
545 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
546 )
547 to_be_deleted.update(paths)
548 if actual_transfer != transfer:
549 # These datasets were copied but now need to be deleted.
550 # This can not be rolled back.
551 for uri in to_be_deleted:
552 uri.remove()
554 def getManyURIs(
555 self,
556 refs: Iterable[DatasetRef],
557 predict: bool = False,
558 allow_missing: bool = False,
559 ) -> Dict[DatasetRef, DatasetRefURIs]:
560 # Docstring inherited
562 uris: Dict[DatasetRef, DatasetRefURIs] = {}
563 missing_refs = set(refs)
565 # If predict is True we don't want to predict a dataset in the first
566 # datastore if it actually exists in a later datastore, so in that
567 # case check all datastores with predict=False first, and then try
568 # again with predict=True.
569 for p in (False, True) if predict else (False,):
570 if not missing_refs:
571 break
572 for datastore in self.datastores:
573 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
574 missing_refs -= got_uris.keys()
575 uris.update(got_uris)
576 if not missing_refs:
577 break
579 if missing_refs and not allow_missing:
580 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
582 return uris
584 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
585 """Return URIs associated with dataset.
587 Parameters
588 ----------
589 ref : `DatasetRef`
590 Reference to the required dataset.
591 predict : `bool`, optional
592 If the datastore does not know about the dataset, should it
593 return a predicted URI or not?
595 Returns
596 -------
597 uris : `DatasetRefURIs`
598 The URI to the primary artifact associated with this dataset (if
599 the dataset was disassembled within the datastore this may be
600 `None`), and the URIs to any components associated with the dataset
601 artifact. (can be empty if there are no components).
603 Notes
604 -----
605 The returned URI is from the first datastore in the list that has
606 the dataset with preference given to the first dataset coming from
607 a permanent datastore. If no datastores have the dataset and prediction
608 is allowed, the predicted URI for the first datastore in the list will
609 be returned.
610 """
611 log.debug("Requesting URIs for %s", ref)
612 predictedUri: Optional[DatasetRefURIs] = None
613 predictedEphemeralUri: Optional[DatasetRefURIs] = None
614 firstEphemeralUri: Optional[DatasetRefURIs] = None
615 for datastore in self.datastores:
616 if datastore.exists(ref):
617 if not datastore.isEphemeral:
618 uri = datastore.getURIs(ref)
619 log.debug("Retrieved non-ephemeral URI: %s", uri)
620 return uri
621 elif not firstEphemeralUri:
622 firstEphemeralUri = datastore.getURIs(ref)
623 elif predict:
624 if not predictedUri and not datastore.isEphemeral:
625 predictedUri = datastore.getURIs(ref, predict)
626 elif not predictedEphemeralUri and datastore.isEphemeral:
627 predictedEphemeralUri = datastore.getURIs(ref, predict)
629 if firstEphemeralUri:
630 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
631 return firstEphemeralUri
633 if predictedUri:
634 log.debug("Retrieved predicted URI: %s", predictedUri)
635 return predictedUri
637 if predictedEphemeralUri:
638 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
639 return predictedEphemeralUri
641 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
643 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
644 """URI to the Dataset.
646 The returned URI is from the first datastore in the list that has
647 the dataset with preference given to the first dataset coming from
648 a permanent datastore. If no datastores have the dataset and prediction
649 is allowed, the predicted URI for the first datastore in the list will
650 be returned.
652 Parameters
653 ----------
654 ref : `DatasetRef`
655 Reference to the required Dataset.
656 predict : `bool`
657 If `True`, allow URIs to be returned of datasets that have not
658 been written.
660 Returns
661 -------
662 uri : `lsst.resources.ResourcePath`
663 URI pointing to the dataset within the datastore. If the
664 dataset does not exist in the datastore, and if ``predict`` is
665 `True`, the URI will be a prediction and will include a URI
666 fragment "#predicted".
668 Notes
669 -----
670 If the datastore does not have entities that relate well
671 to the concept of a URI the returned URI string will be
672 descriptive. The returned URI is not guaranteed to be obtainable.
674 Raises
675 ------
676 FileNotFoundError
677 A URI has been requested for a dataset that does not exist and
678 guessing is not allowed.
679 RuntimeError
680 Raised if a request is made for a single URI but multiple URIs
681 are associated with this dataset.
682 """
683 log.debug("Requesting URI for %s", ref)
684 primary, components = self.getURIs(ref, predict)
685 if primary is None or components: 685 ↛ 686line 685 didn't jump to line 686, because the condition on line 685 was never true
686 raise RuntimeError(
687 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
688 )
689 return primary
691 def retrieveArtifacts(
692 self,
693 refs: Iterable[DatasetRef],
694 destination: ResourcePath,
695 transfer: str = "auto",
696 preserve_path: bool = True,
697 overwrite: bool = False,
698 ) -> List[ResourcePath]:
699 """Retrieve the file artifacts associated with the supplied refs.
701 Parameters
702 ----------
703 refs : iterable of `DatasetRef`
704 The datasets for which file artifacts are to be retrieved.
705 A single ref can result in multiple files. The refs must
706 be resolved.
707 destination : `lsst.resources.ResourcePath`
708 Location to write the file artifacts.
709 transfer : `str`, optional
710 Method to use to transfer the artifacts. Must be one of the options
711 supported by `lsst.resources.ResourcePath.transfer_from()`.
712 "move" is not allowed.
713 preserve_path : `bool`, optional
714 If `True` the full path of the file artifact within the datastore
715 is preserved. If `False` the final file component of the path
716 is used.
717 overwrite : `bool`, optional
718 If `True` allow transfers to overwrite existing files at the
719 destination.
721 Returns
722 -------
723 targets : `list` of `lsst.resources.ResourcePath`
724 URIs of file artifacts in destination location. Order is not
725 preserved.
726 """
727 if not destination.isdir(): 727 ↛ 728line 727 didn't jump to line 728, because the condition on line 727 was never true
728 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
730 # Using getURIs is not feasible since it becomes difficult to
731 # determine the path within the datastore later on. For now
732 # follow getURIs implementation approach.
734 pending = set(refs)
736 # There is a question as to whether an exception should be raised
737 # early if some of the refs are missing, or whether files should be
738 # transferred until a problem is hit. Prefer to complain up front.
739 # Use the datastore integer as primary key.
740 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {}
742 for number, datastore in enumerate(self.datastores):
743 if datastore.isEphemeral:
744 # In the future we will want to distinguish in-memory from
745 # caching datastore since using an on-disk local
746 # cache is exactly what we should be doing.
747 continue
748 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
750 if datastore_refs:
751 grouped_by_datastore[number] = datastore_refs
753 # Remove these from the pending list so that we do not bother
754 # looking for them any more.
755 pending = pending - datastore_refs
757 if pending: 757 ↛ 758line 757 didn't jump to line 758, because the condition on line 757 was never true
758 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
760 # Now do the transfer.
761 targets: List[ResourcePath] = []
762 for number, datastore_refs in grouped_by_datastore.items():
763 targets.extend(
764 self.datastores[number].retrieveArtifacts(
765 datastore_refs,
766 destination,
767 transfer=transfer,
768 preserve_path=preserve_path,
769 overwrite=overwrite,
770 )
771 )
773 return targets
775 def remove(self, ref: DatasetRef) -> None:
776 """Indicate to the datastore that a dataset can be removed.
778 The dataset will be removed from each datastore. The dataset is
779 not required to exist in every child datastore.
781 Parameters
782 ----------
783 ref : `DatasetRef`
784 Reference to the required dataset.
786 Raises
787 ------
788 FileNotFoundError
789 Attempt to remove a dataset that does not exist. Raised if none
790 of the child datastores removed the dataset.
791 """
792 log.debug("Removing %s", ref)
793 self.trash(ref, ignore_errors=False)
794 self.emptyTrash(ignore_errors=False)
796 def forget(self, refs: Iterable[DatasetRef]) -> None:
797 for datastore in tuple(self.datastores):
798 datastore.forget(refs)
800 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
801 if isinstance(ref, DatasetRef):
802 ref_label = str(ref)
803 else:
804 ref_label = "bulk datasets"
806 log.debug("Trashing %s", ref_label)
808 counter = 0
809 for datastore in self.datastores:
810 try:
811 datastore.trash(ref, ignore_errors=ignore_errors)
812 counter += 1
813 except FileNotFoundError:
814 pass
816 if counter == 0:
817 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
818 if ignore_errors: 818 ↛ 819line 818 didn't jump to line 819, because the condition on line 818 was never true
819 log.warning(err_msg)
820 else:
821 raise FileNotFoundError(err_msg)
823 def emptyTrash(self, ignore_errors: bool = True) -> None:
824 for datastore in self.datastores:
825 datastore.emptyTrash(ignore_errors=ignore_errors)
827 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
828 """Retrieve a dataset from an input `Datastore`,
829 and store the result in this `Datastore`.
831 Parameters
832 ----------
833 inputDatastore : `Datastore`
834 The external `Datastore` from which to retreive the Dataset.
835 ref : `DatasetRef`
836 Reference to the required dataset in the input data store.
838 Returns
839 -------
840 results : `list`
841 List containing the return value from the ``put()`` to each
842 child datastore.
843 """
844 assert inputDatastore is not self # unless we want it for renames?
845 inMemoryDataset = inputDatastore.get(ref)
846 self.put(inMemoryDataset, ref)
848 def validateConfiguration(
849 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
850 ) -> None:
851 """Validate some of the configuration for this datastore.
853 Parameters
854 ----------
855 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
856 Entities to test against this configuration. Can be differing
857 types.
858 logFailures : `bool`, optional
859 If `True`, output a log message for every validation error
860 detected.
862 Raises
863 ------
864 DatastoreValidationError
865 Raised if there is a validation problem with a configuration.
866 All the problems are reported in a single exception.
868 Notes
869 -----
870 This method checks each datastore in turn.
871 """
873 # Need to catch each of the datastore outputs and ensure that
874 # all are tested.
875 failures = []
876 for datastore in self.datastores:
877 try:
878 datastore.validateConfiguration(entities, logFailures=logFailures)
879 except DatastoreValidationError as e:
880 if logFailures: 880 ↛ 882line 880 didn't jump to line 882, because the condition on line 880 was never false
881 log.critical("Datastore %s failed validation", datastore.name)
882 failures.append(f"Datastore {self.name}: {e}")
884 if failures:
885 msg = ";\n".join(failures)
886 raise DatastoreValidationError(msg)
888 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
889 # Docstring is inherited from base class
890 failures = []
891 for datastore in self.datastores:
892 try:
893 datastore.validateKey(lookupKey, entity)
894 except DatastoreValidationError as e:
895 failures.append(f"Datastore {self.name}: {e}")
897 if failures:
898 msg = ";\n".join(failures)
899 raise DatastoreValidationError(msg)
901 def getLookupKeys(self) -> Set[LookupKey]:
902 # Docstring is inherited from base class
903 keys = set()
904 for datastore in self.datastores:
905 keys.update(datastore.getLookupKeys())
907 keys.update(self.constraints.getLookupKeys())
908 for p in self.datastoreConstraints:
909 if p is not None: 909 ↛ 910line 909 didn't jump to line 910, because the condition on line 909 was never true
910 keys.update(p.getLookupKeys())
912 return keys
914 def needs_expanded_data_ids(
915 self,
916 transfer: Optional[str],
917 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
918 ) -> bool:
919 # Docstring inherited.
920 # We can't safely use `self.datastoreConstraints` with `entity` to
921 # check whether a child datastore would even want to ingest this
922 # dataset, because we don't want to filter out datastores that might
923 # need an expanded data ID based in incomplete information (e.g. we
924 # pass a StorageClass, but the constraint dispatches on DatasetType).
925 # So we pessimistically check if any datastore would need an expanded
926 # data ID for this transfer mode.
927 return any(datastore.needs_expanded_data_ids(transfer) for datastore in self.datastores) 927 ↛ exitline 927 didn't finish the generator expression on line 927
929 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
930 # Docstring inherited from the base class.
932 for datastore in self.datastores:
933 datastore.import_records(data)
935 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
936 # Docstring inherited from the base class.
938 all_records: Dict[str, DatastoreRecordData] = {}
940 # Merge all sub-datastore records into one structure
941 for datastore in self.datastores:
942 sub_records = datastore.export_records(refs)
943 for name, record_data in sub_records.items():
944 # All datastore names must be unique in a chain.
945 if name in all_records: 945 ↛ 946line 945 didn't jump to line 946, because the condition on line 945 was never true
946 raise ValueError("Non-unique datastore name found in datastore {datastore}")
947 all_records[name] = record_data
949 return all_records
951 def export(
952 self,
953 refs: Iterable[DatasetRef],
954 *,
955 directory: Optional[ResourcePathExpression] = None,
956 transfer: Optional[str] = "auto",
957 ) -> Iterable[FileDataset]:
958 # Docstring inherited from Datastore.export.
959 if transfer == "auto" and directory is None:
960 transfer = None
962 if transfer is not None and directory is None:
963 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
965 if transfer == "move":
966 raise TypeError("Can not export by moving files out of datastore.")
968 # Exporting from a chain has the potential for a dataset to be
969 # in one or more of the datastores in the chain. We only need one
970 # of them since we assume the datasets are the same in all (but
971 # the file format could be different of course since that is a
972 # per-datastore configuration).
973 # We also do not know whether any of the datastores in the chain
974 # support file export.
976 # Ensure we have an ordered sequence that is not an iterator or set.
977 if not isinstance(refs, Sequence):
978 refs = list(refs)
980 # If any of the datasets are missing entirely we need to raise early
981 # before we try to run the export. This can be a little messy but is
982 # better than exporting files from the first datastore and then finding
983 # that one is missing but is not in the second datastore either.
984 known = [datastore.knows_these(refs) for datastore in self.datastores]
985 refs_known: set[DatasetRef] = set()
986 for known_to_this in known:
987 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this})
988 missing_count = len(refs) - len(refs_known)
989 if missing_count:
990 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}")
992 # To allow us to slot each result into the right place after
993 # asking each datastore, create a dict with the index.
994 ref_positions = {ref: i for i, ref in enumerate(refs)}
996 # Presize the final export list.
997 exported: list[FileDataset | None] = [None] * len(refs)
999 # The order of the returned dataset has to match the order of the
1000 # given refs, even if they are all from different datastores.
1001 for i, datastore in enumerate(self.datastores):
1002 known_to_this = known[i]
1003 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions]
1005 try:
1006 this_export = datastore.export(filtered, directory=directory, transfer=transfer)
1007 except NotImplementedError:
1008 # Try the next datastore.
1009 continue
1011 for ref, export in zip(filtered, this_export):
1012 # Get the position and also delete it from the list.
1013 exported[ref_positions.pop(ref)] = export
1015 # Every dataset should be accounted for because of the earlier checks
1016 # but make sure that we did fill all the slots to appease mypy.
1017 for i, dataset in enumerate(exported):
1018 if dataset is None: 1018 ↛ 1019line 1018 didn't jump to line 1019, because the condition on line 1018 was never true
1019 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.")
1020 yield dataset