Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 86%
477 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-07 11:02 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-07 11:02 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Chained datastore."""
30from __future__ import annotations
32__all__ = ("ChainedDatastore",)
34import itertools
35import logging
36import time
37import warnings
38from collections.abc import Collection, Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any
41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset
42from lsst.daf.butler.datastore import (
43 DatasetRefURIs,
44 Datastore,
45 DatastoreConfig,
46 DatastoreOpaqueTable,
47 DatastoreValidationError,
48)
49from lsst.daf.butler.datastore.constraints import Constraints
50from lsst.daf.butler.datastore.record_data import DatastoreRecordData
51from lsst.resources import ResourcePath
52from lsst.utils import doImportType
54if TYPE_CHECKING:
55 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
56 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
57 from lsst.resources import ResourcePathExpression
59log = logging.getLogger(__name__)
62class _IngestPrepData(Datastore.IngestPrepData):
63 """Helper class for ChainedDatastore ingest implementation.
65 Parameters
66 ----------
67 children : `list` of `tuple`
68 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
69 """
71 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
72 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
73 self.children = children
76class ChainedDatastore(Datastore):
77 """Chained Datastores to allow read and writes from multiple datastores.
79 A ChainedDatastore is configured with multiple datastore configurations.
80 A ``put()`` is always sent to each datastore. A ``get()``
81 operation is sent to each datastore in turn and the first datastore
82 to return a valid dataset is used.
84 Parameters
85 ----------
86 config : `DatastoreConfig` or `str`
87 Configuration. This configuration must include a ``datastores`` field
88 as a sequence of datastore configurations. The order in this sequence
89 indicates the order to use for read operations.
90 bridgeManager : `DatastoreRegistryBridgeManager`
91 Object that manages the interface between `Registry` and datastores.
92 datastores : `list` [`Datastore`]
93 All the child datastores known to this datastore.
95 Notes
96 -----
97 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
98 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
99 and `"hardlink"` if and only if all its child datastores do.
100 """
102 defaultConfigFile = "datastores/chainedDatastore.yaml"
103 """Path to configuration defaults. Accessed within the ``configs`` resource
104 or relative to a search path. Can be None if no defaults specified.
105 """
107 containerKey = "datastores"
108 """Key to specify where child datastores are configured."""
110 datastores: list[Datastore]
111 """All the child datastores known to this datastore."""
113 datastoreConstraints: Sequence[Constraints | None]
114 """Constraints to be applied to each of the child datastores."""
116 @classmethod
117 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
118 """Set any filesystem-dependent config options for child Datastores to
119 be appropriate for a new empty repository with the given root.
121 Parameters
122 ----------
123 root : `str`
124 Filesystem path to the root of the data repository.
125 config : `Config`
126 A `Config` to update. Only the subset understood by
127 this component will be updated. Will not expand
128 defaults.
129 full : `Config`
130 A complete config with all defaults expanded that can be
131 converted to a `DatastoreConfig`. Read-only and will not be
132 modified by this method.
133 Repository-specific options that should not be obtained
134 from defaults when Butler instances are constructed
135 should be copied from ``full`` to ``config``.
136 overwrite : `bool`, optional
137 If `False`, do not modify a value in ``config`` if the value
138 already exists. Default is always to overwrite with the provided
139 ``root``.
141 Notes
142 -----
143 If a keyword is explicitly defined in the supplied ``config`` it
144 will not be overridden by this method if ``overwrite`` is `False`.
145 This allows explicit values set in external configs to be retained.
146 """
147 # Extract the part of the config we care about updating
148 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
150 # And the subset of the full config that we can use for reference.
151 # Do not bother with defaults because we are told this already has
152 # them.
153 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
155 # Loop over each datastore config and pass the subsets to the
156 # child datastores to process.
158 containerKey = cls.containerKey
159 for idx, (child, fullChild) in enumerate(
160 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True)
161 ):
162 childConfig = DatastoreConfig(child, mergeDefaults=False)
163 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
164 datastoreClass = doImportType(fullChildConfig["cls"])
165 if not issubclass(datastoreClass, Datastore): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true
166 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
167 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}"
168 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
170 # Reattach to parent
171 datastoreConfig[containerKey, idx] = childConfig
173 # Reattach modified datastore config to parent
174 # If this has a datastore key we attach there, otherwise we assume
175 # this information goes at the top of the config hierarchy.
176 if DatastoreConfig.component in config:
177 config[DatastoreConfig.component] = datastoreConfig
178 else:
179 config.update(datastoreConfig)
181 return
183 def __init__(
184 self,
185 config: DatastoreConfig,
186 bridgeManager: DatastoreRegistryBridgeManager,
187 datastores: list[Datastore],
188 ):
189 super().__init__(config, bridgeManager)
191 self.datastores = list(datastores)
193 # Name ourself based on our children
194 if self.datastores: 194 ↛ 199line 194 didn't jump to line 199, because the condition on line 194 was never false
195 # We must set the names explicitly
196 self._names = [d.name for d in self.datastores]
197 childNames = ",".join(self.names)
198 else:
199 childNames = f"(empty@{time.time()})"
200 self._names = [childNames]
201 self.name = f"{type(self).__qualname__}[{childNames}]"
203 # We declare we are ephemeral if all our child datastores declare
204 # they are ephemeral
205 self.isEphemeral = all(d.isEphemeral for d in self.datastores)
207 # per-datastore override constraints
208 if "datastore_constraints" in self.config:
209 overrides = self.config["datastore_constraints"]
211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 raise DatastoreValidationError(
213 f"Number of registered datastores ({len(self.datastores)})"
214 " differs from number of constraints overrides"
215 f" {len(overrides)}"
216 )
218 self.datastoreConstraints = [
219 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
220 ]
222 else:
223 self.datastoreConstraints = (None,) * len(self.datastores)
225 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
227 @classmethod
228 def _create_from_config(
229 cls,
230 config: DatastoreConfig,
231 bridgeManager: DatastoreRegistryBridgeManager,
232 butlerRoot: ResourcePathExpression | None,
233 ) -> ChainedDatastore:
234 # Scan for child datastores and instantiate them with the same registry
235 datastores = []
236 for c in config["datastores"]:
237 c = DatastoreConfig(c)
238 datastoreType = doImportType(c["cls"])
239 if not issubclass(datastoreType, Datastore): 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true
240 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
241 datastore = datastoreType._create_from_config(c, bridgeManager, butlerRoot=butlerRoot)
242 log.debug("Creating child datastore %s", datastore.name)
243 datastores.append(datastore)
245 return ChainedDatastore(config, bridgeManager, datastores)
247 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore:
248 datastores = [ds.clone(bridgeManager) for ds in self.datastores]
249 return ChainedDatastore(self.config, bridgeManager, datastores)
251 @property
252 def names(self) -> tuple[str, ...]:
253 return tuple(self._names)
255 @property
256 def roots(self) -> dict[str, ResourcePath | None]:
257 # Docstring inherited.
258 roots = {}
259 for datastore in self.datastores:
260 roots.update(datastore.roots)
261 return roots
263 def __str__(self) -> str:
264 chainName = ", ".join(str(ds) for ds in self.datastores)
265 return chainName
267 def _set_trust_mode(self, mode: bool) -> None:
268 for datastore in self.datastores:
269 datastore._set_trust_mode(mode)
271 def knows(self, ref: DatasetRef) -> bool:
272 """Check if the dataset is known to any of the datastores.
274 Does not check for existence of any artifact.
276 Parameters
277 ----------
278 ref : `DatasetRef`
279 Reference to the required dataset.
281 Returns
282 -------
283 exists : `bool`
284 `True` if the dataset is known to the datastore.
285 """
286 for datastore in self.datastores:
287 if datastore.knows(ref):
288 log.debug("%s known to datastore %s", ref, datastore.name)
289 return True
290 return False
292 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
293 # Docstring inherited from the base class.
294 refs_known: dict[DatasetRef, bool] = {}
295 for datastore in self.datastores:
296 refs_known.update(datastore.knows_these(refs))
298 # No need to check in next datastore for refs that are known.
299 # We only update entries that were initially False.
300 refs = [ref for ref, known in refs_known.items() if not known]
302 return refs_known
304 def mexists(
305 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
306 ) -> dict[DatasetRef, bool]:
307 """Check the existence of multiple datasets at once.
309 Parameters
310 ----------
311 refs : iterable of `DatasetRef`
312 The datasets to be checked.
313 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
314 Optional mapping of datastore artifact to existence. Updated by
315 this method with details of all artifacts tested. Can be `None`
316 if the caller is not interested.
318 Returns
319 -------
320 existence : `dict` of [`DatasetRef`, `bool`]
321 Mapping from dataset to boolean indicating existence in any
322 of the child datastores.
323 """
324 dataset_existence: dict[DatasetRef, bool] = {}
325 for datastore in self.datastores:
326 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
328 # For next datastore no point asking about ones we know
329 # exist already. No special exemption for ephemeral datastores.
330 refs = [ref for ref, exists in dataset_existence.items() if not exists]
332 return dataset_existence
334 def exists(self, ref: DatasetRef) -> bool:
335 """Check if the dataset exists in one of the datastores.
337 Parameters
338 ----------
339 ref : `DatasetRef`
340 Reference to the required dataset.
342 Returns
343 -------
344 exists : `bool`
345 `True` if the entity exists in one of the child datastores.
346 """
347 for datastore in self.datastores:
348 if datastore.exists(ref):
349 log.debug("Found %s in datastore %s", ref, datastore.name)
350 return True
351 return False
353 def get(
354 self,
355 ref: DatasetRef,
356 parameters: Mapping[str, Any] | None = None,
357 storageClass: StorageClass | str | None = None,
358 ) -> Any:
359 """Load an InMemoryDataset from the store.
361 The dataset is returned from the first datastore that has
362 the dataset.
364 Parameters
365 ----------
366 ref : `DatasetRef`
367 Reference to the required Dataset.
368 parameters : `dict`
369 `StorageClass`-specific parameters that specify, for example,
370 a slice of the dataset to be loaded.
371 storageClass : `StorageClass` or `str`, optional
372 The storage class to be used to override the Python type
373 returned by this method. By default the returned type matches
374 the dataset type definition for this dataset. Specifying a
375 read `StorageClass` can force a different type to be returned.
376 This type must be compatible with the original type.
378 Returns
379 -------
380 inMemoryDataset : `object`
381 Requested dataset or slice thereof as an InMemoryDataset.
383 Raises
384 ------
385 FileNotFoundError
386 Requested dataset can not be retrieved.
387 TypeError
388 Return value from formatter has unexpected type.
389 ValueError
390 Formatter failed to process the dataset.
391 """
392 for datastore in self.datastores:
393 try:
394 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
395 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
396 return inMemoryObject
397 except FileNotFoundError:
398 pass
400 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores")
402 def prepare_get_for_external_client(self, ref: DatasetRef) -> object | None:
403 datastore = self._get_matching_datastore(ref)
404 if datastore is None: 404 ↛ 407line 404 didn't jump to line 407, because the condition on line 404 was never false
405 return None
407 return datastore.prepare_get_for_external_client(ref)
409 def _get_matching_datastore(self, ref: DatasetRef) -> Datastore | None:
410 """Return the first child datastore that owns the specified dataset."""
411 for datastore in self.datastores:
412 if datastore.knows(ref): 412 ↛ 413line 412 didn't jump to line 413, because the condition on line 412 was never true
413 return datastore
415 return None
417 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
418 """Write a InMemoryDataset with a given `DatasetRef` to each
419 datastore.
421 The put() to child datastores can fail with
422 `DatasetTypeNotSupportedError`. The put() for this datastore will be
423 deemed to have succeeded so long as at least one child datastore
424 accepted the inMemoryDataset.
426 Parameters
427 ----------
428 inMemoryDataset : `object`
429 The dataset to store.
430 ref : `DatasetRef`
431 Reference to the associated Dataset.
433 Raises
434 ------
435 TypeError
436 Supplied object and storage class are inconsistent.
437 DatasetTypeNotSupportedError
438 All datastores reported `DatasetTypeNotSupportedError`.
439 """
440 log.debug("Put %s", ref)
442 # Confirm that we can accept this dataset
443 if not self.constraints.isAcceptable(ref):
444 # Raise rather than use boolean return value.
445 raise DatasetTypeNotSupportedError(
446 f"Dataset {ref} has been rejected by this datastore via configuration."
447 )
449 isPermanent = False
450 nsuccess = 0
451 npermanent = 0
452 nephemeral = 0
453 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
454 if (
455 constraints is not None and not constraints.isAcceptable(ref)
456 ) or not datastore.constraints.isAcceptable(ref):
457 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
458 continue
460 if datastore.isEphemeral:
461 nephemeral += 1
462 else:
463 npermanent += 1
464 try:
465 datastore.put(inMemoryDataset, ref)
466 nsuccess += 1
467 if not datastore.isEphemeral:
468 isPermanent = True
469 except DatasetTypeNotSupportedError:
470 pass
472 if nsuccess == 0:
473 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
475 if not isPermanent and npermanent > 0: 475 ↛ 476line 475 didn't jump to line 476, because the condition on line 475 was never true
476 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
478 if self._transaction is not None:
479 self._transaction.registerUndo("put", self.remove, ref)
481 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
482 # Docstring inherited from base class.
483 log.debug("Put %s", ref)
485 # Confirm that we can accept this dataset
486 if not self.constraints.isAcceptable(ref):
487 # Raise rather than use boolean return value.
488 raise DatasetTypeNotSupportedError(
489 f"Dataset {ref} has been rejected by this datastore via configuration."
490 )
492 isPermanent = False
493 nsuccess = 0
494 npermanent = 0
495 nephemeral = 0
496 stored_refs: dict[str, DatasetRef] = {}
497 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
498 if (
499 constraints is not None and not constraints.isAcceptable(ref)
500 ) or not datastore.constraints.isAcceptable(ref):
501 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
502 continue
504 if datastore.isEphemeral:
505 nephemeral += 1
506 else:
507 npermanent += 1
508 try:
509 stored_ref_map = datastore.put_new(in_memory_dataset, ref)
510 stored_refs.update(stored_ref_map)
511 nsuccess += 1
512 if not datastore.isEphemeral:
513 isPermanent = True
514 except DatasetTypeNotSupportedError:
515 pass
517 if nsuccess == 0:
518 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
520 if not isPermanent and npermanent > 0:
521 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
523 if self._transaction is not None:
524 self._transaction.registerUndo("put", self.remove, ref)
526 return stored_refs
528 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
529 # Docstring inherited from base class.
530 if transfer != "auto":
531 return transfer
532 # Ask each datastore what they think auto means
533 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
535 # Remove any untranslated "auto" values
536 transfers.discard(transfer)
538 if len(transfers) == 1: 538 ↛ 539line 538 didn't jump to line 539, because the condition on line 538 was never true
539 return transfers.pop()
540 if not transfers: 540 ↛ 544line 540 didn't jump to line 544, because the condition on line 540 was never false
541 # Everything reported "auto"
542 return transfer
544 raise RuntimeError(
545 "Chained datastore does not yet support different transfer modes"
546 f" from 'auto' in each child datastore (wanted {transfers})"
547 )
549 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
550 # Docstring inherited from Datastore._prepIngest.
551 if transfer is None:
552 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
554 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
555 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
556 if not acceptable:
557 log.debug(
558 "Datastore %s skipping ingest via configuration for refs %s",
559 name,
560 ", ".join(str(ref) for ref in dataset.refs),
561 )
562 return False
563 else:
564 return True
566 # Filter down to just datasets the chained datastore's own
567 # configuration accepts.
568 okForParent: list[FileDataset] = [
569 dataset
570 for dataset in datasets
571 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
572 ]
574 # Iterate over nested datastores and call _prepIngest on each.
575 # Save the results to a list:
576 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
577 # ...and remember whether all of the failures are due to
578 # NotImplementedError being raised.
579 allFailuresAreNotImplementedError = True
580 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
581 okForChild: list[FileDataset]
582 if constraints is not None:
583 okForChild = [
584 dataset
585 for dataset in okForParent
586 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
587 ]
588 else:
589 okForChild = okForParent
590 try:
591 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
592 except NotImplementedError:
593 log.debug(
594 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
595 datastore.name,
596 transfer,
597 )
598 continue
599 allFailuresAreNotImplementedError = False
600 if okForChild:
601 # Do not store for later if a datastore has rejected
602 # everything.
603 # Include the source paths if this is a "move". It's clearer
604 # to find the paths now rather than try to infer how
605 # each datastore has stored them in the internal prep class.
606 paths = (
607 {ResourcePath(dataset.path, forceDirectory=False) for dataset in okForChild}
608 if transfer == "move"
609 else set()
610 )
611 children.append((datastore, prepDataForChild, paths))
612 if allFailuresAreNotImplementedError:
613 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
614 return _IngestPrepData(children=children)
616 def _finishIngest(
617 self,
618 prepData: _IngestPrepData,
619 *,
620 transfer: str | None = None,
621 record_validation_info: bool = True,
622 ) -> None:
623 # Docstring inherited from Datastore._finishIngest.
624 # For "move" we must use "copy" and then delete the input
625 # data at the end. This has no rollback option if the ingest
626 # subsequently fails. If there is only one active datastore
627 # accepting any files we can leave it as "move"
628 actual_transfer: str | None
629 if transfer == "move" and len(prepData.children) > 1:
630 actual_transfer = "copy"
631 else:
632 actual_transfer = transfer
633 to_be_deleted: set[ResourcePath] = set()
634 for datastore, prepDataForChild, paths in prepData.children:
635 datastore._finishIngest(
636 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
637 )
638 to_be_deleted.update(paths)
639 if actual_transfer != transfer:
640 # These datasets were copied but now need to be deleted.
641 # This can not be rolled back.
642 for uri in to_be_deleted:
643 uri.remove()
645 def getManyURIs(
646 self,
647 refs: Iterable[DatasetRef],
648 predict: bool = False,
649 allow_missing: bool = False,
650 ) -> dict[DatasetRef, DatasetRefURIs]:
651 # Docstring inherited
653 uris: dict[DatasetRef, DatasetRefURIs] = {}
654 missing_refs = set(refs)
656 # If predict is True we don't want to predict a dataset in the first
657 # datastore if it actually exists in a later datastore, so in that
658 # case check all datastores with predict=False first, and then try
659 # again with predict=True.
660 for p in (False, True) if predict else (False,):
661 if not missing_refs:
662 break
663 for datastore in self.datastores:
664 try:
665 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
666 except NotImplementedError:
667 # some datastores may not implement generating URIs
668 continue
669 missing_refs -= got_uris.keys()
670 uris.update(got_uris)
671 if not missing_refs:
672 break
674 if missing_refs and not allow_missing:
675 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
677 return uris
679 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
680 """Return URIs associated with dataset.
682 Parameters
683 ----------
684 ref : `DatasetRef`
685 Reference to the required dataset.
686 predict : `bool`, optional
687 If the datastore does not know about the dataset, controls whether
688 it should return a predicted URI or not.
690 Returns
691 -------
692 uris : `DatasetRefURIs`
693 The URI to the primary artifact associated with this dataset (if
694 the dataset was disassembled within the datastore this may be
695 `None`), and the URIs to any components associated with the dataset
696 artifact. (can be empty if there are no components).
698 Notes
699 -----
700 The returned URI is from the first datastore in the list that has
701 the dataset with preference given to the first dataset coming from
702 a permanent datastore. If no datastores have the dataset and prediction
703 is allowed, the predicted URI for the first datastore in the list will
704 be returned.
705 """
706 log.debug("Requesting URIs for %s", ref)
707 predictedUri: DatasetRefURIs | None = None
708 predictedEphemeralUri: DatasetRefURIs | None = None
709 firstEphemeralUri: DatasetRefURIs | None = None
710 for datastore in self.datastores:
711 if datastore.exists(ref):
712 if not datastore.isEphemeral:
713 uri = datastore.getURIs(ref)
714 log.debug("Retrieved non-ephemeral URI: %s", uri)
715 return uri
716 elif not firstEphemeralUri:
717 firstEphemeralUri = datastore.getURIs(ref)
718 elif predict:
719 if not predictedUri and not datastore.isEphemeral:
720 predictedUri = datastore.getURIs(ref, predict)
721 elif not predictedEphemeralUri and datastore.isEphemeral:
722 predictedEphemeralUri = datastore.getURIs(ref, predict)
724 if firstEphemeralUri:
725 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
726 return firstEphemeralUri
728 if predictedUri:
729 log.debug("Retrieved predicted URI: %s", predictedUri)
730 return predictedUri
732 if predictedEphemeralUri:
733 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
734 return predictedEphemeralUri
736 raise FileNotFoundError(f"Dataset {ref} not in any datastore")
738 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
739 """URI to the Dataset.
741 The returned URI is from the first datastore in the list that has
742 the dataset with preference given to the first dataset coming from
743 a permanent datastore. If no datastores have the dataset and prediction
744 is allowed, the predicted URI for the first datastore in the list will
745 be returned.
747 Parameters
748 ----------
749 ref : `DatasetRef`
750 Reference to the required Dataset.
751 predict : `bool`
752 If `True`, allow URIs to be returned of datasets that have not
753 been written.
755 Returns
756 -------
757 uri : `lsst.resources.ResourcePath`
758 URI pointing to the dataset within the datastore. If the
759 dataset does not exist in the datastore, and if ``predict`` is
760 `True`, the URI will be a prediction and will include a URI
761 fragment "#predicted".
763 Notes
764 -----
765 If the datastore does not have entities that relate well
766 to the concept of a URI the returned URI string will be
767 descriptive. The returned URI is not guaranteed to be obtainable.
769 Raises
770 ------
771 FileNotFoundError
772 A URI has been requested for a dataset that does not exist and
773 guessing is not allowed.
774 RuntimeError
775 Raised if a request is made for a single URI but multiple URIs
776 are associated with this dataset.
777 """
778 log.debug("Requesting URI for %s", ref)
779 primary, components = self.getURIs(ref, predict)
780 if primary is None or components: 780 ↛ 781line 780 didn't jump to line 781, because the condition on line 780 was never true
781 raise RuntimeError(
782 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
783 )
784 return primary
786 def retrieveArtifacts(
787 self,
788 refs: Iterable[DatasetRef],
789 destination: ResourcePath,
790 transfer: str = "auto",
791 preserve_path: bool = True,
792 overwrite: bool = False,
793 ) -> list[ResourcePath]:
794 """Retrieve the file artifacts associated with the supplied refs.
796 Parameters
797 ----------
798 refs : iterable of `DatasetRef`
799 The datasets for which file artifacts are to be retrieved.
800 A single ref can result in multiple files. The refs must
801 be resolved.
802 destination : `lsst.resources.ResourcePath`
803 Location to write the file artifacts.
804 transfer : `str`, optional
805 Method to use to transfer the artifacts. Must be one of the options
806 supported by `lsst.resources.ResourcePath.transfer_from()`.
807 "move" is not allowed.
808 preserve_path : `bool`, optional
809 If `True` the full path of the file artifact within the datastore
810 is preserved. If `False` the final file component of the path
811 is used.
812 overwrite : `bool`, optional
813 If `True` allow transfers to overwrite existing files at the
814 destination.
816 Returns
817 -------
818 targets : `list` of `lsst.resources.ResourcePath`
819 URIs of file artifacts in destination location. Order is not
820 preserved.
821 """
822 if not destination.isdir():
823 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
825 # Using getURIs is not feasible since it becomes difficult to
826 # determine the path within the datastore later on. For now
827 # follow getURIs implementation approach.
829 pending = set(refs)
831 # There is a question as to whether an exception should be raised
832 # early if some of the refs are missing, or whether files should be
833 # transferred until a problem is hit. Prefer to complain up front.
834 # Use the datastore integer as primary key.
835 grouped_by_datastore: dict[int, set[DatasetRef]] = {}
837 for number, datastore in enumerate(self.datastores):
838 if datastore.isEphemeral:
839 # In the future we will want to distinguish in-memory from
840 # caching datastore since using an on-disk local
841 # cache is exactly what we should be doing.
842 continue
843 try:
844 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
845 except NotImplementedError:
846 # Some datastores may not support retrieving artifacts
847 continue
849 if datastore_refs:
850 grouped_by_datastore[number] = datastore_refs
852 # Remove these from the pending list so that we do not bother
853 # looking for them any more.
854 pending = pending - datastore_refs
856 if pending: 856 ↛ 857line 856 didn't jump to line 857, because the condition on line 856 was never true
857 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
859 # Now do the transfer.
860 targets: list[ResourcePath] = []
861 for number, datastore_refs in grouped_by_datastore.items():
862 targets.extend(
863 self.datastores[number].retrieveArtifacts(
864 datastore_refs,
865 destination,
866 transfer=transfer,
867 preserve_path=preserve_path,
868 overwrite=overwrite,
869 )
870 )
872 return targets
874 def remove(self, ref: DatasetRef) -> None:
875 """Indicate to the datastore that a dataset can be removed.
877 The dataset will be removed from each datastore. The dataset is
878 not required to exist in every child datastore.
880 Parameters
881 ----------
882 ref : `DatasetRef`
883 Reference to the required dataset.
885 Raises
886 ------
887 FileNotFoundError
888 Attempt to remove a dataset that does not exist. Raised if none
889 of the child datastores removed the dataset.
890 """
891 log.debug("Removing %s", ref)
892 self.trash(ref, ignore_errors=False)
893 self.emptyTrash(ignore_errors=False)
895 def forget(self, refs: Iterable[DatasetRef]) -> None:
896 for datastore in tuple(self.datastores):
897 datastore.forget(refs)
899 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
900 if isinstance(ref, DatasetRef):
901 ref_label = str(ref)
902 else:
903 ref_label = "bulk datasets"
905 log.debug("Trashing %s", ref_label)
907 counter = 0
908 for datastore in self.datastores:
909 try:
910 datastore.trash(ref, ignore_errors=ignore_errors)
911 counter += 1
912 except FileNotFoundError:
913 pass
915 if counter == 0:
916 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
917 if ignore_errors: 917 ↛ 918line 917 didn't jump to line 918, because the condition on line 917 was never true
918 log.warning(err_msg)
919 else:
920 raise FileNotFoundError(err_msg)
922 def emptyTrash(self, ignore_errors: bool = True) -> None:
923 for datastore in self.datastores:
924 datastore.emptyTrash(ignore_errors=ignore_errors)
926 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
927 """Retrieve a dataset from an input `Datastore`,
928 and store the result in this `Datastore`.
930 Parameters
931 ----------
932 inputDatastore : `Datastore`
933 The external `Datastore` from which to retreive the Dataset.
934 ref : `DatasetRef`
935 Reference to the required dataset in the input data store.
937 Returns
938 -------
939 results : `list`
940 List containing the return value from the ``put()`` to each
941 child datastore.
942 """
943 assert inputDatastore is not self # unless we want it for renames?
944 inMemoryDataset = inputDatastore.get(ref)
945 self.put(inMemoryDataset, ref)
947 def validateConfiguration(
948 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
949 ) -> None:
950 """Validate some of the configuration for this datastore.
952 Parameters
953 ----------
954 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
955 Entities to test against this configuration. Can be differing
956 types.
957 logFailures : `bool`, optional
958 If `True`, output a log message for every validation error
959 detected.
961 Raises
962 ------
963 DatastoreValidationError
964 Raised if there is a validation problem with a configuration.
965 All the problems are reported in a single exception.
967 Notes
968 -----
969 This method checks each datastore in turn.
970 """
971 # Need to catch each of the datastore outputs and ensure that
972 # all are tested.
973 failures = []
974 for datastore in self.datastores:
975 try:
976 datastore.validateConfiguration(entities, logFailures=logFailures)
977 except DatastoreValidationError as e:
978 if logFailures: 978 ↛ 980line 978 didn't jump to line 980, because the condition on line 978 was never false
979 log.critical("Datastore %s failed validation", datastore.name)
980 failures.append(f"Datastore {self.name}: {e}")
982 if failures:
983 msg = ";\n".join(failures)
984 raise DatastoreValidationError(msg)
986 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
987 # Docstring is inherited from base class
988 failures = []
989 for datastore in self.datastores:
990 try:
991 datastore.validateKey(lookupKey, entity)
992 except DatastoreValidationError as e:
993 failures.append(f"Datastore {self.name}: {e}")
995 if failures:
996 msg = ";\n".join(failures)
997 raise DatastoreValidationError(msg)
999 def getLookupKeys(self) -> set[LookupKey]:
1000 # Docstring is inherited from base class
1001 keys = set()
1002 for datastore in self.datastores:
1003 keys.update(datastore.getLookupKeys())
1005 keys.update(self.constraints.getLookupKeys())
1006 for p in self.datastoreConstraints:
1007 if p is not None: 1007 ↛ 1006line 1007 didn't jump to line 1006, because the condition on line 1007 was never false
1008 keys.update(p.getLookupKeys())
1010 return keys
1012 def needs_expanded_data_ids(
1013 self,
1014 transfer: str | None,
1015 entity: DatasetRef | DatasetType | StorageClass | None = None,
1016 ) -> bool:
1017 # Docstring inherited.
1018 # We can't safely use `self.datastoreConstraints` with `entity` to
1019 # check whether a child datastore would even want to ingest this
1020 # dataset, because we don't want to filter out datastores that might
1021 # need an expanded data ID based in incomplete information (e.g. we
1022 # pass a StorageClass, but the constraint dispatches on DatasetType).
1023 # So we pessimistically check if any datastore would need an expanded
1024 # data ID for this transfer mode.
1025 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores)
1027 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
1028 # Docstring inherited from the base class.
1030 for datastore in self.datastores:
1031 datastore.import_records(data)
1033 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
1034 # Docstring inherited from the base class.
1036 all_records: dict[str, DatastoreRecordData] = {}
1038 # Merge all sub-datastore records into one structure
1039 for datastore in self.datastores:
1040 sub_records = datastore.export_records(refs)
1041 for name, record_data in sub_records.items():
1042 # All datastore names must be unique in a chain.
1043 if name in all_records: 1043 ↛ 1044line 1043 didn't jump to line 1044, because the condition on line 1043 was never true
1044 raise ValueError("Non-unique datastore name found in datastore {datastore}")
1045 all_records[name] = record_data
1047 return all_records
1049 def export(
1050 self,
1051 refs: Iterable[DatasetRef],
1052 *,
1053 directory: ResourcePathExpression | None = None,
1054 transfer: str | None = "auto",
1055 ) -> Iterable[FileDataset]:
1056 # Docstring inherited from Datastore.export.
1057 if transfer == "auto" and directory is None:
1058 transfer = None
1060 if transfer is not None and directory is None:
1061 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
1063 if transfer == "move":
1064 raise TypeError("Can not export by moving files out of datastore.")
1066 # Exporting from a chain has the potential for a dataset to be
1067 # in one or more of the datastores in the chain. We only need one
1068 # of them since we assume the datasets are the same in all (but
1069 # the file format could be different of course since that is a
1070 # per-datastore configuration).
1071 # We also do not know whether any of the datastores in the chain
1072 # support file export.
1074 # Ensure we have an ordered sequence that is not an iterator or set.
1075 if not isinstance(refs, Sequence):
1076 refs = list(refs)
1078 # If any of the datasets are missing entirely we need to raise early
1079 # before we try to run the export. This can be a little messy but is
1080 # better than exporting files from the first datastore and then finding
1081 # that one is missing but is not in the second datastore either.
1082 known = [datastore.knows_these(refs) for datastore in self.datastores]
1083 refs_known: set[DatasetRef] = set()
1084 for known_to_this in known:
1085 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this})
1086 missing_count = len(refs) - len(refs_known)
1087 if missing_count:
1088 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}")
1090 # To allow us to slot each result into the right place after
1091 # asking each datastore, create a dict with the index.
1092 ref_positions = {ref: i for i, ref in enumerate(refs)}
1094 # Presize the final export list.
1095 exported: list[FileDataset | None] = [None] * len(refs)
1097 # The order of the returned dataset has to match the order of the
1098 # given refs, even if they are all from different datastores.
1099 for i, datastore in enumerate(self.datastores):
1100 known_to_this = known[i]
1101 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions]
1103 try:
1104 this_export = datastore.export(filtered, directory=directory, transfer=transfer)
1105 except NotImplementedError:
1106 # Try the next datastore.
1107 continue
1109 for ref, export in zip(filtered, this_export, strict=True):
1110 # Get the position and also delete it from the list.
1111 exported[ref_positions.pop(ref)] = export
1113 # Every dataset should be accounted for because of the earlier checks
1114 # but make sure that we did fill all the slots to appease mypy.
1115 for i, dataset in enumerate(exported):
1116 if dataset is None: 1116 ↛ 1117line 1116 didn't jump to line 1117, because the condition on line 1116 was never true
1117 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.")
1118 yield dataset
1120 def transfer_from(
1121 self,
1122 source_datastore: Datastore,
1123 refs: Collection[DatasetRef],
1124 transfer: str = "auto",
1125 artifact_existence: dict[ResourcePath, bool] | None = None,
1126 dry_run: bool = False,
1127 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1128 # Docstring inherited
1129 # mypy does not understand "type(self) is not type(source)"
1130 if isinstance(source_datastore, ChainedDatastore):
1131 # Both the source and destination are chained datastores.
1132 source_datastores = tuple(source_datastore.datastores)
1133 else:
1134 # The source datastore is different, forward everything to the
1135 # child datastores.
1136 source_datastores = (source_datastore,)
1138 if not refs: 1138 ↛ 1140line 1138 didn't jump to line 1140, because the condition on line 1138 was never true
1139 # Nothing to transfer.
1140 return set(), set()
1142 # Need to know the set of all possible refs that could be transferred.
1143 remaining_refs = set(refs)
1145 missing_from_source: set[DatasetRef] | None = None
1146 all_accepted = set()
1147 nsuccess = 0
1148 for source_child in source_datastores:
1149 # If we are reading from a chained datastore, it's possible that
1150 # only a subset of the datastores know about the dataset. We can't
1151 # ask the receiving datastore to copy it when it doesn't exist
1152 # so we have to filter again based on what the source datastore
1153 # understands.
1154 known_to_source = source_child.knows_these(list(refs))
1156 # Need to know that there is a possibility that some of these
1157 # datasets exist but are unknown to the source datastore if
1158 # trust is enabled.
1159 if getattr(source_child, "trustGetRequest", False):
1160 unknown = [ref for ref, known in known_to_source.items() if not known]
1161 existence = source_child.mexists(unknown, artifact_existence)
1162 for ref, exists in existence.items():
1163 known_to_source[ref] = exists
1165 missing = {ref for ref, known in known_to_source.items() if not known}
1166 if missing:
1167 if missing_from_source is None:
1168 missing_from_source = missing
1169 else:
1170 missing_from_source &= missing
1172 # Try to transfer from each source datastore to each child
1173 # datastore. Have to make sure we don't transfer something
1174 # we've already transferred to this destination on later passes.
1176 # Filter the initial list based on the datasets we have
1177 # not yet transferred.
1178 these_refs = []
1179 for ref in refs:
1180 if ref in remaining_refs and known_to_source[ref]:
1181 these_refs.append(ref)
1183 if not these_refs:
1184 # Already transferred all datasets known to this datastore.
1185 continue
1187 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
1188 if constraints is not None: 1188 ↛ 1196line 1188 didn't jump to line 1196, because the condition on line 1188 was never false
1189 filtered_refs = []
1190 for ref in these_refs:
1191 if constraints.isAcceptable(ref):
1192 filtered_refs.append(ref)
1193 else:
1194 log.debug("Rejecting ref by constraints: %s", ref)
1195 else:
1196 filtered_refs = list(these_refs)
1197 try:
1198 accepted, _ = datastore.transfer_from(
1199 source_child,
1200 filtered_refs,
1201 transfer,
1202 artifact_existence,
1203 dry_run=dry_run,
1204 )
1205 except (TypeError, NotImplementedError):
1206 # The datastores were incompatible.
1207 continue
1208 else:
1209 nsuccess += 1
1211 # Remove the accepted datasets from those remaining.
1212 remaining_refs = remaining_refs - accepted
1214 # Keep track of everything we have accepted.
1215 all_accepted.update(accepted)
1217 if missing_from_source:
1218 for ref in missing_from_source:
1219 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref)
1221 if nsuccess == 0: 1221 ↛ 1222line 1221 didn't jump to line 1222, because the condition on line 1221 was never true
1222 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}")
1224 return all_accepted, remaining_refs
1226 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
1227 # Docstring inherited from the base class.
1228 tables: dict[str, DatastoreOpaqueTable] = {}
1229 for datastore in self.datastores:
1230 tables.update(datastore.get_opaque_table_definitions())
1231 return tables