Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 86%
474 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-13 10:56 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-13 10:56 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Chained datastore."""
30from __future__ import annotations
32__all__ = ("ChainedDatastore",)
34import itertools
35import logging
36import time
37import warnings
38from collections.abc import Collection, Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any
41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset
42from lsst.daf.butler.datastore import (
43 DatasetRefURIs,
44 Datastore,
45 DatastoreConfig,
46 DatastoreOpaqueTable,
47 DatastoreValidationError,
48)
49from lsst.daf.butler.datastore.constraints import Constraints
50from lsst.daf.butler.datastore.record_data import DatastoreRecordData
51from lsst.resources import ResourcePath
52from lsst.utils import doImportType
54if TYPE_CHECKING:
55 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
56 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
57 from lsst.resources import ResourcePathExpression
59log = logging.getLogger(__name__)
62class _IngestPrepData(Datastore.IngestPrepData):
63 """Helper class for ChainedDatastore ingest implementation.
65 Parameters
66 ----------
67 children : `list` of `tuple`
68 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
69 """
71 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
72 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
73 self.children = children
76class ChainedDatastore(Datastore):
77 """Chained Datastores to allow read and writes from multiple datastores.
79 A ChainedDatastore is configured with multiple datastore configurations.
80 A ``put()`` is always sent to each datastore. A ``get()``
81 operation is sent to each datastore in turn and the first datastore
82 to return a valid dataset is used.
84 Parameters
85 ----------
86 config : `DatastoreConfig` or `str`
87 Configuration. This configuration must include a ``datastores`` field
88 as a sequence of datastore configurations. The order in this sequence
89 indicates the order to use for read operations.
90 bridgeManager : `DatastoreRegistryBridgeManager`
91 Object that manages the interface between `Registry` and datastores.
92 datastores : `list` [`Datastore`]
93 All the child datastores known to this datastore.
95 Notes
96 -----
97 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
98 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
99 and `"hardlink"` if and only if all its child datastores do.
100 """
102 defaultConfigFile = "datastores/chainedDatastore.yaml"
103 """Path to configuration defaults. Accessed within the ``configs`` resource
104 or relative to a search path. Can be None if no defaults specified.
105 """
107 containerKey = "datastores"
108 """Key to specify where child datastores are configured."""
110 datastores: list[Datastore]
111 """All the child datastores known to this datastore."""
113 datastoreConstraints: Sequence[Constraints | None]
114 """Constraints to be applied to each of the child datastores."""
116 @classmethod
117 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
118 """Set any filesystem-dependent config options for child Datastores to
119 be appropriate for a new empty repository with the given root.
121 Parameters
122 ----------
123 root : `str`
124 Filesystem path to the root of the data repository.
125 config : `Config`
126 A `Config` to update. Only the subset understood by
127 this component will be updated. Will not expand
128 defaults.
129 full : `Config`
130 A complete config with all defaults expanded that can be
131 converted to a `DatastoreConfig`. Read-only and will not be
132 modified by this method.
133 Repository-specific options that should not be obtained
134 from defaults when Butler instances are constructed
135 should be copied from ``full`` to ``config``.
136 overwrite : `bool`, optional
137 If `False`, do not modify a value in ``config`` if the value
138 already exists. Default is always to overwrite with the provided
139 ``root``.
141 Notes
142 -----
143 If a keyword is explicitly defined in the supplied ``config`` it
144 will not be overridden by this method if ``overwrite`` is `False`.
145 This allows explicit values set in external configs to be retained.
146 """
147 # Extract the part of the config we care about updating
148 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
150 # And the subset of the full config that we can use for reference.
151 # Do not bother with defaults because we are told this already has
152 # them.
153 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
155 # Loop over each datastore config and pass the subsets to the
156 # child datastores to process.
158 containerKey = cls.containerKey
159 for idx, (child, fullChild) in enumerate(
160 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True)
161 ):
162 childConfig = DatastoreConfig(child, mergeDefaults=False)
163 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
164 datastoreClass = doImportType(fullChildConfig["cls"])
165 if not issubclass(datastoreClass, Datastore): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true
166 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
167 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}"
168 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
170 # Reattach to parent
171 datastoreConfig[containerKey, idx] = childConfig
173 # Reattach modified datastore config to parent
174 # If this has a datastore key we attach there, otherwise we assume
175 # this information goes at the top of the config hierarchy.
176 if DatastoreConfig.component in config:
177 config[DatastoreConfig.component] = datastoreConfig
178 else:
179 config.update(datastoreConfig)
181 return
183 def __init__(
184 self,
185 config: DatastoreConfig,
186 bridgeManager: DatastoreRegistryBridgeManager,
187 datastores: list[Datastore],
188 ):
189 super().__init__(config, bridgeManager)
191 self.datastores = list(datastores)
193 # Name ourself based on our children
194 if self.datastores: 194 ↛ 199line 194 didn't jump to line 199, because the condition on line 194 was never false
195 # We must set the names explicitly
196 self._names = [d.name for d in self.datastores]
197 childNames = ",".join(self.names)
198 else:
199 childNames = f"(empty@{time.time()})"
200 self._names = [childNames]
201 self.name = f"{type(self).__qualname__}[{childNames}]"
203 # We declare we are ephemeral if all our child datastores declare
204 # they are ephemeral
205 self.isEphemeral = all(d.isEphemeral for d in self.datastores)
207 # per-datastore override constraints
208 if "datastore_constraints" in self.config:
209 overrides = self.config["datastore_constraints"]
211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 raise DatastoreValidationError(
213 f"Number of registered datastores ({len(self.datastores)})"
214 " differs from number of constraints overrides"
215 f" {len(overrides)}"
216 )
218 self.datastoreConstraints = [
219 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
220 ]
222 else:
223 self.datastoreConstraints = (None,) * len(self.datastores)
225 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
227 @classmethod
228 def _create_from_config(
229 cls,
230 config: DatastoreConfig,
231 bridgeManager: DatastoreRegistryBridgeManager,
232 butlerRoot: ResourcePathExpression | None,
233 ) -> ChainedDatastore:
234 # Scan for child datastores and instantiate them with the same registry
235 datastores = []
236 for c in config["datastores"]:
237 c = DatastoreConfig(c)
238 datastoreType = doImportType(c["cls"])
239 if not issubclass(datastoreType, Datastore): 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true
240 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
241 datastore = datastoreType._create_from_config(c, bridgeManager, butlerRoot=butlerRoot)
242 log.debug("Creating child datastore %s", datastore.name)
243 datastores.append(datastore)
245 return ChainedDatastore(config, bridgeManager, datastores)
247 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore:
248 datastores = [ds.clone(bridgeManager) for ds in self.datastores]
249 return ChainedDatastore(self.config, bridgeManager, datastores)
251 @property
252 def names(self) -> tuple[str, ...]:
253 return tuple(self._names)
255 @property
256 def roots(self) -> dict[str, ResourcePath | None]:
257 # Docstring inherited.
258 roots = {}
259 for datastore in self.datastores:
260 roots.update(datastore.roots)
261 return roots
263 def __str__(self) -> str:
264 chainName = ", ".join(str(ds) for ds in self.datastores)
265 return chainName
267 def _set_trust_mode(self, mode: bool) -> None:
268 for datastore in self.datastores:
269 datastore._set_trust_mode(mode)
271 def knows(self, ref: DatasetRef) -> bool:
272 """Check if the dataset is known to any of the datastores.
274 Does not check for existence of any artifact.
276 Parameters
277 ----------
278 ref : `DatasetRef`
279 Reference to the required dataset.
281 Returns
282 -------
283 exists : `bool`
284 `True` if the dataset is known to the datastore.
285 """
286 for datastore in self.datastores:
287 if datastore.knows(ref):
288 log.debug("%s known to datastore %s", ref, datastore.name)
289 return True
290 return False
292 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
293 # Docstring inherited from the base class.
294 refs_known: dict[DatasetRef, bool] = {}
295 for datastore in self.datastores:
296 refs_known.update(datastore.knows_these(refs))
298 # No need to check in next datastore for refs that are known.
299 # We only update entries that were initially False.
300 refs = [ref for ref, known in refs_known.items() if not known]
302 return refs_known
304 def mexists(
305 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
306 ) -> dict[DatasetRef, bool]:
307 """Check the existence of multiple datasets at once.
309 Parameters
310 ----------
311 refs : iterable of `DatasetRef`
312 The datasets to be checked.
313 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
314 Optional mapping of datastore artifact to existence. Updated by
315 this method with details of all artifacts tested. Can be `None`
316 if the caller is not interested.
318 Returns
319 -------
320 existence : `dict` of [`DatasetRef`, `bool`]
321 Mapping from dataset to boolean indicating existence in any
322 of the child datastores.
323 """
324 dataset_existence: dict[DatasetRef, bool] = {}
325 for datastore in self.datastores:
326 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
328 # For next datastore no point asking about ones we know
329 # exist already. No special exemption for ephemeral datastores.
330 refs = [ref for ref, exists in dataset_existence.items() if not exists]
332 return dataset_existence
334 def exists(self, ref: DatasetRef) -> bool:
335 """Check if the dataset exists in one of the datastores.
337 Parameters
338 ----------
339 ref : `DatasetRef`
340 Reference to the required dataset.
342 Returns
343 -------
344 exists : `bool`
345 `True` if the entity exists in one of the child datastores.
346 """
347 for datastore in self.datastores:
348 if datastore.exists(ref):
349 log.debug("Found %s in datastore %s", ref, datastore.name)
350 return True
351 return False
353 def get(
354 self,
355 ref: DatasetRef,
356 parameters: Mapping[str, Any] | None = None,
357 storageClass: StorageClass | str | None = None,
358 ) -> Any:
359 """Load an InMemoryDataset from the store.
361 The dataset is returned from the first datastore that has
362 the dataset.
364 Parameters
365 ----------
366 ref : `DatasetRef`
367 Reference to the required Dataset.
368 parameters : `dict`
369 `StorageClass`-specific parameters that specify, for example,
370 a slice of the dataset to be loaded.
371 storageClass : `StorageClass` or `str`, optional
372 The storage class to be used to override the Python type
373 returned by this method. By default the returned type matches
374 the dataset type definition for this dataset. Specifying a
375 read `StorageClass` can force a different type to be returned.
376 This type must be compatible with the original type.
378 Returns
379 -------
380 inMemoryDataset : `object`
381 Requested dataset or slice thereof as an InMemoryDataset.
383 Raises
384 ------
385 FileNotFoundError
386 Requested dataset can not be retrieved.
387 TypeError
388 Return value from formatter has unexpected type.
389 ValueError
390 Formatter failed to process the dataset.
391 """
392 for datastore in self.datastores:
393 try:
394 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
395 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
396 return inMemoryObject
397 except FileNotFoundError:
398 pass
400 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores")
402 def prepare_get_for_external_client(self, ref: DatasetRef) -> object:
403 return self._get_matching_datastore(ref).prepare_get_for_external_client(ref)
405 def _get_matching_datastore(self, ref: DatasetRef) -> Datastore:
406 """Return the first child datastore that owns the specified dataset."""
407 for datastore in self.datastores:
408 if datastore.knows(ref): 408 ↛ 409line 408 didn't jump to line 409, because the condition on line 408 was never true
409 return datastore
411 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores")
413 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
414 """Write a InMemoryDataset with a given `DatasetRef` to each
415 datastore.
417 The put() to child datastores can fail with
418 `DatasetTypeNotSupportedError`. The put() for this datastore will be
419 deemed to have succeeded so long as at least one child datastore
420 accepted the inMemoryDataset.
422 Parameters
423 ----------
424 inMemoryDataset : `object`
425 The dataset to store.
426 ref : `DatasetRef`
427 Reference to the associated Dataset.
429 Raises
430 ------
431 TypeError
432 Supplied object and storage class are inconsistent.
433 DatasetTypeNotSupportedError
434 All datastores reported `DatasetTypeNotSupportedError`.
435 """
436 log.debug("Put %s", ref)
438 # Confirm that we can accept this dataset
439 if not self.constraints.isAcceptable(ref):
440 # Raise rather than use boolean return value.
441 raise DatasetTypeNotSupportedError(
442 f"Dataset {ref} has been rejected by this datastore via configuration."
443 )
445 isPermanent = False
446 nsuccess = 0
447 npermanent = 0
448 nephemeral = 0
449 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
450 if (
451 constraints is not None and not constraints.isAcceptable(ref)
452 ) or not datastore.constraints.isAcceptable(ref):
453 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
454 continue
456 if datastore.isEphemeral:
457 nephemeral += 1
458 else:
459 npermanent += 1
460 try:
461 datastore.put(inMemoryDataset, ref)
462 nsuccess += 1
463 if not datastore.isEphemeral:
464 isPermanent = True
465 except DatasetTypeNotSupportedError:
466 pass
468 if nsuccess == 0:
469 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
471 if not isPermanent and npermanent > 0: 471 ↛ 472line 471 didn't jump to line 472, because the condition on line 471 was never true
472 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
474 if self._transaction is not None:
475 self._transaction.registerUndo("put", self.remove, ref)
477 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
478 # Docstring inherited from base class.
479 log.debug("Put %s", ref)
481 # Confirm that we can accept this dataset
482 if not self.constraints.isAcceptable(ref):
483 # Raise rather than use boolean return value.
484 raise DatasetTypeNotSupportedError(
485 f"Dataset {ref} has been rejected by this datastore via configuration."
486 )
488 isPermanent = False
489 nsuccess = 0
490 npermanent = 0
491 nephemeral = 0
492 stored_refs: dict[str, DatasetRef] = {}
493 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
494 if (
495 constraints is not None and not constraints.isAcceptable(ref)
496 ) or not datastore.constraints.isAcceptable(ref):
497 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
498 continue
500 if datastore.isEphemeral:
501 nephemeral += 1
502 else:
503 npermanent += 1
504 try:
505 stored_ref_map = datastore.put_new(in_memory_dataset, ref)
506 stored_refs.update(stored_ref_map)
507 nsuccess += 1
508 if not datastore.isEphemeral:
509 isPermanent = True
510 except DatasetTypeNotSupportedError:
511 pass
513 if nsuccess == 0:
514 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
516 if not isPermanent and npermanent > 0:
517 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
519 if self._transaction is not None:
520 self._transaction.registerUndo("put", self.remove, ref)
522 return stored_refs
524 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
525 # Docstring inherited from base class.
526 if transfer != "auto":
527 return transfer
528 # Ask each datastore what they think auto means
529 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
531 # Remove any untranslated "auto" values
532 transfers.discard(transfer)
534 if len(transfers) == 1: 534 ↛ 535line 534 didn't jump to line 535, because the condition on line 534 was never true
535 return transfers.pop()
536 if not transfers: 536 ↛ 540line 536 didn't jump to line 540, because the condition on line 536 was never false
537 # Everything reported "auto"
538 return transfer
540 raise RuntimeError(
541 "Chained datastore does not yet support different transfer modes"
542 f" from 'auto' in each child datastore (wanted {transfers})"
543 )
545 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
546 # Docstring inherited from Datastore._prepIngest.
547 if transfer is None:
548 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
550 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
551 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
552 if not acceptable:
553 log.debug(
554 "Datastore %s skipping ingest via configuration for refs %s",
555 name,
556 ", ".join(str(ref) for ref in dataset.refs),
557 )
558 return False
559 else:
560 return True
562 # Filter down to just datasets the chained datastore's own
563 # configuration accepts.
564 okForParent: list[FileDataset] = [
565 dataset
566 for dataset in datasets
567 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
568 ]
570 # Iterate over nested datastores and call _prepIngest on each.
571 # Save the results to a list:
572 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
573 # ...and remember whether all of the failures are due to
574 # NotImplementedError being raised.
575 allFailuresAreNotImplementedError = True
576 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
577 okForChild: list[FileDataset]
578 if constraints is not None:
579 okForChild = [
580 dataset
581 for dataset in okForParent
582 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
583 ]
584 else:
585 okForChild = okForParent
586 try:
587 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
588 except NotImplementedError:
589 log.debug(
590 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
591 datastore.name,
592 transfer,
593 )
594 continue
595 allFailuresAreNotImplementedError = False
596 if okForChild:
597 # Do not store for later if a datastore has rejected
598 # everything.
599 # Include the source paths if this is a "move". It's clearer
600 # to find the paths now rather than try to infer how
601 # each datastore has stored them in the internal prep class.
602 paths = (
603 {ResourcePath(dataset.path, forceDirectory=False) for dataset in okForChild}
604 if transfer == "move"
605 else set()
606 )
607 children.append((datastore, prepDataForChild, paths))
608 if allFailuresAreNotImplementedError:
609 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
610 return _IngestPrepData(children=children)
612 def _finishIngest(
613 self,
614 prepData: _IngestPrepData,
615 *,
616 transfer: str | None = None,
617 record_validation_info: bool = True,
618 ) -> None:
619 # Docstring inherited from Datastore._finishIngest.
620 # For "move" we must use "copy" and then delete the input
621 # data at the end. This has no rollback option if the ingest
622 # subsequently fails. If there is only one active datastore
623 # accepting any files we can leave it as "move"
624 actual_transfer: str | None
625 if transfer == "move" and len(prepData.children) > 1:
626 actual_transfer = "copy"
627 else:
628 actual_transfer = transfer
629 to_be_deleted: set[ResourcePath] = set()
630 for datastore, prepDataForChild, paths in prepData.children:
631 datastore._finishIngest(
632 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
633 )
634 to_be_deleted.update(paths)
635 if actual_transfer != transfer:
636 # These datasets were copied but now need to be deleted.
637 # This can not be rolled back.
638 for uri in to_be_deleted:
639 uri.remove()
641 def getManyURIs(
642 self,
643 refs: Iterable[DatasetRef],
644 predict: bool = False,
645 allow_missing: bool = False,
646 ) -> dict[DatasetRef, DatasetRefURIs]:
647 # Docstring inherited
649 uris: dict[DatasetRef, DatasetRefURIs] = {}
650 missing_refs = set(refs)
652 # If predict is True we don't want to predict a dataset in the first
653 # datastore if it actually exists in a later datastore, so in that
654 # case check all datastores with predict=False first, and then try
655 # again with predict=True.
656 for p in (False, True) if predict else (False,):
657 if not missing_refs:
658 break
659 for datastore in self.datastores:
660 try:
661 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
662 except NotImplementedError:
663 # some datastores may not implement generating URIs
664 continue
665 missing_refs -= got_uris.keys()
666 uris.update(got_uris)
667 if not missing_refs:
668 break
670 if missing_refs and not allow_missing:
671 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
673 return uris
675 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
676 """Return URIs associated with dataset.
678 Parameters
679 ----------
680 ref : `DatasetRef`
681 Reference to the required dataset.
682 predict : `bool`, optional
683 If the datastore does not know about the dataset, controls whether
684 it should return a predicted URI or not.
686 Returns
687 -------
688 uris : `DatasetRefURIs`
689 The URI to the primary artifact associated with this dataset (if
690 the dataset was disassembled within the datastore this may be
691 `None`), and the URIs to any components associated with the dataset
692 artifact. (can be empty if there are no components).
694 Notes
695 -----
696 The returned URI is from the first datastore in the list that has
697 the dataset with preference given to the first dataset coming from
698 a permanent datastore. If no datastores have the dataset and prediction
699 is allowed, the predicted URI for the first datastore in the list will
700 be returned.
701 """
702 log.debug("Requesting URIs for %s", ref)
703 predictedUri: DatasetRefURIs | None = None
704 predictedEphemeralUri: DatasetRefURIs | None = None
705 firstEphemeralUri: DatasetRefURIs | None = None
706 for datastore in self.datastores:
707 if datastore.exists(ref):
708 if not datastore.isEphemeral:
709 uri = datastore.getURIs(ref)
710 log.debug("Retrieved non-ephemeral URI: %s", uri)
711 return uri
712 elif not firstEphemeralUri:
713 firstEphemeralUri = datastore.getURIs(ref)
714 elif predict:
715 if not predictedUri and not datastore.isEphemeral:
716 predictedUri = datastore.getURIs(ref, predict)
717 elif not predictedEphemeralUri and datastore.isEphemeral:
718 predictedEphemeralUri = datastore.getURIs(ref, predict)
720 if firstEphemeralUri:
721 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
722 return firstEphemeralUri
724 if predictedUri:
725 log.debug("Retrieved predicted URI: %s", predictedUri)
726 return predictedUri
728 if predictedEphemeralUri:
729 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
730 return predictedEphemeralUri
732 raise FileNotFoundError(f"Dataset {ref} not in any datastore")
734 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
735 """URI to the Dataset.
737 The returned URI is from the first datastore in the list that has
738 the dataset with preference given to the first dataset coming from
739 a permanent datastore. If no datastores have the dataset and prediction
740 is allowed, the predicted URI for the first datastore in the list will
741 be returned.
743 Parameters
744 ----------
745 ref : `DatasetRef`
746 Reference to the required Dataset.
747 predict : `bool`
748 If `True`, allow URIs to be returned of datasets that have not
749 been written.
751 Returns
752 -------
753 uri : `lsst.resources.ResourcePath`
754 URI pointing to the dataset within the datastore. If the
755 dataset does not exist in the datastore, and if ``predict`` is
756 `True`, the URI will be a prediction and will include a URI
757 fragment "#predicted".
759 Notes
760 -----
761 If the datastore does not have entities that relate well
762 to the concept of a URI the returned URI string will be
763 descriptive. The returned URI is not guaranteed to be obtainable.
765 Raises
766 ------
767 FileNotFoundError
768 A URI has been requested for a dataset that does not exist and
769 guessing is not allowed.
770 RuntimeError
771 Raised if a request is made for a single URI but multiple URIs
772 are associated with this dataset.
773 """
774 log.debug("Requesting URI for %s", ref)
775 primary, components = self.getURIs(ref, predict)
776 if primary is None or components: 776 ↛ 777line 776 didn't jump to line 777, because the condition on line 776 was never true
777 raise RuntimeError(
778 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
779 )
780 return primary
782 def retrieveArtifacts(
783 self,
784 refs: Iterable[DatasetRef],
785 destination: ResourcePath,
786 transfer: str = "auto",
787 preserve_path: bool = True,
788 overwrite: bool = False,
789 ) -> list[ResourcePath]:
790 """Retrieve the file artifacts associated with the supplied refs.
792 Parameters
793 ----------
794 refs : iterable of `DatasetRef`
795 The datasets for which file artifacts are to be retrieved.
796 A single ref can result in multiple files. The refs must
797 be resolved.
798 destination : `lsst.resources.ResourcePath`
799 Location to write the file artifacts.
800 transfer : `str`, optional
801 Method to use to transfer the artifacts. Must be one of the options
802 supported by `lsst.resources.ResourcePath.transfer_from()`.
803 "move" is not allowed.
804 preserve_path : `bool`, optional
805 If `True` the full path of the file artifact within the datastore
806 is preserved. If `False` the final file component of the path
807 is used.
808 overwrite : `bool`, optional
809 If `True` allow transfers to overwrite existing files at the
810 destination.
812 Returns
813 -------
814 targets : `list` of `lsst.resources.ResourcePath`
815 URIs of file artifacts in destination location. Order is not
816 preserved.
817 """
818 if not destination.isdir(): 818 ↛ 819line 818 didn't jump to line 819, because the condition on line 818 was never true
819 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
821 # Using getURIs is not feasible since it becomes difficult to
822 # determine the path within the datastore later on. For now
823 # follow getURIs implementation approach.
825 pending = set(refs)
827 # There is a question as to whether an exception should be raised
828 # early if some of the refs are missing, or whether files should be
829 # transferred until a problem is hit. Prefer to complain up front.
830 # Use the datastore integer as primary key.
831 grouped_by_datastore: dict[int, set[DatasetRef]] = {}
833 for number, datastore in enumerate(self.datastores):
834 if datastore.isEphemeral:
835 # In the future we will want to distinguish in-memory from
836 # caching datastore since using an on-disk local
837 # cache is exactly what we should be doing.
838 continue
839 try:
840 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
841 except NotImplementedError:
842 # Some datastores may not support retrieving artifacts
843 continue
845 if datastore_refs:
846 grouped_by_datastore[number] = datastore_refs
848 # Remove these from the pending list so that we do not bother
849 # looking for them any more.
850 pending = pending - datastore_refs
852 if pending: 852 ↛ 853line 852 didn't jump to line 853, because the condition on line 852 was never true
853 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
855 # Now do the transfer.
856 targets: list[ResourcePath] = []
857 for number, datastore_refs in grouped_by_datastore.items():
858 targets.extend(
859 self.datastores[number].retrieveArtifacts(
860 datastore_refs,
861 destination,
862 transfer=transfer,
863 preserve_path=preserve_path,
864 overwrite=overwrite,
865 )
866 )
868 return targets
870 def remove(self, ref: DatasetRef) -> None:
871 """Indicate to the datastore that a dataset can be removed.
873 The dataset will be removed from each datastore. The dataset is
874 not required to exist in every child datastore.
876 Parameters
877 ----------
878 ref : `DatasetRef`
879 Reference to the required dataset.
881 Raises
882 ------
883 FileNotFoundError
884 Attempt to remove a dataset that does not exist. Raised if none
885 of the child datastores removed the dataset.
886 """
887 log.debug("Removing %s", ref)
888 self.trash(ref, ignore_errors=False)
889 self.emptyTrash(ignore_errors=False)
891 def forget(self, refs: Iterable[DatasetRef]) -> None:
892 for datastore in tuple(self.datastores):
893 datastore.forget(refs)
895 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
896 if isinstance(ref, DatasetRef):
897 ref_label = str(ref)
898 else:
899 ref_label = "bulk datasets"
901 log.debug("Trashing %s", ref_label)
903 counter = 0
904 for datastore in self.datastores:
905 try:
906 datastore.trash(ref, ignore_errors=ignore_errors)
907 counter += 1
908 except FileNotFoundError:
909 pass
911 if counter == 0:
912 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
913 if ignore_errors: 913 ↛ 914line 913 didn't jump to line 914, because the condition on line 913 was never true
914 log.warning(err_msg)
915 else:
916 raise FileNotFoundError(err_msg)
918 def emptyTrash(self, ignore_errors: bool = True) -> None:
919 for datastore in self.datastores:
920 datastore.emptyTrash(ignore_errors=ignore_errors)
922 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
923 """Retrieve a dataset from an input `Datastore`,
924 and store the result in this `Datastore`.
926 Parameters
927 ----------
928 inputDatastore : `Datastore`
929 The external `Datastore` from which to retreive the Dataset.
930 ref : `DatasetRef`
931 Reference to the required dataset in the input data store.
933 Returns
934 -------
935 results : `list`
936 List containing the return value from the ``put()`` to each
937 child datastore.
938 """
939 assert inputDatastore is not self # unless we want it for renames?
940 inMemoryDataset = inputDatastore.get(ref)
941 self.put(inMemoryDataset, ref)
943 def validateConfiguration(
944 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
945 ) -> None:
946 """Validate some of the configuration for this datastore.
948 Parameters
949 ----------
950 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
951 Entities to test against this configuration. Can be differing
952 types.
953 logFailures : `bool`, optional
954 If `True`, output a log message for every validation error
955 detected.
957 Raises
958 ------
959 DatastoreValidationError
960 Raised if there is a validation problem with a configuration.
961 All the problems are reported in a single exception.
963 Notes
964 -----
965 This method checks each datastore in turn.
966 """
967 # Need to catch each of the datastore outputs and ensure that
968 # all are tested.
969 failures = []
970 for datastore in self.datastores:
971 try:
972 datastore.validateConfiguration(entities, logFailures=logFailures)
973 except DatastoreValidationError as e:
974 if logFailures: 974 ↛ 976line 974 didn't jump to line 976, because the condition on line 974 was never false
975 log.critical("Datastore %s failed validation", datastore.name)
976 failures.append(f"Datastore {self.name}: {e}")
978 if failures:
979 msg = ";\n".join(failures)
980 raise DatastoreValidationError(msg)
982 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
983 # Docstring is inherited from base class
984 failures = []
985 for datastore in self.datastores:
986 try:
987 datastore.validateKey(lookupKey, entity)
988 except DatastoreValidationError as e:
989 failures.append(f"Datastore {self.name}: {e}")
991 if failures:
992 msg = ";\n".join(failures)
993 raise DatastoreValidationError(msg)
995 def getLookupKeys(self) -> set[LookupKey]:
996 # Docstring is inherited from base class
997 keys = set()
998 for datastore in self.datastores:
999 keys.update(datastore.getLookupKeys())
1001 keys.update(self.constraints.getLookupKeys())
1002 for p in self.datastoreConstraints:
1003 if p is not None: 1003 ↛ 1002line 1003 didn't jump to line 1002, because the condition on line 1003 was never false
1004 keys.update(p.getLookupKeys())
1006 return keys
1008 def needs_expanded_data_ids(
1009 self,
1010 transfer: str | None,
1011 entity: DatasetRef | DatasetType | StorageClass | None = None,
1012 ) -> bool:
1013 # Docstring inherited.
1014 # We can't safely use `self.datastoreConstraints` with `entity` to
1015 # check whether a child datastore would even want to ingest this
1016 # dataset, because we don't want to filter out datastores that might
1017 # need an expanded data ID based in incomplete information (e.g. we
1018 # pass a StorageClass, but the constraint dispatches on DatasetType).
1019 # So we pessimistically check if any datastore would need an expanded
1020 # data ID for this transfer mode.
1021 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores)
1023 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
1024 # Docstring inherited from the base class.
1026 for datastore in self.datastores:
1027 datastore.import_records(data)
1029 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
1030 # Docstring inherited from the base class.
1032 all_records: dict[str, DatastoreRecordData] = {}
1034 # Merge all sub-datastore records into one structure
1035 for datastore in self.datastores:
1036 sub_records = datastore.export_records(refs)
1037 for name, record_data in sub_records.items():
1038 # All datastore names must be unique in a chain.
1039 if name in all_records: 1039 ↛ 1040line 1039 didn't jump to line 1040, because the condition on line 1039 was never true
1040 raise ValueError("Non-unique datastore name found in datastore {datastore}")
1041 all_records[name] = record_data
1043 return all_records
1045 def export(
1046 self,
1047 refs: Iterable[DatasetRef],
1048 *,
1049 directory: ResourcePathExpression | None = None,
1050 transfer: str | None = "auto",
1051 ) -> Iterable[FileDataset]:
1052 # Docstring inherited from Datastore.export.
1053 if transfer == "auto" and directory is None:
1054 transfer = None
1056 if transfer is not None and directory is None:
1057 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
1059 if transfer == "move":
1060 raise TypeError("Can not export by moving files out of datastore.")
1062 # Exporting from a chain has the potential for a dataset to be
1063 # in one or more of the datastores in the chain. We only need one
1064 # of them since we assume the datasets are the same in all (but
1065 # the file format could be different of course since that is a
1066 # per-datastore configuration).
1067 # We also do not know whether any of the datastores in the chain
1068 # support file export.
1070 # Ensure we have an ordered sequence that is not an iterator or set.
1071 if not isinstance(refs, Sequence):
1072 refs = list(refs)
1074 # If any of the datasets are missing entirely we need to raise early
1075 # before we try to run the export. This can be a little messy but is
1076 # better than exporting files from the first datastore and then finding
1077 # that one is missing but is not in the second datastore either.
1078 known = [datastore.knows_these(refs) for datastore in self.datastores]
1079 refs_known: set[DatasetRef] = set()
1080 for known_to_this in known:
1081 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this})
1082 missing_count = len(refs) - len(refs_known)
1083 if missing_count:
1084 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}")
1086 # To allow us to slot each result into the right place after
1087 # asking each datastore, create a dict with the index.
1088 ref_positions = {ref: i for i, ref in enumerate(refs)}
1090 # Presize the final export list.
1091 exported: list[FileDataset | None] = [None] * len(refs)
1093 # The order of the returned dataset has to match the order of the
1094 # given refs, even if they are all from different datastores.
1095 for i, datastore in enumerate(self.datastores):
1096 known_to_this = known[i]
1097 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions]
1099 try:
1100 this_export = datastore.export(filtered, directory=directory, transfer=transfer)
1101 except NotImplementedError:
1102 # Try the next datastore.
1103 continue
1105 for ref, export in zip(filtered, this_export, strict=True):
1106 # Get the position and also delete it from the list.
1107 exported[ref_positions.pop(ref)] = export
1109 # Every dataset should be accounted for because of the earlier checks
1110 # but make sure that we did fill all the slots to appease mypy.
1111 for i, dataset in enumerate(exported):
1112 if dataset is None: 1112 ↛ 1113line 1112 didn't jump to line 1113, because the condition on line 1112 was never true
1113 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.")
1114 yield dataset
1116 def transfer_from(
1117 self,
1118 source_datastore: Datastore,
1119 refs: Collection[DatasetRef],
1120 transfer: str = "auto",
1121 artifact_existence: dict[ResourcePath, bool] | None = None,
1122 dry_run: bool = False,
1123 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1124 # Docstring inherited
1125 # mypy does not understand "type(self) is not type(source)"
1126 if isinstance(source_datastore, ChainedDatastore):
1127 # Both the source and destination are chained datastores.
1128 source_datastores = tuple(source_datastore.datastores)
1129 else:
1130 # The source datastore is different, forward everything to the
1131 # child datastores.
1132 source_datastores = (source_datastore,)
1134 if not refs: 1134 ↛ 1136line 1134 didn't jump to line 1136, because the condition on line 1134 was never true
1135 # Nothing to transfer.
1136 return set(), set()
1138 # Need to know the set of all possible refs that could be transferred.
1139 remaining_refs = set(refs)
1141 missing_from_source: set[DatasetRef] | None = None
1142 all_accepted = set()
1143 nsuccess = 0
1144 for source_child in source_datastores:
1145 # If we are reading from a chained datastore, it's possible that
1146 # only a subset of the datastores know about the dataset. We can't
1147 # ask the receiving datastore to copy it when it doesn't exist
1148 # so we have to filter again based on what the source datastore
1149 # understands.
1150 known_to_source = source_child.knows_these(list(refs))
1152 # Need to know that there is a possibility that some of these
1153 # datasets exist but are unknown to the source datastore if
1154 # trust is enabled.
1155 if getattr(source_child, "trustGetRequest", False):
1156 unknown = [ref for ref, known in known_to_source.items() if not known]
1157 existence = source_child.mexists(unknown, artifact_existence)
1158 for ref, exists in existence.items():
1159 known_to_source[ref] = exists
1161 missing = {ref for ref, known in known_to_source.items() if not known}
1162 if missing:
1163 if missing_from_source is None:
1164 missing_from_source = missing
1165 else:
1166 missing_from_source &= missing
1168 # Try to transfer from each source datastore to each child
1169 # datastore. Have to make sure we don't transfer something
1170 # we've already transferred to this destination on later passes.
1172 # Filter the initial list based on the datasets we have
1173 # not yet transferred.
1174 these_refs = []
1175 for ref in refs:
1176 if ref in remaining_refs and known_to_source[ref]:
1177 these_refs.append(ref)
1179 if not these_refs:
1180 # Already transferred all datasets known to this datastore.
1181 continue
1183 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
1184 if constraints is not None: 1184 ↛ 1192line 1184 didn't jump to line 1192, because the condition on line 1184 was never false
1185 filtered_refs = []
1186 for ref in these_refs:
1187 if constraints.isAcceptable(ref):
1188 filtered_refs.append(ref)
1189 else:
1190 log.debug("Rejecting ref by constraints: %s", ref)
1191 else:
1192 filtered_refs = list(these_refs)
1193 try:
1194 accepted, _ = datastore.transfer_from(
1195 source_child,
1196 filtered_refs,
1197 transfer,
1198 artifact_existence,
1199 dry_run=dry_run,
1200 )
1201 except (TypeError, NotImplementedError):
1202 # The datastores were incompatible.
1203 continue
1204 else:
1205 nsuccess += 1
1207 # Remove the accepted datasets from those remaining.
1208 remaining_refs = remaining_refs - accepted
1210 # Keep track of everything we have accepted.
1211 all_accepted.update(accepted)
1213 if missing_from_source:
1214 for ref in missing_from_source:
1215 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref)
1217 if nsuccess == 0: 1217 ↛ 1218line 1217 didn't jump to line 1218, because the condition on line 1217 was never true
1218 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}")
1220 return all_accepted, remaining_refs
1222 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
1223 # Docstring inherited from the base class.
1224 tables: dict[str, DatastoreOpaqueTable] = {}
1225 for datastore in self.datastores:
1226 tables.update(datastore.get_opaque_table_definitions())
1227 return tables