Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 86%
467 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:19 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:19 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Chained datastore."""
30from __future__ import annotations
32__all__ = ("ChainedDatastore",)
34import itertools
35import logging
36import time
37import warnings
38from collections.abc import Collection, Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any
41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset
42from lsst.daf.butler.datastore import (
43 DatasetRefURIs,
44 Datastore,
45 DatastoreConfig,
46 DatastoreOpaqueTable,
47 DatastoreValidationError,
48)
49from lsst.daf.butler.datastore.constraints import Constraints
50from lsst.daf.butler.datastore.record_data import DatastoreRecordData
51from lsst.resources import ResourcePath
52from lsst.utils import doImportType
54if TYPE_CHECKING:
55 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
56 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
57 from lsst.resources import ResourcePathExpression
59log = logging.getLogger(__name__)
62class _IngestPrepData(Datastore.IngestPrepData):
63 """Helper class for ChainedDatastore ingest implementation.
65 Parameters
66 ----------
67 children : `list` of `tuple`
68 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
69 """
71 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
72 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
73 self.children = children
76class ChainedDatastore(Datastore):
77 """Chained Datastores to allow read and writes from multiple datastores.
79 A ChainedDatastore is configured with multiple datastore configurations.
80 A ``put()`` is always sent to each datastore. A ``get()``
81 operation is sent to each datastore in turn and the first datastore
82 to return a valid dataset is used.
84 Parameters
85 ----------
86 config : `DatastoreConfig` or `str`
87 Configuration. This configuration must include a ``datastores`` field
88 as a sequence of datastore configurations. The order in this sequence
89 indicates the order to use for read operations.
90 bridgeManager : `DatastoreRegistryBridgeManager`
91 Object that manages the interface between `Registry` and datastores.
92 datastores : `list` [`Datastore`]
93 All the child datastores known to this datastore.
95 Notes
96 -----
97 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
98 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
99 and `"hardlink"` if and only if all its child datastores do.
100 """
102 defaultConfigFile = "datastores/chainedDatastore.yaml"
103 """Path to configuration defaults. Accessed within the ``configs`` resource
104 or relative to a search path. Can be None if no defaults specified.
105 """
107 containerKey = "datastores"
108 """Key to specify where child datastores are configured."""
110 datastores: list[Datastore]
111 """All the child datastores known to this datastore."""
113 datastoreConstraints: Sequence[Constraints | None]
114 """Constraints to be applied to each of the child datastores."""
116 @classmethod
117 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
118 """Set any filesystem-dependent config options for child Datastores to
119 be appropriate for a new empty repository with the given root.
121 Parameters
122 ----------
123 root : `str`
124 Filesystem path to the root of the data repository.
125 config : `Config`
126 A `Config` to update. Only the subset understood by
127 this component will be updated. Will not expand
128 defaults.
129 full : `Config`
130 A complete config with all defaults expanded that can be
131 converted to a `DatastoreConfig`. Read-only and will not be
132 modified by this method.
133 Repository-specific options that should not be obtained
134 from defaults when Butler instances are constructed
135 should be copied from ``full`` to ``config``.
136 overwrite : `bool`, optional
137 If `False`, do not modify a value in ``config`` if the value
138 already exists. Default is always to overwrite with the provided
139 ``root``.
141 Notes
142 -----
143 If a keyword is explicitly defined in the supplied ``config`` it
144 will not be overridden by this method if ``overwrite`` is `False`.
145 This allows explicit values set in external configs to be retained.
146 """
147 # Extract the part of the config we care about updating
148 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
150 # And the subset of the full config that we can use for reference.
151 # Do not bother with defaults because we are told this already has
152 # them.
153 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
155 # Loop over each datastore config and pass the subsets to the
156 # child datastores to process.
158 containerKey = cls.containerKey
159 for idx, (child, fullChild) in enumerate(
160 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True)
161 ):
162 childConfig = DatastoreConfig(child, mergeDefaults=False)
163 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
164 datastoreClass = doImportType(fullChildConfig["cls"])
165 if not issubclass(datastoreClass, Datastore): 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true
166 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
167 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}"
168 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
170 # Reattach to parent
171 datastoreConfig[containerKey, idx] = childConfig
173 # Reattach modified datastore config to parent
174 # If this has a datastore key we attach there, otherwise we assume
175 # this information goes at the top of the config hierarchy.
176 if DatastoreConfig.component in config:
177 config[DatastoreConfig.component] = datastoreConfig
178 else:
179 config.update(datastoreConfig)
181 return
183 def __init__(
184 self,
185 config: DatastoreConfig,
186 bridgeManager: DatastoreRegistryBridgeManager,
187 datastores: list[Datastore],
188 ):
189 super().__init__(config, bridgeManager)
191 self.datastores = list(datastores)
193 # Name ourself based on our children
194 if self.datastores: 194 ↛ 199line 194 didn't jump to line 199, because the condition on line 194 was never false
195 # We must set the names explicitly
196 self._names = [d.name for d in self.datastores]
197 childNames = ",".join(self.names)
198 else:
199 childNames = f"(empty@{time.time()})"
200 self._names = [childNames]
201 self.name = f"{type(self).__qualname__}[{childNames}]"
203 # We declare we are ephemeral if all our child datastores declare
204 # they are ephemeral
205 self.isEphemeral = all(d.isEphemeral for d in self.datastores)
207 # per-datastore override constraints
208 if "datastore_constraints" in self.config:
209 overrides = self.config["datastore_constraints"]
211 if len(overrides) != len(self.datastores): 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 raise DatastoreValidationError(
213 f"Number of registered datastores ({len(self.datastores)})"
214 " differs from number of constraints overrides"
215 f" {len(overrides)}"
216 )
218 self.datastoreConstraints = [
219 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
220 ]
222 else:
223 self.datastoreConstraints = (None,) * len(self.datastores)
225 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
227 @classmethod
228 def _create_from_config(
229 cls,
230 config: DatastoreConfig,
231 bridgeManager: DatastoreRegistryBridgeManager,
232 butlerRoot: ResourcePathExpression | None,
233 ) -> ChainedDatastore:
234 # Scan for child datastores and instantiate them with the same registry
235 datastores = []
236 for c in config["datastores"]:
237 c = DatastoreConfig(c)
238 datastoreType = doImportType(c["cls"])
239 if not issubclass(datastoreType, Datastore): 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true
240 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
241 datastore = datastoreType._create_from_config(c, bridgeManager, butlerRoot=butlerRoot)
242 log.debug("Creating child datastore %s", datastore.name)
243 datastores.append(datastore)
245 return ChainedDatastore(config, bridgeManager, datastores)
247 def clone(self, bridgeManager: DatastoreRegistryBridgeManager) -> Datastore:
248 datastores = [ds.clone(bridgeManager) for ds in self.datastores]
249 return ChainedDatastore(self.config, bridgeManager, datastores)
251 @property
252 def names(self) -> tuple[str, ...]:
253 return tuple(self._names)
255 @property
256 def roots(self) -> dict[str, ResourcePath | None]:
257 # Docstring inherited.
258 roots = {}
259 for datastore in self.datastores:
260 roots.update(datastore.roots)
261 return roots
263 def __str__(self) -> str:
264 chainName = ", ".join(str(ds) for ds in self.datastores)
265 return chainName
267 def _set_trust_mode(self, mode: bool) -> None:
268 for datastore in self.datastores:
269 datastore._set_trust_mode(mode)
271 def knows(self, ref: DatasetRef) -> bool:
272 """Check if the dataset is known to any of the datastores.
274 Does not check for existence of any artifact.
276 Parameters
277 ----------
278 ref : `DatasetRef`
279 Reference to the required dataset.
281 Returns
282 -------
283 exists : `bool`
284 `True` if the dataset is known to the datastore.
285 """
286 for datastore in self.datastores:
287 if datastore.knows(ref):
288 log.debug("%s known to datastore %s", ref, datastore.name)
289 return True
290 return False
292 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
293 # Docstring inherited from the base class.
294 refs_known: dict[DatasetRef, bool] = {}
295 for datastore in self.datastores:
296 refs_known.update(datastore.knows_these(refs))
298 # No need to check in next datastore for refs that are known.
299 # We only update entries that were initially False.
300 refs = [ref for ref, known in refs_known.items() if not known]
302 return refs_known
304 def mexists(
305 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
306 ) -> dict[DatasetRef, bool]:
307 """Check the existence of multiple datasets at once.
309 Parameters
310 ----------
311 refs : iterable of `DatasetRef`
312 The datasets to be checked.
313 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
314 Optional mapping of datastore artifact to existence. Updated by
315 this method with details of all artifacts tested. Can be `None`
316 if the caller is not interested.
318 Returns
319 -------
320 existence : `dict` of [`DatasetRef`, `bool`]
321 Mapping from dataset to boolean indicating existence in any
322 of the child datastores.
323 """
324 dataset_existence: dict[DatasetRef, bool] = {}
325 for datastore in self.datastores:
326 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
328 # For next datastore no point asking about ones we know
329 # exist already. No special exemption for ephemeral datastores.
330 refs = [ref for ref, exists in dataset_existence.items() if not exists]
332 return dataset_existence
334 def exists(self, ref: DatasetRef) -> bool:
335 """Check if the dataset exists in one of the datastores.
337 Parameters
338 ----------
339 ref : `DatasetRef`
340 Reference to the required dataset.
342 Returns
343 -------
344 exists : `bool`
345 `True` if the entity exists in one of the child datastores.
346 """
347 for datastore in self.datastores:
348 if datastore.exists(ref):
349 log.debug("Found %s in datastore %s", ref, datastore.name)
350 return True
351 return False
353 def get(
354 self,
355 ref: DatasetRef,
356 parameters: Mapping[str, Any] | None = None,
357 storageClass: StorageClass | str | None = None,
358 ) -> Any:
359 """Load an InMemoryDataset from the store.
361 The dataset is returned from the first datastore that has
362 the dataset.
364 Parameters
365 ----------
366 ref : `DatasetRef`
367 Reference to the required Dataset.
368 parameters : `dict`
369 `StorageClass`-specific parameters that specify, for example,
370 a slice of the dataset to be loaded.
371 storageClass : `StorageClass` or `str`, optional
372 The storage class to be used to override the Python type
373 returned by this method. By default the returned type matches
374 the dataset type definition for this dataset. Specifying a
375 read `StorageClass` can force a different type to be returned.
376 This type must be compatible with the original type.
378 Returns
379 -------
380 inMemoryDataset : `object`
381 Requested dataset or slice thereof as an InMemoryDataset.
383 Raises
384 ------
385 FileNotFoundError
386 Requested dataset can not be retrieved.
387 TypeError
388 Return value from formatter has unexpected type.
389 ValueError
390 Formatter failed to process the dataset.
391 """
392 for datastore in self.datastores:
393 try:
394 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
395 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
396 return inMemoryObject
397 except FileNotFoundError:
398 pass
400 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores")
402 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
403 """Write a InMemoryDataset with a given `DatasetRef` to each
404 datastore.
406 The put() to child datastores can fail with
407 `DatasetTypeNotSupportedError`. The put() for this datastore will be
408 deemed to have succeeded so long as at least one child datastore
409 accepted the inMemoryDataset.
411 Parameters
412 ----------
413 inMemoryDataset : `object`
414 The dataset to store.
415 ref : `DatasetRef`
416 Reference to the associated Dataset.
418 Raises
419 ------
420 TypeError
421 Supplied object and storage class are inconsistent.
422 DatasetTypeNotSupportedError
423 All datastores reported `DatasetTypeNotSupportedError`.
424 """
425 log.debug("Put %s", ref)
427 # Confirm that we can accept this dataset
428 if not self.constraints.isAcceptable(ref):
429 # Raise rather than use boolean return value.
430 raise DatasetTypeNotSupportedError(
431 f"Dataset {ref} has been rejected by this datastore via configuration."
432 )
434 isPermanent = False
435 nsuccess = 0
436 npermanent = 0
437 nephemeral = 0
438 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
439 if (
440 constraints is not None and not constraints.isAcceptable(ref)
441 ) or not datastore.constraints.isAcceptable(ref):
442 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
443 continue
445 if datastore.isEphemeral:
446 nephemeral += 1
447 else:
448 npermanent += 1
449 try:
450 datastore.put(inMemoryDataset, ref)
451 nsuccess += 1
452 if not datastore.isEphemeral:
453 isPermanent = True
454 except DatasetTypeNotSupportedError:
455 pass
457 if nsuccess == 0:
458 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
460 if not isPermanent and npermanent > 0: 460 ↛ 461line 460 didn't jump to line 461, because the condition on line 460 was never true
461 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
463 if self._transaction is not None:
464 self._transaction.registerUndo("put", self.remove, ref)
466 def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
467 # Docstring inherited from base class.
468 log.debug("Put %s", ref)
470 # Confirm that we can accept this dataset
471 if not self.constraints.isAcceptable(ref):
472 # Raise rather than use boolean return value.
473 raise DatasetTypeNotSupportedError(
474 f"Dataset {ref} has been rejected by this datastore via configuration."
475 )
477 isPermanent = False
478 nsuccess = 0
479 npermanent = 0
480 nephemeral = 0
481 stored_refs: dict[str, DatasetRef] = {}
482 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
483 if (
484 constraints is not None and not constraints.isAcceptable(ref)
485 ) or not datastore.constraints.isAcceptable(ref):
486 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
487 continue
489 if datastore.isEphemeral:
490 nephemeral += 1
491 else:
492 npermanent += 1
493 try:
494 stored_ref_map = datastore.put_new(in_memory_dataset, ref)
495 stored_refs.update(stored_ref_map)
496 nsuccess += 1
497 if not datastore.isEphemeral:
498 isPermanent = True
499 except DatasetTypeNotSupportedError:
500 pass
502 if nsuccess == 0:
503 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
505 if not isPermanent and npermanent > 0:
506 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
508 if self._transaction is not None:
509 self._transaction.registerUndo("put", self.remove, ref)
511 return stored_refs
513 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
514 # Docstring inherited from base class.
515 if transfer != "auto":
516 return transfer
517 # Ask each datastore what they think auto means
518 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
520 # Remove any untranslated "auto" values
521 transfers.discard(transfer)
523 if len(transfers) == 1: 523 ↛ 524line 523 didn't jump to line 524, because the condition on line 523 was never true
524 return transfers.pop()
525 if not transfers: 525 ↛ 529line 525 didn't jump to line 529, because the condition on line 525 was never false
526 # Everything reported "auto"
527 return transfer
529 raise RuntimeError(
530 "Chained datastore does not yet support different transfer modes"
531 f" from 'auto' in each child datastore (wanted {transfers})"
532 )
534 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
535 # Docstring inherited from Datastore._prepIngest.
536 if transfer is None:
537 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
539 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
540 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
541 if not acceptable:
542 log.debug(
543 "Datastore %s skipping ingest via configuration for refs %s",
544 name,
545 ", ".join(str(ref) for ref in dataset.refs),
546 )
547 return False
548 else:
549 return True
551 # Filter down to just datasets the chained datastore's own
552 # configuration accepts.
553 okForParent: list[FileDataset] = [
554 dataset
555 for dataset in datasets
556 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
557 ]
559 # Iterate over nested datastores and call _prepIngest on each.
560 # Save the results to a list:
561 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
562 # ...and remember whether all of the failures are due to
563 # NotImplementedError being raised.
564 allFailuresAreNotImplementedError = True
565 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
566 okForChild: list[FileDataset]
567 if constraints is not None:
568 okForChild = [
569 dataset
570 for dataset in okForParent
571 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
572 ]
573 else:
574 okForChild = okForParent
575 try:
576 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
577 except NotImplementedError:
578 log.debug(
579 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
580 datastore.name,
581 transfer,
582 )
583 continue
584 allFailuresAreNotImplementedError = False
585 if okForChild:
586 # Do not store for later if a datastore has rejected
587 # everything.
588 # Include the source paths if this is a "move". It's clearer
589 # to find the paths now rather than try to infer how
590 # each datastore has stored them in the internal prep class.
591 paths = (
592 {ResourcePath(dataset.path, forceDirectory=False) for dataset in okForChild}
593 if transfer == "move"
594 else set()
595 )
596 children.append((datastore, prepDataForChild, paths))
597 if allFailuresAreNotImplementedError:
598 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
599 return _IngestPrepData(children=children)
601 def _finishIngest(
602 self,
603 prepData: _IngestPrepData,
604 *,
605 transfer: str | None = None,
606 record_validation_info: bool = True,
607 ) -> None:
608 # Docstring inherited from Datastore._finishIngest.
609 # For "move" we must use "copy" and then delete the input
610 # data at the end. This has no rollback option if the ingest
611 # subsequently fails. If there is only one active datastore
612 # accepting any files we can leave it as "move"
613 actual_transfer: str | None
614 if transfer == "move" and len(prepData.children) > 1:
615 actual_transfer = "copy"
616 else:
617 actual_transfer = transfer
618 to_be_deleted: set[ResourcePath] = set()
619 for datastore, prepDataForChild, paths in prepData.children:
620 datastore._finishIngest(
621 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
622 )
623 to_be_deleted.update(paths)
624 if actual_transfer != transfer:
625 # These datasets were copied but now need to be deleted.
626 # This can not be rolled back.
627 for uri in to_be_deleted:
628 uri.remove()
630 def getManyURIs(
631 self,
632 refs: Iterable[DatasetRef],
633 predict: bool = False,
634 allow_missing: bool = False,
635 ) -> dict[DatasetRef, DatasetRefURIs]:
636 # Docstring inherited
638 uris: dict[DatasetRef, DatasetRefURIs] = {}
639 missing_refs = set(refs)
641 # If predict is True we don't want to predict a dataset in the first
642 # datastore if it actually exists in a later datastore, so in that
643 # case check all datastores with predict=False first, and then try
644 # again with predict=True.
645 for p in (False, True) if predict else (False,):
646 if not missing_refs:
647 break
648 for datastore in self.datastores:
649 try:
650 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
651 except NotImplementedError:
652 # some datastores may not implement generating URIs
653 continue
654 missing_refs -= got_uris.keys()
655 uris.update(got_uris)
656 if not missing_refs:
657 break
659 if missing_refs and not allow_missing:
660 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
662 return uris
664 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
665 """Return URIs associated with dataset.
667 Parameters
668 ----------
669 ref : `DatasetRef`
670 Reference to the required dataset.
671 predict : `bool`, optional
672 If the datastore does not know about the dataset, controls whether
673 it should return a predicted URI or not.
675 Returns
676 -------
677 uris : `DatasetRefURIs`
678 The URI to the primary artifact associated with this dataset (if
679 the dataset was disassembled within the datastore this may be
680 `None`), and the URIs to any components associated with the dataset
681 artifact. (can be empty if there are no components).
683 Notes
684 -----
685 The returned URI is from the first datastore in the list that has
686 the dataset with preference given to the first dataset coming from
687 a permanent datastore. If no datastores have the dataset and prediction
688 is allowed, the predicted URI for the first datastore in the list will
689 be returned.
690 """
691 log.debug("Requesting URIs for %s", ref)
692 predictedUri: DatasetRefURIs | None = None
693 predictedEphemeralUri: DatasetRefURIs | None = None
694 firstEphemeralUri: DatasetRefURIs | None = None
695 for datastore in self.datastores:
696 if datastore.exists(ref):
697 if not datastore.isEphemeral:
698 uri = datastore.getURIs(ref)
699 log.debug("Retrieved non-ephemeral URI: %s", uri)
700 return uri
701 elif not firstEphemeralUri:
702 firstEphemeralUri = datastore.getURIs(ref)
703 elif predict:
704 if not predictedUri and not datastore.isEphemeral:
705 predictedUri = datastore.getURIs(ref, predict)
706 elif not predictedEphemeralUri and datastore.isEphemeral:
707 predictedEphemeralUri = datastore.getURIs(ref, predict)
709 if firstEphemeralUri:
710 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
711 return firstEphemeralUri
713 if predictedUri:
714 log.debug("Retrieved predicted URI: %s", predictedUri)
715 return predictedUri
717 if predictedEphemeralUri:
718 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
719 return predictedEphemeralUri
721 raise FileNotFoundError(f"Dataset {ref} not in any datastore")
723 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
724 """URI to the Dataset.
726 The returned URI is from the first datastore in the list that has
727 the dataset with preference given to the first dataset coming from
728 a permanent datastore. If no datastores have the dataset and prediction
729 is allowed, the predicted URI for the first datastore in the list will
730 be returned.
732 Parameters
733 ----------
734 ref : `DatasetRef`
735 Reference to the required Dataset.
736 predict : `bool`
737 If `True`, allow URIs to be returned of datasets that have not
738 been written.
740 Returns
741 -------
742 uri : `lsst.resources.ResourcePath`
743 URI pointing to the dataset within the datastore. If the
744 dataset does not exist in the datastore, and if ``predict`` is
745 `True`, the URI will be a prediction and will include a URI
746 fragment "#predicted".
748 Notes
749 -----
750 If the datastore does not have entities that relate well
751 to the concept of a URI the returned URI string will be
752 descriptive. The returned URI is not guaranteed to be obtainable.
754 Raises
755 ------
756 FileNotFoundError
757 A URI has been requested for a dataset that does not exist and
758 guessing is not allowed.
759 RuntimeError
760 Raised if a request is made for a single URI but multiple URIs
761 are associated with this dataset.
762 """
763 log.debug("Requesting URI for %s", ref)
764 primary, components = self.getURIs(ref, predict)
765 if primary is None or components: 765 ↛ 766line 765 didn't jump to line 766, because the condition on line 765 was never true
766 raise RuntimeError(
767 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
768 )
769 return primary
771 def retrieveArtifacts(
772 self,
773 refs: Iterable[DatasetRef],
774 destination: ResourcePath,
775 transfer: str = "auto",
776 preserve_path: bool = True,
777 overwrite: bool = False,
778 ) -> list[ResourcePath]:
779 """Retrieve the file artifacts associated with the supplied refs.
781 Parameters
782 ----------
783 refs : iterable of `DatasetRef`
784 The datasets for which file artifacts are to be retrieved.
785 A single ref can result in multiple files. The refs must
786 be resolved.
787 destination : `lsst.resources.ResourcePath`
788 Location to write the file artifacts.
789 transfer : `str`, optional
790 Method to use to transfer the artifacts. Must be one of the options
791 supported by `lsst.resources.ResourcePath.transfer_from()`.
792 "move" is not allowed.
793 preserve_path : `bool`, optional
794 If `True` the full path of the file artifact within the datastore
795 is preserved. If `False` the final file component of the path
796 is used.
797 overwrite : `bool`, optional
798 If `True` allow transfers to overwrite existing files at the
799 destination.
801 Returns
802 -------
803 targets : `list` of `lsst.resources.ResourcePath`
804 URIs of file artifacts in destination location. Order is not
805 preserved.
806 """
807 if not destination.isdir(): 807 ↛ 808line 807 didn't jump to line 808, because the condition on line 807 was never true
808 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
810 # Using getURIs is not feasible since it becomes difficult to
811 # determine the path within the datastore later on. For now
812 # follow getURIs implementation approach.
814 pending = set(refs)
816 # There is a question as to whether an exception should be raised
817 # early if some of the refs are missing, or whether files should be
818 # transferred until a problem is hit. Prefer to complain up front.
819 # Use the datastore integer as primary key.
820 grouped_by_datastore: dict[int, set[DatasetRef]] = {}
822 for number, datastore in enumerate(self.datastores):
823 if datastore.isEphemeral:
824 # In the future we will want to distinguish in-memory from
825 # caching datastore since using an on-disk local
826 # cache is exactly what we should be doing.
827 continue
828 try:
829 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
830 except NotImplementedError:
831 # Some datastores may not support retrieving artifacts
832 continue
834 if datastore_refs:
835 grouped_by_datastore[number] = datastore_refs
837 # Remove these from the pending list so that we do not bother
838 # looking for them any more.
839 pending = pending - datastore_refs
841 if pending: 841 ↛ 842line 841 didn't jump to line 842, because the condition on line 841 was never true
842 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
844 # Now do the transfer.
845 targets: list[ResourcePath] = []
846 for number, datastore_refs in grouped_by_datastore.items():
847 targets.extend(
848 self.datastores[number].retrieveArtifacts(
849 datastore_refs,
850 destination,
851 transfer=transfer,
852 preserve_path=preserve_path,
853 overwrite=overwrite,
854 )
855 )
857 return targets
859 def remove(self, ref: DatasetRef) -> None:
860 """Indicate to the datastore that a dataset can be removed.
862 The dataset will be removed from each datastore. The dataset is
863 not required to exist in every child datastore.
865 Parameters
866 ----------
867 ref : `DatasetRef`
868 Reference to the required dataset.
870 Raises
871 ------
872 FileNotFoundError
873 Attempt to remove a dataset that does not exist. Raised if none
874 of the child datastores removed the dataset.
875 """
876 log.debug("Removing %s", ref)
877 self.trash(ref, ignore_errors=False)
878 self.emptyTrash(ignore_errors=False)
880 def forget(self, refs: Iterable[DatasetRef]) -> None:
881 for datastore in tuple(self.datastores):
882 datastore.forget(refs)
884 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
885 if isinstance(ref, DatasetRef):
886 ref_label = str(ref)
887 else:
888 ref_label = "bulk datasets"
890 log.debug("Trashing %s", ref_label)
892 counter = 0
893 for datastore in self.datastores:
894 try:
895 datastore.trash(ref, ignore_errors=ignore_errors)
896 counter += 1
897 except FileNotFoundError:
898 pass
900 if counter == 0:
901 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
902 if ignore_errors: 902 ↛ 903line 902 didn't jump to line 903, because the condition on line 902 was never true
903 log.warning(err_msg)
904 else:
905 raise FileNotFoundError(err_msg)
907 def emptyTrash(self, ignore_errors: bool = True) -> None:
908 for datastore in self.datastores:
909 datastore.emptyTrash(ignore_errors=ignore_errors)
911 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
912 """Retrieve a dataset from an input `Datastore`,
913 and store the result in this `Datastore`.
915 Parameters
916 ----------
917 inputDatastore : `Datastore`
918 The external `Datastore` from which to retreive the Dataset.
919 ref : `DatasetRef`
920 Reference to the required dataset in the input data store.
922 Returns
923 -------
924 results : `list`
925 List containing the return value from the ``put()`` to each
926 child datastore.
927 """
928 assert inputDatastore is not self # unless we want it for renames?
929 inMemoryDataset = inputDatastore.get(ref)
930 self.put(inMemoryDataset, ref)
932 def validateConfiguration(
933 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
934 ) -> None:
935 """Validate some of the configuration for this datastore.
937 Parameters
938 ----------
939 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
940 Entities to test against this configuration. Can be differing
941 types.
942 logFailures : `bool`, optional
943 If `True`, output a log message for every validation error
944 detected.
946 Raises
947 ------
948 DatastoreValidationError
949 Raised if there is a validation problem with a configuration.
950 All the problems are reported in a single exception.
952 Notes
953 -----
954 This method checks each datastore in turn.
955 """
956 # Need to catch each of the datastore outputs and ensure that
957 # all are tested.
958 failures = []
959 for datastore in self.datastores:
960 try:
961 datastore.validateConfiguration(entities, logFailures=logFailures)
962 except DatastoreValidationError as e:
963 if logFailures: 963 ↛ 965line 963 didn't jump to line 965, because the condition on line 963 was never false
964 log.critical("Datastore %s failed validation", datastore.name)
965 failures.append(f"Datastore {self.name}: {e}")
967 if failures:
968 msg = ";\n".join(failures)
969 raise DatastoreValidationError(msg)
971 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
972 # Docstring is inherited from base class
973 failures = []
974 for datastore in self.datastores:
975 try:
976 datastore.validateKey(lookupKey, entity)
977 except DatastoreValidationError as e:
978 failures.append(f"Datastore {self.name}: {e}")
980 if failures:
981 msg = ";\n".join(failures)
982 raise DatastoreValidationError(msg)
984 def getLookupKeys(self) -> set[LookupKey]:
985 # Docstring is inherited from base class
986 keys = set()
987 for datastore in self.datastores:
988 keys.update(datastore.getLookupKeys())
990 keys.update(self.constraints.getLookupKeys())
991 for p in self.datastoreConstraints:
992 if p is not None: 992 ↛ 991line 992 didn't jump to line 991, because the condition on line 992 was never false
993 keys.update(p.getLookupKeys())
995 return keys
997 def needs_expanded_data_ids(
998 self,
999 transfer: str | None,
1000 entity: DatasetRef | DatasetType | StorageClass | None = None,
1001 ) -> bool:
1002 # Docstring inherited.
1003 # We can't safely use `self.datastoreConstraints` with `entity` to
1004 # check whether a child datastore would even want to ingest this
1005 # dataset, because we don't want to filter out datastores that might
1006 # need an expanded data ID based in incomplete information (e.g. we
1007 # pass a StorageClass, but the constraint dispatches on DatasetType).
1008 # So we pessimistically check if any datastore would need an expanded
1009 # data ID for this transfer mode.
1010 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores)
1012 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
1013 # Docstring inherited from the base class.
1015 for datastore in self.datastores:
1016 datastore.import_records(data)
1018 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
1019 # Docstring inherited from the base class.
1021 all_records: dict[str, DatastoreRecordData] = {}
1023 # Merge all sub-datastore records into one structure
1024 for datastore in self.datastores:
1025 sub_records = datastore.export_records(refs)
1026 for name, record_data in sub_records.items():
1027 # All datastore names must be unique in a chain.
1028 if name in all_records: 1028 ↛ 1029line 1028 didn't jump to line 1029, because the condition on line 1028 was never true
1029 raise ValueError("Non-unique datastore name found in datastore {datastore}")
1030 all_records[name] = record_data
1032 return all_records
1034 def export(
1035 self,
1036 refs: Iterable[DatasetRef],
1037 *,
1038 directory: ResourcePathExpression | None = None,
1039 transfer: str | None = "auto",
1040 ) -> Iterable[FileDataset]:
1041 # Docstring inherited from Datastore.export.
1042 if transfer == "auto" and directory is None:
1043 transfer = None
1045 if transfer is not None and directory is None:
1046 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
1048 if transfer == "move":
1049 raise TypeError("Can not export by moving files out of datastore.")
1051 # Exporting from a chain has the potential for a dataset to be
1052 # in one or more of the datastores in the chain. We only need one
1053 # of them since we assume the datasets are the same in all (but
1054 # the file format could be different of course since that is a
1055 # per-datastore configuration).
1056 # We also do not know whether any of the datastores in the chain
1057 # support file export.
1059 # Ensure we have an ordered sequence that is not an iterator or set.
1060 if not isinstance(refs, Sequence):
1061 refs = list(refs)
1063 # If any of the datasets are missing entirely we need to raise early
1064 # before we try to run the export. This can be a little messy but is
1065 # better than exporting files from the first datastore and then finding
1066 # that one is missing but is not in the second datastore either.
1067 known = [datastore.knows_these(refs) for datastore in self.datastores]
1068 refs_known: set[DatasetRef] = set()
1069 for known_to_this in known:
1070 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this})
1071 missing_count = len(refs) - len(refs_known)
1072 if missing_count:
1073 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}")
1075 # To allow us to slot each result into the right place after
1076 # asking each datastore, create a dict with the index.
1077 ref_positions = {ref: i for i, ref in enumerate(refs)}
1079 # Presize the final export list.
1080 exported: list[FileDataset | None] = [None] * len(refs)
1082 # The order of the returned dataset has to match the order of the
1083 # given refs, even if they are all from different datastores.
1084 for i, datastore in enumerate(self.datastores):
1085 known_to_this = known[i]
1086 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions]
1088 try:
1089 this_export = datastore.export(filtered, directory=directory, transfer=transfer)
1090 except NotImplementedError:
1091 # Try the next datastore.
1092 continue
1094 for ref, export in zip(filtered, this_export, strict=True):
1095 # Get the position and also delete it from the list.
1096 exported[ref_positions.pop(ref)] = export
1098 # Every dataset should be accounted for because of the earlier checks
1099 # but make sure that we did fill all the slots to appease mypy.
1100 for i, dataset in enumerate(exported):
1101 if dataset is None: 1101 ↛ 1102line 1101 didn't jump to line 1102, because the condition on line 1101 was never true
1102 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.")
1103 yield dataset
1105 def transfer_from(
1106 self,
1107 source_datastore: Datastore,
1108 refs: Collection[DatasetRef],
1109 transfer: str = "auto",
1110 artifact_existence: dict[ResourcePath, bool] | None = None,
1111 dry_run: bool = False,
1112 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1113 # Docstring inherited
1114 # mypy does not understand "type(self) is not type(source)"
1115 if isinstance(source_datastore, ChainedDatastore):
1116 # Both the source and destination are chained datastores.
1117 source_datastores = tuple(source_datastore.datastores)
1118 else:
1119 # The source datastore is different, forward everything to the
1120 # child datastores.
1121 source_datastores = (source_datastore,)
1123 if not refs: 1123 ↛ 1125line 1123 didn't jump to line 1125, because the condition on line 1123 was never true
1124 # Nothing to transfer.
1125 return set(), set()
1127 # Need to know the set of all possible refs that could be transferred.
1128 remaining_refs = set(refs)
1130 missing_from_source: set[DatasetRef] | None = None
1131 all_accepted = set()
1132 nsuccess = 0
1133 for source_child in source_datastores:
1134 # If we are reading from a chained datastore, it's possible that
1135 # only a subset of the datastores know about the dataset. We can't
1136 # ask the receiving datastore to copy it when it doesn't exist
1137 # so we have to filter again based on what the source datastore
1138 # understands.
1139 known_to_source = source_child.knows_these(list(refs))
1141 # Need to know that there is a possibility that some of these
1142 # datasets exist but are unknown to the source datastore if
1143 # trust is enabled.
1144 if getattr(source_child, "trustGetRequest", False):
1145 unknown = [ref for ref, known in known_to_source.items() if not known]
1146 existence = source_child.mexists(unknown, artifact_existence)
1147 for ref, exists in existence.items():
1148 known_to_source[ref] = exists
1150 missing = {ref for ref, known in known_to_source.items() if not known}
1151 if missing:
1152 if missing_from_source is None:
1153 missing_from_source = missing
1154 else:
1155 missing_from_source &= missing
1157 # Try to transfer from each source datastore to each child
1158 # datastore. Have to make sure we don't transfer something
1159 # we've already transferred to this destination on later passes.
1161 # Filter the initial list based on the datasets we have
1162 # not yet transferred.
1163 these_refs = []
1164 for ref in refs:
1165 if ref in remaining_refs and known_to_source[ref]:
1166 these_refs.append(ref)
1168 if not these_refs:
1169 # Already transferred all datasets known to this datastore.
1170 continue
1172 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
1173 if constraints is not None: 1173 ↛ 1181line 1173 didn't jump to line 1181, because the condition on line 1173 was never false
1174 filtered_refs = []
1175 for ref in these_refs:
1176 if constraints.isAcceptable(ref):
1177 filtered_refs.append(ref)
1178 else:
1179 log.debug("Rejecting ref by constraints: %s", ref)
1180 else:
1181 filtered_refs = list(these_refs)
1182 try:
1183 accepted, _ = datastore.transfer_from(
1184 source_child,
1185 filtered_refs,
1186 transfer,
1187 artifact_existence,
1188 dry_run=dry_run,
1189 )
1190 except (TypeError, NotImplementedError):
1191 # The datastores were incompatible.
1192 continue
1193 else:
1194 nsuccess += 1
1196 # Remove the accepted datasets from those remaining.
1197 remaining_refs = remaining_refs - accepted
1199 # Keep track of everything we have accepted.
1200 all_accepted.update(accepted)
1202 if missing_from_source:
1203 for ref in missing_from_source:
1204 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref)
1206 if nsuccess == 0: 1206 ↛ 1207line 1206 didn't jump to line 1207, because the condition on line 1206 was never true
1207 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}")
1209 return all_accepted, remaining_refs
1211 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
1212 # Docstring inherited from the base class.
1213 tables: dict[str, DatastoreOpaqueTable] = {}
1214 for datastore in self.datastores:
1215 tables.update(datastore.get_opaque_table_definitions())
1216 return tables