Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%
421 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 07:59 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 07:59 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Chained datastore."""
30from __future__ import annotations
32__all__ = ("ChainedDatastore",)
34import itertools
35import logging
36import time
37import warnings
38from collections.abc import Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any
41from lsst.daf.butler import (
42 Constraints,
43 DatasetRef,
44 DatasetRefURIs,
45 DatasetTypeNotSupportedError,
46 Datastore,
47 DatastoreConfig,
48 DatastoreRecordData,
49 DatastoreValidationError,
50 FileDataset,
51)
52from lsst.resources import ResourcePath
53from lsst.utils import doImportType
55if TYPE_CHECKING:
56 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
57 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
58 from lsst.resources import ResourcePathExpression
60log = logging.getLogger(__name__)
63class _IngestPrepData(Datastore.IngestPrepData):
64 """Helper class for ChainedDatastore ingest implementation.
66 Parameters
67 ----------
68 children : `list` of `tuple`
69 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
70 """
72 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
73 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
74 self.children = children
77class ChainedDatastore(Datastore):
78 """Chained Datastores to allow read and writes from multiple datastores.
80 A ChainedDatastore is configured with multiple datastore configurations.
81 A ``put()`` is always sent to each datastore. A ``get()``
82 operation is sent to each datastore in turn and the first datastore
83 to return a valid dataset is used.
85 Parameters
86 ----------
87 config : `DatastoreConfig` or `str`
88 Configuration. This configuration must include a ``datastores`` field
89 as a sequence of datastore configurations. The order in this sequence
90 indicates the order to use for read operations.
91 bridgeManager : `DatastoreRegistryBridgeManager`
92 Object that manages the interface between `Registry` and datastores.
93 butlerRoot : `str`, optional
94 New datastore root to use to override the configuration value. This
95 root is sent to each child datastore.
97 Notes
98 -----
99 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
100 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
101 and `"hardlink"` if and only if all its child datastores do.
102 """
104 defaultConfigFile = "datastores/chainedDatastore.yaml"
105 """Path to configuration defaults. Accessed within the ``configs`` resource
106 or relative to a search path. Can be None if no defaults specified.
107 """
109 containerKey = "datastores"
110 """Key to specify where child datastores are configured."""
112 datastores: list[Datastore]
113 """All the child datastores known to this datastore."""
115 datastoreConstraints: Sequence[Constraints | None]
116 """Constraints to be applied to each of the child datastores."""
118 @classmethod
119 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
120 """Set any filesystem-dependent config options for child Datastores to
121 be appropriate for a new empty repository with the given root.
123 Parameters
124 ----------
125 root : `str`
126 Filesystem path to the root of the data repository.
127 config : `Config`
128 A `Config` to update. Only the subset understood by
129 this component will be updated. Will not expand
130 defaults.
131 full : `Config`
132 A complete config with all defaults expanded that can be
133 converted to a `DatastoreConfig`. Read-only and will not be
134 modified by this method.
135 Repository-specific options that should not be obtained
136 from defaults when Butler instances are constructed
137 should be copied from ``full`` to ``config``.
138 overwrite : `bool`, optional
139 If `False`, do not modify a value in ``config`` if the value
140 already exists. Default is always to overwrite with the provided
141 ``root``.
143 Notes
144 -----
145 If a keyword is explicitly defined in the supplied ``config`` it
146 will not be overridden by this method if ``overwrite`` is `False`.
147 This allows explicit values set in external configs to be retained.
148 """
149 # Extract the part of the config we care about updating
150 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
152 # And the subset of the full config that we can use for reference.
153 # Do not bother with defaults because we are told this already has
154 # them.
155 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
157 # Loop over each datastore config and pass the subsets to the
158 # child datastores to process.
160 containerKey = cls.containerKey
161 for idx, (child, fullChild) in enumerate(
162 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True)
163 ):
164 childConfig = DatastoreConfig(child, mergeDefaults=False)
165 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
166 datastoreClass = doImportType(fullChildConfig["cls"])
167 if not issubclass(datastoreClass, Datastore): 167 ↛ 168line 167 didn't jump to line 168, because the condition on line 167 was never true
168 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
169 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}"
170 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
172 # Reattach to parent
173 datastoreConfig[containerKey, idx] = childConfig
175 # Reattach modified datastore config to parent
176 # If this has a datastore key we attach there, otherwise we assume
177 # this information goes at the top of the config hierarchy.
178 if DatastoreConfig.component in config:
179 config[DatastoreConfig.component] = datastoreConfig
180 else:
181 config.update(datastoreConfig)
183 return
185 def __init__(
186 self,
187 config: Config | ResourcePathExpression,
188 bridgeManager: DatastoreRegistryBridgeManager,
189 butlerRoot: str | None = None,
190 ):
191 super().__init__(config, bridgeManager)
193 # Scan for child datastores and instantiate them with the same registry
194 self.datastores = []
195 for c in self.config["datastores"]:
196 c = DatastoreConfig(c)
197 datastoreType = doImportType(c["cls"])
198 if not issubclass(datastoreType, Datastore): 198 ↛ 199line 198 didn't jump to line 199, because the condition on line 198 was never true
199 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
200 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
201 log.debug("Creating child datastore %s", datastore.name)
202 self.datastores.append(datastore)
204 # Name ourself based on our children
205 if self.datastores: 205 ↛ 210line 205 didn't jump to line 210, because the condition on line 205 was never false
206 # We must set the names explicitly
207 self._names = [d.name for d in self.datastores]
208 childNames = ",".join(self.names)
209 else:
210 childNames = f"(empty@{time.time()})"
211 self._names = [childNames]
212 self.name = f"{type(self).__qualname__}[{childNames}]"
214 # We declare we are ephemeral if all our child datastores declare
215 # they are ephemeral
216 isEphemeral = True
217 for d in self.datastores:
218 if not d.isEphemeral:
219 isEphemeral = False
220 break
221 self.isEphemeral = isEphemeral
223 # per-datastore override constraints
224 if "datastore_constraints" in self.config:
225 overrides = self.config["datastore_constraints"]
227 if len(overrides) != len(self.datastores): 227 ↛ 228line 227 didn't jump to line 228, because the condition on line 227 was never true
228 raise DatastoreValidationError(
229 f"Number of registered datastores ({len(self.datastores)})"
230 " differs from number of constraints overrides"
231 f" {len(overrides)}"
232 )
234 self.datastoreConstraints = [
235 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
236 ]
238 else:
239 self.datastoreConstraints = (None,) * len(self.datastores)
241 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
243 @property
244 def names(self) -> tuple[str, ...]:
245 return tuple(self._names)
247 @property
248 def roots(self) -> dict[str, ResourcePath | None]:
249 # Docstring inherited.
250 roots = {}
251 for datastore in self.datastores:
252 roots.update(datastore.roots)
253 return roots
255 def __str__(self) -> str:
256 chainName = ", ".join(str(ds) for ds in self.datastores)
257 return chainName
259 def knows(self, ref: DatasetRef) -> bool:
260 """Check if the dataset is known to any of the datastores.
262 Does not check for existence of any artifact.
264 Parameters
265 ----------
266 ref : `DatasetRef`
267 Reference to the required dataset.
269 Returns
270 -------
271 exists : `bool`
272 `True` if the dataset is known to the datastore.
273 """
274 for datastore in self.datastores:
275 if datastore.knows(ref):
276 log.debug("%s known to datastore %s", ref, datastore.name)
277 return True
278 return False
280 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
281 # Docstring inherited from the base class.
282 refs_known: dict[DatasetRef, bool] = {}
283 for datastore in self.datastores:
284 refs_known.update(datastore.knows_these(refs))
286 # No need to check in next datastore for refs that are known.
287 # We only update entries that were initially False.
288 refs = [ref for ref, known in refs_known.items() if not known]
290 return refs_known
292 def mexists(
293 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
294 ) -> dict[DatasetRef, bool]:
295 """Check the existence of multiple datasets at once.
297 Parameters
298 ----------
299 refs : iterable of `DatasetRef`
300 The datasets to be checked.
301 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
302 Optional mapping of datastore artifact to existence. Updated by
303 this method with details of all artifacts tested. Can be `None`
304 if the caller is not interested.
306 Returns
307 -------
308 existence : `dict` of [`DatasetRef`, `bool`]
309 Mapping from dataset to boolean indicating existence in any
310 of the child datastores.
311 """
312 dataset_existence: dict[DatasetRef, bool] = {}
313 for datastore in self.datastores:
314 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
316 # For next datastore no point asking about ones we know
317 # exist already. No special exemption for ephemeral datastores.
318 refs = [ref for ref, exists in dataset_existence.items() if not exists]
320 return dataset_existence
322 def exists(self, ref: DatasetRef) -> bool:
323 """Check if the dataset exists in one of the datastores.
325 Parameters
326 ----------
327 ref : `DatasetRef`
328 Reference to the required dataset.
330 Returns
331 -------
332 exists : `bool`
333 `True` if the entity exists in one of the child datastores.
334 """
335 for datastore in self.datastores:
336 if datastore.exists(ref):
337 log.debug("Found %s in datastore %s", ref, datastore.name)
338 return True
339 return False
341 def get(
342 self,
343 ref: DatasetRef,
344 parameters: Mapping[str, Any] | None = None,
345 storageClass: StorageClass | str | None = None,
346 ) -> Any:
347 """Load an InMemoryDataset from the store.
349 The dataset is returned from the first datastore that has
350 the dataset.
352 Parameters
353 ----------
354 ref : `DatasetRef`
355 Reference to the required Dataset.
356 parameters : `dict`
357 `StorageClass`-specific parameters that specify, for example,
358 a slice of the dataset to be loaded.
359 storageClass : `StorageClass` or `str`, optional
360 The storage class to be used to override the Python type
361 returned by this method. By default the returned type matches
362 the dataset type definition for this dataset. Specifying a
363 read `StorageClass` can force a different type to be returned.
364 This type must be compatible with the original type.
366 Returns
367 -------
368 inMemoryDataset : `object`
369 Requested dataset or slice thereof as an InMemoryDataset.
371 Raises
372 ------
373 FileNotFoundError
374 Requested dataset can not be retrieved.
375 TypeError
376 Return value from formatter has unexpected type.
377 ValueError
378 Formatter failed to process the dataset.
379 """
380 for datastore in self.datastores:
381 try:
382 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
383 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
384 return inMemoryObject
385 except FileNotFoundError:
386 pass
388 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores")
390 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
391 """Write a InMemoryDataset with a given `DatasetRef` to each
392 datastore.
394 The put() to child datastores can fail with
395 `DatasetTypeNotSupportedError`. The put() for this datastore will be
396 deemed to have succeeded so long as at least one child datastore
397 accepted the inMemoryDataset.
399 Parameters
400 ----------
401 inMemoryDataset : `object`
402 The dataset to store.
403 ref : `DatasetRef`
404 Reference to the associated Dataset.
406 Raises
407 ------
408 TypeError
409 Supplied object and storage class are inconsistent.
410 DatasetTypeNotSupportedError
411 All datastores reported `DatasetTypeNotSupportedError`.
412 """
413 log.debug("Put %s", ref)
415 # Confirm that we can accept this dataset
416 if not self.constraints.isAcceptable(ref):
417 # Raise rather than use boolean return value.
418 raise DatasetTypeNotSupportedError(
419 f"Dataset {ref} has been rejected by this datastore via configuration."
420 )
422 isPermanent = False
423 nsuccess = 0
424 npermanent = 0
425 nephemeral = 0
426 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
427 if (
428 constraints is not None and not constraints.isAcceptable(ref)
429 ) or not datastore.constraints.isAcceptable(ref):
430 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
431 continue
433 if datastore.isEphemeral:
434 nephemeral += 1
435 else:
436 npermanent += 1
437 try:
438 datastore.put(inMemoryDataset, ref)
439 nsuccess += 1
440 if not datastore.isEphemeral:
441 isPermanent = True
442 except DatasetTypeNotSupportedError:
443 pass
445 if nsuccess == 0:
446 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
448 if not isPermanent and npermanent > 0: 448 ↛ 449line 448 didn't jump to line 449, because the condition on line 448 was never true
449 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
451 if self._transaction is not None:
452 self._transaction.registerUndo("put", self.remove, ref)
454 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
455 # Docstring inherited from base class.
456 if transfer != "auto":
457 return transfer
458 # Ask each datastore what they think auto means
459 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
461 # Remove any untranslated "auto" values
462 transfers.discard(transfer)
464 if len(transfers) == 1: 464 ↛ 465line 464 didn't jump to line 465, because the condition on line 464 was never true
465 return transfers.pop()
466 if not transfers: 466 ↛ 470line 466 didn't jump to line 470, because the condition on line 466 was never false
467 # Everything reported "auto"
468 return transfer
470 raise RuntimeError(
471 "Chained datastore does not yet support different transfer modes"
472 f" from 'auto' in each child datastore (wanted {transfers})"
473 )
475 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
476 # Docstring inherited from Datastore._prepIngest.
477 if transfer is None:
478 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
480 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
481 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
482 if not acceptable:
483 log.debug(
484 "Datastore %s skipping ingest via configuration for refs %s",
485 name,
486 ", ".join(str(ref) for ref in dataset.refs),
487 )
488 return False
489 else:
490 return True
492 # Filter down to just datasets the chained datastore's own
493 # configuration accepts.
494 okForParent: list[FileDataset] = [
495 dataset
496 for dataset in datasets
497 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
498 ]
500 # Iterate over nested datastores and call _prepIngest on each.
501 # Save the results to a list:
502 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
503 # ...and remember whether all of the failures are due to
504 # NotImplementedError being raised.
505 allFailuresAreNotImplementedError = True
506 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
507 okForChild: list[FileDataset]
508 if constraints is not None:
509 okForChild = [
510 dataset
511 for dataset in okForParent
512 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
513 ]
514 else:
515 okForChild = okForParent
516 try:
517 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
518 except NotImplementedError:
519 log.debug(
520 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
521 datastore.name,
522 transfer,
523 )
524 continue
525 allFailuresAreNotImplementedError = False
526 if okForChild:
527 # Do not store for later if a datastore has rejected
528 # everything.
529 # Include the source paths if this is a "move". It's clearer
530 # to find the paths now rather than try to infer how
531 # each datastore has stored them in the internal prep class.
532 paths = (
533 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set()
534 )
535 children.append((datastore, prepDataForChild, paths))
536 if allFailuresAreNotImplementedError:
537 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
538 return _IngestPrepData(children=children)
540 def _finishIngest(
541 self,
542 prepData: _IngestPrepData,
543 *,
544 transfer: str | None = None,
545 record_validation_info: bool = True,
546 ) -> None:
547 # Docstring inherited from Datastore._finishIngest.
548 # For "move" we must use "copy" and then delete the input
549 # data at the end. This has no rollback option if the ingest
550 # subsequently fails. If there is only one active datastore
551 # accepting any files we can leave it as "move"
552 actual_transfer: str | None
553 if transfer == "move" and len(prepData.children) > 1:
554 actual_transfer = "copy"
555 else:
556 actual_transfer = transfer
557 to_be_deleted: set[ResourcePath] = set()
558 for datastore, prepDataForChild, paths in prepData.children:
559 datastore._finishIngest(
560 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
561 )
562 to_be_deleted.update(paths)
563 if actual_transfer != transfer:
564 # These datasets were copied but now need to be deleted.
565 # This can not be rolled back.
566 for uri in to_be_deleted:
567 uri.remove()
569 def getManyURIs(
570 self,
571 refs: Iterable[DatasetRef],
572 predict: bool = False,
573 allow_missing: bool = False,
574 ) -> dict[DatasetRef, DatasetRefURIs]:
575 # Docstring inherited
577 uris: dict[DatasetRef, DatasetRefURIs] = {}
578 missing_refs = set(refs)
580 # If predict is True we don't want to predict a dataset in the first
581 # datastore if it actually exists in a later datastore, so in that
582 # case check all datastores with predict=False first, and then try
583 # again with predict=True.
584 for p in (False, True) if predict else (False,):
585 if not missing_refs:
586 break
587 for datastore in self.datastores:
588 try:
589 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
590 except NotImplementedError:
591 # some datastores may not implement generating URIs
592 continue
593 missing_refs -= got_uris.keys()
594 uris.update(got_uris)
595 if not missing_refs:
596 break
598 if missing_refs and not allow_missing:
599 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
601 return uris
603 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
604 """Return URIs associated with dataset.
606 Parameters
607 ----------
608 ref : `DatasetRef`
609 Reference to the required dataset.
610 predict : `bool`, optional
611 If the datastore does not know about the dataset, should it
612 return a predicted URI or not?
614 Returns
615 -------
616 uris : `DatasetRefURIs`
617 The URI to the primary artifact associated with this dataset (if
618 the dataset was disassembled within the datastore this may be
619 `None`), and the URIs to any components associated with the dataset
620 artifact. (can be empty if there are no components).
622 Notes
623 -----
624 The returned URI is from the first datastore in the list that has
625 the dataset with preference given to the first dataset coming from
626 a permanent datastore. If no datastores have the dataset and prediction
627 is allowed, the predicted URI for the first datastore in the list will
628 be returned.
629 """
630 log.debug("Requesting URIs for %s", ref)
631 predictedUri: DatasetRefURIs | None = None
632 predictedEphemeralUri: DatasetRefURIs | None = None
633 firstEphemeralUri: DatasetRefURIs | None = None
634 for datastore in self.datastores:
635 if datastore.exists(ref):
636 if not datastore.isEphemeral:
637 uri = datastore.getURIs(ref)
638 log.debug("Retrieved non-ephemeral URI: %s", uri)
639 return uri
640 elif not firstEphemeralUri:
641 firstEphemeralUri = datastore.getURIs(ref)
642 elif predict:
643 if not predictedUri and not datastore.isEphemeral:
644 predictedUri = datastore.getURIs(ref, predict)
645 elif not predictedEphemeralUri and datastore.isEphemeral:
646 predictedEphemeralUri = datastore.getURIs(ref, predict)
648 if firstEphemeralUri:
649 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
650 return firstEphemeralUri
652 if predictedUri:
653 log.debug("Retrieved predicted URI: %s", predictedUri)
654 return predictedUri
656 if predictedEphemeralUri:
657 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
658 return predictedEphemeralUri
660 raise FileNotFoundError(f"Dataset {ref} not in any datastore")
662 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
663 """URI to the Dataset.
665 The returned URI is from the first datastore in the list that has
666 the dataset with preference given to the first dataset coming from
667 a permanent datastore. If no datastores have the dataset and prediction
668 is allowed, the predicted URI for the first datastore in the list will
669 be returned.
671 Parameters
672 ----------
673 ref : `DatasetRef`
674 Reference to the required Dataset.
675 predict : `bool`
676 If `True`, allow URIs to be returned of datasets that have not
677 been written.
679 Returns
680 -------
681 uri : `lsst.resources.ResourcePath`
682 URI pointing to the dataset within the datastore. If the
683 dataset does not exist in the datastore, and if ``predict`` is
684 `True`, the URI will be a prediction and will include a URI
685 fragment "#predicted".
687 Notes
688 -----
689 If the datastore does not have entities that relate well
690 to the concept of a URI the returned URI string will be
691 descriptive. The returned URI is not guaranteed to be obtainable.
693 Raises
694 ------
695 FileNotFoundError
696 A URI has been requested for a dataset that does not exist and
697 guessing is not allowed.
698 RuntimeError
699 Raised if a request is made for a single URI but multiple URIs
700 are associated with this dataset.
701 """
702 log.debug("Requesting URI for %s", ref)
703 primary, components = self.getURIs(ref, predict)
704 if primary is None or components: 704 ↛ 705line 704 didn't jump to line 705, because the condition on line 704 was never true
705 raise RuntimeError(
706 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
707 )
708 return primary
710 def retrieveArtifacts(
711 self,
712 refs: Iterable[DatasetRef],
713 destination: ResourcePath,
714 transfer: str = "auto",
715 preserve_path: bool = True,
716 overwrite: bool = False,
717 ) -> list[ResourcePath]:
718 """Retrieve the file artifacts associated with the supplied refs.
720 Parameters
721 ----------
722 refs : iterable of `DatasetRef`
723 The datasets for which file artifacts are to be retrieved.
724 A single ref can result in multiple files. The refs must
725 be resolved.
726 destination : `lsst.resources.ResourcePath`
727 Location to write the file artifacts.
728 transfer : `str`, optional
729 Method to use to transfer the artifacts. Must be one of the options
730 supported by `lsst.resources.ResourcePath.transfer_from()`.
731 "move" is not allowed.
732 preserve_path : `bool`, optional
733 If `True` the full path of the file artifact within the datastore
734 is preserved. If `False` the final file component of the path
735 is used.
736 overwrite : `bool`, optional
737 If `True` allow transfers to overwrite existing files at the
738 destination.
740 Returns
741 -------
742 targets : `list` of `lsst.resources.ResourcePath`
743 URIs of file artifacts in destination location. Order is not
744 preserved.
745 """
746 if not destination.isdir(): 746 ↛ 747line 746 didn't jump to line 747, because the condition on line 746 was never true
747 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
749 # Using getURIs is not feasible since it becomes difficult to
750 # determine the path within the datastore later on. For now
751 # follow getURIs implementation approach.
753 pending = set(refs)
755 # There is a question as to whether an exception should be raised
756 # early if some of the refs are missing, or whether files should be
757 # transferred until a problem is hit. Prefer to complain up front.
758 # Use the datastore integer as primary key.
759 grouped_by_datastore: dict[int, set[DatasetRef]] = {}
761 for number, datastore in enumerate(self.datastores):
762 if datastore.isEphemeral:
763 # In the future we will want to distinguish in-memory from
764 # caching datastore since using an on-disk local
765 # cache is exactly what we should be doing.
766 continue
767 try:
768 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
769 except NotImplementedError:
770 # Some datastores may not support retrieving artifacts
771 continue
773 if datastore_refs:
774 grouped_by_datastore[number] = datastore_refs
776 # Remove these from the pending list so that we do not bother
777 # looking for them any more.
778 pending = pending - datastore_refs
780 if pending: 780 ↛ 781line 780 didn't jump to line 781, because the condition on line 780 was never true
781 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
783 # Now do the transfer.
784 targets: list[ResourcePath] = []
785 for number, datastore_refs in grouped_by_datastore.items():
786 targets.extend(
787 self.datastores[number].retrieveArtifacts(
788 datastore_refs,
789 destination,
790 transfer=transfer,
791 preserve_path=preserve_path,
792 overwrite=overwrite,
793 )
794 )
796 return targets
798 def remove(self, ref: DatasetRef) -> None:
799 """Indicate to the datastore that a dataset can be removed.
801 The dataset will be removed from each datastore. The dataset is
802 not required to exist in every child datastore.
804 Parameters
805 ----------
806 ref : `DatasetRef`
807 Reference to the required dataset.
809 Raises
810 ------
811 FileNotFoundError
812 Attempt to remove a dataset that does not exist. Raised if none
813 of the child datastores removed the dataset.
814 """
815 log.debug("Removing %s", ref)
816 self.trash(ref, ignore_errors=False)
817 self.emptyTrash(ignore_errors=False)
819 def forget(self, refs: Iterable[DatasetRef]) -> None:
820 for datastore in tuple(self.datastores):
821 datastore.forget(refs)
823 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
824 if isinstance(ref, DatasetRef):
825 ref_label = str(ref)
826 else:
827 ref_label = "bulk datasets"
829 log.debug("Trashing %s", ref_label)
831 counter = 0
832 for datastore in self.datastores:
833 try:
834 datastore.trash(ref, ignore_errors=ignore_errors)
835 counter += 1
836 except FileNotFoundError:
837 pass
839 if counter == 0:
840 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
841 if ignore_errors: 841 ↛ 842line 841 didn't jump to line 842, because the condition on line 841 was never true
842 log.warning(err_msg)
843 else:
844 raise FileNotFoundError(err_msg)
846 def emptyTrash(self, ignore_errors: bool = True) -> None:
847 for datastore in self.datastores:
848 datastore.emptyTrash(ignore_errors=ignore_errors)
850 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
851 """Retrieve a dataset from an input `Datastore`,
852 and store the result in this `Datastore`.
854 Parameters
855 ----------
856 inputDatastore : `Datastore`
857 The external `Datastore` from which to retreive the Dataset.
858 ref : `DatasetRef`
859 Reference to the required dataset in the input data store.
861 Returns
862 -------
863 results : `list`
864 List containing the return value from the ``put()`` to each
865 child datastore.
866 """
867 assert inputDatastore is not self # unless we want it for renames?
868 inMemoryDataset = inputDatastore.get(ref)
869 self.put(inMemoryDataset, ref)
871 def validateConfiguration(
872 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
873 ) -> None:
874 """Validate some of the configuration for this datastore.
876 Parameters
877 ----------
878 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
879 Entities to test against this configuration. Can be differing
880 types.
881 logFailures : `bool`, optional
882 If `True`, output a log message for every validation error
883 detected.
885 Raises
886 ------
887 DatastoreValidationError
888 Raised if there is a validation problem with a configuration.
889 All the problems are reported in a single exception.
891 Notes
892 -----
893 This method checks each datastore in turn.
894 """
895 # Need to catch each of the datastore outputs and ensure that
896 # all are tested.
897 failures = []
898 for datastore in self.datastores:
899 try:
900 datastore.validateConfiguration(entities, logFailures=logFailures)
901 except DatastoreValidationError as e:
902 if logFailures: 902 ↛ 904line 902 didn't jump to line 904, because the condition on line 902 was never false
903 log.critical("Datastore %s failed validation", datastore.name)
904 failures.append(f"Datastore {self.name}: {e}")
906 if failures:
907 msg = ";\n".join(failures)
908 raise DatastoreValidationError(msg)
910 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
911 # Docstring is inherited from base class
912 failures = []
913 for datastore in self.datastores:
914 try:
915 datastore.validateKey(lookupKey, entity)
916 except DatastoreValidationError as e:
917 failures.append(f"Datastore {self.name}: {e}")
919 if failures:
920 msg = ";\n".join(failures)
921 raise DatastoreValidationError(msg)
923 def getLookupKeys(self) -> set[LookupKey]:
924 # Docstring is inherited from base class
925 keys = set()
926 for datastore in self.datastores:
927 keys.update(datastore.getLookupKeys())
929 keys.update(self.constraints.getLookupKeys())
930 for p in self.datastoreConstraints:
931 if p is not None: 931 ↛ 930line 931 didn't jump to line 930, because the condition on line 931 was never false
932 keys.update(p.getLookupKeys())
934 return keys
936 def needs_expanded_data_ids(
937 self,
938 transfer: str | None,
939 entity: DatasetRef | DatasetType | StorageClass | None = None,
940 ) -> bool:
941 # Docstring inherited.
942 # We can't safely use `self.datastoreConstraints` with `entity` to
943 # check whether a child datastore would even want to ingest this
944 # dataset, because we don't want to filter out datastores that might
945 # need an expanded data ID based in incomplete information (e.g. we
946 # pass a StorageClass, but the constraint dispatches on DatasetType).
947 # So we pessimistically check if any datastore would need an expanded
948 # data ID for this transfer mode.
949 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores)
951 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
952 # Docstring inherited from the base class.
954 for datastore in self.datastores:
955 datastore.import_records(data)
957 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
958 # Docstring inherited from the base class.
960 all_records: dict[str, DatastoreRecordData] = {}
962 # Merge all sub-datastore records into one structure
963 for datastore in self.datastores:
964 sub_records = datastore.export_records(refs)
965 for name, record_data in sub_records.items():
966 # All datastore names must be unique in a chain.
967 if name in all_records: 967 ↛ 968line 967 didn't jump to line 968, because the condition on line 967 was never true
968 raise ValueError("Non-unique datastore name found in datastore {datastore}")
969 all_records[name] = record_data
971 return all_records
973 def export(
974 self,
975 refs: Iterable[DatasetRef],
976 *,
977 directory: ResourcePathExpression | None = None,
978 transfer: str | None = "auto",
979 ) -> Iterable[FileDataset]:
980 # Docstring inherited from Datastore.export.
981 if transfer == "auto" and directory is None:
982 transfer = None
984 if transfer is not None and directory is None:
985 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
987 if transfer == "move":
988 raise TypeError("Can not export by moving files out of datastore.")
990 # Exporting from a chain has the potential for a dataset to be
991 # in one or more of the datastores in the chain. We only need one
992 # of them since we assume the datasets are the same in all (but
993 # the file format could be different of course since that is a
994 # per-datastore configuration).
995 # We also do not know whether any of the datastores in the chain
996 # support file export.
998 # Ensure we have an ordered sequence that is not an iterator or set.
999 if not isinstance(refs, Sequence):
1000 refs = list(refs)
1002 # If any of the datasets are missing entirely we need to raise early
1003 # before we try to run the export. This can be a little messy but is
1004 # better than exporting files from the first datastore and then finding
1005 # that one is missing but is not in the second datastore either.
1006 known = [datastore.knows_these(refs) for datastore in self.datastores]
1007 refs_known: set[DatasetRef] = set()
1008 for known_to_this in known:
1009 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this})
1010 missing_count = len(refs) - len(refs_known)
1011 if missing_count:
1012 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}")
1014 # To allow us to slot each result into the right place after
1015 # asking each datastore, create a dict with the index.
1016 ref_positions = {ref: i for i, ref in enumerate(refs)}
1018 # Presize the final export list.
1019 exported: list[FileDataset | None] = [None] * len(refs)
1021 # The order of the returned dataset has to match the order of the
1022 # given refs, even if they are all from different datastores.
1023 for i, datastore in enumerate(self.datastores):
1024 known_to_this = known[i]
1025 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions]
1027 try:
1028 this_export = datastore.export(filtered, directory=directory, transfer=transfer)
1029 except NotImplementedError:
1030 # Try the next datastore.
1031 continue
1033 for ref, export in zip(filtered, this_export, strict=True):
1034 # Get the position and also delete it from the list.
1035 exported[ref_positions.pop(ref)] = export
1037 # Every dataset should be accounted for because of the earlier checks
1038 # but make sure that we did fill all the slots to appease mypy.
1039 for i, dataset in enumerate(exported):
1040 if dataset is None: 1040 ↛ 1041line 1040 didn't jump to line 1041, because the condition on line 1040 was never true
1041 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.")
1042 yield dataset
1044 def transfer_from(
1045 self,
1046 source_datastore: Datastore,
1047 refs: Iterable[DatasetRef],
1048 transfer: str = "auto",
1049 artifact_existence: dict[ResourcePath, bool] | None = None,
1050 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1051 # Docstring inherited
1052 # mypy does not understand "type(self) is not type(source)"
1053 if isinstance(source_datastore, ChainedDatastore):
1054 # Both the source and destination are chained datastores.
1055 source_datastores = tuple(source_datastore.datastores)
1056 else:
1057 # The source datastore is different, forward everything to the
1058 # child datastores.
1059 source_datastores = (source_datastore,)
1061 # Need to know the set of all possible refs that could be transferred.
1062 remaining_refs = set(refs)
1064 missing_from_source: set[DatasetRef] | None = None
1065 all_accepted = set()
1066 nsuccess = 0
1067 for source_child in source_datastores:
1068 # If we are reading from a chained datastore, it's possible that
1069 # only a subset of the datastores know about the dataset. We can't
1070 # ask the receiving datastore to copy it when it doesn't exist
1071 # so we have to filter again based on what the source datastore
1072 # understands.
1073 known_to_source = source_child.knows_these(list(refs))
1075 # Need to know that there is a possibility that some of these
1076 # datasets exist but are unknown to the source datastore if
1077 # trust is enabled.
1078 if getattr(source_child, "trustGetRequest", False):
1079 unknown = [ref for ref, known in known_to_source.items() if not known]
1080 existence = source_child.mexists(unknown, artifact_existence)
1081 for ref, exists in existence.items():
1082 known_to_source[ref] = exists
1084 missing = {ref for ref, known in known_to_source.items() if not known}
1085 if missing:
1086 if missing_from_source is None:
1087 missing_from_source = missing
1088 else:
1089 missing_from_source &= missing
1091 # Try to transfer from each source datastore to each child
1092 # datastore. Have to make sure we don't transfer something
1093 # we've already transferred to this destination on later passes.
1095 # Filter the initial list based on the datasets we have
1096 # not yet transferred.
1097 these_refs = []
1098 for ref in refs:
1099 if ref in remaining_refs and known_to_source[ref]:
1100 these_refs.append(ref)
1102 if not these_refs:
1103 # Already transferred all datasets known to this datastore.
1104 continue
1106 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
1107 if constraints is not None: 1107 ↛ 1115line 1107 didn't jump to line 1115, because the condition on line 1107 was never false
1108 filtered_refs = []
1109 for ref in these_refs:
1110 if constraints.isAcceptable(ref):
1111 filtered_refs.append(ref)
1112 else:
1113 log.debug("Rejecting ref by constraints: %s", ref)
1114 else:
1115 filtered_refs = list(these_refs)
1116 try:
1117 accepted, _ = datastore.transfer_from(
1118 source_child, filtered_refs, transfer, artifact_existence
1119 )
1120 except (TypeError, NotImplementedError):
1121 # The datastores were incompatible.
1122 continue
1123 else:
1124 nsuccess += 1
1126 # Remove the accepted datasets from those remaining.
1127 remaining_refs = remaining_refs - accepted
1129 # Keep track of everything we have accepted.
1130 all_accepted.update(accepted)
1132 if missing_from_source:
1133 for ref in missing_from_source:
1134 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref)
1136 if nsuccess == 0: 1136 ↛ 1137line 1136 didn't jump to line 1137, because the condition on line 1136 was never true
1137 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}")
1139 return all_accepted, remaining_refs