Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%
415 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:29 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:29 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Chained datastore."""
26__all__ = ("ChainedDatastore",)
28import itertools
29import logging
30import time
31import warnings
32from collections.abc import Iterable, Mapping, Sequence
33from typing import TYPE_CHECKING, Any
35from lsst.daf.butler import (
36 Constraints,
37 DatasetRef,
38 DatasetRefURIs,
39 DatasetTypeNotSupportedError,
40 Datastore,
41 DatastoreConfig,
42 DatastoreRecordData,
43 DatastoreValidationError,
44 FileDataset,
45)
46from lsst.resources import ResourcePath
47from lsst.utils import doImportType
49if TYPE_CHECKING:
50 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
51 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
52 from lsst.resources import ResourcePathExpression
54log = logging.getLogger(__name__)
57class _IngestPrepData(Datastore.IngestPrepData):
58 """Helper class for ChainedDatastore ingest implementation.
60 Parameters
61 ----------
62 children : `list` of `tuple`
63 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
64 """
66 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
67 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
68 self.children = children
71class ChainedDatastore(Datastore):
72 """Chained Datastores to allow read and writes from multiple datastores.
74 A ChainedDatastore is configured with multiple datastore configurations.
75 A ``put()`` is always sent to each datastore. A ``get()``
76 operation is sent to each datastore in turn and the first datastore
77 to return a valid dataset is used.
79 Parameters
80 ----------
81 config : `DatastoreConfig` or `str`
82 Configuration. This configuration must include a ``datastores`` field
83 as a sequence of datastore configurations. The order in this sequence
84 indicates the order to use for read operations.
85 bridgeManager : `DatastoreRegistryBridgeManager`
86 Object that manages the interface between `Registry` and datastores.
87 butlerRoot : `str`, optional
88 New datastore root to use to override the configuration value. This
89 root is sent to each child datastore.
91 Notes
92 -----
93 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
94 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
95 and `"hardlink"` if and only if all its child datastores do.
96 """
98 defaultConfigFile = "datastores/chainedDatastore.yaml"
99 """Path to configuration defaults. Accessed within the ``configs`` resource
100 or relative to a search path. Can be None if no defaults specified.
101 """
103 containerKey = "datastores"
104 """Key to specify where child datastores are configured."""
106 datastores: list[Datastore]
107 """All the child datastores known to this datastore."""
109 datastoreConstraints: Sequence[Constraints | None]
110 """Constraints to be applied to each of the child datastores."""
112 @classmethod
113 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
114 """Set any filesystem-dependent config options for child Datastores to
115 be appropriate for a new empty repository with the given root.
117 Parameters
118 ----------
119 root : `str`
120 Filesystem path to the root of the data repository.
121 config : `Config`
122 A `Config` to update. Only the subset understood by
123 this component will be updated. Will not expand
124 defaults.
125 full : `Config`
126 A complete config with all defaults expanded that can be
127 converted to a `DatastoreConfig`. Read-only and will not be
128 modified by this method.
129 Repository-specific options that should not be obtained
130 from defaults when Butler instances are constructed
131 should be copied from ``full`` to ``config``.
132 overwrite : `bool`, optional
133 If `False`, do not modify a value in ``config`` if the value
134 already exists. Default is always to overwrite with the provided
135 ``root``.
137 Notes
138 -----
139 If a keyword is explicitly defined in the supplied ``config`` it
140 will not be overridden by this method if ``overwrite`` is `False`.
141 This allows explicit values set in external configs to be retained.
142 """
144 # Extract the part of the config we care about updating
145 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
147 # And the subset of the full config that we can use for reference.
148 # Do not bother with defaults because we are told this already has
149 # them.
150 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
152 # Loop over each datastore config and pass the subsets to the
153 # child datastores to process.
155 containerKey = cls.containerKey
156 for idx, (child, fullChild) in enumerate(
157 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey])
158 ):
159 childConfig = DatastoreConfig(child, mergeDefaults=False)
160 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
161 datastoreClass = doImportType(fullChildConfig["cls"])
162 if not issubclass(datastoreClass, Datastore): 162 ↛ 163line 162 didn't jump to line 163, because the condition on line 162 was never true
163 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
164 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}"
165 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
167 # Reattach to parent
168 datastoreConfig[containerKey, idx] = childConfig
170 # Reattach modified datastore config to parent
171 # If this has a datastore key we attach there, otherwise we assume
172 # this information goes at the top of the config hierarchy.
173 if DatastoreConfig.component in config:
174 config[DatastoreConfig.component] = datastoreConfig
175 else:
176 config.update(datastoreConfig)
178 return
180 def __init__(
181 self,
182 config: Config | ResourcePathExpression,
183 bridgeManager: DatastoreRegistryBridgeManager,
184 butlerRoot: str | None = None,
185 ):
186 super().__init__(config, bridgeManager)
188 # Scan for child datastores and instantiate them with the same registry
189 self.datastores = []
190 for c in self.config["datastores"]:
191 c = DatastoreConfig(c)
192 datastoreType = doImportType(c["cls"])
193 if not issubclass(datastoreType, Datastore): 193 ↛ 194line 193 didn't jump to line 194, because the condition on line 193 was never true
194 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
195 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
196 log.debug("Creating child datastore %s", datastore.name)
197 self.datastores.append(datastore)
199 # Name ourself based on our children
200 if self.datastores: 200 ↛ 205line 200 didn't jump to line 205, because the condition on line 200 was never false
201 # We must set the names explicitly
202 self._names = [d.name for d in self.datastores]
203 childNames = ",".join(self.names)
204 else:
205 childNames = f"(empty@{time.time()})"
206 self._names = [childNames]
207 self.name = f"{type(self).__qualname__}[{childNames}]"
209 # We declare we are ephemeral if all our child datastores declare
210 # they are ephemeral
211 isEphemeral = True
212 for d in self.datastores:
213 if not d.isEphemeral:
214 isEphemeral = False
215 break
216 self.isEphemeral = isEphemeral
218 # per-datastore override constraints
219 if "datastore_constraints" in self.config:
220 overrides = self.config["datastore_constraints"]
222 if len(overrides) != len(self.datastores): 222 ↛ 223line 222 didn't jump to line 223, because the condition on line 222 was never true
223 raise DatastoreValidationError(
224 f"Number of registered datastores ({len(self.datastores)})"
225 " differs from number of constraints overrides"
226 f" {len(overrides)}"
227 )
229 self.datastoreConstraints = [
230 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
231 ]
233 else:
234 self.datastoreConstraints = (None,) * len(self.datastores)
236 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
238 @property
239 def names(self) -> tuple[str, ...]:
240 return tuple(self._names)
242 def __str__(self) -> str:
243 chainName = ", ".join(str(ds) for ds in self.datastores)
244 return chainName
246 def knows(self, ref: DatasetRef) -> bool:
247 """Check if the dataset is known to any of the datastores.
249 Does not check for existence of any artifact.
251 Parameters
252 ----------
253 ref : `DatasetRef`
254 Reference to the required dataset.
256 Returns
257 -------
258 exists : `bool`
259 `True` if the dataset is known to the datastore.
260 """
261 for datastore in self.datastores:
262 if datastore.knows(ref):
263 log.debug("%s known to datastore %s", ref, datastore.name)
264 return True
265 return False
267 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
268 # Docstring inherited from the base class.
269 refs_known: dict[DatasetRef, bool] = {}
270 for datastore in self.datastores:
271 refs_known.update(datastore.knows_these(refs))
273 # No need to check in next datastore for refs that are known.
274 # We only update entries that were initially False.
275 refs = [ref for ref, known in refs_known.items() if not known]
277 return refs_known
279 def mexists(
280 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
281 ) -> dict[DatasetRef, bool]:
282 """Check the existence of multiple datasets at once.
284 Parameters
285 ----------
286 refs : iterable of `DatasetRef`
287 The datasets to be checked.
288 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
289 Optional mapping of datastore artifact to existence. Updated by
290 this method with details of all artifacts tested. Can be `None`
291 if the caller is not interested.
293 Returns
294 -------
295 existence : `dict` of [`DatasetRef`, `bool`]
296 Mapping from dataset to boolean indicating existence in any
297 of the child datastores.
298 """
299 dataset_existence: dict[DatasetRef, bool] = {}
300 for datastore in self.datastores:
301 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
303 # For next datastore no point asking about ones we know
304 # exist already. No special exemption for ephemeral datastores.
305 refs = [ref for ref, exists in dataset_existence.items() if not exists]
307 return dataset_existence
309 def exists(self, ref: DatasetRef) -> bool:
310 """Check if the dataset exists in one of the datastores.
312 Parameters
313 ----------
314 ref : `DatasetRef`
315 Reference to the required dataset.
317 Returns
318 -------
319 exists : `bool`
320 `True` if the entity exists in one of the child datastores.
321 """
322 for datastore in self.datastores:
323 if datastore.exists(ref):
324 log.debug("Found %s in datastore %s", ref, datastore.name)
325 return True
326 return False
328 def get(
329 self,
330 ref: DatasetRef,
331 parameters: Mapping[str, Any] | None = None,
332 storageClass: StorageClass | str | None = None,
333 ) -> Any:
334 """Load an InMemoryDataset from the store.
336 The dataset is returned from the first datastore that has
337 the dataset.
339 Parameters
340 ----------
341 ref : `DatasetRef`
342 Reference to the required Dataset.
343 parameters : `dict`
344 `StorageClass`-specific parameters that specify, for example,
345 a slice of the dataset to be loaded.
346 storageClass : `StorageClass` or `str`, optional
347 The storage class to be used to override the Python type
348 returned by this method. By default the returned type matches
349 the dataset type definition for this dataset. Specifying a
350 read `StorageClass` can force a different type to be returned.
351 This type must be compatible with the original type.
353 Returns
354 -------
355 inMemoryDataset : `object`
356 Requested dataset or slice thereof as an InMemoryDataset.
358 Raises
359 ------
360 FileNotFoundError
361 Requested dataset can not be retrieved.
362 TypeError
363 Return value from formatter has unexpected type.
364 ValueError
365 Formatter failed to process the dataset.
366 """
368 for datastore in self.datastores:
369 try:
370 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
371 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
372 return inMemoryObject
373 except FileNotFoundError:
374 pass
376 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores")
378 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
379 """Write a InMemoryDataset with a given `DatasetRef` to each
380 datastore.
382 The put() to child datastores can fail with
383 `DatasetTypeNotSupportedError`. The put() for this datastore will be
384 deemed to have succeeded so long as at least one child datastore
385 accepted the inMemoryDataset.
387 Parameters
388 ----------
389 inMemoryDataset : `object`
390 The dataset to store.
391 ref : `DatasetRef`
392 Reference to the associated Dataset.
394 Raises
395 ------
396 TypeError
397 Supplied object and storage class are inconsistent.
398 DatasetTypeNotSupportedError
399 All datastores reported `DatasetTypeNotSupportedError`.
400 """
401 log.debug("Put %s", ref)
403 # Confirm that we can accept this dataset
404 if not self.constraints.isAcceptable(ref):
405 # Raise rather than use boolean return value.
406 raise DatasetTypeNotSupportedError(
407 f"Dataset {ref} has been rejected by this datastore via configuration."
408 )
410 isPermanent = False
411 nsuccess = 0
412 npermanent = 0
413 nephemeral = 0
414 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
415 if (
416 constraints is not None and not constraints.isAcceptable(ref)
417 ) or not datastore.constraints.isAcceptable(ref):
418 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
419 continue
421 if datastore.isEphemeral:
422 nephemeral += 1
423 else:
424 npermanent += 1
425 try:
426 datastore.put(inMemoryDataset, ref)
427 nsuccess += 1
428 if not datastore.isEphemeral:
429 isPermanent = True
430 except DatasetTypeNotSupportedError:
431 pass
433 if nsuccess == 0:
434 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
436 if not isPermanent and npermanent > 0: 436 ↛ 437line 436 didn't jump to line 437, because the condition on line 436 was never true
437 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
439 if self._transaction is not None:
440 self._transaction.registerUndo("put", self.remove, ref)
442 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
443 # Docstring inherited from base class.
444 if transfer != "auto":
445 return transfer
446 # Ask each datastore what they think auto means
447 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
449 # Remove any untranslated "auto" values
450 transfers.discard(transfer)
452 if len(transfers) == 1: 452 ↛ 453line 452 didn't jump to line 453, because the condition on line 452 was never true
453 return transfers.pop()
454 if not transfers: 454 ↛ 458line 454 didn't jump to line 458, because the condition on line 454 was never false
455 # Everything reported "auto"
456 return transfer
458 raise RuntimeError(
459 "Chained datastore does not yet support different transfer modes"
460 f" from 'auto' in each child datastore (wanted {transfers})"
461 )
463 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
464 # Docstring inherited from Datastore._prepIngest.
465 if transfer is None:
466 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
468 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
469 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
470 if not acceptable:
471 log.debug(
472 "Datastore %s skipping ingest via configuration for refs %s",
473 name,
474 ", ".join(str(ref) for ref in dataset.refs),
475 )
476 return False
477 else:
478 return True
480 # Filter down to just datasets the chained datastore's own
481 # configuration accepts.
482 okForParent: list[FileDataset] = [
483 dataset
484 for dataset in datasets
485 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
486 ]
488 # Iterate over nested datastores and call _prepIngest on each.
489 # Save the results to a list:
490 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
491 # ...and remember whether all of the failures are due to
492 # NotImplementedError being raised.
493 allFailuresAreNotImplementedError = True
494 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
495 okForChild: list[FileDataset]
496 if constraints is not None:
497 okForChild = [
498 dataset
499 for dataset in okForParent
500 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
501 ]
502 else:
503 okForChild = okForParent
504 try:
505 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
506 except NotImplementedError:
507 log.debug(
508 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
509 datastore.name,
510 transfer,
511 )
512 continue
513 allFailuresAreNotImplementedError = False
514 if okForChild:
515 # Do not store for later if a datastore has rejected
516 # everything.
517 # Include the source paths if this is a "move". It's clearer
518 # to find the paths now rather than try to infer how
519 # each datastore has stored them in the internal prep class.
520 paths = (
521 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set()
522 )
523 children.append((datastore, prepDataForChild, paths))
524 if allFailuresAreNotImplementedError:
525 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
526 return _IngestPrepData(children=children)
528 def _finishIngest(
529 self,
530 prepData: _IngestPrepData,
531 *,
532 transfer: str | None = None,
533 record_validation_info: bool = True,
534 ) -> None:
535 # Docstring inherited from Datastore._finishIngest.
536 # For "move" we must use "copy" and then delete the input
537 # data at the end. This has no rollback option if the ingest
538 # subsequently fails. If there is only one active datastore
539 # accepting any files we can leave it as "move"
540 actual_transfer: str | None
541 if transfer == "move" and len(prepData.children) > 1:
542 actual_transfer = "copy"
543 else:
544 actual_transfer = transfer
545 to_be_deleted: set[ResourcePath] = set()
546 for datastore, prepDataForChild, paths in prepData.children:
547 datastore._finishIngest(
548 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
549 )
550 to_be_deleted.update(paths)
551 if actual_transfer != transfer:
552 # These datasets were copied but now need to be deleted.
553 # This can not be rolled back.
554 for uri in to_be_deleted:
555 uri.remove()
557 def getManyURIs(
558 self,
559 refs: Iterable[DatasetRef],
560 predict: bool = False,
561 allow_missing: bool = False,
562 ) -> dict[DatasetRef, DatasetRefURIs]:
563 # Docstring inherited
565 uris: dict[DatasetRef, DatasetRefURIs] = {}
566 missing_refs = set(refs)
568 # If predict is True we don't want to predict a dataset in the first
569 # datastore if it actually exists in a later datastore, so in that
570 # case check all datastores with predict=False first, and then try
571 # again with predict=True.
572 for p in (False, True) if predict else (False,):
573 if not missing_refs:
574 break
575 for datastore in self.datastores:
576 try:
577 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
578 except NotImplementedError:
579 # some datastores may not implement generating URIs
580 continue
581 missing_refs -= got_uris.keys()
582 uris.update(got_uris)
583 if not missing_refs:
584 break
586 if missing_refs and not allow_missing:
587 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
589 return uris
591 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
592 """Return URIs associated with dataset.
594 Parameters
595 ----------
596 ref : `DatasetRef`
597 Reference to the required dataset.
598 predict : `bool`, optional
599 If the datastore does not know about the dataset, should it
600 return a predicted URI or not?
602 Returns
603 -------
604 uris : `DatasetRefURIs`
605 The URI to the primary artifact associated with this dataset (if
606 the dataset was disassembled within the datastore this may be
607 `None`), and the URIs to any components associated with the dataset
608 artifact. (can be empty if there are no components).
610 Notes
611 -----
612 The returned URI is from the first datastore in the list that has
613 the dataset with preference given to the first dataset coming from
614 a permanent datastore. If no datastores have the dataset and prediction
615 is allowed, the predicted URI for the first datastore in the list will
616 be returned.
617 """
618 log.debug("Requesting URIs for %s", ref)
619 predictedUri: DatasetRefURIs | None = None
620 predictedEphemeralUri: DatasetRefURIs | None = None
621 firstEphemeralUri: DatasetRefURIs | None = None
622 for datastore in self.datastores:
623 if datastore.exists(ref):
624 if not datastore.isEphemeral:
625 uri = datastore.getURIs(ref)
626 log.debug("Retrieved non-ephemeral URI: %s", uri)
627 return uri
628 elif not firstEphemeralUri:
629 firstEphemeralUri = datastore.getURIs(ref)
630 elif predict:
631 if not predictedUri and not datastore.isEphemeral:
632 predictedUri = datastore.getURIs(ref, predict)
633 elif not predictedEphemeralUri and datastore.isEphemeral:
634 predictedEphemeralUri = datastore.getURIs(ref, predict)
636 if firstEphemeralUri:
637 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
638 return firstEphemeralUri
640 if predictedUri:
641 log.debug("Retrieved predicted URI: %s", predictedUri)
642 return predictedUri
644 if predictedEphemeralUri:
645 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
646 return predictedEphemeralUri
648 raise FileNotFoundError(f"Dataset {ref} not in any datastore")
650 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
651 """URI to the Dataset.
653 The returned URI is from the first datastore in the list that has
654 the dataset with preference given to the first dataset coming from
655 a permanent datastore. If no datastores have the dataset and prediction
656 is allowed, the predicted URI for the first datastore in the list will
657 be returned.
659 Parameters
660 ----------
661 ref : `DatasetRef`
662 Reference to the required Dataset.
663 predict : `bool`
664 If `True`, allow URIs to be returned of datasets that have not
665 been written.
667 Returns
668 -------
669 uri : `lsst.resources.ResourcePath`
670 URI pointing to the dataset within the datastore. If the
671 dataset does not exist in the datastore, and if ``predict`` is
672 `True`, the URI will be a prediction and will include a URI
673 fragment "#predicted".
675 Notes
676 -----
677 If the datastore does not have entities that relate well
678 to the concept of a URI the returned URI string will be
679 descriptive. The returned URI is not guaranteed to be obtainable.
681 Raises
682 ------
683 FileNotFoundError
684 A URI has been requested for a dataset that does not exist and
685 guessing is not allowed.
686 RuntimeError
687 Raised if a request is made for a single URI but multiple URIs
688 are associated with this dataset.
689 """
690 log.debug("Requesting URI for %s", ref)
691 primary, components = self.getURIs(ref, predict)
692 if primary is None or components: 692 ↛ 693line 692 didn't jump to line 693, because the condition on line 692 was never true
693 raise RuntimeError(
694 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
695 )
696 return primary
698 def retrieveArtifacts(
699 self,
700 refs: Iterable[DatasetRef],
701 destination: ResourcePath,
702 transfer: str = "auto",
703 preserve_path: bool = True,
704 overwrite: bool = False,
705 ) -> list[ResourcePath]:
706 """Retrieve the file artifacts associated with the supplied refs.
708 Parameters
709 ----------
710 refs : iterable of `DatasetRef`
711 The datasets for which file artifacts are to be retrieved.
712 A single ref can result in multiple files. The refs must
713 be resolved.
714 destination : `lsst.resources.ResourcePath`
715 Location to write the file artifacts.
716 transfer : `str`, optional
717 Method to use to transfer the artifacts. Must be one of the options
718 supported by `lsst.resources.ResourcePath.transfer_from()`.
719 "move" is not allowed.
720 preserve_path : `bool`, optional
721 If `True` the full path of the file artifact within the datastore
722 is preserved. If `False` the final file component of the path
723 is used.
724 overwrite : `bool`, optional
725 If `True` allow transfers to overwrite existing files at the
726 destination.
728 Returns
729 -------
730 targets : `list` of `lsst.resources.ResourcePath`
731 URIs of file artifacts in destination location. Order is not
732 preserved.
733 """
734 if not destination.isdir(): 734 ↛ 735line 734 didn't jump to line 735, because the condition on line 734 was never true
735 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
737 # Using getURIs is not feasible since it becomes difficult to
738 # determine the path within the datastore later on. For now
739 # follow getURIs implementation approach.
741 pending = set(refs)
743 # There is a question as to whether an exception should be raised
744 # early if some of the refs are missing, or whether files should be
745 # transferred until a problem is hit. Prefer to complain up front.
746 # Use the datastore integer as primary key.
747 grouped_by_datastore: dict[int, set[DatasetRef]] = {}
749 for number, datastore in enumerate(self.datastores):
750 if datastore.isEphemeral:
751 # In the future we will want to distinguish in-memory from
752 # caching datastore since using an on-disk local
753 # cache is exactly what we should be doing.
754 continue
755 try:
756 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
757 except NotImplementedError:
758 # Some datastores may not support retrieving artifacts
759 continue
761 if datastore_refs:
762 grouped_by_datastore[number] = datastore_refs
764 # Remove these from the pending list so that we do not bother
765 # looking for them any more.
766 pending = pending - datastore_refs
768 if pending: 768 ↛ 769line 768 didn't jump to line 769, because the condition on line 768 was never true
769 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
771 # Now do the transfer.
772 targets: list[ResourcePath] = []
773 for number, datastore_refs in grouped_by_datastore.items():
774 targets.extend(
775 self.datastores[number].retrieveArtifacts(
776 datastore_refs,
777 destination,
778 transfer=transfer,
779 preserve_path=preserve_path,
780 overwrite=overwrite,
781 )
782 )
784 return targets
786 def remove(self, ref: DatasetRef) -> None:
787 """Indicate to the datastore that a dataset can be removed.
789 The dataset will be removed from each datastore. The dataset is
790 not required to exist in every child datastore.
792 Parameters
793 ----------
794 ref : `DatasetRef`
795 Reference to the required dataset.
797 Raises
798 ------
799 FileNotFoundError
800 Attempt to remove a dataset that does not exist. Raised if none
801 of the child datastores removed the dataset.
802 """
803 log.debug("Removing %s", ref)
804 self.trash(ref, ignore_errors=False)
805 self.emptyTrash(ignore_errors=False)
807 def forget(self, refs: Iterable[DatasetRef]) -> None:
808 for datastore in tuple(self.datastores):
809 datastore.forget(refs)
811 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
812 if isinstance(ref, DatasetRef):
813 ref_label = str(ref)
814 else:
815 ref_label = "bulk datasets"
817 log.debug("Trashing %s", ref_label)
819 counter = 0
820 for datastore in self.datastores:
821 try:
822 datastore.trash(ref, ignore_errors=ignore_errors)
823 counter += 1
824 except FileNotFoundError:
825 pass
827 if counter == 0:
828 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
829 if ignore_errors: 829 ↛ 830line 829 didn't jump to line 830, because the condition on line 829 was never true
830 log.warning(err_msg)
831 else:
832 raise FileNotFoundError(err_msg)
834 def emptyTrash(self, ignore_errors: bool = True) -> None:
835 for datastore in self.datastores:
836 datastore.emptyTrash(ignore_errors=ignore_errors)
838 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
839 """Retrieve a dataset from an input `Datastore`,
840 and store the result in this `Datastore`.
842 Parameters
843 ----------
844 inputDatastore : `Datastore`
845 The external `Datastore` from which to retreive the Dataset.
846 ref : `DatasetRef`
847 Reference to the required dataset in the input data store.
849 Returns
850 -------
851 results : `list`
852 List containing the return value from the ``put()`` to each
853 child datastore.
854 """
855 assert inputDatastore is not self # unless we want it for renames?
856 inMemoryDataset = inputDatastore.get(ref)
857 self.put(inMemoryDataset, ref)
859 def validateConfiguration(
860 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
861 ) -> None:
862 """Validate some of the configuration for this datastore.
864 Parameters
865 ----------
866 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
867 Entities to test against this configuration. Can be differing
868 types.
869 logFailures : `bool`, optional
870 If `True`, output a log message for every validation error
871 detected.
873 Raises
874 ------
875 DatastoreValidationError
876 Raised if there is a validation problem with a configuration.
877 All the problems are reported in a single exception.
879 Notes
880 -----
881 This method checks each datastore in turn.
882 """
884 # Need to catch each of the datastore outputs and ensure that
885 # all are tested.
886 failures = []
887 for datastore in self.datastores:
888 try:
889 datastore.validateConfiguration(entities, logFailures=logFailures)
890 except DatastoreValidationError as e:
891 if logFailures: 891 ↛ 893line 891 didn't jump to line 893, because the condition on line 891 was never false
892 log.critical("Datastore %s failed validation", datastore.name)
893 failures.append(f"Datastore {self.name}: {e}")
895 if failures:
896 msg = ";\n".join(failures)
897 raise DatastoreValidationError(msg)
899 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
900 # Docstring is inherited from base class
901 failures = []
902 for datastore in self.datastores:
903 try:
904 datastore.validateKey(lookupKey, entity)
905 except DatastoreValidationError as e:
906 failures.append(f"Datastore {self.name}: {e}")
908 if failures:
909 msg = ";\n".join(failures)
910 raise DatastoreValidationError(msg)
912 def getLookupKeys(self) -> set[LookupKey]:
913 # Docstring is inherited from base class
914 keys = set()
915 for datastore in self.datastores:
916 keys.update(datastore.getLookupKeys())
918 keys.update(self.constraints.getLookupKeys())
919 for p in self.datastoreConstraints:
920 if p is not None: 920 ↛ 919line 920 didn't jump to line 919, because the condition on line 920 was never false
921 keys.update(p.getLookupKeys())
923 return keys
925 def needs_expanded_data_ids(
926 self,
927 transfer: str | None,
928 entity: DatasetRef | DatasetType | StorageClass | None = None,
929 ) -> bool:
930 # Docstring inherited.
931 # We can't safely use `self.datastoreConstraints` with `entity` to
932 # check whether a child datastore would even want to ingest this
933 # dataset, because we don't want to filter out datastores that might
934 # need an expanded data ID based in incomplete information (e.g. we
935 # pass a StorageClass, but the constraint dispatches on DatasetType).
936 # So we pessimistically check if any datastore would need an expanded
937 # data ID for this transfer mode.
938 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores)
940 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
941 # Docstring inherited from the base class.
943 for datastore in self.datastores:
944 datastore.import_records(data)
946 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
947 # Docstring inherited from the base class.
949 all_records: dict[str, DatastoreRecordData] = {}
951 # Merge all sub-datastore records into one structure
952 for datastore in self.datastores:
953 sub_records = datastore.export_records(refs)
954 for name, record_data in sub_records.items():
955 # All datastore names must be unique in a chain.
956 if name in all_records: 956 ↛ 957line 956 didn't jump to line 957, because the condition on line 956 was never true
957 raise ValueError("Non-unique datastore name found in datastore {datastore}")
958 all_records[name] = record_data
960 return all_records
962 def export(
963 self,
964 refs: Iterable[DatasetRef],
965 *,
966 directory: ResourcePathExpression | None = None,
967 transfer: str | None = "auto",
968 ) -> Iterable[FileDataset]:
969 # Docstring inherited from Datastore.export.
970 if transfer == "auto" and directory is None:
971 transfer = None
973 if transfer is not None and directory is None:
974 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
976 if transfer == "move":
977 raise TypeError("Can not export by moving files out of datastore.")
979 # Exporting from a chain has the potential for a dataset to be
980 # in one or more of the datastores in the chain. We only need one
981 # of them since we assume the datasets are the same in all (but
982 # the file format could be different of course since that is a
983 # per-datastore configuration).
984 # We also do not know whether any of the datastores in the chain
985 # support file export.
987 # Ensure we have an ordered sequence that is not an iterator or set.
988 if not isinstance(refs, Sequence):
989 refs = list(refs)
991 # If any of the datasets are missing entirely we need to raise early
992 # before we try to run the export. This can be a little messy but is
993 # better than exporting files from the first datastore and then finding
994 # that one is missing but is not in the second datastore either.
995 known = [datastore.knows_these(refs) for datastore in self.datastores]
996 refs_known: set[DatasetRef] = set()
997 for known_to_this in known:
998 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this})
999 missing_count = len(refs) - len(refs_known)
1000 if missing_count:
1001 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}")
1003 # To allow us to slot each result into the right place after
1004 # asking each datastore, create a dict with the index.
1005 ref_positions = {ref: i for i, ref in enumerate(refs)}
1007 # Presize the final export list.
1008 exported: list[FileDataset | None] = [None] * len(refs)
1010 # The order of the returned dataset has to match the order of the
1011 # given refs, even if they are all from different datastores.
1012 for i, datastore in enumerate(self.datastores):
1013 known_to_this = known[i]
1014 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions]
1016 try:
1017 this_export = datastore.export(filtered, directory=directory, transfer=transfer)
1018 except NotImplementedError:
1019 # Try the next datastore.
1020 continue
1022 for ref, export in zip(filtered, this_export):
1023 # Get the position and also delete it from the list.
1024 exported[ref_positions.pop(ref)] = export
1026 # Every dataset should be accounted for because of the earlier checks
1027 # but make sure that we did fill all the slots to appease mypy.
1028 for i, dataset in enumerate(exported):
1029 if dataset is None: 1029 ↛ 1030line 1029 didn't jump to line 1030, because the condition on line 1029 was never true
1030 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.")
1031 yield dataset
1033 def transfer_from(
1034 self,
1035 source_datastore: Datastore,
1036 refs: Iterable[DatasetRef],
1037 transfer: str = "auto",
1038 artifact_existence: dict[ResourcePath, bool] | None = None,
1039 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1040 # Docstring inherited
1041 # mypy does not understand "type(self) is not type(source)"
1042 if isinstance(source_datastore, ChainedDatastore):
1043 # Both the source and destination are chained datastores.
1044 source_datastores = tuple(source_datastore.datastores)
1045 else:
1046 # The source datastore is different, forward everything to the
1047 # child datastores.
1048 source_datastores = tuple([source_datastore])
1050 # Need to know the set of all possible refs that could be transferred.
1051 remaining_refs = set(refs)
1053 missing_from_source: set[DatasetRef] | None = None
1054 all_accepted = set()
1055 nsuccess = 0
1056 for source_child in source_datastores:
1057 # If we are reading from a chained datastore, it's possible that
1058 # only a subset of the datastores know about the dataset. We can't
1059 # ask the receiving datastore to copy it when it doesn't exist
1060 # so we have to filter again based on what the source datastore
1061 # understands.
1062 known_to_source = source_child.knows_these([ref for ref in refs])
1064 # Need to know that there is a possibility that some of these
1065 # datasets exist but are unknown to the source datastore if
1066 # trust is enabled.
1067 if getattr(source_child, "trustGetRequest", False):
1068 unknown = [ref for ref, known in known_to_source.items() if not known]
1069 existence = source_child.mexists(unknown, artifact_existence)
1070 for ref, exists in existence.items():
1071 known_to_source[ref] = exists
1073 missing = {ref for ref, known in known_to_source.items() if not known}
1074 if missing:
1075 if missing_from_source is None:
1076 missing_from_source = missing
1077 else:
1078 missing_from_source &= missing
1080 # Try to transfer from each source datastore to each child
1081 # datastore. Have to make sure we don't transfer something
1082 # we've already transferred to this destination on later passes.
1084 # Filter the initial list based on the datasets we have
1085 # not yet transferred.
1086 these_refs = []
1087 for ref in refs:
1088 if ref in remaining_refs and known_to_source[ref]:
1089 these_refs.append(ref)
1091 if not these_refs:
1092 # Already transferred all datasets known to this datastore.
1093 continue
1095 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
1096 if constraints is not None: 1096 ↛ 1104line 1096 didn't jump to line 1104, because the condition on line 1096 was never false
1097 filtered_refs = []
1098 for ref in these_refs:
1099 if constraints.isAcceptable(ref):
1100 filtered_refs.append(ref)
1101 else:
1102 log.debug("Rejecting ref by constraints: %s", ref)
1103 else:
1104 filtered_refs = [ref for ref in these_refs]
1105 try:
1106 accepted, _ = datastore.transfer_from(
1107 source_child, filtered_refs, transfer, artifact_existence
1108 )
1109 except (TypeError, NotImplementedError):
1110 # The datastores were incompatible.
1111 continue
1112 else:
1113 nsuccess += 1
1115 # Remove the accepted datasets from those remaining.
1116 remaining_refs = remaining_refs - accepted
1118 # Keep track of everything we have accepted.
1119 all_accepted.update(accepted)
1121 if missing_from_source:
1122 for ref in missing_from_source:
1123 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref)
1125 if nsuccess == 0: 1125 ↛ 1126line 1125 didn't jump to line 1126, because the condition on line 1125 was never true
1126 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}")
1128 return all_accepted, remaining_refs