Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 91%
329 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-26 02:01 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-26 02:01 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24"""Chained datastore."""
26__all__ = ("ChainedDatastore",)
28import itertools
29import logging
30import time
31import warnings
32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union
34from lsst.daf.butler import (
35 Constraints,
36 DatasetRef,
37 DatasetRefURIs,
38 DatasetTypeNotSupportedError,
39 Datastore,
40 DatastoreConfig,
41 DatastoreRecordData,
42 DatastoreValidationError,
43 FileDataset,
44)
45from lsst.resources import ResourcePath
46from lsst.utils import doImportType
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
52log = logging.getLogger(__name__)
55class _IngestPrepData(Datastore.IngestPrepData):
56 """Helper class for ChainedDatastore ingest implementation.
58 Parameters
59 ----------
60 children : `list` of `tuple`
61 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
62 """
64 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
65 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
66 self.children = children
69class ChainedDatastore(Datastore):
70 """Chained Datastores to allow read and writes from multiple datastores.
72 A ChainedDatastore is configured with multiple datastore configurations.
73 A ``put()`` is always sent to each datastore. A ``get()``
74 operation is sent to each datastore in turn and the first datastore
75 to return a valid dataset is used.
77 Parameters
78 ----------
79 config : `DatastoreConfig` or `str`
80 Configuration. This configuration must include a ``datastores`` field
81 as a sequence of datastore configurations. The order in this sequence
82 indicates the order to use for read operations.
83 bridgeManager : `DatastoreRegistryBridgeManager`
84 Object that manages the interface between `Registry` and datastores.
85 butlerRoot : `str`, optional
86 New datastore root to use to override the configuration value. This
87 root is sent to each child datastore.
89 Notes
90 -----
91 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
92 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
93 and `"hardlink"` if and only if all its child datastores do.
94 """
96 defaultConfigFile = "datastores/chainedDatastore.yaml"
97 """Path to configuration defaults. Accessed within the ``configs`` resource
98 or relative to a search path. Can be None if no defaults specified.
99 """
101 containerKey = "datastores"
102 """Key to specify where child datastores are configured."""
104 datastores: List[Datastore]
105 """All the child datastores known to this datastore."""
107 datastoreConstraints: Sequence[Optional[Constraints]]
108 """Constraints to be applied to each of the child datastores."""
110 @classmethod
111 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
112 """Set any filesystem-dependent config options for child Datastores to
113 be appropriate for a new empty repository with the given root.
115 Parameters
116 ----------
117 root : `str`
118 Filesystem path to the root of the data repository.
119 config : `Config`
120 A `Config` to update. Only the subset understood by
121 this component will be updated. Will not expand
122 defaults.
123 full : `Config`
124 A complete config with all defaults expanded that can be
125 converted to a `DatastoreConfig`. Read-only and will not be
126 modified by this method.
127 Repository-specific options that should not be obtained
128 from defaults when Butler instances are constructed
129 should be copied from ``full`` to ``config``.
130 overwrite : `bool`, optional
131 If `False`, do not modify a value in ``config`` if the value
132 already exists. Default is always to overwrite with the provided
133 ``root``.
135 Notes
136 -----
137 If a keyword is explicitly defined in the supplied ``config`` it
138 will not be overridden by this method if ``overwrite`` is `False`.
139 This allows explicit values set in external configs to be retained.
140 """
142 # Extract the part of the config we care about updating
143 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
145 # And the subset of the full config that we can use for reference.
146 # Do not bother with defaults because we are told this already has
147 # them.
148 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
150 # Loop over each datastore config and pass the subsets to the
151 # child datastores to process.
153 containerKey = cls.containerKey
154 for idx, (child, fullChild) in enumerate(
155 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey])
156 ):
157 childConfig = DatastoreConfig(child, mergeDefaults=False)
158 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
159 datastoreClass = doImportType(fullChildConfig["cls"])
160 if not issubclass(datastoreClass, Datastore): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
162 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
163 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
165 # Reattach to parent
166 datastoreConfig[containerKey, idx] = childConfig
168 # Reattach modified datastore config to parent
169 # If this has a datastore key we attach there, otherwise we assume
170 # this information goes at the top of the config hierarchy.
171 if DatastoreConfig.component in config:
172 config[DatastoreConfig.component] = datastoreConfig
173 else:
174 config.update(datastoreConfig)
176 return
178 def __init__(
179 self,
180 config: Union[Config, str],
181 bridgeManager: DatastoreRegistryBridgeManager,
182 butlerRoot: str = None,
183 ):
184 super().__init__(config, bridgeManager)
186 # Scan for child datastores and instantiate them with the same registry
187 self.datastores = []
188 for c in self.config["datastores"]:
189 c = DatastoreConfig(c)
190 datastoreType = doImportType(c["cls"])
191 if not issubclass(datastoreType, Datastore): 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true
192 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
193 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
194 log.debug("Creating child datastore %s", datastore.name)
195 self.datastores.append(datastore)
197 # Name ourself based on our children
198 if self.datastores: 198 ↛ 203line 198 didn't jump to line 203, because the condition on line 198 was never false
199 # We must set the names explicitly
200 self._names = [d.name for d in self.datastores]
201 childNames = ",".join(self.names)
202 else:
203 childNames = "(empty@{})".format(time.time())
204 self._names = [childNames]
205 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
207 # We declare we are ephemeral if all our child datastores declare
208 # they are ephemeral
209 isEphemeral = True
210 for d in self.datastores:
211 if not d.isEphemeral:
212 isEphemeral = False
213 break
214 self.isEphemeral = isEphemeral
216 # per-datastore override constraints
217 if "datastore_constraints" in self.config:
218 overrides = self.config["datastore_constraints"]
220 if len(overrides) != len(self.datastores): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 raise DatastoreValidationError(
222 f"Number of registered datastores ({len(self.datastores)})"
223 " differs from number of constraints overrides"
224 f" {len(overrides)}"
225 )
227 self.datastoreConstraints = [
228 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
229 ]
231 else:
232 self.datastoreConstraints = (None,) * len(self.datastores)
234 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
236 @property
237 def names(self) -> Tuple[str, ...]:
238 return tuple(self._names)
240 def __str__(self) -> str:
241 chainName = ", ".join(str(ds) for ds in self.datastores)
242 return chainName
244 def knows(self, ref: DatasetRef) -> bool:
245 """Check if the dataset is known to any of the datastores.
247 Does not check for existence of any artifact.
249 Parameters
250 ----------
251 ref : `DatasetRef`
252 Reference to the required dataset.
254 Returns
255 -------
256 exists : `bool`
257 `True` if the dataset is known to the datastore.
258 """
259 for datastore in self.datastores:
260 if datastore.knows(ref):
261 log.debug("%s known to datastore %s", ref, datastore.name)
262 return True
263 return False
265 def mexists(
266 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None
267 ) -> Dict[DatasetRef, bool]:
268 """Check the existence of multiple datasets at once.
270 Parameters
271 ----------
272 refs : iterable of `DatasetRef`
273 The datasets to be checked.
274 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
275 Optional mapping of datastore artifact to existence. Updated by
276 this method with details of all artifacts tested. Can be `None`
277 if the caller is not interested.
279 Returns
280 -------
281 existence : `dict` of [`DatasetRef`, `bool`]
282 Mapping from dataset to boolean indicating existence in any
283 of the child datastores.
284 """
285 dataset_existence: Dict[DatasetRef, bool] = {}
286 for datastore in self.datastores:
287 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
289 # For next datastore no point asking about ones we know
290 # exist already. No special exemption for ephemeral datastores.
291 refs = [ref for ref, exists in dataset_existence.items() if not exists]
293 return dataset_existence
295 def exists(self, ref: DatasetRef) -> bool:
296 """Check if the dataset exists in one of the datastores.
298 Parameters
299 ----------
300 ref : `DatasetRef`
301 Reference to the required dataset.
303 Returns
304 -------
305 exists : `bool`
306 `True` if the entity exists in one of the child datastores.
307 """
308 for datastore in self.datastores:
309 if datastore.exists(ref):
310 log.debug("Found %s in datastore %s", ref, datastore.name)
311 return True
312 return False
314 def get(
315 self,
316 ref: DatasetRef,
317 parameters: Optional[Mapping[str, Any]] = None,
318 storageClass: Optional[Union[StorageClass, str]] = None,
319 ) -> Any:
320 """Load an InMemoryDataset from the store.
322 The dataset is returned from the first datastore that has
323 the dataset.
325 Parameters
326 ----------
327 ref : `DatasetRef`
328 Reference to the required Dataset.
329 parameters : `dict`
330 `StorageClass`-specific parameters that specify, for example,
331 a slice of the dataset to be loaded.
332 storageClass : `StorageClass` or `str`, optional
333 The storage class to be used to override the Python type
334 returned by this method. By default the returned type matches
335 the dataset type definition for this dataset. Specifying a
336 read `StorageClass` can force a different type to be returned.
337 This type must be compatible with the original type.
339 Returns
340 -------
341 inMemoryDataset : `object`
342 Requested dataset or slice thereof as an InMemoryDataset.
344 Raises
345 ------
346 FileNotFoundError
347 Requested dataset can not be retrieved.
348 TypeError
349 Return value from formatter has unexpected type.
350 ValueError
351 Formatter failed to process the dataset.
352 """
354 for datastore in self.datastores:
355 try:
356 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
357 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
358 return inMemoryObject
359 except FileNotFoundError:
360 pass
362 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
364 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
365 """Write a InMemoryDataset with a given `DatasetRef` to each
366 datastore.
368 The put() to child datastores can fail with
369 `DatasetTypeNotSupportedError`. The put() for this datastore will be
370 deemed to have succeeded so long as at least one child datastore
371 accepted the inMemoryDataset.
373 Parameters
374 ----------
375 inMemoryDataset : `object`
376 The dataset to store.
377 ref : `DatasetRef`
378 Reference to the associated Dataset.
380 Raises
381 ------
382 TypeError
383 Supplied object and storage class are inconsistent.
384 DatasetTypeNotSupportedError
385 All datastores reported `DatasetTypeNotSupportedError`.
386 """
387 log.debug("Put %s", ref)
389 # Confirm that we can accept this dataset
390 if not self.constraints.isAcceptable(ref):
391 # Raise rather than use boolean return value.
392 raise DatasetTypeNotSupportedError(
393 f"Dataset {ref} has been rejected by this datastore via configuration."
394 )
396 isPermanent = False
397 nsuccess = 0
398 npermanent = 0
399 nephemeral = 0
400 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
401 if constraints is not None and not constraints.isAcceptable(ref):
402 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
403 continue
405 if datastore.isEphemeral:
406 nephemeral += 1
407 else:
408 npermanent += 1
409 try:
410 datastore.put(inMemoryDataset, ref)
411 nsuccess += 1
412 if not datastore.isEphemeral:
413 isPermanent = True
414 except DatasetTypeNotSupportedError:
415 pass
417 if nsuccess == 0:
418 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
420 if not isPermanent and npermanent > 0: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true
421 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
423 if self._transaction is not None:
424 self._transaction.registerUndo("put", self.remove, ref)
426 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> Optional[str]:
427 # Docstring inherited from base class.
428 if transfer != "auto":
429 return transfer
430 # Ask each datastore what they think auto means
431 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
433 # Remove any untranslated "auto" values
434 transfers.discard(transfer)
436 if len(transfers) == 1: 436 ↛ 437line 436 didn't jump to line 437, because the condition on line 436 was never true
437 return transfers.pop()
438 if not transfers: 438 ↛ 442line 438 didn't jump to line 442, because the condition on line 438 was never false
439 # Everything reported "auto"
440 return transfer
442 raise RuntimeError(
443 "Chained datastore does not yet support different transfer modes"
444 f" from 'auto' in each child datastore (wanted {transfers})"
445 )
447 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
448 # Docstring inherited from Datastore._prepIngest.
449 if transfer is None:
450 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
452 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
453 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
454 if not acceptable:
455 log.debug(
456 "Datastore %s skipping ingest via configuration for refs %s",
457 name,
458 ", ".join(str(ref) for ref in dataset.refs),
459 )
460 return False
461 else:
462 return True
464 # Filter down to just datasets the chained datastore's own
465 # configuration accepts.
466 okForParent: List[FileDataset] = [
467 dataset
468 for dataset in datasets
469 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
470 ]
472 # Iterate over nested datastores and call _prepIngest on each.
473 # Save the results to a list:
474 children: List[Tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
475 # ...and remember whether all of the failures are due to
476 # NotImplementedError being raised.
477 allFailuresAreNotImplementedError = True
478 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
479 okForChild: List[FileDataset]
480 if constraints is not None:
481 okForChild = [
482 dataset
483 for dataset in okForParent
484 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
485 ]
486 else:
487 okForChild = okForParent
488 try:
489 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
490 except NotImplementedError:
491 log.debug(
492 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
493 datastore.name,
494 transfer,
495 )
496 continue
497 allFailuresAreNotImplementedError = False
498 if okForChild:
499 # Do not store for later if a datastore has rejected
500 # everything.
501 # Include the source paths if this is a "move". It's clearer
502 # to find the paths now rather than try to infer how
503 # each datastore has stored them in the internal prep class.
504 paths = (
505 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set()
506 )
507 children.append((datastore, prepDataForChild, paths))
508 if allFailuresAreNotImplementedError:
509 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
510 return _IngestPrepData(children=children)
512 def _finishIngest(
513 self,
514 prepData: _IngestPrepData,
515 *,
516 transfer: Optional[str] = None,
517 record_validation_info: bool = True,
518 ) -> None:
519 # Docstring inherited from Datastore._finishIngest.
520 # For "move" we must use "copy" and then delete the input
521 # data at the end. This has no rollback option if the ingest
522 # subsequently fails. If there is only one active datastore
523 # accepting any files we can leave it as "move"
524 actual_transfer: str | None
525 if transfer == "move" and len(prepData.children) > 1:
526 actual_transfer = "copy"
527 else:
528 actual_transfer = transfer
529 to_be_deleted: set[ResourcePath] = set()
530 for datastore, prepDataForChild, paths in prepData.children:
531 datastore._finishIngest(
532 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
533 )
534 to_be_deleted.update(paths)
535 if actual_transfer != transfer:
536 # These datasets were copied but now need to be deleted.
537 # This can not be rolled back.
538 for uri in to_be_deleted:
539 uri.remove()
541 def getManyURIs(
542 self,
543 refs: Iterable[DatasetRef],
544 predict: bool = False,
545 allow_missing: bool = False,
546 ) -> Dict[DatasetRef, DatasetRefURIs]:
547 # Docstring inherited
549 uris: Dict[DatasetRef, DatasetRefURIs] = {}
550 missing_refs = set(refs)
552 # If predict is True we don't want to predict a dataset in the first
553 # datastore if it actually exists in a later datastore, so in that
554 # case check all datastores with predict=False first, and then try
555 # again with predict=True.
556 for p in (False, True) if predict else (False,):
557 if not missing_refs:
558 break
559 for datastore in self.datastores:
560 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
561 missing_refs -= got_uris.keys()
562 uris.update(got_uris)
563 if not missing_refs:
564 break
566 if missing_refs and not allow_missing:
567 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
569 return uris
571 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
572 """Return URIs associated with dataset.
574 Parameters
575 ----------
576 ref : `DatasetRef`
577 Reference to the required dataset.
578 predict : `bool`, optional
579 If the datastore does not know about the dataset, should it
580 return a predicted URI or not?
582 Returns
583 -------
584 uris : `DatasetRefURIs`
585 The URI to the primary artifact associated with this dataset (if
586 the dataset was disassembled within the datastore this may be
587 `None`), and the URIs to any components associated with the dataset
588 artifact. (can be empty if there are no components).
590 Notes
591 -----
592 The returned URI is from the first datastore in the list that has
593 the dataset with preference given to the first dataset coming from
594 a permanent datastore. If no datastores have the dataset and prediction
595 is allowed, the predicted URI for the first datastore in the list will
596 be returned.
597 """
598 log.debug("Requesting URIs for %s", ref)
599 predictedUri: Optional[DatasetRefURIs] = None
600 predictedEphemeralUri: Optional[DatasetRefURIs] = None
601 firstEphemeralUri: Optional[DatasetRefURIs] = None
602 for datastore in self.datastores:
603 if datastore.exists(ref):
604 if not datastore.isEphemeral:
605 uri = datastore.getURIs(ref)
606 log.debug("Retrieved non-ephemeral URI: %s", uri)
607 return uri
608 elif not firstEphemeralUri:
609 firstEphemeralUri = datastore.getURIs(ref)
610 elif predict:
611 if not predictedUri and not datastore.isEphemeral:
612 predictedUri = datastore.getURIs(ref, predict)
613 elif not predictedEphemeralUri and datastore.isEphemeral:
614 predictedEphemeralUri = datastore.getURIs(ref, predict)
616 if firstEphemeralUri:
617 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
618 return firstEphemeralUri
620 if predictedUri:
621 log.debug("Retrieved predicted URI: %s", predictedUri)
622 return predictedUri
624 if predictedEphemeralUri:
625 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
626 return predictedEphemeralUri
628 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
630 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
631 """URI to the Dataset.
633 The returned URI is from the first datastore in the list that has
634 the dataset with preference given to the first dataset coming from
635 a permanent datastore. If no datastores have the dataset and prediction
636 is allowed, the predicted URI for the first datastore in the list will
637 be returned.
639 Parameters
640 ----------
641 ref : `DatasetRef`
642 Reference to the required Dataset.
643 predict : `bool`
644 If `True`, allow URIs to be returned of datasets that have not
645 been written.
647 Returns
648 -------
649 uri : `lsst.resources.ResourcePath`
650 URI pointing to the dataset within the datastore. If the
651 dataset does not exist in the datastore, and if ``predict`` is
652 `True`, the URI will be a prediction and will include a URI
653 fragment "#predicted".
655 Notes
656 -----
657 If the datastore does not have entities that relate well
658 to the concept of a URI the returned URI string will be
659 descriptive. The returned URI is not guaranteed to be obtainable.
661 Raises
662 ------
663 FileNotFoundError
664 A URI has been requested for a dataset that does not exist and
665 guessing is not allowed.
666 RuntimeError
667 Raised if a request is made for a single URI but multiple URIs
668 are associated with this dataset.
669 """
670 log.debug("Requesting URI for %s", ref)
671 primary, components = self.getURIs(ref, predict)
672 if primary is None or components: 672 ↛ 673line 672 didn't jump to line 673, because the condition on line 672 was never true
673 raise RuntimeError(
674 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
675 )
676 return primary
678 def retrieveArtifacts(
679 self,
680 refs: Iterable[DatasetRef],
681 destination: ResourcePath,
682 transfer: str = "auto",
683 preserve_path: bool = True,
684 overwrite: bool = False,
685 ) -> List[ResourcePath]:
686 """Retrieve the file artifacts associated with the supplied refs.
688 Parameters
689 ----------
690 refs : iterable of `DatasetRef`
691 The datasets for which file artifacts are to be retrieved.
692 A single ref can result in multiple files. The refs must
693 be resolved.
694 destination : `lsst.resources.ResourcePath`
695 Location to write the file artifacts.
696 transfer : `str`, optional
697 Method to use to transfer the artifacts. Must be one of the options
698 supported by `lsst.resources.ResourcePath.transfer_from()`.
699 "move" is not allowed.
700 preserve_path : `bool`, optional
701 If `True` the full path of the file artifact within the datastore
702 is preserved. If `False` the final file component of the path
703 is used.
704 overwrite : `bool`, optional
705 If `True` allow transfers to overwrite existing files at the
706 destination.
708 Returns
709 -------
710 targets : `list` of `lsst.resources.ResourcePath`
711 URIs of file artifacts in destination location. Order is not
712 preserved.
713 """
714 if not destination.isdir(): 714 ↛ 715line 714 didn't jump to line 715, because the condition on line 714 was never true
715 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
717 # Using getURIs is not feasible since it becomes difficult to
718 # determine the path within the datastore later on. For now
719 # follow getURIs implementation approach.
721 pending = set(refs)
723 # There is a question as to whether an exception should be raised
724 # early if some of the refs are missing, or whether files should be
725 # transferred until a problem is hit. Prefer to complain up front.
726 # Use the datastore integer as primary key.
727 grouped_by_datastore: Dict[int, Set[DatasetRef]] = {}
729 for number, datastore in enumerate(self.datastores):
730 if datastore.isEphemeral:
731 # In the future we will want to distinguish in-memory from
732 # caching datastore since using an on-disk local
733 # cache is exactly what we should be doing.
734 continue
735 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
737 if datastore_refs:
738 grouped_by_datastore[number] = datastore_refs
740 # Remove these from the pending list so that we do not bother
741 # looking for them any more.
742 pending = pending - datastore_refs
744 if pending: 744 ↛ 745line 744 didn't jump to line 745, because the condition on line 744 was never true
745 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
747 # Now do the transfer.
748 targets: List[ResourcePath] = []
749 for number, datastore_refs in grouped_by_datastore.items():
750 targets.extend(
751 self.datastores[number].retrieveArtifacts(
752 datastore_refs,
753 destination,
754 transfer=transfer,
755 preserve_path=preserve_path,
756 overwrite=overwrite,
757 )
758 )
760 return targets
762 def remove(self, ref: DatasetRef) -> None:
763 """Indicate to the datastore that a dataset can be removed.
765 The dataset will be removed from each datastore. The dataset is
766 not required to exist in every child datastore.
768 Parameters
769 ----------
770 ref : `DatasetRef`
771 Reference to the required dataset.
773 Raises
774 ------
775 FileNotFoundError
776 Attempt to remove a dataset that does not exist. Raised if none
777 of the child datastores removed the dataset.
778 """
779 log.debug("Removing %s", ref)
780 self.trash(ref, ignore_errors=False)
781 self.emptyTrash(ignore_errors=False)
783 def forget(self, refs: Iterable[DatasetRef]) -> None:
784 for datastore in tuple(self.datastores):
785 datastore.forget(refs)
787 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None:
788 if isinstance(ref, DatasetRef):
789 ref_label = str(ref)
790 else:
791 ref_label = "bulk datasets"
793 log.debug("Trashing %s", ref_label)
795 counter = 0
796 for datastore in self.datastores:
797 try:
798 datastore.trash(ref, ignore_errors=ignore_errors)
799 counter += 1
800 except FileNotFoundError:
801 pass
803 if counter == 0:
804 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
805 if ignore_errors: 805 ↛ 806line 805 didn't jump to line 806, because the condition on line 805 was never true
806 log.warning(err_msg)
807 else:
808 raise FileNotFoundError(err_msg)
810 def emptyTrash(self, ignore_errors: bool = True) -> None:
811 for datastore in self.datastores:
812 datastore.emptyTrash(ignore_errors=ignore_errors)
814 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
815 """Retrieve a dataset from an input `Datastore`,
816 and store the result in this `Datastore`.
818 Parameters
819 ----------
820 inputDatastore : `Datastore`
821 The external `Datastore` from which to retreive the Dataset.
822 ref : `DatasetRef`
823 Reference to the required dataset in the input data store.
825 Returns
826 -------
827 results : `list`
828 List containing the return value from the ``put()`` to each
829 child datastore.
830 """
831 assert inputDatastore is not self # unless we want it for renames?
832 inMemoryDataset = inputDatastore.get(ref)
833 self.put(inMemoryDataset, ref)
835 def validateConfiguration(
836 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False
837 ) -> None:
838 """Validate some of the configuration for this datastore.
840 Parameters
841 ----------
842 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
843 Entities to test against this configuration. Can be differing
844 types.
845 logFailures : `bool`, optional
846 If `True`, output a log message for every validation error
847 detected.
849 Raises
850 ------
851 DatastoreValidationError
852 Raised if there is a validation problem with a configuration.
853 All the problems are reported in a single exception.
855 Notes
856 -----
857 This method checks each datastore in turn.
858 """
860 # Need to catch each of the datastore outputs and ensure that
861 # all are tested.
862 failures = []
863 for datastore in self.datastores:
864 try:
865 datastore.validateConfiguration(entities, logFailures=logFailures)
866 except DatastoreValidationError as e:
867 if logFailures: 867 ↛ 869line 867 didn't jump to line 869, because the condition on line 867 was never false
868 log.critical("Datastore %s failed validation", datastore.name)
869 failures.append(f"Datastore {self.name}: {e}")
871 if failures:
872 msg = ";\n".join(failures)
873 raise DatastoreValidationError(msg)
875 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None:
876 # Docstring is inherited from base class
877 failures = []
878 for datastore in self.datastores:
879 try:
880 datastore.validateKey(lookupKey, entity)
881 except DatastoreValidationError as e:
882 failures.append(f"Datastore {self.name}: {e}")
884 if failures:
885 msg = ";\n".join(failures)
886 raise DatastoreValidationError(msg)
888 def getLookupKeys(self) -> Set[LookupKey]:
889 # Docstring is inherited from base class
890 keys = set()
891 for datastore in self.datastores:
892 keys.update(datastore.getLookupKeys())
894 keys.update(self.constraints.getLookupKeys())
895 for p in self.datastoreConstraints:
896 if p is not None: 896 ↛ 897line 896 didn't jump to line 897, because the condition on line 896 was never true
897 keys.update(p.getLookupKeys())
899 return keys
901 def needs_expanded_data_ids(
902 self,
903 transfer: Optional[str],
904 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None,
905 ) -> bool:
906 # Docstring inherited.
907 # We can't safely use `self.datastoreConstraints` with `entity` to
908 # check whether a child datastore would even want to ingest this
909 # dataset, because we don't want to filter out datastores that might
910 # need an expanded data ID based in incomplete information (e.g. we
911 # pass a StorageClass, but the constraint dispatches on DatasetType).
912 # So we pessimistically check if any datastore would need an expanded
913 # data ID for this transfer mode.
914 return any(datastore.needs_expanded_data_ids(transfer) for datastore in self.datastores) 914 ↛ exitline 914 didn't finish the generator expression on line 914
916 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
917 # Docstring inherited from the base class.
919 for datastore in self.datastores:
920 datastore.import_records(data)
922 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
923 # Docstring inherited from the base class.
925 all_records: Dict[str, DatastoreRecordData] = {}
927 # Merge all sub-datastore records into one structure
928 for datastore in self.datastores:
929 sub_records = datastore.export_records(refs)
930 for name, record_data in sub_records.items():
931 # All datastore names must be unique in a chain.
932 if name in all_records: 932 ↛ 933line 932 didn't jump to line 933, because the condition on line 932 was never true
933 raise ValueError("Non-unique datastore name found in datastore {datastore}")
934 all_records[name] = record_data
936 return all_records