Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py: 92%
424 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-12 09:43 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-12 09:43 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Chained datastore."""
30from __future__ import annotations
32__all__ = ("ChainedDatastore",)
34import itertools
35import logging
36import time
37import warnings
38from collections.abc import Iterable, Mapping, Sequence
39from typing import TYPE_CHECKING, Any
41from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, Datastore, FileDataset
42from lsst.daf.butler.datastore import DatasetRefURIs, DatastoreConfig, DatastoreValidationError
43from lsst.daf.butler.datastore.constraints import Constraints
44from lsst.daf.butler.datastore.record_data import DatastoreRecordData
45from lsst.resources import ResourcePath
46from lsst.utils import doImportType
48if TYPE_CHECKING:
49 from lsst.daf.butler import Config, DatasetType, LookupKey, StorageClass
50 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
51 from lsst.resources import ResourcePathExpression
53log = logging.getLogger(__name__)
56class _IngestPrepData(Datastore.IngestPrepData):
57 """Helper class for ChainedDatastore ingest implementation.
59 Parameters
60 ----------
61 children : `list` of `tuple`
62 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
63 """
65 def __init__(self, children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]]):
66 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data, _ in children))
67 self.children = children
70class ChainedDatastore(Datastore):
71 """Chained Datastores to allow read and writes from multiple datastores.
73 A ChainedDatastore is configured with multiple datastore configurations.
74 A ``put()`` is always sent to each datastore. A ``get()``
75 operation is sent to each datastore in turn and the first datastore
76 to return a valid dataset is used.
78 Parameters
79 ----------
80 config : `DatastoreConfig` or `str`
81 Configuration. This configuration must include a ``datastores`` field
82 as a sequence of datastore configurations. The order in this sequence
83 indicates the order to use for read operations.
84 bridgeManager : `DatastoreRegistryBridgeManager`
85 Object that manages the interface between `Registry` and datastores.
86 butlerRoot : `str`, optional
87 New datastore root to use to override the configuration value. This
88 root is sent to each child datastore.
90 Notes
91 -----
92 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
93 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
94 and `"hardlink"` if and only if all its child datastores do.
95 """
97 defaultConfigFile = "datastores/chainedDatastore.yaml"
98 """Path to configuration defaults. Accessed within the ``configs`` resource
99 or relative to a search path. Can be None if no defaults specified.
100 """
102 containerKey = "datastores"
103 """Key to specify where child datastores are configured."""
105 datastores: list[Datastore]
106 """All the child datastores known to this datastore."""
108 datastoreConstraints: Sequence[Constraints | None]
109 """Constraints to be applied to each of the child datastores."""
111 @classmethod
112 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
113 """Set any filesystem-dependent config options for child Datastores to
114 be appropriate for a new empty repository with the given root.
116 Parameters
117 ----------
118 root : `str`
119 Filesystem path to the root of the data repository.
120 config : `Config`
121 A `Config` to update. Only the subset understood by
122 this component will be updated. Will not expand
123 defaults.
124 full : `Config`
125 A complete config with all defaults expanded that can be
126 converted to a `DatastoreConfig`. Read-only and will not be
127 modified by this method.
128 Repository-specific options that should not be obtained
129 from defaults when Butler instances are constructed
130 should be copied from ``full`` to ``config``.
131 overwrite : `bool`, optional
132 If `False`, do not modify a value in ``config`` if the value
133 already exists. Default is always to overwrite with the provided
134 ``root``.
136 Notes
137 -----
138 If a keyword is explicitly defined in the supplied ``config`` it
139 will not be overridden by this method if ``overwrite`` is `False`.
140 This allows explicit values set in external configs to be retained.
141 """
142 # Extract the part of the config we care about updating
143 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
145 # And the subset of the full config that we can use for reference.
146 # Do not bother with defaults because we are told this already has
147 # them.
148 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
150 # Loop over each datastore config and pass the subsets to the
151 # child datastores to process.
153 containerKey = cls.containerKey
154 for idx, (child, fullChild) in enumerate(
155 zip(datastoreConfig[containerKey], fullDatastoreConfig[containerKey], strict=True)
156 ):
157 childConfig = DatastoreConfig(child, mergeDefaults=False)
158 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
159 datastoreClass = doImportType(fullChildConfig["cls"])
160 if not issubclass(datastoreClass, Datastore): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 raise TypeError(f"Imported child class {fullChildConfig['cls']} is not a Datastore")
162 newroot = f"{root}/{datastoreClass.__qualname__}_{idx}"
163 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
165 # Reattach to parent
166 datastoreConfig[containerKey, idx] = childConfig
168 # Reattach modified datastore config to parent
169 # If this has a datastore key we attach there, otherwise we assume
170 # this information goes at the top of the config hierarchy.
171 if DatastoreConfig.component in config:
172 config[DatastoreConfig.component] = datastoreConfig
173 else:
174 config.update(datastoreConfig)
176 return
178 def __init__(
179 self,
180 config: Config | ResourcePathExpression,
181 bridgeManager: DatastoreRegistryBridgeManager,
182 butlerRoot: str | None = None,
183 ):
184 super().__init__(config, bridgeManager)
186 # Scan for child datastores and instantiate them with the same registry
187 self.datastores = []
188 for c in self.config["datastores"]:
189 c = DatastoreConfig(c)
190 datastoreType = doImportType(c["cls"])
191 if not issubclass(datastoreType, Datastore): 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true
192 raise TypeError(f"Imported child class {c['cls']} is not a Datastore")
193 datastore = datastoreType(c, bridgeManager, butlerRoot=butlerRoot)
194 log.debug("Creating child datastore %s", datastore.name)
195 self.datastores.append(datastore)
197 # Name ourself based on our children
198 if self.datastores: 198 ↛ 203line 198 didn't jump to line 203, because the condition on line 198 was never false
199 # We must set the names explicitly
200 self._names = [d.name for d in self.datastores]
201 childNames = ",".join(self.names)
202 else:
203 childNames = f"(empty@{time.time()})"
204 self._names = [childNames]
205 self.name = f"{type(self).__qualname__}[{childNames}]"
207 # We declare we are ephemeral if all our child datastores declare
208 # they are ephemeral
209 isEphemeral = True
210 for d in self.datastores:
211 if not d.isEphemeral:
212 isEphemeral = False
213 break
214 self.isEphemeral = isEphemeral
216 # per-datastore override constraints
217 if "datastore_constraints" in self.config:
218 overrides = self.config["datastore_constraints"]
220 if len(overrides) != len(self.datastores): 220 ↛ 221line 220 didn't jump to line 221, because the condition on line 220 was never true
221 raise DatastoreValidationError(
222 f"Number of registered datastores ({len(self.datastores)})"
223 " differs from number of constraints overrides"
224 f" {len(overrides)}"
225 )
227 self.datastoreConstraints = [
228 Constraints(c.get("constraints"), universe=bridgeManager.universe) for c in overrides
229 ]
231 else:
232 self.datastoreConstraints = (None,) * len(self.datastores)
234 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
236 @property
237 def names(self) -> tuple[str, ...]:
238 return tuple(self._names)
240 @property
241 def roots(self) -> dict[str, ResourcePath | None]:
242 # Docstring inherited.
243 roots = {}
244 for datastore in self.datastores:
245 roots.update(datastore.roots)
246 return roots
248 def __str__(self) -> str:
249 chainName = ", ".join(str(ds) for ds in self.datastores)
250 return chainName
252 def knows(self, ref: DatasetRef) -> bool:
253 """Check if the dataset is known to any of the datastores.
255 Does not check for existence of any artifact.
257 Parameters
258 ----------
259 ref : `DatasetRef`
260 Reference to the required dataset.
262 Returns
263 -------
264 exists : `bool`
265 `True` if the dataset is known to the datastore.
266 """
267 for datastore in self.datastores:
268 if datastore.knows(ref):
269 log.debug("%s known to datastore %s", ref, datastore.name)
270 return True
271 return False
273 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]:
274 # Docstring inherited from the base class.
275 refs_known: dict[DatasetRef, bool] = {}
276 for datastore in self.datastores:
277 refs_known.update(datastore.knows_these(refs))
279 # No need to check in next datastore for refs that are known.
280 # We only update entries that were initially False.
281 refs = [ref for ref, known in refs_known.items() if not known]
283 return refs_known
285 def mexists(
286 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None
287 ) -> dict[DatasetRef, bool]:
288 """Check the existence of multiple datasets at once.
290 Parameters
291 ----------
292 refs : iterable of `DatasetRef`
293 The datasets to be checked.
294 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
295 Optional mapping of datastore artifact to existence. Updated by
296 this method with details of all artifacts tested. Can be `None`
297 if the caller is not interested.
299 Returns
300 -------
301 existence : `dict` of [`DatasetRef`, `bool`]
302 Mapping from dataset to boolean indicating existence in any
303 of the child datastores.
304 """
305 dataset_existence: dict[DatasetRef, bool] = {}
306 for datastore in self.datastores:
307 dataset_existence.update(datastore.mexists(refs, artifact_existence=artifact_existence))
309 # For next datastore no point asking about ones we know
310 # exist already. No special exemption for ephemeral datastores.
311 refs = [ref for ref, exists in dataset_existence.items() if not exists]
313 return dataset_existence
315 def exists(self, ref: DatasetRef) -> bool:
316 """Check if the dataset exists in one of the datastores.
318 Parameters
319 ----------
320 ref : `DatasetRef`
321 Reference to the required dataset.
323 Returns
324 -------
325 exists : `bool`
326 `True` if the entity exists in one of the child datastores.
327 """
328 for datastore in self.datastores:
329 if datastore.exists(ref):
330 log.debug("Found %s in datastore %s", ref, datastore.name)
331 return True
332 return False
334 def get(
335 self,
336 ref: DatasetRef,
337 parameters: Mapping[str, Any] | None = None,
338 storageClass: StorageClass | str | None = None,
339 ) -> Any:
340 """Load an InMemoryDataset from the store.
342 The dataset is returned from the first datastore that has
343 the dataset.
345 Parameters
346 ----------
347 ref : `DatasetRef`
348 Reference to the required Dataset.
349 parameters : `dict`
350 `StorageClass`-specific parameters that specify, for example,
351 a slice of the dataset to be loaded.
352 storageClass : `StorageClass` or `str`, optional
353 The storage class to be used to override the Python type
354 returned by this method. By default the returned type matches
355 the dataset type definition for this dataset. Specifying a
356 read `StorageClass` can force a different type to be returned.
357 This type must be compatible with the original type.
359 Returns
360 -------
361 inMemoryDataset : `object`
362 Requested dataset or slice thereof as an InMemoryDataset.
364 Raises
365 ------
366 FileNotFoundError
367 Requested dataset can not be retrieved.
368 TypeError
369 Return value from formatter has unexpected type.
370 ValueError
371 Formatter failed to process the dataset.
372 """
373 for datastore in self.datastores:
374 try:
375 inMemoryObject = datastore.get(ref, parameters, storageClass=storageClass)
376 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
377 return inMemoryObject
378 except FileNotFoundError:
379 pass
381 raise FileNotFoundError(f"Dataset {ref} could not be found in any of the datastores")
383 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
384 """Write a InMemoryDataset with a given `DatasetRef` to each
385 datastore.
387 The put() to child datastores can fail with
388 `DatasetTypeNotSupportedError`. The put() for this datastore will be
389 deemed to have succeeded so long as at least one child datastore
390 accepted the inMemoryDataset.
392 Parameters
393 ----------
394 inMemoryDataset : `object`
395 The dataset to store.
396 ref : `DatasetRef`
397 Reference to the associated Dataset.
399 Raises
400 ------
401 TypeError
402 Supplied object and storage class are inconsistent.
403 DatasetTypeNotSupportedError
404 All datastores reported `DatasetTypeNotSupportedError`.
405 """
406 log.debug("Put %s", ref)
408 # Confirm that we can accept this dataset
409 if not self.constraints.isAcceptable(ref):
410 # Raise rather than use boolean return value.
411 raise DatasetTypeNotSupportedError(
412 f"Dataset {ref} has been rejected by this datastore via configuration."
413 )
415 isPermanent = False
416 nsuccess = 0
417 npermanent = 0
418 nephemeral = 0
419 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
420 if (
421 constraints is not None and not constraints.isAcceptable(ref)
422 ) or not datastore.constraints.isAcceptable(ref):
423 log.debug("Datastore %s skipping put via configuration for ref %s", datastore.name, ref)
424 continue
426 if datastore.isEphemeral:
427 nephemeral += 1
428 else:
429 npermanent += 1
430 try:
431 datastore.put(inMemoryDataset, ref)
432 nsuccess += 1
433 if not datastore.isEphemeral:
434 isPermanent = True
435 except DatasetTypeNotSupportedError:
436 pass
438 if nsuccess == 0:
439 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
441 if not isPermanent and npermanent > 0: 441 ↛ 442line 441 didn't jump to line 442, because the condition on line 441 was never true
442 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
444 if self._transaction is not None:
445 self._transaction.registerUndo("put", self.remove, ref)
447 def _overrideTransferMode(self, *datasets: Any, transfer: str | None = None) -> str | None:
448 # Docstring inherited from base class.
449 if transfer != "auto":
450 return transfer
451 # Ask each datastore what they think auto means
452 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
454 # Remove any untranslated "auto" values
455 transfers.discard(transfer)
457 if len(transfers) == 1: 457 ↛ 458line 457 didn't jump to line 458, because the condition on line 457 was never true
458 return transfers.pop()
459 if not transfers: 459 ↛ 463line 459 didn't jump to line 463, because the condition on line 459 was never false
460 # Everything reported "auto"
461 return transfer
463 raise RuntimeError(
464 "Chained datastore does not yet support different transfer modes"
465 f" from 'auto' in each child datastore (wanted {transfers})"
466 )
468 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData:
469 # Docstring inherited from Datastore._prepIngest.
470 if transfer is None:
471 raise NotImplementedError("ChainedDatastore does not support transfer=None.")
473 def isDatasetAcceptable(dataset: FileDataset, *, name: str, constraints: Constraints) -> bool:
474 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
475 if not acceptable:
476 log.debug(
477 "Datastore %s skipping ingest via configuration for refs %s",
478 name,
479 ", ".join(str(ref) for ref in dataset.refs),
480 )
481 return False
482 else:
483 return True
485 # Filter down to just datasets the chained datastore's own
486 # configuration accepts.
487 okForParent: list[FileDataset] = [
488 dataset
489 for dataset in datasets
490 if isDatasetAcceptable(dataset, name=self.name, constraints=self.constraints)
491 ]
493 # Iterate over nested datastores and call _prepIngest on each.
494 # Save the results to a list:
495 children: list[tuple[Datastore, Datastore.IngestPrepData, set[ResourcePath]]] = []
496 # ...and remember whether all of the failures are due to
497 # NotImplementedError being raised.
498 allFailuresAreNotImplementedError = True
499 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
500 okForChild: list[FileDataset]
501 if constraints is not None:
502 okForChild = [
503 dataset
504 for dataset in okForParent
505 if isDatasetAcceptable(dataset, name=datastore.name, constraints=constraints)
506 ]
507 else:
508 okForChild = okForParent
509 try:
510 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
511 except NotImplementedError:
512 log.debug(
513 "Skipping ingest for datastore %s because transfer mode %s is not supported.",
514 datastore.name,
515 transfer,
516 )
517 continue
518 allFailuresAreNotImplementedError = False
519 if okForChild:
520 # Do not store for later if a datastore has rejected
521 # everything.
522 # Include the source paths if this is a "move". It's clearer
523 # to find the paths now rather than try to infer how
524 # each datastore has stored them in the internal prep class.
525 paths = (
526 {ResourcePath(dataset.path) for dataset in okForChild} if transfer == "move" else set()
527 )
528 children.append((datastore, prepDataForChild, paths))
529 if allFailuresAreNotImplementedError:
530 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
531 return _IngestPrepData(children=children)
533 def _finishIngest(
534 self,
535 prepData: _IngestPrepData,
536 *,
537 transfer: str | None = None,
538 record_validation_info: bool = True,
539 ) -> None:
540 # Docstring inherited from Datastore._finishIngest.
541 # For "move" we must use "copy" and then delete the input
542 # data at the end. This has no rollback option if the ingest
543 # subsequently fails. If there is only one active datastore
544 # accepting any files we can leave it as "move"
545 actual_transfer: str | None
546 if transfer == "move" and len(prepData.children) > 1:
547 actual_transfer = "copy"
548 else:
549 actual_transfer = transfer
550 to_be_deleted: set[ResourcePath] = set()
551 for datastore, prepDataForChild, paths in prepData.children:
552 datastore._finishIngest(
553 prepDataForChild, transfer=actual_transfer, record_validation_info=record_validation_info
554 )
555 to_be_deleted.update(paths)
556 if actual_transfer != transfer:
557 # These datasets were copied but now need to be deleted.
558 # This can not be rolled back.
559 for uri in to_be_deleted:
560 uri.remove()
562 def getManyURIs(
563 self,
564 refs: Iterable[DatasetRef],
565 predict: bool = False,
566 allow_missing: bool = False,
567 ) -> dict[DatasetRef, DatasetRefURIs]:
568 # Docstring inherited
570 uris: dict[DatasetRef, DatasetRefURIs] = {}
571 missing_refs = set(refs)
573 # If predict is True we don't want to predict a dataset in the first
574 # datastore if it actually exists in a later datastore, so in that
575 # case check all datastores with predict=False first, and then try
576 # again with predict=True.
577 for p in (False, True) if predict else (False,):
578 if not missing_refs:
579 break
580 for datastore in self.datastores:
581 try:
582 got_uris = datastore.getManyURIs(missing_refs, p, allow_missing=True)
583 except NotImplementedError:
584 # some datastores may not implement generating URIs
585 continue
586 missing_refs -= got_uris.keys()
587 uris.update(got_uris)
588 if not missing_refs:
589 break
591 if missing_refs and not allow_missing:
592 raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")
594 return uris
596 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs:
597 """Return URIs associated with dataset.
599 Parameters
600 ----------
601 ref : `DatasetRef`
602 Reference to the required dataset.
603 predict : `bool`, optional
604 If the datastore does not know about the dataset, should it
605 return a predicted URI or not?
607 Returns
608 -------
609 uris : `DatasetRefURIs`
610 The URI to the primary artifact associated with this dataset (if
611 the dataset was disassembled within the datastore this may be
612 `None`), and the URIs to any components associated with the dataset
613 artifact. (can be empty if there are no components).
615 Notes
616 -----
617 The returned URI is from the first datastore in the list that has
618 the dataset with preference given to the first dataset coming from
619 a permanent datastore. If no datastores have the dataset and prediction
620 is allowed, the predicted URI for the first datastore in the list will
621 be returned.
622 """
623 log.debug("Requesting URIs for %s", ref)
624 predictedUri: DatasetRefURIs | None = None
625 predictedEphemeralUri: DatasetRefURIs | None = None
626 firstEphemeralUri: DatasetRefURIs | None = None
627 for datastore in self.datastores:
628 if datastore.exists(ref):
629 if not datastore.isEphemeral:
630 uri = datastore.getURIs(ref)
631 log.debug("Retrieved non-ephemeral URI: %s", uri)
632 return uri
633 elif not firstEphemeralUri:
634 firstEphemeralUri = datastore.getURIs(ref)
635 elif predict:
636 if not predictedUri and not datastore.isEphemeral:
637 predictedUri = datastore.getURIs(ref, predict)
638 elif not predictedEphemeralUri and datastore.isEphemeral:
639 predictedEphemeralUri = datastore.getURIs(ref, predict)
641 if firstEphemeralUri:
642 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
643 return firstEphemeralUri
645 if predictedUri:
646 log.debug("Retrieved predicted URI: %s", predictedUri)
647 return predictedUri
649 if predictedEphemeralUri:
650 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
651 return predictedEphemeralUri
653 raise FileNotFoundError(f"Dataset {ref} not in any datastore")
655 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath:
656 """URI to the Dataset.
658 The returned URI is from the first datastore in the list that has
659 the dataset with preference given to the first dataset coming from
660 a permanent datastore. If no datastores have the dataset and prediction
661 is allowed, the predicted URI for the first datastore in the list will
662 be returned.
664 Parameters
665 ----------
666 ref : `DatasetRef`
667 Reference to the required Dataset.
668 predict : `bool`
669 If `True`, allow URIs to be returned of datasets that have not
670 been written.
672 Returns
673 -------
674 uri : `lsst.resources.ResourcePath`
675 URI pointing to the dataset within the datastore. If the
676 dataset does not exist in the datastore, and if ``predict`` is
677 `True`, the URI will be a prediction and will include a URI
678 fragment "#predicted".
680 Notes
681 -----
682 If the datastore does not have entities that relate well
683 to the concept of a URI the returned URI string will be
684 descriptive. The returned URI is not guaranteed to be obtainable.
686 Raises
687 ------
688 FileNotFoundError
689 A URI has been requested for a dataset that does not exist and
690 guessing is not allowed.
691 RuntimeError
692 Raised if a request is made for a single URI but multiple URIs
693 are associated with this dataset.
694 """
695 log.debug("Requesting URI for %s", ref)
696 primary, components = self.getURIs(ref, predict)
697 if primary is None or components: 697 ↛ 698line 697 didn't jump to line 698, because the condition on line 697 was never true
698 raise RuntimeError(
699 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead."
700 )
701 return primary
703 def retrieveArtifacts(
704 self,
705 refs: Iterable[DatasetRef],
706 destination: ResourcePath,
707 transfer: str = "auto",
708 preserve_path: bool = True,
709 overwrite: bool = False,
710 ) -> list[ResourcePath]:
711 """Retrieve the file artifacts associated with the supplied refs.
713 Parameters
714 ----------
715 refs : iterable of `DatasetRef`
716 The datasets for which file artifacts are to be retrieved.
717 A single ref can result in multiple files. The refs must
718 be resolved.
719 destination : `lsst.resources.ResourcePath`
720 Location to write the file artifacts.
721 transfer : `str`, optional
722 Method to use to transfer the artifacts. Must be one of the options
723 supported by `lsst.resources.ResourcePath.transfer_from()`.
724 "move" is not allowed.
725 preserve_path : `bool`, optional
726 If `True` the full path of the file artifact within the datastore
727 is preserved. If `False` the final file component of the path
728 is used.
729 overwrite : `bool`, optional
730 If `True` allow transfers to overwrite existing files at the
731 destination.
733 Returns
734 -------
735 targets : `list` of `lsst.resources.ResourcePath`
736 URIs of file artifacts in destination location. Order is not
737 preserved.
738 """
739 if not destination.isdir(): 739 ↛ 740line 739 didn't jump to line 740, because the condition on line 739 was never true
740 raise ValueError(f"Destination location must refer to a directory. Given {destination}")
742 # Using getURIs is not feasible since it becomes difficult to
743 # determine the path within the datastore later on. For now
744 # follow getURIs implementation approach.
746 pending = set(refs)
748 # There is a question as to whether an exception should be raised
749 # early if some of the refs are missing, or whether files should be
750 # transferred until a problem is hit. Prefer to complain up front.
751 # Use the datastore integer as primary key.
752 grouped_by_datastore: dict[int, set[DatasetRef]] = {}
754 for number, datastore in enumerate(self.datastores):
755 if datastore.isEphemeral:
756 # In the future we will want to distinguish in-memory from
757 # caching datastore since using an on-disk local
758 # cache is exactly what we should be doing.
759 continue
760 try:
761 datastore_refs = {ref for ref in pending if datastore.exists(ref)}
762 except NotImplementedError:
763 # Some datastores may not support retrieving artifacts
764 continue
766 if datastore_refs:
767 grouped_by_datastore[number] = datastore_refs
769 # Remove these from the pending list so that we do not bother
770 # looking for them any more.
771 pending = pending - datastore_refs
773 if pending: 773 ↛ 774line 773 didn't jump to line 774, because the condition on line 773 was never true
774 raise RuntimeError(f"Some datasets were not found in any datastores: {pending}")
776 # Now do the transfer.
777 targets: list[ResourcePath] = []
778 for number, datastore_refs in grouped_by_datastore.items():
779 targets.extend(
780 self.datastores[number].retrieveArtifacts(
781 datastore_refs,
782 destination,
783 transfer=transfer,
784 preserve_path=preserve_path,
785 overwrite=overwrite,
786 )
787 )
789 return targets
791 def remove(self, ref: DatasetRef) -> None:
792 """Indicate to the datastore that a dataset can be removed.
794 The dataset will be removed from each datastore. The dataset is
795 not required to exist in every child datastore.
797 Parameters
798 ----------
799 ref : `DatasetRef`
800 Reference to the required dataset.
802 Raises
803 ------
804 FileNotFoundError
805 Attempt to remove a dataset that does not exist. Raised if none
806 of the child datastores removed the dataset.
807 """
808 log.debug("Removing %s", ref)
809 self.trash(ref, ignore_errors=False)
810 self.emptyTrash(ignore_errors=False)
812 def forget(self, refs: Iterable[DatasetRef]) -> None:
813 for datastore in tuple(self.datastores):
814 datastore.forget(refs)
816 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
817 if isinstance(ref, DatasetRef):
818 ref_label = str(ref)
819 else:
820 ref_label = "bulk datasets"
822 log.debug("Trashing %s", ref_label)
824 counter = 0
825 for datastore in self.datastores:
826 try:
827 datastore.trash(ref, ignore_errors=ignore_errors)
828 counter += 1
829 except FileNotFoundError:
830 pass
832 if counter == 0:
833 err_msg = f"Could not mark for removal from any child datastore: {ref_label}"
834 if ignore_errors: 834 ↛ 835line 834 didn't jump to line 835, because the condition on line 834 was never true
835 log.warning(err_msg)
836 else:
837 raise FileNotFoundError(err_msg)
839 def emptyTrash(self, ignore_errors: bool = True) -> None:
840 for datastore in self.datastores:
841 datastore.emptyTrash(ignore_errors=ignore_errors)
843 def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None:
844 """Retrieve a dataset from an input `Datastore`,
845 and store the result in this `Datastore`.
847 Parameters
848 ----------
849 inputDatastore : `Datastore`
850 The external `Datastore` from which to retreive the Dataset.
851 ref : `DatasetRef`
852 Reference to the required dataset in the input data store.
854 Returns
855 -------
856 results : `list`
857 List containing the return value from the ``put()`` to each
858 child datastore.
859 """
860 assert inputDatastore is not self # unless we want it for renames?
861 inMemoryDataset = inputDatastore.get(ref)
862 self.put(inMemoryDataset, ref)
864 def validateConfiguration(
865 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
866 ) -> None:
867 """Validate some of the configuration for this datastore.
869 Parameters
870 ----------
871 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
872 Entities to test against this configuration. Can be differing
873 types.
874 logFailures : `bool`, optional
875 If `True`, output a log message for every validation error
876 detected.
878 Raises
879 ------
880 DatastoreValidationError
881 Raised if there is a validation problem with a configuration.
882 All the problems are reported in a single exception.
884 Notes
885 -----
886 This method checks each datastore in turn.
887 """
888 # Need to catch each of the datastore outputs and ensure that
889 # all are tested.
890 failures = []
891 for datastore in self.datastores:
892 try:
893 datastore.validateConfiguration(entities, logFailures=logFailures)
894 except DatastoreValidationError as e:
895 if logFailures: 895 ↛ 897line 895 didn't jump to line 897, because the condition on line 895 was never false
896 log.critical("Datastore %s failed validation", datastore.name)
897 failures.append(f"Datastore {self.name}: {e}")
899 if failures:
900 msg = ";\n".join(failures)
901 raise DatastoreValidationError(msg)
903 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
904 # Docstring is inherited from base class
905 failures = []
906 for datastore in self.datastores:
907 try:
908 datastore.validateKey(lookupKey, entity)
909 except DatastoreValidationError as e:
910 failures.append(f"Datastore {self.name}: {e}")
912 if failures:
913 msg = ";\n".join(failures)
914 raise DatastoreValidationError(msg)
916 def getLookupKeys(self) -> set[LookupKey]:
917 # Docstring is inherited from base class
918 keys = set()
919 for datastore in self.datastores:
920 keys.update(datastore.getLookupKeys())
922 keys.update(self.constraints.getLookupKeys())
923 for p in self.datastoreConstraints:
924 if p is not None: 924 ↛ 923line 924 didn't jump to line 923, because the condition on line 924 was never false
925 keys.update(p.getLookupKeys())
927 return keys
929 def needs_expanded_data_ids(
930 self,
931 transfer: str | None,
932 entity: DatasetRef | DatasetType | StorageClass | None = None,
933 ) -> bool:
934 # Docstring inherited.
935 # We can't safely use `self.datastoreConstraints` with `entity` to
936 # check whether a child datastore would even want to ingest this
937 # dataset, because we don't want to filter out datastores that might
938 # need an expanded data ID based in incomplete information (e.g. we
939 # pass a StorageClass, but the constraint dispatches on DatasetType).
940 # So we pessimistically check if any datastore would need an expanded
941 # data ID for this transfer mode.
942 return any(datastore.needs_expanded_data_ids(transfer, entity) for datastore in self.datastores)
944 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None:
945 # Docstring inherited from the base class.
947 for datastore in self.datastores:
948 datastore.import_records(data)
950 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
951 # Docstring inherited from the base class.
953 all_records: dict[str, DatastoreRecordData] = {}
955 # Merge all sub-datastore records into one structure
956 for datastore in self.datastores:
957 sub_records = datastore.export_records(refs)
958 for name, record_data in sub_records.items():
959 # All datastore names must be unique in a chain.
960 if name in all_records: 960 ↛ 961line 960 didn't jump to line 961, because the condition on line 960 was never true
961 raise ValueError("Non-unique datastore name found in datastore {datastore}")
962 all_records[name] = record_data
964 return all_records
966 def export(
967 self,
968 refs: Iterable[DatasetRef],
969 *,
970 directory: ResourcePathExpression | None = None,
971 transfer: str | None = "auto",
972 ) -> Iterable[FileDataset]:
973 # Docstring inherited from Datastore.export.
974 if transfer == "auto" and directory is None:
975 transfer = None
977 if transfer is not None and directory is None:
978 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given")
980 if transfer == "move":
981 raise TypeError("Can not export by moving files out of datastore.")
983 # Exporting from a chain has the potential for a dataset to be
984 # in one or more of the datastores in the chain. We only need one
985 # of them since we assume the datasets are the same in all (but
986 # the file format could be different of course since that is a
987 # per-datastore configuration).
988 # We also do not know whether any of the datastores in the chain
989 # support file export.
991 # Ensure we have an ordered sequence that is not an iterator or set.
992 if not isinstance(refs, Sequence):
993 refs = list(refs)
995 # If any of the datasets are missing entirely we need to raise early
996 # before we try to run the export. This can be a little messy but is
997 # better than exporting files from the first datastore and then finding
998 # that one is missing but is not in the second datastore either.
999 known = [datastore.knows_these(refs) for datastore in self.datastores]
1000 refs_known: set[DatasetRef] = set()
1001 for known_to_this in known:
1002 refs_known.update({ref for ref, knows_this in known_to_this.items() if knows_this})
1003 missing_count = len(refs) - len(refs_known)
1004 if missing_count:
1005 raise FileNotFoundError(f"Not all datasets known to this datastore. Missing {missing_count}")
1007 # To allow us to slot each result into the right place after
1008 # asking each datastore, create a dict with the index.
1009 ref_positions = {ref: i for i, ref in enumerate(refs)}
1011 # Presize the final export list.
1012 exported: list[FileDataset | None] = [None] * len(refs)
1014 # The order of the returned dataset has to match the order of the
1015 # given refs, even if they are all from different datastores.
1016 for i, datastore in enumerate(self.datastores):
1017 known_to_this = known[i]
1018 filtered = [ref for ref, knows in known_to_this.items() if knows and ref in ref_positions]
1020 try:
1021 this_export = datastore.export(filtered, directory=directory, transfer=transfer)
1022 except NotImplementedError:
1023 # Try the next datastore.
1024 continue
1026 for ref, export in zip(filtered, this_export, strict=True):
1027 # Get the position and also delete it from the list.
1028 exported[ref_positions.pop(ref)] = export
1030 # Every dataset should be accounted for because of the earlier checks
1031 # but make sure that we did fill all the slots to appease mypy.
1032 for i, dataset in enumerate(exported):
1033 if dataset is None: 1033 ↛ 1034line 1033 didn't jump to line 1034, because the condition on line 1033 was never true
1034 raise FileNotFoundError(f"Failed to export dataset {refs[i]}.")
1035 yield dataset
1037 def transfer_from(
1038 self,
1039 source_datastore: Datastore,
1040 refs: Iterable[DatasetRef],
1041 transfer: str = "auto",
1042 artifact_existence: dict[ResourcePath, bool] | None = None,
1043 ) -> tuple[set[DatasetRef], set[DatasetRef]]:
1044 # Docstring inherited
1045 # mypy does not understand "type(self) is not type(source)"
1046 if isinstance(source_datastore, ChainedDatastore):
1047 # Both the source and destination are chained datastores.
1048 source_datastores = tuple(source_datastore.datastores)
1049 else:
1050 # The source datastore is different, forward everything to the
1051 # child datastores.
1052 source_datastores = (source_datastore,)
1054 # Need to know the set of all possible refs that could be transferred.
1055 remaining_refs = set(refs)
1057 missing_from_source: set[DatasetRef] | None = None
1058 all_accepted = set()
1059 nsuccess = 0
1060 for source_child in source_datastores:
1061 # If we are reading from a chained datastore, it's possible that
1062 # only a subset of the datastores know about the dataset. We can't
1063 # ask the receiving datastore to copy it when it doesn't exist
1064 # so we have to filter again based on what the source datastore
1065 # understands.
1066 known_to_source = source_child.knows_these(list(refs))
1068 # Need to know that there is a possibility that some of these
1069 # datasets exist but are unknown to the source datastore if
1070 # trust is enabled.
1071 if getattr(source_child, "trustGetRequest", False):
1072 unknown = [ref for ref, known in known_to_source.items() if not known]
1073 existence = source_child.mexists(unknown, artifact_existence)
1074 for ref, exists in existence.items():
1075 known_to_source[ref] = exists
1077 missing = {ref for ref, known in known_to_source.items() if not known}
1078 if missing:
1079 if missing_from_source is None:
1080 missing_from_source = missing
1081 else:
1082 missing_from_source &= missing
1084 # Try to transfer from each source datastore to each child
1085 # datastore. Have to make sure we don't transfer something
1086 # we've already transferred to this destination on later passes.
1088 # Filter the initial list based on the datasets we have
1089 # not yet transferred.
1090 these_refs = []
1091 for ref in refs:
1092 if ref in remaining_refs and known_to_source[ref]:
1093 these_refs.append(ref)
1095 if not these_refs:
1096 # Already transferred all datasets known to this datastore.
1097 continue
1099 for datastore, constraints in zip(self.datastores, self.datastoreConstraints, strict=True):
1100 if constraints is not None: 1100 ↛ 1108line 1100 didn't jump to line 1108, because the condition on line 1100 was never false
1101 filtered_refs = []
1102 for ref in these_refs:
1103 if constraints.isAcceptable(ref):
1104 filtered_refs.append(ref)
1105 else:
1106 log.debug("Rejecting ref by constraints: %s", ref)
1107 else:
1108 filtered_refs = list(these_refs)
1109 try:
1110 accepted, _ = datastore.transfer_from(
1111 source_child, filtered_refs, transfer, artifact_existence
1112 )
1113 except (TypeError, NotImplementedError):
1114 # The datastores were incompatible.
1115 continue
1116 else:
1117 nsuccess += 1
1119 # Remove the accepted datasets from those remaining.
1120 remaining_refs = remaining_refs - accepted
1122 # Keep track of everything we have accepted.
1123 all_accepted.update(accepted)
1125 if missing_from_source:
1126 for ref in missing_from_source:
1127 log.warning("Asked to transfer dataset %s but no file artifacts exist for it", ref)
1129 if nsuccess == 0: 1129 ↛ 1130line 1129 didn't jump to line 1130, because the condition on line 1129 was never true
1130 raise TypeError(f"None of the child datastores could accept transfers from {source_datastore!r}")
1132 return all_accepted, remaining_refs