Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py : 92%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Chained datastore."""
24__all__ = ("ChainedDatastore",)
26import time
27import logging
28import warnings
29import itertools
30from typing import List, Sequence, Optional, Tuple, Any
32from lsst.utils import doImport
33from lsst.daf.butler import Datastore, DatastoreConfig, DatasetTypeNotSupportedError, \
34 DatastoreValidationError, Constraints, FileDataset
36log = logging.getLogger(__name__)
39class _IngestPrepData(Datastore.IngestPrepData):
40 """Helper class for ChainedDatastore ingest implementation.
42 Parameters
43 ----------
44 children : `list` of `tuple`
45 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
46 """
47 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]):
48 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children))
49 self.children = children
52class ChainedDatastore(Datastore):
53 """Chained Datastores to allow read and writes from multiple datastores.
55 A ChainedDatastore is configured with multiple datastore configurations.
56 A ``put()`` is always sent to each datastore. A ``get()``
57 operation is sent to each datastore in turn and the first datastore
58 to return a valid dataset is used.
60 Parameters
61 ----------
62 config : `DatastoreConfig` or `str`
63 Configuration. This configuration must include a ``datastores`` field
64 as a sequence of datastore configurations. The order in this sequence
65 indicates the order to use for read operations.
66 registry : `Registry`
67 Registry to use for storing internal information about the datasets.
68 butlerRoot : `str`, optional
69 New datastore root to use to override the configuration value. This
70 root is sent to each child datastore.
72 Notes
73 -----
74 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
75 mode. It supports `"copy"`, `"symlink"`, `"relsymlink"`
76 and `"hardlink"` if and only if all its child datastores do.
77 """
79 defaultConfigFile = "datastores/chainedDatastore.yaml"
80 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
81 absolute path. Can be None if no defaults specified.
82 """
84 containerKey = "datastores"
85 """Key to specify where child datastores are configured."""
87 datastores: List[Datastore]
88 """All the child datastores known to this datastore."""
90 datastoreConstraints: Sequence[Optional[Constraints]]
91 """Constraints to be applied to each of the child datastores."""
93 @classmethod
94 def setConfigRoot(cls, root, config, full, overwrite=True):
95 """Set any filesystem-dependent config options for child Datastores to
96 be appropriate for a new empty repository with the given root.
98 Parameters
99 ----------
100 root : `str`
101 Filesystem path to the root of the data repository.
102 config : `Config`
103 A `Config` to update. Only the subset understood by
104 this component will be updated. Will not expand
105 defaults.
106 full : `Config`
107 A complete config with all defaults expanded that can be
108 converted to a `DatastoreConfig`. Read-only and will not be
109 modified by this method.
110 Repository-specific options that should not be obtained
111 from defaults when Butler instances are constructed
112 should be copied from ``full`` to ``config``.
113 overwrite : `bool`, optional
114 If `False`, do not modify a value in ``config`` if the value
115 already exists. Default is always to overwrite with the provided
116 ``root``.
118 Notes
119 -----
120 If a keyword is explicitly defined in the supplied ``config`` it
121 will not be overridden by this method if ``overwrite`` is `False`.
122 This allows explicit values set in external configs to be retained.
123 """
125 # Extract the part of the config we care about updating
126 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
128 # And the subset of the full config that we can use for reference.
129 # Do not bother with defaults because we are told this already has
130 # them.
131 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
133 # Loop over each datastore config and pass the subsets to the
134 # child datastores to process.
136 containerKey = cls.containerKey
137 for idx, (child, fullChild) in enumerate(zip(datastoreConfig[containerKey],
138 fullDatastoreConfig[containerKey])):
139 childConfig = DatastoreConfig(child, mergeDefaults=False)
140 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
141 datastoreClass = doImport(fullChildConfig["cls"])
142 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
143 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
145 # Reattach to parent
146 datastoreConfig[containerKey, idx] = childConfig
148 # Reattach modified datastore config to parent
149 # If this has a datastore key we attach there, otherwise we assume
150 # this information goes at the top of the config hierarchy.
151 if DatastoreConfig.component in config:
152 config[DatastoreConfig.component] = datastoreConfig
153 else:
154 config.update(datastoreConfig)
156 return
158 def __init__(self, config, registry=None, butlerRoot=None):
159 super().__init__(config, registry)
161 # Scan for child datastores and instantiate them with the same registry
162 self.datastores = []
163 for c in self.config["datastores"]:
164 c = DatastoreConfig(c)
165 datastoreType = doImport(c["cls"])
166 datastore = datastoreType(c, registry, butlerRoot=butlerRoot)
167 log.debug("Creating child datastore %s", datastore.name)
168 self.datastores.append(datastore)
170 # Name ourself based on our children
171 if self.datastores: 171 ↛ 176line 171 didn't jump to line 176, because the condition on line 171 was never false
172 # We must set the names explicitly
173 self._names = [d.name for d in self.datastores]
174 childNames = ",".join(self.names)
175 else:
176 childNames = "(empty@{})".format(time.time())
177 self._names = [childNames]
178 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
180 # We declare we are ephemeral if all our child datastores declare
181 # they are ephemeral
182 isEphemeral = True
183 for d in self.datastores:
184 if not d.isEphemeral:
185 isEphemeral = False
186 break
187 self.isEphemeral = isEphemeral
189 # per-datastore override constraints
190 if "datastore_constraints" in self.config:
191 overrides = self.config["datastore_constraints"]
193 if len(overrides) != len(self.datastores): 193 ↛ 194line 193 didn't jump to line 194, because the condition on line 193 was never true
194 raise DatastoreValidationError(f"Number of registered datastores ({len(self.datastores)})"
195 " differs from number of constraints overrides"
196 f" {len(overrides)}")
198 self.datastoreConstraints = [Constraints(c.get("constraints"), universe=self.registry.dimensions)
199 for c in overrides]
201 else:
202 self.datastoreConstraints = (None,) * len(self.datastores)
204 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
206 @property
207 def names(self):
208 return self._names
210 def __str__(self):
211 chainName = ", ".join(str(ds) for ds in self.datastores)
212 return chainName
214 def exists(self, ref):
215 """Check if the dataset exists in one of the datastores.
217 Parameters
218 ----------
219 ref : `DatasetRef`
220 Reference to the required dataset.
222 Returns
223 -------
224 exists : `bool`
225 `True` if the entity exists in one of the child datastores.
226 """
227 for datastore in self.datastores:
228 if datastore.exists(ref):
229 log.debug("Found %s in datastore %s", ref, datastore.name)
230 return True
231 return False
233 def get(self, ref, parameters=None):
234 """Load an InMemoryDataset from the store.
236 The dataset is returned from the first datastore that has
237 the dataset.
239 Parameters
240 ----------
241 ref : `DatasetRef`
242 Reference to the required Dataset.
243 parameters : `dict`
244 `StorageClass`-specific parameters that specify, for example,
245 a slice of the dataset to be loaded.
247 Returns
248 -------
249 inMemoryDataset : `object`
250 Requested dataset or slice thereof as an InMemoryDataset.
252 Raises
253 ------
254 FileNotFoundError
255 Requested dataset can not be retrieved.
256 TypeError
257 Return value from formatter has unexpected type.
258 ValueError
259 Formatter failed to process the dataset.
260 """
262 for datastore in self.datastores:
263 try:
264 inMemoryObject = datastore.get(ref, parameters)
265 log.debug("Found dataset %s in datastore %s", ref, datastore.name)
266 return inMemoryObject
267 except FileNotFoundError:
268 pass
270 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
272 def put(self, inMemoryDataset, ref):
273 """Write a InMemoryDataset with a given `DatasetRef` to each
274 datastore.
276 The put() to child datastores can fail with
277 `DatasetTypeNotSupportedError`. The put() for this datastore will be
278 deemed to have succeeded so long as at least one child datastore
279 accepted the inMemoryDataset.
281 Parameters
282 ----------
283 inMemoryDataset : `object`
284 The dataset to store.
285 ref : `DatasetRef`
286 Reference to the associated Dataset.
288 Raises
289 ------
290 TypeError
291 Supplied object and storage class are inconsistent.
292 DatasetTypeNotSupportedError
293 All datastores reported `DatasetTypeNotSupportedError`.
294 """
295 log.debug("Put %s", ref)
297 # Confirm that we can accept this dataset
298 if not self.constraints.isAcceptable(ref):
299 # Raise rather than use boolean return value.
300 raise DatasetTypeNotSupportedError(f"Dataset {ref} has been rejected by this datastore via"
301 " configuration.")
303 isPermanent = False
304 nsuccess = 0
305 npermanent = 0
306 nephemeral = 0
307 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
308 if constraints is not None and not constraints.isAcceptable(ref):
309 log.debug("Datastore %s skipping put via configuration for ref %s",
310 datastore.name, ref)
311 continue
313 if datastore.isEphemeral:
314 nephemeral += 1
315 else:
316 npermanent += 1
317 try:
318 datastore.put(inMemoryDataset, ref)
319 nsuccess += 1
320 if not datastore.isEphemeral:
321 isPermanent = True
322 except DatasetTypeNotSupportedError:
323 pass
325 if nsuccess == 0:
326 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
328 if not isPermanent and npermanent > 0: 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true
329 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
331 if self._transaction is not None:
332 self._transaction.registerUndo('put', self.remove, ref)
334 def _overrideTransferMode(self, *datasets: Any, transfer: Optional[str] = None) -> str:
335 # Docstring inherited from base class.
336 if transfer != "auto":
337 return transfer
338 # Ask each datastore what they think auto means
339 transfers = {d._overrideTransferMode(*datasets, transfer=transfer) for d in self.datastores}
341 # Remove any untranslated "auto" values
342 transfers.discard(transfer)
344 if len(transfers) == 1:
345 return transfers.pop()
346 if not transfers: 346 ↛ 350line 346 didn't jump to line 350, because the condition on line 346 was never false
347 # Everything reported "auto"
348 return transfer
350 raise RuntimeError("Chained datastore does not yet support different transfer modes"
351 f" from 'auto' in each child datastore (wanted {transfers})")
353 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
354 # Docstring inherited from Datastore._prepIngest.
355 if transfer is None or transfer == "move":
356 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.")
358 def isDatasetAcceptable(dataset, *, name, constraints):
359 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
360 if not acceptable:
361 log.debug("Datastore %s skipping ingest via configuration for refs %s",
362 name, ", ".join(str(ref) for ref in dataset.refs))
363 return False
364 else:
365 return True
367 # Filter down to just datasets the chained datastore's own
368 # configuration accepts.
369 okForParent: List[FileDataset] = [dataset for dataset in datasets
370 if isDatasetAcceptable(dataset, name=self.name,
371 constraints=self.constraints)]
373 # Iterate over nested datastores and call _prepIngest on each.
374 # Save the results to a list:
375 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = []
376 # ...and remember whether all of the failures are due to
377 # NotImplementedError being raised.
378 allFailuresAreNotImplementedError = True
379 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
380 if constraints is not None:
381 okForChild: List[FileDataset] = [dataset for dataset in okForParent
382 if isDatasetAcceptable(dataset, name=datastore.name,
383 constraints=constraints)]
384 else:
385 okForChild: List[FileDataset] = okForParent
386 try:
387 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
388 except NotImplementedError:
389 log.debug("Skipping ingest for datastore %s because transfer "
390 "mode %s is not supported.", datastore.name, transfer)
391 continue
392 allFailuresAreNotImplementedError = False
393 children.append((datastore, prepDataForChild))
394 if allFailuresAreNotImplementedError:
395 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
396 return _IngestPrepData(children=children)
398 def _finishIngest(self, prepData: _IngestPrepData, *, transfer: Optional[str] = None):
399 # Docstring inherited from Datastore._finishIngest.
400 for datastore, prepDataForChild in prepData.children:
401 datastore._finishIngest(prepDataForChild, transfer=transfer)
403 def getUri(self, ref, predict=False):
404 """URI to the Dataset.
406 The returned URI is from the first datastore in the list that has
407 the dataset with preference given to the first dataset coming from
408 a permanent datastore. If no datastores have the dataset and prediction
409 is allowed, the predicted URI for the first datastore in the list will
410 be returned.
412 Parameters
413 ----------
414 ref : `DatasetRef`
415 Reference to the required Dataset.
416 predict : `bool`
417 If `True`, allow URIs to be returned of datasets that have not
418 been written.
420 Returns
421 -------
422 uri : `str`
423 URI string pointing to the dataset within the datastore. If the
424 dataset does not exist in the datastore, and if ``predict`` is
425 `True`, the URI will be a prediction and will include a URI
426 fragment "#predicted".
428 Notes
429 -----
430 If the datastore does not have entities that relate well
431 to the concept of a URI the returned URI string will be
432 descriptive. The returned URI is not guaranteed to be obtainable.
434 Raises
435 ------
436 FileNotFoundError
437 A URI has been requested for a dataset that does not exist and
438 guessing is not allowed.
439 """
440 log.debug("Requesting URI for %s", ref)
441 predictedUri = None
442 predictedEphemeralUri = None
443 firstEphemeralUri = None
444 for datastore in self.datastores:
445 if datastore.exists(ref):
446 if not datastore.isEphemeral:
447 uri = datastore.getUri(ref)
448 log.debug("Retrieved ephemeral URI: %s", uri)
449 return uri
450 elif firstEphemeralUri is None:
451 firstEphemeralUri = datastore.getUri(ref)
452 elif predict:
453 if predictedUri is None and not datastore.isEphemeral:
454 predictedUri = datastore.getUri(ref, predict)
455 elif predictedEphemeralUri is None and datastore.isEphemeral:
456 predictedEphemeralUri = datastore.getUri(ref, predict)
458 if firstEphemeralUri is not None:
459 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
460 return firstEphemeralUri
462 if predictedUri is not None:
463 log.debug("Retrieved predicted URI: %s", predictedUri)
464 return predictedUri
466 if predictedEphemeralUri is not None:
467 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
468 return predictedEphemeralUri
470 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
472 def remove(self, ref):
473 """Indicate to the datastore that a dataset can be removed.
475 The dataset will be removed from each datastore. The dataset is
476 not required to exist in every child datastore.
478 Parameters
479 ----------
480 ref : `DatasetRef`
481 Reference to the required dataset.
483 Raises
484 ------
485 FileNotFoundError
486 Attempt to remove a dataset that does not exist. Raised if none
487 of the child datastores removed the dataset.
488 """
489 log.debug(f"Removing {ref}")
490 self.trash(ref, ignore_errors=False)
491 self.emptyTrash(ignore_errors=False)
493 def trash(self, ref, ignore_errors=True):
494 log.debug("Trashing %s", ref)
496 counter = 0
497 for datastore in self.datastores:
498 try:
499 datastore.trash(ref, ignore_errors=ignore_errors)
500 counter += 1
501 except FileNotFoundError:
502 pass
504 if counter == 0:
505 err_msg = f"Could not mark for removal from any child datastore: {ref}"
506 if ignore_errors: 506 ↛ 507line 506 didn't jump to line 507, because the condition on line 506 was never true
507 log.warning(err_msg)
508 else:
509 raise FileNotFoundError(err_msg)
511 def emptyTrash(self, ignore_errors=True):
512 for datastore in self.datastores:
513 datastore.emptyTrash(ignore_errors=ignore_errors)
515 def transfer(self, inputDatastore, ref):
516 """Retrieve a dataset from an input `Datastore`,
517 and store the result in this `Datastore`.
519 Parameters
520 ----------
521 inputDatastore : `Datastore`
522 The external `Datastore` from which to retreive the Dataset.
523 ref : `DatasetRef`
524 Reference to the required dataset in the input data store.
526 Returns
527 -------
528 results : `list`
529 List containing the return value from the ``put()`` to each
530 child datastore.
531 """
532 assert inputDatastore is not self # unless we want it for renames?
533 inMemoryDataset = inputDatastore.get(ref)
534 return [datastore.put(inMemoryDataset, ref) for datastore in self.datastores]
536 def validateConfiguration(self, entities, logFailures=False):
537 """Validate some of the configuration for this datastore.
539 Parameters
540 ----------
541 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
542 Entities to test against this configuration. Can be differing
543 types.
544 logFailures : `bool`, optional
545 If `True`, output a log message for every validation error
546 detected.
548 Raises
549 ------
550 DatastoreValidationError
551 Raised if there is a validation problem with a configuration.
552 All the problems are reported in a single exception.
554 Notes
555 -----
556 This method checks each datastore in turn.
557 """
559 # Need to catch each of the datastore outputs and ensure that
560 # all are tested.
561 failures = []
562 for datastore in self.datastores:
563 try:
564 datastore.validateConfiguration(entities, logFailures=logFailures)
565 except DatastoreValidationError as e:
566 if logFailures: 566 ↛ 568line 566 didn't jump to line 568, because the condition on line 566 was never false
567 log.fatal("Datastore %s failed validation", datastore.name)
568 failures.append(f"Datastore {self.name}: {e}")
570 if failures:
571 msg = ";\n".join(failures)
572 raise DatastoreValidationError(msg)
574 def validateKey(self, lookupKey, entity):
575 # Docstring is inherited from base class
576 failures = []
577 for datastore in self.datastores:
578 try:
579 datastore.validateKey(lookupKey, entity)
580 except DatastoreValidationError as e:
581 failures.append(f"Datastore {self.name}: {e}")
583 if failures:
584 msg = ";\n".join(failures)
585 raise DatastoreValidationError(msg)
587 def getLookupKeys(self):
588 # Docstring is inherited from base class
589 keys = set()
590 for datastore in self.datastores:
591 keys.update(datastore.getLookupKeys())
593 keys.update(self.constraints.getLookupKeys())
594 for p in self.datastoreConstraints:
595 if p is not None: 595 ↛ 596line 595 didn't jump to line 596, because the condition on line 595 was never true
596 keys.update(p.getLookupKeys())
598 return keys