Coverage for python/lsst/daf/butler/datastores/chainedDatastore.py : 93%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Chained datastore."""
24__all__ = ("ChainedDatastore",)
26import time
27import logging
28import warnings
29import itertools
30from typing import List, Sequence, Optional, Tuple
32from lsst.utils import doImport
33from lsst.daf.butler import Datastore, DatastoreConfig, DatasetTypeNotSupportedError, \
34 DatastoreValidationError, Constraints, FileDataset
36log = logging.getLogger(__name__)
39class _IngestPrepData(Datastore.IngestPrepData):
40 """Helper class for ChainedDatastore ingest implementation.
42 Parameters
43 ----------
44 children : `list` of `tuple`
45 Pairs of `Datastore`, `IngestPrepData` for all child datastores.
46 """
47 def __init__(self, children: List[Tuple[Datastore, Datastore.IngestPrepData]]):
48 super().__init__(itertools.chain.from_iterable(data.refs.values() for _, data in children))
49 self.children = children
52class ChainedDatastore(Datastore):
53 """Chained Datastores to allow read and writes from multiple datastores.
55 A ChainedDatastore is configured with multiple datastore configurations.
56 A ``put()`` is always sent to each datastore. A ``get()``
57 operation is sent to each datastore in turn and the first datastore
58 to return a valid dataset is used.
60 Parameters
61 ----------
62 config : `DatastoreConfig` or `str`
63 Configuration. This configuration must include a ``datastores`` field
64 as a sequence of datastore configurations. The order in this sequence
65 indicates the order to use for read operations.
66 registry : `Registry`
67 Registry to use for storing internal information about the datasets.
68 butlerRoot : `str`, optional
69 New datastore root to use to override the configuration value. This
70 root is sent to each child datastore.
72 Notes
73 -----
74 ChainedDatastore never supports `None` or `"move"` as an `ingest` transfer
75 mode. It supports `"copy"`, `"symlink"`, and `"hardlink"` if and only if
76 its child datastores do.
77 """
79 defaultConfigFile = "datastores/chainedDatastore.yaml"
80 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
81 absolute path. Can be None if no defaults specified.
82 """
84 containerKey = "datastores"
85 """Key to specify where child datastores are configured."""
87 datastores: List[Datastore]
88 """All the child datastores known to this datastore."""
90 datastoreConstraints: Sequence[Optional[Constraints]]
91 """Constraints to be applied to each of the child datastores."""
93 @classmethod
94 def setConfigRoot(cls, root, config, full, overwrite=True):
95 """Set any filesystem-dependent config options for child Datastores to
96 be appropriate for a new empty repository with the given root.
98 Parameters
99 ----------
100 root : `str`
101 Filesystem path to the root of the data repository.
102 config : `Config`
103 A `Config` to update. Only the subset understood by
104 this component will be updated. Will not expand
105 defaults.
106 full : `Config`
107 A complete config with all defaults expanded that can be
108 converted to a `DatastoreConfig`. Read-only and will not be
109 modified by this method.
110 Repository-specific options that should not be obtained
111 from defaults when Butler instances are constructed
112 should be copied from ``full`` to ``config``.
113 overwrite : `bool`, optional
114 If `False`, do not modify a value in ``config`` if the value
115 already exists. Default is always to overwrite with the provided
116 ``root``.
118 Notes
119 -----
120 If a keyword is explicitly defined in the supplied ``config`` it
121 will not be overridden by this method if ``overwrite`` is `False`.
122 This allows explicit values set in external configs to be retained.
123 """
125 # Extract the part of the config we care about updating
126 datastoreConfig = DatastoreConfig(config, mergeDefaults=False)
128 # And the subset of the full config that we can use for reference.
129 # Do not bother with defaults because we are told this already has
130 # them.
131 fullDatastoreConfig = DatastoreConfig(full, mergeDefaults=False)
133 # Loop over each datastore config and pass the subsets to the
134 # child datastores to process.
136 containerKey = cls.containerKey
137 for idx, (child, fullChild) in enumerate(zip(datastoreConfig[containerKey],
138 fullDatastoreConfig[containerKey])):
139 childConfig = DatastoreConfig(child, mergeDefaults=False)
140 fullChildConfig = DatastoreConfig(fullChild, mergeDefaults=False)
141 datastoreClass = doImport(fullChildConfig["cls"])
142 newroot = "{}/{}_{}".format(root, datastoreClass.__qualname__, idx)
143 datastoreClass.setConfigRoot(newroot, childConfig, fullChildConfig, overwrite=overwrite)
145 # Reattach to parent
146 datastoreConfig[containerKey, idx] = childConfig
148 # Reattach modified datastore config to parent
149 # If this has a datastore key we attach there, otherwise we assume
150 # this information goes at the top of the config hierarchy.
151 if DatastoreConfig.component in config:
152 config[DatastoreConfig.component] = datastoreConfig
153 else:
154 config.update(datastoreConfig)
156 return
158 def __init__(self, config, registry=None, butlerRoot=None):
159 super().__init__(config, registry)
161 # Scan for child datastores and instantiate them with the same registry
162 self.datastores = []
163 for c in self.config["datastores"]:
164 c = DatastoreConfig(c)
165 datastoreType = doImport(c["cls"])
166 datastore = datastoreType(c, registry, butlerRoot=butlerRoot)
167 log.debug("Creating child datastore %s", datastore.name)
168 self.datastores.append(datastore)
170 # Name ourself based on our children
171 if self.datastores: 171 ↛ 176line 171 didn't jump to line 176, because the condition on line 171 was never false
172 # We must set the names explicitly
173 self._names = [d.name for d in self.datastores]
174 childNames = ",".join(self.names)
175 else:
176 childNames = "(empty@{})".format(time.time())
177 self._names = [childNames]
178 self.name = "{}[{}]".format(type(self).__qualname__, childNames)
180 # We declare we are ephemeral if all our child datastores declare
181 # they are ephemeral
182 isEphemeral = True
183 for d in self.datastores:
184 if not d.isEphemeral:
185 isEphemeral = False
186 break
187 self.isEphemeral = isEphemeral
189 # per-datastore override constraints
190 if "datastore_constraints" in self.config:
191 overrides = self.config["datastore_constraints"]
193 if len(overrides) != len(self.datastores): 193 ↛ 194line 193 didn't jump to line 194, because the condition on line 193 was never true
194 raise DatastoreValidationError(f"Number of registered datastores ({len(self.datastores)})"
195 " differs from number of constraints overrides"
196 f" {len(overrides)}")
198 self.datastoreConstraints = [Constraints(c.get("constraints"), universe=self.registry.dimensions)
199 for c in overrides]
201 else:
202 self.datastoreConstraints = (None,) * len(self.datastores)
204 log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent"))
206 @property
207 def names(self):
208 return self._names
210 def __str__(self):
211 chainName = ", ".join(str(ds) for ds in self.datastores)
212 return chainName
214 def exists(self, ref):
215 """Check if the dataset exists in one of the datastores.
217 Parameters
218 ----------
219 ref : `DatasetRef`
220 Reference to the required dataset.
222 Returns
223 -------
224 exists : `bool`
225 `True` if the entity exists in one of the child datastores.
226 """
227 for datastore in self.datastores:
228 if datastore.exists(ref):
229 log.debug("Found %s in datastore %s", ref, datastore.name)
230 return True
231 return False
233 def get(self, ref, parameters=None):
234 """Load an InMemoryDataset from the store.
236 The dataset is returned from the first datastore that has
237 the dataset.
239 Parameters
240 ----------
241 ref : `DatasetRef`
242 Reference to the required Dataset.
243 parameters : `dict`
244 `StorageClass`-specific parameters that specify, for example,
245 a slice of the Dataset to be loaded.
247 Returns
248 -------
249 inMemoryDataset : `object`
250 Requested Dataset or slice thereof as an InMemoryDataset.
252 Raises
253 ------
254 FileNotFoundError
255 Requested dataset can not be retrieved.
256 TypeError
257 Return value from formatter has unexpected type.
258 ValueError
259 Formatter failed to process the dataset.
260 """
262 for datastore in self.datastores:
263 try:
264 inMemoryObject = datastore.get(ref, parameters)
265 log.debug("Found Dataset %s in datastore %s", ref, datastore.name)
266 return inMemoryObject
267 except FileNotFoundError:
268 pass
270 raise FileNotFoundError("Dataset {} could not be found in any of the datastores".format(ref))
272 def put(self, inMemoryDataset, ref):
273 """Write a InMemoryDataset with a given `DatasetRef` to each
274 datastore.
276 The put() to child datastores can fail with
277 `DatasetTypeNotSupportedError`. The put() for this datastore will be
278 deemed to have succeeded so long as at least one child datastore
279 accepted the inMemoryDataset.
281 Parameters
282 ----------
283 inMemoryDataset : `object`
284 The Dataset to store.
285 ref : `DatasetRef`
286 Reference to the associated Dataset.
288 Raises
289 ------
290 TypeError
291 Supplied object and storage class are inconsistent.
292 DatasetTypeNotSupportedError
293 All datastores reported `DatasetTypeNotSupportedError`.
294 """
295 log.debug("Put %s", ref)
297 # Confirm that we can accept this dataset
298 if not self.constraints.isAcceptable(ref):
299 # Raise rather than use boolean return value.
300 raise DatasetTypeNotSupportedError(f"Dataset {ref} has been rejected by this datastore via"
301 " configuration.")
303 isPermanent = False
304 nsuccess = 0
305 npermanent = 0
306 nephemeral = 0
307 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
308 if constraints is not None and not constraints.isAcceptable(ref):
309 log.debug("Datastore %s skipping put via configuration for ref %s",
310 datastore.name, ref)
311 continue
313 if datastore.isEphemeral:
314 nephemeral += 1
315 else:
316 npermanent += 1
317 try:
318 datastore.put(inMemoryDataset, ref)
319 nsuccess += 1
320 if not datastore.isEphemeral:
321 isPermanent = True
322 except DatasetTypeNotSupportedError:
323 pass
325 if nsuccess == 0:
326 raise DatasetTypeNotSupportedError(f"None of the chained datastores supported ref {ref}")
328 if not isPermanent and npermanent > 0: 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true
329 warnings.warn(f"Put of {ref} only succeeded in ephemeral databases", stacklevel=2)
331 if self._transaction is not None:
332 self._transaction.registerUndo('put', self.remove, ref)
334 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
335 # Docstring inherited from Datastore._prepIngest.
336 if transfer is None or transfer == "move":
337 raise NotImplementedError("ChainedDatastore does not support transfer=None or transfer='move'.")
339 def isDatasetAcceptable(dataset, *, name, constraints):
340 acceptable = [ref for ref in dataset.refs if constraints.isAcceptable(ref)]
341 if not acceptable:
342 log.debug("Datastore %s skipping ingest via configuration for refs %s",
343 name, ", ".join(str(ref) for ref in dataset.refs))
344 return False
345 else:
346 return True
348 # Filter down to just datasets the chained datastore's own
349 # configuration accepts.
350 okForParent: List[FileDataset] = [dataset for dataset in datasets
351 if isDatasetAcceptable(dataset, name=self.name,
352 constraints=self.constraints)]
354 # Iterate over nested datastores and call _prepIngest on each.
355 # Save the results to a list:
356 children: List[Tuple[Datastore, Datastore.IngestPrepData]] = []
357 # ...and remember whether all of the failures are due to
358 # NotImplementedError being raised.
359 allFailuresAreNotImplementedError = True
360 for datastore, constraints in zip(self.datastores, self.datastoreConstraints):
361 if constraints is not None:
362 okForChild: List[FileDataset] = [dataset for dataset in okForParent
363 if isDatasetAcceptable(dataset, name=datastore.name,
364 constraints=constraints)]
365 else:
366 okForChild: List[FileDataset] = okForParent
367 try:
368 prepDataForChild = datastore._prepIngest(*okForChild, transfer=transfer)
369 except NotImplementedError:
370 log.debug("Skipping ingest for datastore %s because transfer "
371 "mode %s is not supported.", datastore.name, transfer)
372 continue
373 allFailuresAreNotImplementedError = False
374 children.append((datastore, prepDataForChild))
375 if allFailuresAreNotImplementedError:
376 raise NotImplementedError(f"No child datastore supports transfer mode {transfer}.")
377 return _IngestPrepData(children=children)
379 def _finishIngest(self, prepData: _IngestPrepData, *, transfer: Optional[str] = None):
380 # Docstring inherited from Datastore._finishIngest.
381 for datastore, prepDataForChild in prepData.children:
382 datastore._finishIngest(prepDataForChild, transfer=transfer)
384 def getUri(self, ref, predict=False):
385 """URI to the Dataset.
387 The returned URI is from the first datastore in the list that has
388 the dataset with preference given to the first dataset coming from
389 a permanent datastore. If no datastores have the dataset and prediction
390 is allowed, the predicted URI for the first datastore in the list will
391 be returned.
393 Parameters
394 ----------
395 ref : `DatasetRef`
396 Reference to the required Dataset.
397 predict : `bool`
398 If `True`, allow URIs to be returned of datasets that have not
399 been written.
401 Returns
402 -------
403 uri : `str`
404 URI string pointing to the Dataset within the datastore. If the
405 Dataset does not exist in the datastore, and if ``predict`` is
406 `True`, the URI will be a prediction and will include a URI
407 fragment "#predicted".
409 Notes
410 -----
411 If the datastore does not have entities that relate well
412 to the concept of a URI the returned URI string will be
413 descriptive. The returned URI is not guaranteed to be obtainable.
415 Raises
416 ------
417 FileNotFoundError
418 A URI has been requested for a dataset that does not exist and
419 guessing is not allowed.
420 """
421 log.debug("Requesting URI for %s", ref)
422 predictedUri = None
423 predictedEphemeralUri = None
424 firstEphemeralUri = None
425 for datastore in self.datastores:
426 if datastore.exists(ref):
427 if not datastore.isEphemeral:
428 uri = datastore.getUri(ref)
429 log.debug("Retrieved ephemeral URI: %s", uri)
430 return uri
431 elif firstEphemeralUri is None:
432 firstEphemeralUri = datastore.getUri(ref)
433 elif predict:
434 if predictedUri is None and not datastore.isEphemeral:
435 predictedUri = datastore.getUri(ref, predict)
436 elif predictedEphemeralUri is None and datastore.isEphemeral:
437 predictedEphemeralUri = datastore.getUri(ref, predict)
439 if firstEphemeralUri is not None:
440 log.debug("Retrieved ephemeral URI: %s", firstEphemeralUri)
441 return firstEphemeralUri
443 if predictedUri is not None:
444 log.debug("Retrieved predicted URI: %s", predictedUri)
445 return predictedUri
447 if predictedEphemeralUri is not None:
448 log.debug("Retrieved predicted ephemeral URI: %s", predictedEphemeralUri)
449 return predictedEphemeralUri
451 raise FileNotFoundError("Dataset {} not in any datastore".format(ref))
453 def remove(self, ref):
454 """Indicate to the Datastore that a Dataset can be removed.
456 The dataset will be removed from each datastore. The dataset is
457 not required to exist in every child datastore.
459 Parameters
460 ----------
461 ref : `DatasetRef`
462 Reference to the required Dataset.
464 Raises
465 ------
466 FileNotFoundError
467 Attempt to remove a dataset that does not exist. Raised if none
468 of the child datastores removed the dataset.
469 """
470 log.debug(f"Removing {ref}")
472 counter = 0
473 for datastore in self.datastores:
474 try:
475 datastore.remove(ref)
476 counter += 1
477 except FileNotFoundError:
478 pass
480 if counter == 0:
481 raise FileNotFoundError(f"Could not remove from any child datastore: {ref}")
483 def transfer(self, inputDatastore, ref):
484 """Retrieve a Dataset from an input `Datastore`,
485 and store the result in this `Datastore`.
487 Parameters
488 ----------
489 inputDatastore : `Datastore`
490 The external `Datastore` from which to retreive the Dataset.
491 ref : `DatasetRef`
492 Reference to the required Dataset in the input data store.
494 Returns
495 -------
496 results : `list`
497 List containing the return value from the ``put()`` to each
498 child datastore.
499 """
500 assert inputDatastore is not self # unless we want it for renames?
501 inMemoryDataset = inputDatastore.get(ref)
502 return [datastore.put(inMemoryDataset, ref) for datastore in self.datastores]
504 def validateConfiguration(self, entities, logFailures=False):
505 """Validate some of the configuration for this datastore.
507 Parameters
508 ----------
509 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
510 Entities to test against this configuration. Can be differing
511 types.
512 logFailures : `bool`, optional
513 If `True`, output a log message for every validation error
514 detected.
516 Raises
517 ------
518 DatastoreValidationError
519 Raised if there is a validation problem with a configuration.
520 All the problems are reported in a single exception.
522 Notes
523 -----
524 This method checks each datastore in turn.
525 """
527 # Need to catch each of the datastore outputs and ensure that
528 # all are tested.
529 failures = []
530 for datastore in self.datastores:
531 try:
532 datastore.validateConfiguration(entities, logFailures=logFailures)
533 except DatastoreValidationError as e:
534 if logFailures: 534 ↛ 536line 534 didn't jump to line 536, because the condition on line 534 was never false
535 log.fatal("Datastore %s failed validation", datastore.name)
536 failures.append(f"Datastore {self.name}: {e}")
538 if failures:
539 msg = ";\n".join(failures)
540 raise DatastoreValidationError(msg)
542 def validateKey(self, lookupKey, entity):
543 # Docstring is inherited from base class
544 failures = []
545 for datastore in self.datastores:
546 try:
547 datastore.validateKey(lookupKey, entity)
548 except DatastoreValidationError as e:
549 failures.append(f"Datastore {self.name}: {e}")
551 if failures:
552 msg = ";\n".join(failures)
553 raise DatastoreValidationError(msg)
555 def getLookupKeys(self):
556 # Docstring is inherited from base class
557 keys = set()
558 for datastore in self.datastores:
559 keys.update(datastore.getLookupKeys())
561 keys.update(self.constraints.getLookupKeys())
562 for p in self.datastoreConstraints:
563 if p is not None: 563 ↛ 564line 563 didn't jump to line 564, because the condition on line 563 was never true
564 keys.update(p.getLookupKeys())
566 return keys