Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 90%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Generic file-based datastore code."""
24__all__ = ("FileLikeDatastore", )
26import logging
27import itertools
28from abc import abstractmethod
30from sqlalchemy import Integer, String
32from dataclasses import dataclass
33from typing import Optional, List, Type
35from lsst.daf.butler import (
36 Config,
37 FileDataset,
38 DatasetRef,
39 DatasetTypeNotSupportedError,
40 Datastore,
41 DatastoreConfig,
42 DatastoreValidationError,
43 FileDescriptor,
44 FileTemplates,
45 FileTemplateValidationError,
46 Formatter,
47 FormatterFactory,
48 Location,
49 LocationFactory,
50 StorageClass,
51 StoredFileInfo,
52)
54from lsst.daf.butler import ddl
55from lsst.daf.butler.registry.interfaces import ReadOnlyDatabaseError
57from lsst.daf.butler.core.repoRelocation import replaceRoot
58from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional
59from .genericDatastore import GenericBaseDatastore
61log = logging.getLogger(__name__)
64class _IngestPrepData(Datastore.IngestPrepData):
65 """Helper class for FileLikeDatastore ingest implementation.
67 Parameters
68 ----------
69 datasets : `list` of `FileDataset`
70 Files to be ingested by this datastore.
71 """
72 def __init__(self, datasets: List[FileDataset]):
73 super().__init__(ref for dataset in datasets for ref in dataset.refs)
74 self.datasets = datasets
77@dataclass(frozen=True)
78class DatastoreFileGetInformation:
79 """Collection of useful parameters needed to retrieve a file from
80 a Datastore.
81 """
83 location: Location
84 """The location from which to read the dataset."""
86 formatter: Formatter
87 """The `Formatter` to use to deserialize the dataset."""
89 info: StoredFileInfo
90 """Stored information about this file and its formatter."""
92 assemblerParams: dict
93 """Parameters to use for post-processing the retrieved dataset."""
95 component: Optional[str]
96 """The component to be retrieved (can be `None`)."""
98 readStorageClass: StorageClass
99 """The `StorageClass` of the dataset being read."""
102class FileLikeDatastore(GenericBaseDatastore):
103 """Generic Datastore for file-based implementations.
105 Should always be sub-classed since key abstract methods are missing.
107 Parameters
108 ----------
109 config : `DatastoreConfig` or `str`
110 Configuration as either a `Config` object or URI to file.
112 Raises
113 ------
114 ValueError
115 If root location does not exist and ``create`` is `False` in the
116 configuration.
117 """
119 defaultConfigFile = None
120 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
121 absolute path. Can be None if no defaults specified.
122 """
124 root: str
125 """Root directory or URI of this `Datastore`."""
127 locationFactory: LocationFactory
128 """Factory for creating locations relative to the datastore root."""
130 formatterFactory: FormatterFactory
131 """Factory for creating instances of formatters."""
133 templates: FileTemplates
134 """File templates that can be used by this `Datastore`."""
136 @classmethod
137 def setConfigRoot(cls, root, config, full, overwrite=True):
138 """Set any filesystem-dependent config options for this Datastore to
139 be appropriate for a new empty repository with the given root.
141 Parameters
142 ----------
143 root : `str`
144 URI to the root of the data repository.
145 config : `Config`
146 A `Config` to update. Only the subset understood by
147 this component will be updated. Will not expand
148 defaults.
149 full : `Config`
150 A complete config with all defaults expanded that can be
151 converted to a `DatastoreConfig`. Read-only and will not be
152 modified by this method.
153 Repository-specific options that should not be obtained
154 from defaults when Butler instances are constructed
155 should be copied from ``full`` to ``config``.
156 overwrite : `bool`, optional
157 If `False`, do not modify a value in ``config`` if the value
158 already exists. Default is always to overwrite with the provided
159 ``root``.
161 Notes
162 -----
163 If a keyword is explicitly defined in the supplied ``config`` it
164 will not be overridden by this method if ``overwrite`` is `False`.
165 This allows explicit values set in external configs to be retained.
166 """
167 Config.updateParameters(DatastoreConfig, config, full,
168 toUpdate={"root": root},
169 toCopy=("cls", ("records", "table")), overwrite=overwrite)
171 @classmethod
172 def makeTableSpec(cls):
173 return ddl.TableSpec(
174 fields=NamedValueSet([
175 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True),
176 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
177 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
178 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
179 # TODO: should checksum be Base64Bytes instead?
180 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
181 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True),
182 ]),
183 unique=frozenset(),
184 foreignKeys=[ddl.ForeignKeySpec(table="dataset", source=("dataset_id",), target=("dataset_id",),
185 onDelete="CASCADE")]
186 )
188 def __init__(self, config, registry, butlerRoot=None):
189 super().__init__(config, registry)
190 if "root" not in self.config: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true
191 raise ValueError("No root directory specified in configuration")
193 # Name ourselves either using an explicit name or a name
194 # derived from the (unexpanded) root
195 if "name" in self.config:
196 self.name = self.config["name"]
197 else:
198 # We use the unexpanded root in the name to indicate that this
199 # datastore can be moved without having to update registry.
200 self.name = "{}@{}".format(type(self).__name__,
201 self.config["root"])
203 # Support repository relocation in config
204 # Existence of self.root is checked in subclass
205 self.root = replaceRoot(self.config["root"], butlerRoot)
207 self.locationFactory = LocationFactory(self.root)
208 self.formatterFactory = FormatterFactory()
210 # Now associate formatters with storage classes
211 self.formatterFactory.registerFormatters(self.config["formatters"],
212 universe=self.registry.dimensions)
214 # Read the file naming templates
215 self.templates = FileTemplates(self.config["templates"],
216 universe=self.registry.dimensions)
218 # Storage of paths and formatters, keyed by dataset_id
219 self._tableName = self.config["records", "table"]
220 try:
221 registry.registerOpaqueTable(self._tableName, self.makeTableSpec())
222 except ReadOnlyDatabaseError:
223 # If the database is read only and we just tried and failed to
224 # create a table, it means someone is trying to create a read-only
225 # butler client for an empty repo. That should be okay, as long
226 # as they then try to get any datasets before some other client
227 # creates the table. Chances are they'rejust validating
228 # configuration.
229 pass
231 # Determine whether checksums should be used
232 self.useChecksum = self.config.get("checksum", True)
234 def __str__(self):
235 return self.root
237 def addStoredItemInfo(self, refs, infos):
238 # Docstring inherited from GenericBaseDatastore
239 records = []
240 for ref, info in zip(refs, infos):
241 records.append(
242 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
243 storage_class=info.storageClass.name,
244 checksum=info.checksum, file_size=info.file_size)
245 )
246 self.registry.insertOpaqueData(self._tableName, *records)
248 def getStoredItemInfo(self, ref):
249 # Docstring inherited from GenericBaseDatastore
250 records = list(self.registry.fetchOpaqueData(self._tableName, dataset_id=ref.id))
251 if len(records) == 0:
252 raise KeyError(f"Unable to retrieve location associated with Dataset {ref}.")
253 assert len(records) == 1, "Primary key constraint should make more than one result impossible."
254 record = records[0]
255 # Convert name of StorageClass to instance
256 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
257 return StoredFileInfo(formatter=record["formatter"],
258 path=record["path"],
259 storageClass=storageClass,
260 checksum=record["checksum"],
261 file_size=record["file_size"])
263 def _registered_refs_per_artifact(self, pathInStore):
264 """Return all dataset refs associated with the supplied path.
266 Parameters
267 ----------
268 pathInStore : `str`
269 Path of interest in the data store.
271 Returns
272 -------
273 ids : `set` of `int`
274 All `DatasetRef` IDs associated with this path.
275 """
276 records = list(self.registry.fetchOpaqueData(self._tableName, path=pathInStore))
277 ids = {r["dataset_id"] for r in records}
278 return ids
280 def removeStoredItemInfo(self, ref):
281 # Docstring inherited from GenericBaseDatastore
282 self.registry.deleteOpaqueData(self._tableName, dataset_id=ref.id)
284 def _get_dataset_location_info(self, ref):
285 """Find the `Location` of the requested dataset in the
286 `Datastore` and the associated stored file information.
288 Parameters
289 ----------
290 ref : `DatasetRef`
291 Reference to the required `Dataset`.
293 Returns
294 -------
295 location : `Location`
296 Location of the dataset within the datastore.
297 Returns `None` if the dataset can not be located.
298 info : `StoredFileInfo`
299 Stored information about this file and its formatter.
300 """
301 # Get the file information (this will fail if no file)
302 try:
303 storedFileInfo = self.getStoredItemInfo(ref)
304 except KeyError:
305 return None, None
307 # Use the path to determine the location
308 location = self.locationFactory.fromPath(storedFileInfo.path)
310 return location, storedFileInfo
312 def _can_remove_dataset_artifact(self, ref):
313 """Check that there is only one dataset associated with the
314 specified artifact.
316 Parameters
317 ----------
318 ref : `DatasetRef`
319 Dataset to be removed.
321 Returns
322 -------
323 can_remove : `Bool`
324 True if the artifact can be safely removed.
325 """
326 storedFileInfo = self.getStoredItemInfo(ref)
328 # Get all entries associated with this path
329 allRefs = self._registered_refs_per_artifact(storedFileInfo.path)
330 if not allRefs: 330 ↛ 331line 330 didn't jump to line 331, because the condition on line 330 was never true
331 raise RuntimeError(f"Datastore inconsistency error. {storedFileInfo.path} not in registry")
333 # Get all the refs associated with this dataset if it is a composite
334 theseRefs = {r.id for r in itertools.chain([ref], ref.components.values())}
336 # Remove these refs from all the refs and if there is nothing left
337 # then we can delete
338 remainingRefs = allRefs - theseRefs
340 if remainingRefs:
341 return False
342 return True
344 def _prepare_for_get(self, ref, parameters=None):
345 """Check parameters for ``get`` and obtain formatter and
346 location.
348 Parameters
349 ----------
350 ref : `DatasetRef`
351 Reference to the required Dataset.
352 parameters : `dict`
353 `StorageClass`-specific parameters that specify, for example,
354 a slice of the Dataset to be loaded.
356 Returns
357 -------
358 getInfo : `DatastoreFileGetInformation`
359 Parameters needed to retrieve the file.
360 """
361 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
363 # Get file metadata and internal metadata
364 location, storedFileInfo = self._get_dataset_location_info(ref)
365 if location is None:
366 raise FileNotFoundError(f"Could not retrieve Dataset {ref}.")
368 # We have a write storage class and a read storage class and they
369 # can be different for concrete composites.
370 readStorageClass = ref.datasetType.storageClass
371 writeStorageClass = storedFileInfo.storageClass
373 # Check that the supplied parameters are suitable for the type read
374 readStorageClass.validateParameters(parameters)
376 # Is this a component request?
377 component = ref.datasetType.component()
379 formatter = getInstanceOf(storedFileInfo.formatter,
380 FileDescriptor(location, readStorageClass=readStorageClass,
381 storageClass=writeStorageClass, parameters=parameters),
382 ref.dataId)
383 formatterParams, assemblerParams = formatter.segregateParameters()
385 return DatastoreFileGetInformation(location, formatter, storedFileInfo,
386 assemblerParams, component, readStorageClass)
388 def _prepare_for_put(self, inMemoryDataset, ref):
389 """Check the arguments for ``put`` and obtain formatter and
390 location.
392 Parameters
393 ----------
394 inMemoryDataset : `object`
395 The Dataset to store.
396 ref : `DatasetRef`
397 Reference to the associated Dataset.
399 Returns
400 -------
401 location : `Location`
402 The location to write the dataset.
403 formatter : `Formatter`
404 The `Formatter` to use to write the dataset.
406 Raises
407 ------
408 TypeError
409 Supplied object and storage class are inconsistent.
410 DatasetTypeNotSupportedError
411 The associated `DatasetType` is not handled by this datastore.
412 """
413 self._validate_put_parameters(inMemoryDataset, ref)
415 # Work out output file name
416 try:
417 template = self.templates.getTemplate(ref)
418 except KeyError as e:
419 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
421 location = self.locationFactory.fromPath(template.format(ref))
423 # Get the formatter based on the storage class
424 storageClass = ref.datasetType.storageClass
425 try:
426 formatter = self.formatterFactory.getFormatter(ref,
427 FileDescriptor(location,
428 storageClass=storageClass),
429 ref.dataId)
430 except KeyError as e:
431 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e
433 return location, formatter
435 @abstractmethod
436 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
437 """Standardize the path of a to-be-ingested file.
439 Parameters
440 ----------
441 path : `str`
442 Path of a file to be ingested.
443 transfer : `str`, optional
444 How (and whether) the dataset should be added to the datastore.
445 See `ingest` for details of transfer modes.
446 This implementation is provided only so
447 `NotImplementedError` can be raised if the mode is not supported;
448 actual transfers are deferred to `_extractIngestInfo`.
450 Returns
451 -------
452 path : `str`
453 New path in what the datastore considers standard form.
455 Notes
456 -----
457 Subclasses of `FileLikeDatastore` should implement this method instead
458 of `_prepIngest`. It should not modify the data repository or given
459 file in any way.
461 Raises
462 ------
463 NotImplementedError
464 Raised if the datastore does not support the given transfer mode
465 (including the case where ingest is not supported at all).
466 FileNotFoundError
467 Raised if one of the given files does not exist.
468 """
469 raise NotImplementedError("Must be implemented by subclasses.")
471 @abstractmethod
472 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
473 transfer: Optional[str] = None) -> StoredFileInfo:
474 """Relocate (if necessary) and extract `StoredFileInfo` from a
475 to-be-ingested file.
477 Parameters
478 ----------
479 path : `str`
480 Path of a file to be ingested.
481 ref : `DatasetRef`
482 Reference for the dataset being ingested. Guaranteed to have
483 ``dataset_id not None`.
484 formatter : `type`
485 `Formatter` subclass to use for this dataset.
486 transfer : `str`, optional
487 How (and whether) the dataset should be added to the datastore.
488 See `ingest` for details of transfer modes.
490 Returns
491 -------
492 info : `StoredFileInfo`
493 Internal datastore record for this file. This will be inserted by
494 the caller; the `_extractIngestInfo` is only resposible for
495 creating and populating the struct.
497 Raises
498 ------
499 FileNotFoundError
500 Raised if one of the given files does not exist.
501 FileExistsError
502 Raised if transfer is not `None` but the (internal) location the
503 file would be moved to is already occupied.
504 """
505 raise NotImplementedError("Must be implemented by subclasses.")
507 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
508 # Docstring inherited from Datastore._prepIngest.
509 filtered = []
510 for dataset in datasets:
511 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
512 if not acceptable:
513 continue
514 else:
515 dataset.refs = acceptable
516 if dataset.formatter is None:
517 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
518 else:
519 dataset.formatter = getClassOf(dataset.formatter)
520 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
521 filtered.append(dataset)
522 return _IngestPrepData(filtered)
524 @transactional
525 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None):
526 # Docstring inherited from Datastore._finishIngest.
527 refsAndInfos = []
528 for dataset in prepData.datasets:
529 # Do ingest as if the first dataset ref is associated with the file
530 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
531 transfer=transfer)
532 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
533 self._register_datasets(refsAndInfos)
535 def getUri(self, ref, predict=False):
536 """URI to the Dataset.
538 Parameters
539 ----------
540 ref : `DatasetRef`
541 Reference to the required Dataset.
542 predict : `bool`
543 If `True`, allow URIs to be returned of datasets that have not
544 been written.
546 Returns
547 -------
548 uri : `str`
549 URI string pointing to the Dataset within the datastore. If the
550 Dataset does not exist in the datastore, and if ``predict`` is
551 `True`, the URI will be a prediction and will include a URI
552 fragment "#predicted".
553 If the datastore does not have entities that relate well
554 to the concept of a URI the returned URI string will be
555 descriptive. The returned URI is not guaranteed to be obtainable.
557 Raises
558 ------
559 FileNotFoundError
560 A URI has been requested for a dataset that does not exist and
561 guessing is not allowed.
563 Notes
564 -----
565 When a predicted URI is requested an attempt will be made to form
566 a reasonable URI based on file templates and the expected formatter.
567 """
568 # if this has never been written then we have to guess
569 if not self.exists(ref):
570 if not predict:
571 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
573 template = self.templates.getTemplate(ref)
574 location = self.locationFactory.fromPath(template.format(ref))
575 storageClass = ref.datasetType.storageClass
576 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location,
577 storageClass=storageClass))
578 # Try to use the extension attribute but ignore problems if the
579 # formatter does not define one.
580 try:
581 location = formatter.makeUpdatedLocation(location)
582 except Exception:
583 # Use the default extension
584 pass
586 # Add a URI fragment to indicate this is a guess
587 return location.uri + "#predicted"
589 # If this is a ref that we have written we can get the path.
590 # Get file metadata and internal metadata
591 storedFileInfo = self.getStoredItemInfo(ref)
593 # Use the path to determine the location
594 location = self.locationFactory.fromPath(storedFileInfo.path)
596 return location.uri
598 def validateConfiguration(self, entities, logFailures=False):
599 """Validate some of the configuration for this datastore.
601 Parameters
602 ----------
603 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
604 Entities to test against this configuration. Can be differing
605 types.
606 logFailures : `bool`, optional
607 If `True`, output a log message for every validation error
608 detected.
610 Raises
611 ------
612 DatastoreValidationError
613 Raised if there is a validation problem with a configuration.
614 All the problems are reported in a single exception.
616 Notes
617 -----
618 This method checks that all the supplied entities have valid file
619 templates and also have formatters defined.
620 """
622 templateFailed = None
623 try:
624 self.templates.validateTemplates(entities, logFailures=logFailures)
625 except FileTemplateValidationError as e:
626 templateFailed = str(e)
628 formatterFailed = []
629 for entity in entities:
630 try:
631 self.formatterFactory.getFormatterClass(entity)
632 except KeyError as e:
633 formatterFailed.append(str(e))
634 if logFailures: 634 ↛ 629line 634 didn't jump to line 629, because the condition on line 634 was never false
635 log.fatal("Formatter failure: %s", e)
637 if templateFailed or formatterFailed:
638 messages = []
639 if templateFailed: 639 ↛ 640line 639 didn't jump to line 640, because the condition on line 639 was never true
640 messages.append(templateFailed)
641 if formatterFailed: 641 ↛ 643line 641 didn't jump to line 643, because the condition on line 641 was never false
642 messages.append(",".join(formatterFailed))
643 msg = ";\n".join(messages)
644 raise DatastoreValidationError(msg)
646 def getLookupKeys(self):
647 # Docstring is inherited from base class
648 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
649 self.constraints.getLookupKeys()
651 def validateKey(self, lookupKey, entity):
652 # Docstring is inherited from base class
653 # The key can be valid in either formatters or templates so we can
654 # only check the template if it exists
655 if lookupKey in self.templates:
656 try:
657 self.templates[lookupKey].validateTemplate(entity)
658 except FileTemplateValidationError as e:
659 raise DatastoreValidationError(e) from e