Coverage for python/lsst/daf/butler/datastores/fileLikeDatastore.py : 90%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Generic file-based datastore code."""
24__all__ = ("FileLikeDatastore", )
26import logging
27import itertools
28from abc import abstractmethod
30from sqlalchemy import Integer, String
32from dataclasses import dataclass
33from typing import Optional, List, Type
35from lsst.daf.butler import (
36 Config,
37 FileDataset,
38 DatasetRef,
39 DatasetTypeNotSupportedError,
40 Datastore,
41 DatastoreConfig,
42 DatastoreValidationError,
43 FileDescriptor,
44 FileTemplates,
45 FileTemplateValidationError,
46 Formatter,
47 FormatterFactory,
48 Location,
49 LocationFactory,
50 StorageClass,
51 StoredFileInfo,
52)
54from lsst.daf.butler import ddl
55from lsst.daf.butler.registry.interfaces import ReadOnlyDatabaseError
57from lsst.daf.butler.core.repoRelocation import replaceRoot
58from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional
59from .genericDatastore import GenericBaseDatastore
61log = logging.getLogger(__name__)
64class _IngestPrepData(Datastore.IngestPrepData):
65 """Helper class for FileLikeDatastore ingest implementation.
67 Parameters
68 ----------
69 datasets : `list` of `FileDataset`
70 Files to be ingested by this datastore.
71 """
72 def __init__(self, datasets: List[FileDataset]):
73 super().__init__(ref for dataset in datasets for ref in dataset.refs)
74 self.datasets = datasets
77@dataclass(frozen=True)
78class DatastoreFileGetInformation:
79 """Collection of useful parameters needed to retrieve a file from
80 a Datastore.
81 """
83 location: Location
84 """The location from which to read the dataset."""
86 formatter: Formatter
87 """The `Formatter` to use to deserialize the dataset."""
89 info: StoredFileInfo
90 """Stored information about this file and its formatter."""
92 assemblerParams: dict
93 """Parameters to use for post-processing the retrieved dataset."""
95 component: Optional[str]
96 """The component to be retrieved (can be `None`)."""
98 readStorageClass: StorageClass
99 """The `StorageClass` of the dataset being read."""
102class FileLikeDatastore(GenericBaseDatastore):
103 """Generic Datastore for file-based implementations.
105 Should always be sub-classed since key abstract methods are missing.
107 Parameters
108 ----------
109 config : `DatastoreConfig` or `str`
110 Configuration as either a `Config` object or URI to file.
112 Raises
113 ------
114 ValueError
115 If root location does not exist and ``create`` is `False` in the
116 configuration.
117 """
119 defaultConfigFile = None
120 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or
121 absolute path. Can be None if no defaults specified.
122 """
124 root: str
125 """Root directory or URI of this `Datastore`."""
127 locationFactory: LocationFactory
128 """Factory for creating locations relative to the datastore root."""
130 formatterFactory: FormatterFactory
131 """Factory for creating instances of formatters."""
133 templates: FileTemplates
134 """File templates that can be used by this `Datastore`."""
136 @classmethod
137 def setConfigRoot(cls, root, config, full, overwrite=True):
138 """Set any filesystem-dependent config options for this Datastore to
139 be appropriate for a new empty repository with the given root.
141 Parameters
142 ----------
143 root : `str`
144 URI to the root of the data repository.
145 config : `Config`
146 A `Config` to update. Only the subset understood by
147 this component will be updated. Will not expand
148 defaults.
149 full : `Config`
150 A complete config with all defaults expanded that can be
151 converted to a `DatastoreConfig`. Read-only and will not be
152 modified by this method.
153 Repository-specific options that should not be obtained
154 from defaults when Butler instances are constructed
155 should be copied from ``full`` to ``config``.
156 overwrite : `bool`, optional
157 If `False`, do not modify a value in ``config`` if the value
158 already exists. Default is always to overwrite with the provided
159 ``root``.
161 Notes
162 -----
163 If a keyword is explicitly defined in the supplied ``config`` it
164 will not be overridden by this method if ``overwrite`` is `False`.
165 This allows explicit values set in external configs to be retained.
166 """
167 Config.updateParameters(DatastoreConfig, config, full,
168 toUpdate={"root": root},
169 toCopy=("cls", ("records", "table")), overwrite=overwrite)
171 @classmethod
172 def makeTableSpec(cls):
173 return ddl.TableSpec(
174 fields=NamedValueSet([
175 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True),
176 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False),
177 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False),
178 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False),
179 # TODO: should checksum be Base64Bytes instead?
180 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True),
181 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True),
182 ]),
183 unique=frozenset(),
184 foreignKeys=[ddl.ForeignKeySpec(table="dataset", source=("dataset_id",), target=("dataset_id",),
185 onDelete="CASCADE")]
186 )
188 def __init__(self, config, registry, butlerRoot=None):
189 super().__init__(config, registry)
190 if "root" not in self.config: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true
191 raise ValueError("No root directory specified in configuration")
193 # Name ourselves either using an explicit name or a name
194 # derived from the (unexpanded) root
195 if "name" in self.config:
196 self.name = self.config["name"]
197 else:
198 # We use the unexpanded root in the name to indicate that this
199 # datastore can be moved without having to update registry.
200 self.name = "{}@{}".format(type(self).__name__,
201 self.config["root"])
203 # Support repository relocation in config
204 # Existence of self.root is checked in subclass
205 self.root = replaceRoot(self.config["root"], butlerRoot)
207 self.locationFactory = LocationFactory(self.root)
208 self.formatterFactory = FormatterFactory()
210 # Now associate formatters with storage classes
211 self.formatterFactory.registerFormatters(self.config["formatters"],
212 universe=self.registry.dimensions)
214 # Read the file naming templates
215 self.templates = FileTemplates(self.config["templates"],
216 universe=self.registry.dimensions)
218 # Storage of paths and formatters, keyed by dataset_id
219 self._tableName = self.config["records", "table"]
220 try:
221 registry.registerOpaqueTable(self._tableName, self.makeTableSpec())
222 except ReadOnlyDatabaseError:
223 # If the database is read only and we just tried and failed to
224 # create a table, it means someone is trying to create a read-only
225 # butler client for an empty repo. That should be okay, as long
226 # as they then try to get any datasets before some other client
227 # creates the table. Chances are they'rejust validating
228 # configuration.
229 pass
231 # Determine whether checksums should be used
232 self.useChecksum = self.config.get("checksum", True)
234 def __str__(self):
235 return self.root
237 def addStoredItemInfo(self, refs, infos):
238 # Docstring inherited from GenericBaseDatastore
239 records = []
240 for ref, info in zip(refs, infos):
241 records.append(
242 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path,
243 storage_class=info.storageClass.name,
244 checksum=info.checksum, file_size=info.file_size)
245 )
246 self.registry.insertOpaqueData(self._tableName, *records)
248 def getStoredItemInfo(self, ref):
249 # Docstring inherited from GenericBaseDatastore
250 records = list(self.registry.fetchOpaqueData(self._tableName, dataset_id=ref.id))
251 if len(records) == 0:
252 raise KeyError(f"Unable to retrieve location associated with Dataset {ref}.")
253 assert len(records) == 1, "Primary key constraint should make more than one result impossible."
254 record = records[0]
255 # Convert name of StorageClass to instance
256 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"])
257 return StoredFileInfo(formatter=record["formatter"],
258 path=record["path"],
259 storageClass=storageClass,
260 checksum=record["checksum"],
261 file_size=record["file_size"])
263 def _registered_refs_per_artifact(self, pathInStore):
264 """Return all dataset refs associated with the supplied path.
266 Parameters
267 ----------
268 pathInStore : `str`
269 Path of interest in the data store.
271 Returns
272 -------
273 ids : `set` of `int`
274 All `DatasetRef` IDs associated with this path.
275 """
276 records = list(self.registry.fetchOpaqueData(self._tableName, path=pathInStore))
277 ids = {r["dataset_id"] for r in records}
278 return ids
280 def removeStoredItemInfo(self, ref):
281 # Docstring inherited from GenericBaseDatastore
282 self.registry.deleteOpaqueData(self._tableName, dataset_id=ref.id)
284 def _get_dataset_location_info(self, ref):
285 """Find the `Location` of the requested dataset in the
286 `Datastore` and the associated stored file information.
288 Parameters
289 ----------
290 ref : `DatasetRef`
291 Reference to the required `Dataset`.
293 Returns
294 -------
295 location : `Location`
296 Location of the dataset within the datastore.
297 Returns `None` if the dataset can not be located.
298 info : `StoredFileInfo`
299 Stored information about this file and its formatter.
300 """
301 # Get the file information (this will fail if no file)
302 try:
303 storedFileInfo = self.getStoredItemInfo(ref)
304 except KeyError:
305 return None, None
307 # Use the path to determine the location
308 location = self.locationFactory.fromPath(storedFileInfo.path)
310 return location, storedFileInfo
312 def _can_remove_dataset_artifact(self, ref):
313 """Check that there is only one dataset associated with the
314 specified artifact.
316 Parameters
317 ----------
318 ref : `DatasetRef`
319 Dataset to be removed.
321 Returns
322 -------
323 can_remove : `Bool`
324 True if the artifact can be safely removed.
325 """
326 storedFileInfo = self.getStoredItemInfo(ref)
328 # Get all entries associated with this path
329 allRefs = self._registered_refs_per_artifact(storedFileInfo.path)
330 if not allRefs: 330 ↛ 331line 330 didn't jump to line 331, because the condition on line 330 was never true
331 raise RuntimeError(f"Datastore inconsistency error. {storedFileInfo.path} not in registry")
333 # Get all the refs associated with this dataset if it is a composite
334 theseRefs = {r.id for r in itertools.chain([ref], ref.components.values())}
336 # Remove these refs from all the refs and if there is nothing left
337 # then we can delete
338 remainingRefs = allRefs - theseRefs
340 if remainingRefs:
341 return False
342 return True
344 def _prepare_for_get(self, ref, parameters=None):
345 """Check parameters for ``get`` and obtain formatter and
346 location.
348 Parameters
349 ----------
350 ref : `DatasetRef`
351 Reference to the required Dataset.
352 parameters : `dict`
353 `StorageClass`-specific parameters that specify, for example,
354 a slice of the Dataset to be loaded.
356 Returns
357 -------
358 getInfo : `DatastoreFileGetInformation`
359 Parameters needed to retrieve the file.
360 """
361 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters)
363 # Get file metadata and internal metadata
364 location, storedFileInfo = self._get_dataset_location_info(ref)
365 if location is None:
366 raise FileNotFoundError(f"Could not retrieve Dataset {ref}.")
368 # We have a write storage class and a read storage class and they
369 # can be different for concrete composites.
370 readStorageClass = ref.datasetType.storageClass
371 writeStorageClass = storedFileInfo.storageClass
373 # Check that the supplied parameters are suitable for the type read
374 readStorageClass.validateParameters(parameters)
376 # Is this a component request?
377 component = ref.datasetType.component()
379 formatter = getInstanceOf(storedFileInfo.formatter,
380 FileDescriptor(location, readStorageClass=readStorageClass,
381 storageClass=writeStorageClass, parameters=parameters),
382 ref.dataId)
383 formatterParams, assemblerParams = formatter.segregateParameters()
385 return DatastoreFileGetInformation(location, formatter, storedFileInfo,
386 assemblerParams, component, readStorageClass)
388 def _prepare_for_put(self, inMemoryDataset, ref):
389 """Check the arguments for ``put`` and obtain formatter and
390 location.
392 Parameters
393 ----------
394 inMemoryDataset : `object`
395 The Dataset to store.
396 ref : `DatasetRef`
397 Reference to the associated Dataset.
399 Returns
400 -------
401 location : `Location`
402 The location to write the dataset.
403 formatter : `Formatter`
404 The `Formatter` to use to write the dataset.
406 Raises
407 ------
408 TypeError
409 Supplied object and storage class are inconsistent.
410 DatasetTypeNotSupportedError
411 The associated `DatasetType` is not handled by this datastore.
412 """
413 self._validate_put_parameters(inMemoryDataset, ref)
415 # Work out output file name
416 try:
417 template = self.templates.getTemplate(ref)
418 except KeyError as e:
419 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e
421 location = self.locationFactory.fromPath(template.format(ref))
423 # Get the formatter based on the storage class
424 storageClass = ref.datasetType.storageClass
425 try:
426 formatter = self.formatterFactory.getFormatter(ref,
427 FileDescriptor(location,
428 storageClass=storageClass),
429 ref.dataId)
430 except KeyError as e:
431 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e
433 return location, formatter
435 @abstractmethod
436 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str:
437 """Standardize the path of a to-be-ingested file.
439 Parameters
440 ----------
441 path : `str`
442 Path of a file to be ingested.
443 transfer : `str`, optional
444 How (and whether) the dataset should be added to the datastore.
445 If `None` (default), the file must already be in a location
446 appropriate for the datastore (e.g. within its root directory),
447 and will not be moved. Other choices include "move", "copy",
448 "symlink", and "hardlink". This is provided only so
449 `NotImplementedError` can be raised if the mode is not supported;
450 actual transfers are deferred to `_extractIngestInfo`.
452 Returns
453 -------
454 path : `str`
455 New path in what the datastore considers standard form.
457 Notes
458 -----
459 Subclasses of `FileLikeDatastore` should implement this method instead
460 of `_prepIngest`. It should not modify the data repository or given
461 file in any way.
463 Raises
464 ------
465 NotImplementedError
466 Raised if the datastore does not support the given transfer mode
467 (including the case where ingest is not supported at all).
468 FileNotFoundError
469 Raised if one of the given files does not exist.
470 """
471 raise NotImplementedError("Must be implemented by subclasses.")
473 @abstractmethod
474 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter],
475 transfer: Optional[str] = None) -> StoredFileInfo:
476 """Relocate (if necessary) and extract `StoredFileInfo` from a
477 to-be-ingested file.
479 Parameters
480 ----------
481 path : `str`
482 Path of a file to be ingested.
483 ref : `DatasetRef`
484 Reference for the dataset being ingested. Guaranteed to have
485 ``dataset_id not None`.
486 formatter : `type`
487 `Formatter` subclass to use for this dataset.
488 transfer : `str`, optional
489 How (and whether) the dataset should be added to the datastore.
490 If `None` (default), the file must already be in a location
491 appropriate for the datastore (e.g. within its root directory),
492 and will not be modified. Other choices include "move", "copy",
493 "symlink", and "hardlink".
495 Returns
496 -------
497 info : `StoredFileInfo`
498 Internal datastore record for this file. This will be inserted by
499 the caller; the `_extractIngestInfo` is only resposible for
500 creating and populating the struct.
502 Raises
503 ------
504 FileNotFoundError
505 Raised if one of the given files does not exist.
506 FileExistsError
507 Raised if transfer is not `None` but the (internal) location the
508 file would be moved to is already occupied.
509 """
510 raise NotImplementedError("Must be implemented by subclasses.")
512 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData:
513 # Docstring inherited from Datastore._prepIngest.
514 filtered = []
515 for dataset in datasets:
516 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)]
517 if not acceptable:
518 continue
519 else:
520 dataset.refs = acceptable
521 if dataset.formatter is None:
522 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0])
523 else:
524 dataset.formatter = getClassOf(dataset.formatter)
525 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer)
526 filtered.append(dataset)
527 return _IngestPrepData(filtered)
529 @transactional
530 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None):
531 # Docstring inherited from Datastore._finishIngest.
532 refsAndInfos = []
533 for dataset in prepData.datasets:
534 # Do ingest as if the first dataset ref is associated with the file
535 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter,
536 transfer=transfer)
537 refsAndInfos.extend([(ref, info) for ref in dataset.refs])
538 self._register_datasets(refsAndInfos)
540 def getUri(self, ref, predict=False):
541 """URI to the Dataset.
543 Parameters
544 ----------
545 ref : `DatasetRef`
546 Reference to the required Dataset.
547 predict : `bool`
548 If `True`, allow URIs to be returned of datasets that have not
549 been written.
551 Returns
552 -------
553 uri : `str`
554 URI string pointing to the Dataset within the datastore. If the
555 Dataset does not exist in the datastore, and if ``predict`` is
556 `True`, the URI will be a prediction and will include a URI
557 fragment "#predicted".
558 If the datastore does not have entities that relate well
559 to the concept of a URI the returned URI string will be
560 descriptive. The returned URI is not guaranteed to be obtainable.
562 Raises
563 ------
564 FileNotFoundError
565 A URI has been requested for a dataset that does not exist and
566 guessing is not allowed.
568 Notes
569 -----
570 When a predicted URI is requested an attempt will be made to form
571 a reasonable URI based on file templates and the expected formatter.
572 """
573 # if this has never been written then we have to guess
574 if not self.exists(ref):
575 if not predict:
576 raise FileNotFoundError("Dataset {} not in this datastore".format(ref))
578 template = self.templates.getTemplate(ref)
579 location = self.locationFactory.fromPath(template.format(ref))
580 storageClass = ref.datasetType.storageClass
581 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location,
582 storageClass=storageClass))
583 # Try to use the extension attribute but ignore problems if the
584 # formatter does not define one.
585 try:
586 location = formatter.makeUpdatedLocation(location)
587 except Exception:
588 # Use the default extension
589 pass
591 # Add a URI fragment to indicate this is a guess
592 return location.uri + "#predicted"
594 # If this is a ref that we have written we can get the path.
595 # Get file metadata and internal metadata
596 storedFileInfo = self.getStoredItemInfo(ref)
598 # Use the path to determine the location
599 location = self.locationFactory.fromPath(storedFileInfo.path)
601 return location.uri
603 def validateConfiguration(self, entities, logFailures=False):
604 """Validate some of the configuration for this datastore.
606 Parameters
607 ----------
608 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass`
609 Entities to test against this configuration. Can be differing
610 types.
611 logFailures : `bool`, optional
612 If `True`, output a log message for every validation error
613 detected.
615 Raises
616 ------
617 DatastoreValidationError
618 Raised if there is a validation problem with a configuration.
619 All the problems are reported in a single exception.
621 Notes
622 -----
623 This method checks that all the supplied entities have valid file
624 templates and also have formatters defined.
625 """
627 templateFailed = None
628 try:
629 self.templates.validateTemplates(entities, logFailures=logFailures)
630 except FileTemplateValidationError as e:
631 templateFailed = str(e)
633 formatterFailed = []
634 for entity in entities:
635 try:
636 self.formatterFactory.getFormatterClass(entity)
637 except KeyError as e:
638 formatterFailed.append(str(e))
639 if logFailures: 639 ↛ 634line 639 didn't jump to line 634, because the condition on line 639 was never false
640 log.fatal("Formatter failure: %s", e)
642 if templateFailed or formatterFailed:
643 messages = []
644 if templateFailed: 644 ↛ 645line 644 didn't jump to line 645, because the condition on line 644 was never true
645 messages.append(templateFailed)
646 if formatterFailed: 646 ↛ 648line 646 didn't jump to line 648, because the condition on line 646 was never false
647 messages.append(",".join(formatterFailed))
648 msg = ";\n".join(messages)
649 raise DatastoreValidationError(msg)
651 def getLookupKeys(self):
652 # Docstring is inherited from base class
653 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \
654 self.constraints.getLookupKeys()
656 def validateKey(self, lookupKey, entity):
657 # Docstring is inherited from base class
658 # The key can be valid in either formatters or templates so we can
659 # only check the template if it exists
660 if lookupKey in self.templates:
661 try:
662 self.templates[lookupKey].validateTemplate(entity)
663 except FileTemplateValidationError as e:
664 raise DatastoreValidationError(e) from e