Coverage for python/lsst/daf/butler/core/repoTransfers.py : 27%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# This file is part of daf_butler. # # Developed for the LSST Data Management System. # This product includes software developed by the LSST Project # (http://www.lsst.org). # See the COPYRIGHT file at the top-level directory of this distribution # for details of code ownership. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>.
"RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig", "YamlRepoExportBackend", "YamlRepoImportBackend"]
from .dimensions import DimensionElement, DimensionRecord, ExpandedDataCoordinate from .datasets import DatasetRef from .registry import Registry from .datastore import Datastore
"""The section of butler configuration that associates repo import/export backends with file formats. """
class DatasetExport: """A struct that represents a dataset exported to a file. """
"""Registry information about the dataset (`DatasetRef`). """
"""Path to the dataset (`str`).
If the dataset was exported with ``transfer=None`` (i.e. in-place), this is relative to the datastore root (only datastores that have a well-defined root in the local filesystem can be expected to support in-place exports). Otherwise this is relative to the directory passed to `Datastore.export`. """
read this dataset (`str`). """
"""Public interface for exporting a subset of a data repository.
Instances of this class are obtained by calling `Butler.export` as the value returned by that context manager::
with butler.export(filename="export.yaml") as export: export.saveDataIds(...) export.saveDatasts(...)
Parameters ---------- registry : `Registry` Registry to export from. datastore : `Datastore` Datastore to export from. backend : `RepoExportBackend` Implementation class for a particular export file format. directory : `str`, optional Directory to pass to `Datastore.export`. transfer : `str`, optional Transfer mdoe to pass to `Datastore.export`. """
directory: Optional[str] = None, transfer: Optional[str] = None): self._registry = registry self._datastore = datastore self._backend = backend self._directory = directory self._transfer = transfer self._dataset_ids = set()
elements: Optional[Iterable[DimensionElement]] = None): """Export the dimension records associated with one or more data IDs.
Parameters ---------- dataIds : iterable of `ExpandedDataCoordinate`. Fully-expanded data IDs to export. elements : iterable of `DimensionElement`, optional Dimension elements whose records should be exported. If `None`, records for all dimensions will be exported. """ if elements is None: elements = frozenset(element for element in self._registry.dimensions.elements if element.hasTable() and element.viewOf is None) else: elements = frozenset(elements) records = defaultdict(dict) for dataId in dataIds: for record in dataId.records.values(): if record.definition in elements: records[record.definition].setdefault(record.dataId, record) for element in self._registry.dimensions.sorted(records.keys()): self._backend.saveDimensionData(element, *records[element].values())
elements: Optional[Iterable[DimensionElement]] = None, rewrite: Optional[Callable[[DatasetExport], DatasetExport]] = None): """Export one or more datasets.
This automatically exports any `DatasetType`, `Run`, and dimension records associated with the datasets.
Parameters ---------- refs : iterable of `DatasetRef` References to the datasets to export. Their `DatasetRef.id` attributes must not be `None`. Duplicates are automatically ignored. elements : iterable of `DimensionElement`, optional Dimension elements whose records should be exported; this is forwarded to `saveDataIds` when exporting the data IDs of the given datasets. rewrite : callable, optional A callable that takes a single `DatasetExport` argument and returns a modified `DatasetExport`. This is typically used to rewrite the path generated by the datastore. If `None`, the `DatasetExport` returned by `Datastore.export` will be used directly.
Note ---- At present, this only associates datasets with the collection that identifies their `Run`. Other collections will be included in the export in the future (once `Registry` provides a way to look up that information). """ dataIds = set() datasets: Mapping[Tuple[DatasetType, Run], List[DatasetExport]] = defaultdict(list) for ref in refs: # The query interfaces that are often used to generate the refs # passed here often don't remove duplicates, so do that here for # convenience. if ref.id in self._dataset_ids: continue dataIds.add(ref.dataId) # TODO: we need to call getDataset here because most ways of # obtaining a DatasetRef (including queryDataset) don't populate # the run attribute. We should address that upstream in the # future. ref = self._registry.getDataset(ref.id, dataId=ref.dataId, datasetType=ref.datasetType) # `exports` is a single-element list here, because we anticipate # a future where more than just Datastore.export has a vectorized # API and we can pull this out of the loop. exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer) if rewrite is not None: exports = [rewrite(export) for export in exports] datasets[ref.datasetType, ref.run].extend(exports) self._dataset_ids.add(ref.id) self.saveDataIds(dataIds, elements=elements) for (datasetType, run), records in datasets.items(): self._backend.saveDatasets(datasetType, run, *records)
"""Delegate to the backend to finish the export process.
For use by `Butler.export` only. """ self._backend.finish()
"""An abstract interface for data repository export implementations. """
def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord): """Export one or more dimension element records.
Parameters ---------- element : `DimensionElement` The `DimensionElement` whose elements are being exported. data : `DimensionRecord` (variadic) One or more records to export. """ raise NotImplementedError()
collections: Iterable[str] = ()): """Export one or more datasets, including their associated DatasetType and Run information (but not including associated dimension information).
Parameters ---------- datasetType : `DatasetType` Type of all datasets being exported with this call. run : `Run` Run associated with all datasets being exported with this call. datasets : `DatasetExport`, variadic Per-dataset information to be exported. collections : iterable of `str` Extra collections (in addition to `Run.collection`) the dataset should be associated with. """ raise NotImplementedError()
def finish(self): """Complete the export process. """ raise NotImplementedError()
"""An abstract interface for data repository import implementations. """
directory: Optional[str] = None, transfer: Optional[str] = None): """Import all information associated with the backend into the given registry and datastore.
Import backends are expected to be constructed with a description of the objects that need to be imported (from, e.g., a file written by the corresponding export backend).
Parameters ---------- registry : `Registry` Registry to import into. datastore : `Datastore` Datastore to import into. directory : `str`, optional File all dataset paths are relative to. transfer : `str`, optional Transfer mode forwarded to `Datastore.ingest`. """ raise NotImplementedError()
"""A repository export implementation that saves to a YAML file.
Parameters ---------- stream A writeable file-like object. """
self.stream = stream self.data = []
# Docstring inherited from RepoExportBackend.saveDimensionData. self.data.append({ "type": "dimension", "element": element.name, "records": [d.toDict() for d in data], # TODO: encode regions })
# Docstring inherited from RepoExportBackend.saveDatasets. self.data.append({ "type": "dataset_type", "name": datasetType.name, "dimensions": [d.name for d in datasetType.dimensions], "storage_class": datasetType.storageClass.name, }) self.data.append({ "type": "run", "id": run.id, "start_time": run.startTime, "end_time": run.endTime, "host": run.host, "collection": run.collection, "pipeline": run.pipeline, "environment": run.environment, }) self.data.append({ "type": "dataset", "dataset_type": datasetType.name, "run_id": run.id, "records": [ { "dataset_id": dataset.ref.id, "data_id": dataset.ref.dataId.byName(), "path": dataset.path, "formatter": dataset.formatter, # TODO: look up and save other collections } for dataset in datasets ] })
# Docstring inherited from RepoExportBackend. yaml.dump( { "description": "Butler Data Repository Export", "version": 0, "data": self.data, }, stream=self.stream, sort_keys=False, )
"""A repository import implementation that reads from a YAML file.
Parameters ---------- stream A readable file-like object. """
self.stream = stream
directory: Optional[str] = None, transfer: Optional[str] = None): # Docstring inherited from RepoImportBackend.load. wrapper = yaml.safe_load(self.stream) # TODO: When version numbers become meaningful, check here that we can # read the version in the file. # Mapping from saved ID to inserted Run (which may have a different # ID). runs = {} # Mapping from collection name to list of DatasetRefs to associate. collections = {} for data in wrapper["data"]: if data["type"] == "dimension": registry.insertDimensionData(data["element"], *data["records"]) elif data["type"] == "run": run = Run(collection=data["collection"], environment=data["environment"], pipeline=data["pipeline"], startTime=data["start_time"], endTime=data["end_time"], host=data["host"]) runs[data["id"]] = run registry.ensureRun(run) elif data["type"] == "dataset_type": registry.registerDatasetType( DatasetType(data["name"], dimensions=data["dimensions"], storageClass=data["storage_class"], universe=registry.dimensions) ) elif data["type"] == "dataset": datasetType = registry.getDatasetType(data["dataset_type"]) run = runs[data["run_id"]] for dataset in data["records"]: ref = registry.addDataset(datasetType, dataset["data_id"], run=run, recursive=True) formatter = doImport(dataset["formatter"]) if directory is not None: path = os.path.join(directory, dataset["path"]) else: path = dataset["path"] datastore.ingest(path, ref, transfer=transfer, formatter=formatter) for collection in dataset.get("collections", []): collections[collection].append(ref) else: raise ValueError(f"Unexpected dictionary type: {data['type']}.") for collection, refs in collections.items(): registry.associate(collection, refs) |