Coverage for python/lsst/daf/butler/core/repoTransfers.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["FileDataset", "RepoExport",
25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig",
26 "YamlRepoExportBackend", "YamlRepoImportBackend"]
28import os
29from abc import ABC, abstractmethod
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Iterable, Optional, IO, List, Mapping, Tuple, Callable, Union, Type
32from collections import defaultdict
34import yaml
36from lsst.utils import doImport
37from .config import ConfigSubset
38from .datasets import DatasetType, DatasetRef
39from .utils import NamedValueSet, iterable
41if TYPE_CHECKING: 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true
42 from .dimensions import DimensionElement, DimensionRecord, ExpandedDataCoordinate
43 from ..registry import Registry
44 from .datastore import Datastore
45 from .formatters import Formatter
48class RepoTransferFormatConfig(ConfigSubset):
49 """The section of butler configuration that associates repo import/export
50 backends with file formats.
51 """
52 component = "repo_transfer_formats"
53 defaultConfigFile = "repo_transfer_formats.yaml"
56@dataclass
57class FileDataset:
58 """A struct that represents a dataset exported to a file.
59 """
60 __slots__ = ("refs", "path", "formatter")
62 refs: List[DatasetRef]
63 """Registry information about the dataset. (`list` of `DatasetRef`).
64 """
66 path: str
67 """Path to the dataset (`str`).
69 If the dataset was exported with ``transfer=None`` (i.e. in-place),
70 this is relative to the datastore root (only datastores that have a
71 well-defined root in the local filesystem can be expected to support
72 in-place exports). Otherwise this is relative to the directory passed
73 to `Datastore.export`.
74 """
76 formatter: Union[None, str, Type[Formatter]]
77 """A `Formatter` class or fully-qualified name.
78 """
80 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *,
81 formatter: Union[None, str, Type[Formatter]] = None):
82 self.path = path
83 if isinstance(refs, DatasetRef):
84 refs = [refs]
85 self.refs = refs
86 self.formatter = formatter
89class RepoExport:
90 """Public interface for exporting a subset of a data repository.
92 Instances of this class are obtained by calling `Butler.export` as the
93 value returned by that context manager::
95 with butler.export(filename="export.yaml") as export:
96 export.saveDataIds(...)
97 export.saveDatasts(...)
99 Parameters
100 ----------
101 registry : `Registry`
102 Registry to export from.
103 datastore : `Datastore`
104 Datastore to export from.
105 backend : `RepoExportBackend`
106 Implementation class for a particular export file format.
107 directory : `str`, optional
108 Directory to pass to `Datastore.export`.
109 transfer : `str`, optional
110 Transfer mdoe to pass to `Datastore.export`.
111 """
113 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *,
114 directory: Optional[str] = None, transfer: Optional[str] = None):
115 self._registry = registry
116 self._datastore = datastore
117 self._backend = backend
118 self._directory = directory
119 self._transfer = transfer
120 self._dataset_ids = set()
122 def saveDataIds(self, dataIds: Iterable[ExpandedDataCoordinate], *,
123 elements: Optional[Iterable[DimensionElement]] = None):
124 """Export the dimension records associated with one or more data IDs.
126 Parameters
127 ----------
128 dataIds : iterable of `ExpandedDataCoordinate`.
129 Fully-expanded data IDs to export.
130 elements : iterable of `DimensionElement`, optional
131 Dimension elements whose records should be exported. If `None`,
132 records for all dimensions will be exported.
133 """
134 if elements is None:
135 elements = frozenset(element for element in self._registry.dimensions.elements
136 if element.hasTable() and element.viewOf is None)
137 else:
138 elements = frozenset(elements)
139 records = defaultdict(dict)
140 for dataId in dataIds:
141 for record in dataId.records.values():
142 if record.definition in elements:
143 records[record.definition].setdefault(record.dataId, record)
144 for element in self._registry.dimensions.sorted(records.keys()):
145 self._backend.saveDimensionData(element, *records[element].values())
147 def saveDatasets(self, refs: Iterable[DatasetRef], *,
148 elements: Optional[Iterable[DimensionElement]] = None,
149 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None):
150 """Export one or more datasets.
152 This automatically exports any `DatasetType`, `Run`, and dimension
153 records associated with the datasets.
155 Parameters
156 ----------
157 refs : iterable of `DatasetRef`
158 References to the datasets to export. Their `DatasetRef.id`
159 attributes must not be `None`. Duplicates are automatically
160 ignored.
161 elements : iterable of `DimensionElement`, optional
162 Dimension elements whose records should be exported; this is
163 forwarded to `saveDataIds` when exporting the data IDs of the
164 given datasets.
165 rewrite : callable, optional
166 A callable that takes a single `FileDataset` argument and returns
167 a modified `FileDataset`. This is typically used to rewrite the
168 path generated by the datastore. If `None`, the `FileDataset`
169 returned by `Datastore.export` will be used directly.
171 Notes
172 -----
173 At present, this only associates datasets with the collection that
174 matches their run name. Other collections will be included in the
175 export in the future (once `Registry` provides a way to look up that
176 information).
177 """
178 dataIds = set()
179 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list)
180 for ref in refs:
181 # The query interfaces that are often used to generate the refs
182 # passed here often don't remove duplicates, so do that here for
183 # convenience.
184 if ref.id in self._dataset_ids:
185 continue
186 dataIds.add(ref.dataId)
187 # TODO: we need to call getDataset here because most ways of
188 # obtaining a DatasetRef (including queryDataset) don't populate
189 # the run attribute. We should address that upstream in the
190 # future.
191 ref = self._registry.getDataset(ref.id, dataId=ref.dataId, datasetType=ref.datasetType)
192 # `exports` is a single-element list here, because we anticipate
193 # a future where more than just Datastore.export has a vectorized
194 # API and we can pull this out of the loop.
195 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer)
196 if rewrite is not None:
197 exports = [rewrite(export) for export in exports]
198 datasets[ref.datasetType, ref.run].extend(exports)
199 self._dataset_ids.add(ref.id)
200 self.saveDataIds(dataIds, elements=elements)
201 for (datasetType, run), records in datasets.items():
202 self._backend.saveDatasets(datasetType, run, *records)
204 def _finish(self):
205 """Delegate to the backend to finish the export process.
207 For use by `Butler.export` only.
208 """
209 self._backend.finish()
212class RepoExportBackend(ABC):
213 """An abstract interface for data repository export implementations.
214 """
216 @abstractmethod
217 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord):
218 """Export one or more dimension element records.
220 Parameters
221 ----------
222 element : `DimensionElement`
223 The `DimensionElement` whose elements are being exported.
224 data : `DimensionRecord` (variadic)
225 One or more records to export.
226 """
227 raise NotImplementedError()
229 @abstractmethod
230 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset,
231 collections: Iterable[str] = ()):
232 """Export one or more datasets, including their associated DatasetType
233 and run information (but not including associated dimension
234 information).
236 Parameters
237 ----------
238 datasetType : `DatasetType`
239 Type of all datasets being exported with this call.
240 run : `str`
241 Run associated with all datasets being exported with this call.
242 datasets : `FileDataset`, variadic
243 Per-dataset information to be exported. `FileDataset.formatter`
244 attributes should be strings, not `Formatter` instances or classes.
245 collections : iterable of `str`
246 Extra collections (in addition to ``run``) the dataset
247 should be associated with.
248 """
249 raise NotImplementedError()
251 @abstractmethod
252 def finish(self):
253 """Complete the export process.
254 """
255 raise NotImplementedError()
258class RepoImportBackend(ABC):
259 """An abstract interface for data repository import implementations.
261 Import backends are expected to be constructed with a description of
262 the objects that need to be imported (from, e.g., a file written by the
263 corresponding export backend), along with a `Registry`.
264 """
266 @abstractmethod
267 def register(self):
268 """Register all runs and dataset types associated with the backend with
269 the `Registry` the backend was constructed with.
271 These operations cannot be performed inside transactions, unlike those
272 performed by `load`, and must in general be performed before `load`.
273 """
275 @abstractmethod
276 def load(self, datastore: Datastore, *,
277 directory: Optional[str] = None, transfer: Optional[str] = None):
278 """Import information associated with the backend into the given
279 registry and datastore.
281 This must be run after `register`, and may be performed inside a
282 transaction.
284 Parameters
285 ----------
286 registry : `Registry`
287 Registry to import into.
288 datastore : `Datastore`
289 Datastore to import into.
290 directory : `str`, optional
291 File all dataset paths are relative to.
292 transfer : `str`, optional
293 Transfer mode forwarded to `Datastore.ingest`.
294 """
295 raise NotImplementedError()
298class YamlRepoExportBackend(RepoExportBackend):
299 """A repository export implementation that saves to a YAML file.
301 Parameters
302 ----------
303 stream
304 A writeable file-like object.
305 """
307 def __init__(self, stream: IO):
308 self.stream = stream
309 self.data = []
311 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord):
312 # Docstring inherited from RepoExportBackend.saveDimensionData.
313 self.data.append({
314 "type": "dimension",
315 "element": element.name,
316 "records": [d.toDict() for d in data], # TODO: encode regions
317 })
319 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset):
320 # Docstring inherited from RepoExportBackend.saveDatasets.
321 self.data.append({
322 "type": "dataset_type",
323 "name": datasetType.name,
324 "dimensions": [d.name for d in datasetType.dimensions],
325 "storage_class": datasetType.storageClass.name,
326 })
327 self.data.append({
328 "type": "run",
329 "name": run,
330 })
331 self.data.append({
332 "type": "dataset",
333 "dataset_type": datasetType.name,
334 "run": run,
335 "records": [
336 {
337 "dataset_id": [ref.id for ref in dataset.refs],
338 "data_id": [ref.dataId.byName() for ref in dataset.refs],
339 "path": dataset.path,
340 "formatter": dataset.formatter,
341 # TODO: look up and save other collections
342 }
343 for dataset in datasets
344 ]
345 })
347 def finish(self):
348 # Docstring inherited from RepoExportBackend.
349 yaml.dump(
350 {
351 "description": "Butler Data Repository Export",
352 "version": 0,
353 "data": self.data,
354 },
355 stream=self.stream,
356 sort_keys=False,
357 )
360class YamlRepoImportBackend(RepoImportBackend):
361 """A repository import implementation that reads from a YAML file.
363 Parameters
364 ----------
365 stream
366 A readable file-like object.
367 registry : `Registry`
368 The registry datasets will be imported into. Only used to retreive
369 dataset types during construction; all write happen in `register`
370 and `load`.
371 """
373 def __init__(self, stream: IO, registry: Registry):
374 # We read the file fully and convert its contents to Python objects
375 # instead of loading incrementally so we can spot some problems early;
376 # because `register` can't be put inside a transaction, we'd rather not
377 # run that at all if there's going to be problem later in `load`.
378 wrapper = yaml.safe_load(stream)
379 # TODO: When version numbers become meaningful, check here that we can
380 # read the version in the file.
381 self.runs: List[str] = []
382 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
383 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
384 self.registry: Registry = registry
385 datasetData = []
386 for data in wrapper["data"]:
387 if data["type"] == "dimension":
388 element = self.registry.dimensions[data["element"]]
389 self.dimensions[element].extend(element.RecordClass.fromDict(r) for r in data["records"])
390 elif data["type"] == "run":
391 self.runs.append(data["name"])
392 elif data["type"] == "dataset_type":
393 self.datasetTypes.add(
394 DatasetType(data["name"], dimensions=data["dimensions"],
395 storageClass=data["storage_class"], universe=self.registry.dimensions)
396 )
397 elif data["type"] == "dataset":
398 # Save raw dataset data for a second loop, so we can ensure we
399 # know about all dataset types first.
400 datasetData.append(data)
401 else:
402 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
403 # key is (dataset type name, run); inner most list is collections
404 self.datasets: Mapping[(str, str), List[Tuple[FileDataset, List[str]]]] = defaultdict(list)
405 for data in datasetData:
406 datasetType = self.datasetTypes.get(data["dataset_type"])
407 if datasetType is None:
408 datasetType = self.registry.getDatasetType(data["dataset_type"])
409 self.datasets[data["dataset_type"], data["run"]].extend(
410 (
411 FileDataset(
412 d["path"],
413 [DatasetRef(datasetType, dataId, run=data["run"], id=refid)
414 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))],
415 formatter=doImport(d["formatter"])
416 ),
417 d.get("collections", [])
418 )
419 for d in data["records"]
420 )
422 def register(self):
423 # Docstring inherited from RepoImportBackend.register.
424 for run in self.runs:
425 self.registry.registerRun(run)
426 for datasetType in self.datasetTypes:
427 self.registry.registerDatasetType(datasetType)
429 def load(self, datastore: Datastore, *,
430 directory: Optional[str] = None, transfer: Optional[str] = None):
431 # Docstring inherited from RepoImportBackend.load.
432 for element, records in self.dimensions.items():
433 self.registry.insertDimensionData(element, *records)
434 # Mapping from collection name to list of DatasetRefs to associate.
435 collections = defaultdict(list)
436 # FileDatasets to ingest into the datastore (in bulk):
437 fileDatasets = []
438 for (datasetTypeName, run), records in self.datasets.items():
439 datasetType = self.registry.getDatasetType(datasetTypeName)
440 # Make a big flattened list of all data IDs, while remembering
441 # slices that associate them with the FileDataset instances they
442 # came from.
443 dataIds = []
444 slices = []
445 for fileDataset, _ in records:
446 start = len(dataIds)
447 dataIds.extend(ref.dataId for ref in fileDataset.refs)
448 stop = len(dataIds)
449 slices.append(slice(start, stop))
450 # Insert all of those DatasetRefs at once.
451 # For now, we ignore the dataset_id we pulled from the file
452 # and just insert without one to get a new autoincrement value.
453 # Eventually (once we have origin in IDs) we'll preserve them.
454 resolvedRefs = self.registry.insertDatasets(
455 datasetType,
456 dataIds=dataIds,
457 run=run,
458 recursive=True
459 )
460 # Now iterate over the original records, and install the new
461 # resolved DatasetRefs to replace the unresolved ones as we
462 # reorganize the collection information.
463 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records):
464 fileDataset.refs = resolvedRefs[sliceForFileDataset]
465 if directory is not None:
466 fileDataset.path = os.path.join(directory, fileDataset.path)
467 fileDatasets.append(fileDataset)
468 for collection in collectionsForDataset:
469 collections[collection].extend(fileDataset.refs)
470 # Ingest everything into the datastore at once.
471 datastore.ingest(*fileDatasets, transfer=transfer)
472 # Associate with collections, one collection at a time.
473 for collection, refs in collections.items():
474 self.registry.associate(collection, refs)