Coverage for python/lsst/daf/butler/core/repoTransfers.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["FileDataset", "RepoExport",
25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig",
26 "YamlRepoExportBackend", "YamlRepoImportBackend"]
28import os
29from abc import ABC, abstractmethod
30from dataclasses import dataclass
31from datetime import datetime
32from typing import (
33 TYPE_CHECKING,
34 Callable,
35 IO,
36 Iterable,
37 List,
38 Mapping,
39 Optional,
40 Set,
41 Tuple,
42 Union,
43)
44from collections import defaultdict
46import yaml
47import astropy.time
49from lsst.utils import doImport
50from .config import ConfigSubset
51from .datasets import DatasetType, DatasetRef
52from .utils import NamedValueSet, iterable
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 from .dimensions import DimensionElement, DimensionRecord, ExpandedDataCoordinate
56 from ..registry import Registry
57 from .datastore import Datastore
58 from .formatter import FormatterParameter
61class RepoTransferFormatConfig(ConfigSubset):
62 """The section of butler configuration that associates repo import/export
63 backends with file formats.
64 """
65 component = "repo_transfer_formats"
66 defaultConfigFile = "repo_transfer_formats.yaml"
69@dataclass
70class FileDataset:
71 """A struct that represents a dataset exported to a file.
72 """
73 __slots__ = ("refs", "path", "formatter")
75 refs: List[DatasetRef]
76 """Registry information about the dataset. (`list` of `DatasetRef`).
77 """
79 path: str
80 """Path to the dataset (`str`).
82 If the dataset was exported with ``transfer=None`` (i.e. in-place),
83 this is relative to the datastore root (only datastores that have a
84 well-defined root in the local filesystem can be expected to support
85 in-place exports). Otherwise this is relative to the directory passed
86 to `Datastore.export`.
87 """
89 formatter: Optional[FormatterParameter]
90 """A `Formatter` class or fully-qualified name.
91 """
93 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *,
94 formatter: Optional[FormatterParameter] = None):
95 self.path = path
96 if isinstance(refs, DatasetRef):
97 refs = [refs]
98 self.refs = refs
99 self.formatter = formatter
102class RepoExport:
103 """Public interface for exporting a subset of a data repository.
105 Instances of this class are obtained by calling `Butler.export` as the
106 value returned by that context manager::
108 with butler.export(filename="export.yaml") as export:
109 export.saveDataIds(...)
110 export.saveDatasts(...)
112 Parameters
113 ----------
114 registry : `Registry`
115 Registry to export from.
116 datastore : `Datastore`
117 Datastore to export from.
118 backend : `RepoExportBackend`
119 Implementation class for a particular export file format.
120 directory : `str`, optional
121 Directory to pass to `Datastore.export`.
122 transfer : `str`, optional
123 Transfer mdoe to pass to `Datastore.export`.
124 """
126 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *,
127 directory: Optional[str] = None, transfer: Optional[str] = None):
128 self._registry = registry
129 self._datastore = datastore
130 self._backend = backend
131 self._directory = directory
132 self._transfer = transfer
133 self._dataset_ids: Set[int] = set()
135 def saveDataIds(self, dataIds: Iterable[ExpandedDataCoordinate], *,
136 elements: Optional[Iterable[DimensionElement]] = None) -> None:
137 """Export the dimension records associated with one or more data IDs.
139 Parameters
140 ----------
141 dataIds : iterable of `ExpandedDataCoordinate`.
142 Fully-expanded data IDs to export.
143 elements : iterable of `DimensionElement`, optional
144 Dimension elements whose records should be exported. If `None`,
145 records for all dimensions will be exported.
146 """
147 if elements is None:
148 elements = frozenset(element for element in self._registry.dimensions.elements
149 if element.hasTable() and element.viewOf is None)
150 else:
151 elements = frozenset(elements)
152 records = defaultdict(dict)
153 for dataId in dataIds:
154 for record in dataId.records.values():
155 if record.definition in elements:
156 records[record.definition].setdefault(record.dataId, record)
157 for element in self._registry.dimensions.sorted(records.keys()):
158 self._backend.saveDimensionData(element, *records[element].values())
160 def saveDatasets(self, refs: Iterable[DatasetRef], *,
161 elements: Optional[Iterable[DimensionElement]] = None,
162 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None:
163 """Export one or more datasets.
165 This automatically exports any `DatasetType`, `Run`, and dimension
166 records associated with the datasets.
168 Parameters
169 ----------
170 refs : iterable of `DatasetRef`
171 References to the datasets to export. Their `DatasetRef.id`
172 attributes must not be `None`. Duplicates are automatically
173 ignored.
174 elements : iterable of `DimensionElement`, optional
175 Dimension elements whose records should be exported; this is
176 forwarded to `saveDataIds` when exporting the data IDs of the
177 given datasets.
178 rewrite : callable, optional
179 A callable that takes a single `FileDataset` argument and returns
180 a modified `FileDataset`. This is typically used to rewrite the
181 path generated by the datastore. If `None`, the `FileDataset`
182 returned by `Datastore.export` will be used directly.
184 Notes
185 -----
186 At present, this only associates datasets with the collection that
187 matches their run name. Other collections will be included in the
188 export in the future (once `Registry` provides a way to look up that
189 information).
190 """
191 dataIds = set()
192 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list)
193 for ref in refs:
194 # The query interfaces that are often used to generate the refs
195 # passed here often don't remove duplicates, so do that here for
196 # convenience.
197 if ref.id in self._dataset_ids:
198 continue
199 dataIds.add(ref.dataId)
200 # `exports` is a single-element list here, because we anticipate
201 # a future where more than just Datastore.export has a vectorized
202 # API and we can pull this out of the loop.
203 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer)
204 if rewrite is not None:
205 exports = [rewrite(export) for export in exports]
206 datasets[ref.datasetType, ref.run].extend(exports)
207 self._dataset_ids.add(ref.id)
208 self.saveDataIds(dataIds, elements=elements)
209 for (datasetType, run), records in datasets.items():
210 self._backend.saveDatasets(datasetType, run, *records)
212 def _finish(self) -> None:
213 """Delegate to the backend to finish the export process.
215 For use by `Butler.export` only.
216 """
217 self._backend.finish()
220class RepoExportBackend(ABC):
221 """An abstract interface for data repository export implementations.
222 """
224 @abstractmethod
225 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
226 """Export one or more dimension element records.
228 Parameters
229 ----------
230 element : `DimensionElement`
231 The `DimensionElement` whose elements are being exported.
232 data : `DimensionRecord` (variadic)
233 One or more records to export.
234 """
235 raise NotImplementedError()
237 @abstractmethod
238 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset,
239 collections: Iterable[str] = ()) -> None:
240 """Export one or more datasets, including their associated DatasetType
241 and run information (but not including associated dimension
242 information).
244 Parameters
245 ----------
246 datasetType : `DatasetType`
247 Type of all datasets being exported with this call.
248 run : `str`
249 Run associated with all datasets being exported with this call.
250 datasets : `FileDataset`, variadic
251 Per-dataset information to be exported. `FileDataset.formatter`
252 attributes should be strings, not `Formatter` instances or classes.
253 collections : iterable of `str`
254 Extra collections (in addition to ``run``) the dataset
255 should be associated with.
256 """
257 raise NotImplementedError()
259 @abstractmethod
260 def finish(self) -> None:
261 """Complete the export process.
262 """
263 raise NotImplementedError()
266class RepoImportBackend(ABC):
267 """An abstract interface for data repository import implementations.
269 Import backends are expected to be constructed with a description of
270 the objects that need to be imported (from, e.g., a file written by the
271 corresponding export backend), along with a `Registry`.
272 """
274 @abstractmethod
275 def register(self) -> None:
276 """Register all runs and dataset types associated with the backend with
277 the `Registry` the backend was constructed with.
279 These operations cannot be performed inside transactions, unlike those
280 performed by `load`, and must in general be performed before `load`.
281 """
283 @abstractmethod
284 def load(self, datastore: Optional[Datastore], *,
285 directory: Optional[str] = None, transfer: Optional[str] = None) -> None:
286 """Import information associated with the backend into the given
287 registry and datastore.
289 This must be run after `register`, and may be performed inside a
290 transaction.
292 Parameters
293 ----------
294 datastore : `Datastore`
295 Datastore to import into. If `None`, datasets will only be
296 inserted into the `Registry` (primarily intended for tests).
297 directory : `str`, optional
298 File all dataset paths are relative to.
299 transfer : `str`, optional
300 Transfer mode forwarded to `Datastore.ingest`.
301 """
302 raise NotImplementedError()
305class YamlRepoExportBackend(RepoExportBackend):
306 """A repository export implementation that saves to a YAML file.
308 Parameters
309 ----------
310 stream
311 A writeable file-like object.
312 """
314 def __init__(self, stream: IO):
315 self.stream = stream
316 self.data = []
318 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
319 # Docstring inherited from RepoExportBackend.saveDimensionData.
320 # Convert astropy time in TAI to datetime in UTC for YAML
321 data_dicts = []
322 for record in data:
323 rec_dict = record.toDict()
324 for key in rec_dict:
325 if isinstance(rec_dict[key], astropy.time.Time):
326 rec_dict[key] = rec_dict[key].utc.to_datetime()
327 data_dicts += [rec_dict]
328 self.data.append({
329 "type": "dimension",
330 "element": element.name,
331 "records": data_dicts, # TODO: encode regions
332 })
334 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
335 # Docstring inherited from RepoExportBackend.saveDatasets.
336 self.data.append({
337 "type": "dataset_type",
338 "name": datasetType.name,
339 "dimensions": [d.name for d in datasetType.dimensions],
340 "storage_class": datasetType.storageClass.name,
341 })
342 self.data.append({
343 "type": "run",
344 "name": run,
345 })
346 self.data.append({
347 "type": "dataset",
348 "dataset_type": datasetType.name,
349 "run": run,
350 "records": [
351 {
352 "dataset_id": [ref.id for ref in dataset.refs],
353 "data_id": [ref.dataId.byName() for ref in dataset.refs],
354 "path": dataset.path,
355 "formatter": dataset.formatter,
356 # TODO: look up and save other collections
357 }
358 for dataset in datasets
359 ]
360 })
362 def finish(self) -> None:
363 # Docstring inherited from RepoExportBackend.
364 yaml.dump(
365 {
366 "description": "Butler Data Repository Export",
367 "version": 0,
368 "data": self.data,
369 },
370 stream=self.stream,
371 sort_keys=False,
372 )
375class YamlRepoImportBackend(RepoImportBackend):
376 """A repository import implementation that reads from a YAML file.
378 Parameters
379 ----------
380 stream
381 A readable file-like object.
382 registry : `Registry`
383 The registry datasets will be imported into. Only used to retreive
384 dataset types during construction; all write happen in `register`
385 and `load`.
386 """
388 def __init__(self, stream: IO, registry: Registry):
389 # We read the file fully and convert its contents to Python objects
390 # instead of loading incrementally so we can spot some problems early;
391 # because `register` can't be put inside a transaction, we'd rather not
392 # run that at all if there's going to be problem later in `load`.
393 wrapper = yaml.safe_load(stream)
394 # TODO: When version numbers become meaningful, check here that we can
395 # read the version in the file.
396 self.runs: List[str] = []
397 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
398 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
399 self.registry: Registry = registry
400 datasetData = []
401 for data in wrapper["data"]:
402 if data["type"] == "dimension":
403 # convert all datetiem values to astropy
404 for record in data["records"]:
405 for key in record:
406 if isinstance(record[key], datetime):
407 record[key] = astropy.time.Time(record[key], scale="utc")
408 element = self.registry.dimensions[data["element"]]
409 self.dimensions[element].extend(element.RecordClass.fromDict(r) for r in data["records"])
410 elif data["type"] == "run":
411 self.runs.append(data["name"])
412 elif data["type"] == "dataset_type":
413 self.datasetTypes.add(
414 DatasetType(data["name"], dimensions=data["dimensions"],
415 storageClass=data["storage_class"], universe=self.registry.dimensions)
416 )
417 elif data["type"] == "dataset":
418 # Save raw dataset data for a second loop, so we can ensure we
419 # know about all dataset types first.
420 datasetData.append(data)
421 else:
422 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
423 # key is (dataset type name, run); inner most list is collections
424 self.datasets: Mapping[Tuple[str, str], List[Tuple[FileDataset, List[str]]]] = defaultdict(list)
425 for data in datasetData:
426 datasetType = self.datasetTypes.get(data["dataset_type"])
427 if datasetType is None:
428 datasetType = self.registry.getDatasetType(data["dataset_type"])
429 self.datasets[data["dataset_type"], data["run"]].extend(
430 (
431 FileDataset(
432 d.get("path"),
433 [DatasetRef(datasetType, dataId, run=data["run"], id=refid)
434 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))],
435 formatter=doImport(d.get("formatter")) if "formatter" in d else None
436 ),
437 d.get("collections", [])
438 )
439 for d in data["records"]
440 )
442 def register(self) -> None:
443 # Docstring inherited from RepoImportBackend.register.
444 for run in self.runs:
445 self.registry.registerRun(run)
446 for datasetType in self.datasetTypes:
447 self.registry.registerDatasetType(datasetType)
449 def load(self, datastore: Optional[Datastore], *,
450 directory: Optional[str] = None, transfer: Optional[str] = None) -> None:
451 # Docstring inherited from RepoImportBackend.load.
452 for element, records in self.dimensions.items():
453 self.registry.insertDimensionData(element, *records)
454 # Mapping from collection name to list of DatasetRefs to associate.
455 collections = defaultdict(list)
456 # FileDatasets to ingest into the datastore (in bulk):
457 fileDatasets = []
458 for (datasetTypeName, run), records in self.datasets.items():
459 datasetType = self.registry.getDatasetType(datasetTypeName)
460 # Make a big flattened list of all data IDs, while remembering
461 # slices that associate them with the FileDataset instances they
462 # came from.
463 dataIds = []
464 slices = []
465 for fileDataset, _ in records:
466 start = len(dataIds)
467 dataIds.extend(ref.dataId for ref in fileDataset.refs)
468 stop = len(dataIds)
469 slices.append(slice(start, stop))
470 # Insert all of those DatasetRefs at once.
471 # For now, we ignore the dataset_id we pulled from the file
472 # and just insert without one to get a new autoincrement value.
473 # Eventually (once we have origin in IDs) we'll preserve them.
474 resolvedRefs = self.registry.insertDatasets(
475 datasetType,
476 dataIds=dataIds,
477 run=run,
478 recursive=True
479 )
480 # Now iterate over the original records, and install the new
481 # resolved DatasetRefs to replace the unresolved ones as we
482 # reorganize the collection information.
483 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records):
484 fileDataset.refs = resolvedRefs[sliceForFileDataset]
485 if directory is not None:
486 fileDataset.path = os.path.join(directory, fileDataset.path)
487 fileDatasets.append(fileDataset)
488 for collection in collectionsForDataset:
489 collections[collection].extend(fileDataset.refs)
490 # Ingest everything into the datastore at once.
491 if datastore is not None and fileDatasets:
492 datastore.ingest(*fileDatasets, transfer=transfer)
493 # Associate with collections, one collection at a time.
494 for collection, refs in collections.items():
495 self.registry.associate(collection, refs)