Coverage for python/lsst/daf/butler/core/repoTransfers.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["FileDataset", "RepoExport",
25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig",
26 "YamlRepoExportBackend", "YamlRepoImportBackend"]
28import os
29from abc import ABC, abstractmethod
30from dataclasses import dataclass
31from datetime import datetime
32from typing import TYPE_CHECKING, Iterable, Optional, IO, List, Mapping, Tuple, Callable, Union
33from collections import defaultdict
35import yaml
36import astropy.time
38from lsst.utils import doImport
39from .config import ConfigSubset
40from .datasets import DatasetType, DatasetRef
41from .utils import NamedValueSet, iterable
43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 from .dimensions import DimensionElement, DimensionRecord, ExpandedDataCoordinate
45 from ..registry import Registry
46 from .datastore import Datastore
47 from .formatter import FormatterParameter
50class RepoTransferFormatConfig(ConfigSubset):
51 """The section of butler configuration that associates repo import/export
52 backends with file formats.
53 """
54 component = "repo_transfer_formats"
55 defaultConfigFile = "repo_transfer_formats.yaml"
58@dataclass
59class FileDataset:
60 """A struct that represents a dataset exported to a file.
61 """
62 __slots__ = ("refs", "path", "formatter")
64 refs: List[DatasetRef]
65 """Registry information about the dataset. (`list` of `DatasetRef`).
66 """
68 path: str
69 """Path to the dataset (`str`).
71 If the dataset was exported with ``transfer=None`` (i.e. in-place),
72 this is relative to the datastore root (only datastores that have a
73 well-defined root in the local filesystem can be expected to support
74 in-place exports). Otherwise this is relative to the directory passed
75 to `Datastore.export`.
76 """
78 formatter: FormatterParameter
79 """A `Formatter` class or fully-qualified name.
80 """
82 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *,
83 formatter: FormatterParameter = None):
84 self.path = path
85 if isinstance(refs, DatasetRef):
86 refs = [refs]
87 self.refs = refs
88 self.formatter = formatter
91class RepoExport:
92 """Public interface for exporting a subset of a data repository.
94 Instances of this class are obtained by calling `Butler.export` as the
95 value returned by that context manager::
97 with butler.export(filename="export.yaml") as export:
98 export.saveDataIds(...)
99 export.saveDatasts(...)
101 Parameters
102 ----------
103 registry : `Registry`
104 Registry to export from.
105 datastore : `Datastore`
106 Datastore to export from.
107 backend : `RepoExportBackend`
108 Implementation class for a particular export file format.
109 directory : `str`, optional
110 Directory to pass to `Datastore.export`.
111 transfer : `str`, optional
112 Transfer mdoe to pass to `Datastore.export`.
113 """
115 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *,
116 directory: Optional[str] = None, transfer: Optional[str] = None):
117 self._registry = registry
118 self._datastore = datastore
119 self._backend = backend
120 self._directory = directory
121 self._transfer = transfer
122 self._dataset_ids = set()
124 def saveDataIds(self, dataIds: Iterable[ExpandedDataCoordinate], *,
125 elements: Optional[Iterable[DimensionElement]] = None):
126 """Export the dimension records associated with one or more data IDs.
128 Parameters
129 ----------
130 dataIds : iterable of `ExpandedDataCoordinate`.
131 Fully-expanded data IDs to export.
132 elements : iterable of `DimensionElement`, optional
133 Dimension elements whose records should be exported. If `None`,
134 records for all dimensions will be exported.
135 """
136 if elements is None:
137 elements = frozenset(element for element in self._registry.dimensions.elements
138 if element.hasTable() and element.viewOf is None)
139 else:
140 elements = frozenset(elements)
141 records = defaultdict(dict)
142 for dataId in dataIds:
143 for record in dataId.records.values():
144 if record.definition in elements:
145 records[record.definition].setdefault(record.dataId, record)
146 for element in self._registry.dimensions.sorted(records.keys()):
147 self._backend.saveDimensionData(element, *records[element].values())
149 def saveDatasets(self, refs: Iterable[DatasetRef], *,
150 elements: Optional[Iterable[DimensionElement]] = None,
151 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None):
152 """Export one or more datasets.
154 This automatically exports any `DatasetType`, `Run`, and dimension
155 records associated with the datasets.
157 Parameters
158 ----------
159 refs : iterable of `DatasetRef`
160 References to the datasets to export. Their `DatasetRef.id`
161 attributes must not be `None`. Duplicates are automatically
162 ignored.
163 elements : iterable of `DimensionElement`, optional
164 Dimension elements whose records should be exported; this is
165 forwarded to `saveDataIds` when exporting the data IDs of the
166 given datasets.
167 rewrite : callable, optional
168 A callable that takes a single `FileDataset` argument and returns
169 a modified `FileDataset`. This is typically used to rewrite the
170 path generated by the datastore. If `None`, the `FileDataset`
171 returned by `Datastore.export` will be used directly.
173 Notes
174 -----
175 At present, this only associates datasets with the collection that
176 matches their run name. Other collections will be included in the
177 export in the future (once `Registry` provides a way to look up that
178 information).
179 """
180 dataIds = set()
181 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list)
182 for ref in refs:
183 # The query interfaces that are often used to generate the refs
184 # passed here often don't remove duplicates, so do that here for
185 # convenience.
186 if ref.id in self._dataset_ids:
187 continue
188 dataIds.add(ref.dataId)
189 # `exports` is a single-element list here, because we anticipate
190 # a future where more than just Datastore.export has a vectorized
191 # API and we can pull this out of the loop.
192 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer)
193 if rewrite is not None:
194 exports = [rewrite(export) for export in exports]
195 datasets[ref.datasetType, ref.run].extend(exports)
196 self._dataset_ids.add(ref.id)
197 self.saveDataIds(dataIds, elements=elements)
198 for (datasetType, run), records in datasets.items():
199 self._backend.saveDatasets(datasetType, run, *records)
201 def _finish(self):
202 """Delegate to the backend to finish the export process.
204 For use by `Butler.export` only.
205 """
206 self._backend.finish()
209class RepoExportBackend(ABC):
210 """An abstract interface for data repository export implementations.
211 """
213 @abstractmethod
214 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord):
215 """Export one or more dimension element records.
217 Parameters
218 ----------
219 element : `DimensionElement`
220 The `DimensionElement` whose elements are being exported.
221 data : `DimensionRecord` (variadic)
222 One or more records to export.
223 """
224 raise NotImplementedError()
226 @abstractmethod
227 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset,
228 collections: Iterable[str] = ()):
229 """Export one or more datasets, including their associated DatasetType
230 and run information (but not including associated dimension
231 information).
233 Parameters
234 ----------
235 datasetType : `DatasetType`
236 Type of all datasets being exported with this call.
237 run : `str`
238 Run associated with all datasets being exported with this call.
239 datasets : `FileDataset`, variadic
240 Per-dataset information to be exported. `FileDataset.formatter`
241 attributes should be strings, not `Formatter` instances or classes.
242 collections : iterable of `str`
243 Extra collections (in addition to ``run``) the dataset
244 should be associated with.
245 """
246 raise NotImplementedError()
248 @abstractmethod
249 def finish(self):
250 """Complete the export process.
251 """
252 raise NotImplementedError()
255class RepoImportBackend(ABC):
256 """An abstract interface for data repository import implementations.
258 Import backends are expected to be constructed with a description of
259 the objects that need to be imported (from, e.g., a file written by the
260 corresponding export backend), along with a `Registry`.
261 """
263 @abstractmethod
264 def register(self):
265 """Register all runs and dataset types associated with the backend with
266 the `Registry` the backend was constructed with.
268 These operations cannot be performed inside transactions, unlike those
269 performed by `load`, and must in general be performed before `load`.
270 """
272 @abstractmethod
273 def load(self, datastore: Optional[Datastore], *,
274 directory: Optional[str] = None, transfer: Optional[str] = None):
275 """Import information associated with the backend into the given
276 registry and datastore.
278 This must be run after `register`, and may be performed inside a
279 transaction.
281 Parameters
282 ----------
283 datastore : `Datastore`
284 Datastore to import into. If `None`, datasets will only be
285 inserted into the `Registry` (primarily intended for tests).
286 directory : `str`, optional
287 File all dataset paths are relative to.
288 transfer : `str`, optional
289 Transfer mode forwarded to `Datastore.ingest`.
290 """
291 raise NotImplementedError()
294class YamlRepoExportBackend(RepoExportBackend):
295 """A repository export implementation that saves to a YAML file.
297 Parameters
298 ----------
299 stream
300 A writeable file-like object.
301 """
303 def __init__(self, stream: IO):
304 self.stream = stream
305 self.data = []
307 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord):
308 # Docstring inherited from RepoExportBackend.saveDimensionData.
309 # Convert astropy time in TAI to datetime in UTC for YAML
310 data_dicts = []
311 for record in data:
312 rec_dict = record.toDict()
313 for key in rec_dict:
314 if isinstance(rec_dict[key], astropy.time.Time):
315 rec_dict[key] = rec_dict[key].utc.to_datetime()
316 data_dicts += [rec_dict]
317 self.data.append({
318 "type": "dimension",
319 "element": element.name,
320 "records": data_dicts, # TODO: encode regions
321 })
323 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset):
324 # Docstring inherited from RepoExportBackend.saveDatasets.
325 self.data.append({
326 "type": "dataset_type",
327 "name": datasetType.name,
328 "dimensions": [d.name for d in datasetType.dimensions],
329 "storage_class": datasetType.storageClass.name,
330 })
331 self.data.append({
332 "type": "run",
333 "name": run,
334 })
335 self.data.append({
336 "type": "dataset",
337 "dataset_type": datasetType.name,
338 "run": run,
339 "records": [
340 {
341 "dataset_id": [ref.id for ref in dataset.refs],
342 "data_id": [ref.dataId.byName() for ref in dataset.refs],
343 "path": dataset.path,
344 "formatter": dataset.formatter,
345 # TODO: look up and save other collections
346 }
347 for dataset in datasets
348 ]
349 })
351 def finish(self):
352 # Docstring inherited from RepoExportBackend.
353 yaml.dump(
354 {
355 "description": "Butler Data Repository Export",
356 "version": 0,
357 "data": self.data,
358 },
359 stream=self.stream,
360 sort_keys=False,
361 )
364class YamlRepoImportBackend(RepoImportBackend):
365 """A repository import implementation that reads from a YAML file.
367 Parameters
368 ----------
369 stream
370 A readable file-like object.
371 registry : `Registry`
372 The registry datasets will be imported into. Only used to retreive
373 dataset types during construction; all write happen in `register`
374 and `load`.
375 """
377 def __init__(self, stream: IO, registry: Registry):
378 # We read the file fully and convert its contents to Python objects
379 # instead of loading incrementally so we can spot some problems early;
380 # because `register` can't be put inside a transaction, we'd rather not
381 # run that at all if there's going to be problem later in `load`.
382 wrapper = yaml.safe_load(stream)
383 # TODO: When version numbers become meaningful, check here that we can
384 # read the version in the file.
385 self.runs: List[str] = []
386 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
387 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
388 self.registry: Registry = registry
389 datasetData = []
390 for data in wrapper["data"]:
391 if data["type"] == "dimension":
392 # convert all datetiem values to astropy
393 for record in data["records"]:
394 for key in record:
395 if isinstance(record[key], datetime):
396 record[key] = astropy.time.Time(record[key], scale="utc")
397 element = self.registry.dimensions[data["element"]]
398 self.dimensions[element].extend(element.RecordClass.fromDict(r) for r in data["records"])
399 elif data["type"] == "run":
400 self.runs.append(data["name"])
401 elif data["type"] == "dataset_type":
402 self.datasetTypes.add(
403 DatasetType(data["name"], dimensions=data["dimensions"],
404 storageClass=data["storage_class"], universe=self.registry.dimensions)
405 )
406 elif data["type"] == "dataset":
407 # Save raw dataset data for a second loop, so we can ensure we
408 # know about all dataset types first.
409 datasetData.append(data)
410 else:
411 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
412 # key is (dataset type name, run); inner most list is collections
413 self.datasets: Mapping[(str, str), List[Tuple[FileDataset, List[str]]]] = defaultdict(list)
414 for data in datasetData:
415 datasetType = self.datasetTypes.get(data["dataset_type"])
416 if datasetType is None:
417 datasetType = self.registry.getDatasetType(data["dataset_type"])
418 self.datasets[data["dataset_type"], data["run"]].extend(
419 (
420 FileDataset(
421 d.get("path"),
422 [DatasetRef(datasetType, dataId, run=data["run"], id=refid)
423 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))],
424 formatter=doImport(d.get("formatter")) if "formatter" in d else None
425 ),
426 d.get("collections", [])
427 )
428 for d in data["records"]
429 )
431 def register(self):
432 # Docstring inherited from RepoImportBackend.register.
433 for run in self.runs:
434 self.registry.registerRun(run)
435 for datasetType in self.datasetTypes:
436 self.registry.registerDatasetType(datasetType)
438 def load(self, datastore: Optional[Datastore], *,
439 directory: Optional[str] = None, transfer: Optional[str] = None):
440 # Docstring inherited from RepoImportBackend.load.
441 for element, records in self.dimensions.items():
442 self.registry.insertDimensionData(element, *records)
443 # Mapping from collection name to list of DatasetRefs to associate.
444 collections = defaultdict(list)
445 # FileDatasets to ingest into the datastore (in bulk):
446 fileDatasets = []
447 for (datasetTypeName, run), records in self.datasets.items():
448 datasetType = self.registry.getDatasetType(datasetTypeName)
449 # Make a big flattened list of all data IDs, while remembering
450 # slices that associate them with the FileDataset instances they
451 # came from.
452 dataIds = []
453 slices = []
454 for fileDataset, _ in records:
455 start = len(dataIds)
456 dataIds.extend(ref.dataId for ref in fileDataset.refs)
457 stop = len(dataIds)
458 slices.append(slice(start, stop))
459 # Insert all of those DatasetRefs at once.
460 # For now, we ignore the dataset_id we pulled from the file
461 # and just insert without one to get a new autoincrement value.
462 # Eventually (once we have origin in IDs) we'll preserve them.
463 resolvedRefs = self.registry.insertDatasets(
464 datasetType,
465 dataIds=dataIds,
466 run=run,
467 recursive=True
468 )
469 # Now iterate over the original records, and install the new
470 # resolved DatasetRefs to replace the unresolved ones as we
471 # reorganize the collection information.
472 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records):
473 fileDataset.refs = resolvedRefs[sliceForFileDataset]
474 if directory is not None:
475 fileDataset.path = os.path.join(directory, fileDataset.path)
476 fileDatasets.append(fileDataset)
477 for collection in collectionsForDataset:
478 collections[collection].extend(fileDataset.refs)
479 # Ingest everything into the datastore at once.
480 if datastore is not None and fileDatasets:
481 datastore.ingest(*fileDatasets, transfer=transfer)
482 # Associate with collections, one collection at a time.
483 for collection, refs in collections.items():
484 self.registry.associate(collection, refs)