Coverage for python/lsst/daf/butler/core/repoTransfers.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["FileDataset", "RepoExport",
25 "RepoExportBackend", "RepoImportBackend", "RepoTransferFormatConfig",
26 "YamlRepoExportBackend", "YamlRepoImportBackend"]
28import os
29from abc import ABC, abstractmethod
30from dataclasses import dataclass
31from datetime import datetime
32from typing import (
33 Any,
34 Callable,
35 Dict,
36 IO,
37 Iterable,
38 List,
39 Mapping,
40 MutableMapping,
41 Optional,
42 Set,
43 Tuple,
44 Type,
45 TYPE_CHECKING,
46 Union,
47)
48from collections import defaultdict
50import yaml
51import astropy.time
53from lsst.utils import doImport
54from .config import ConfigSubset
55from .datasets import DatasetType, DatasetRef
56from .utils import iterable
57from .named import NamedValueSet
59if TYPE_CHECKING: 59 ↛ 60line 59 didn't jump to line 60, because the condition on line 59 was never true
60 from .dimensions import DataCoordinate, DimensionElement, DimensionRecord, ExpandedDataCoordinate
61 from ..registry import Registry
62 from .datastore import Datastore
63 from .formatter import FormatterParameter
66class RepoTransferFormatConfig(ConfigSubset):
67 """The section of butler configuration that associates repo import/export
68 backends with file formats.
69 """
70 component = "repo_transfer_formats"
71 defaultConfigFile = "repo_transfer_formats.yaml"
74@dataclass
75class FileDataset:
76 """A struct that represents a dataset exported to a file.
77 """
78 __slots__ = ("refs", "path", "formatter")
80 refs: List[DatasetRef]
81 """Registry information about the dataset. (`list` of `DatasetRef`).
82 """
84 path: str
85 """Path to the dataset (`str`).
87 If the dataset was exported with ``transfer=None`` (i.e. in-place),
88 this is relative to the datastore root (only datastores that have a
89 well-defined root in the local filesystem can be expected to support
90 in-place exports). Otherwise this is relative to the directory passed
91 to `Datastore.export`.
92 """
94 formatter: Optional[FormatterParameter]
95 """A `Formatter` class or fully-qualified name.
96 """
98 def __init__(self, path: str, refs: Union[DatasetRef, List[DatasetRef]], *,
99 formatter: Optional[FormatterParameter] = None):
100 self.path = path
101 if isinstance(refs, DatasetRef):
102 refs = [refs]
103 self.refs = refs
104 self.formatter = formatter
107class RepoExport:
108 """Public interface for exporting a subset of a data repository.
110 Instances of this class are obtained by calling `Butler.export` as the
111 value returned by that context manager::
113 with butler.export(filename="export.yaml") as export:
114 export.saveDataIds(...)
115 export.saveDatasts(...)
117 Parameters
118 ----------
119 registry : `Registry`
120 Registry to export from.
121 datastore : `Datastore`
122 Datastore to export from.
123 backend : `RepoExportBackend`
124 Implementation class for a particular export file format.
125 directory : `str`, optional
126 Directory to pass to `Datastore.export`.
127 transfer : `str`, optional
128 Transfer mdoe to pass to `Datastore.export`.
129 """
131 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *,
132 directory: Optional[str] = None, transfer: Optional[str] = None):
133 self._registry = registry
134 self._datastore = datastore
135 self._backend = backend
136 self._directory = directory
137 self._transfer = transfer
138 self._dataset_ids: Set[int] = set()
140 def saveDataIds(self, dataIds: Iterable[ExpandedDataCoordinate], *,
141 elements: Optional[Iterable[DimensionElement]] = None) -> None:
142 """Export the dimension records associated with one or more data IDs.
144 Parameters
145 ----------
146 dataIds : iterable of `ExpandedDataCoordinate`.
147 Fully-expanded data IDs to export.
148 elements : iterable of `DimensionElement`, optional
149 Dimension elements whose records should be exported. If `None`,
150 records for all dimensions will be exported.
151 """
152 if elements is None:
153 elements = frozenset(element for element in self._registry.dimensions.elements
154 if element.hasTable() and element.viewOf is None)
155 else:
156 elements = frozenset(elements)
157 records: MutableMapping[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
158 for dataId in dataIds:
159 for record in dataId.records.values():
160 if record is not None and record.definition in elements:
161 records[record.definition].setdefault(record.dataId, record)
162 for element in self._registry.dimensions.sorted(records.keys()):
163 self._backend.saveDimensionData(element, *records[element].values())
165 def saveDatasets(self, refs: Iterable[DatasetRef], *,
166 elements: Optional[Iterable[DimensionElement]] = None,
167 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None:
168 """Export one or more datasets.
170 This automatically exports any `DatasetType`, `Run`, and dimension
171 records associated with the datasets.
173 Parameters
174 ----------
175 refs : iterable of `DatasetRef`
176 References to the datasets to export. Their `DatasetRef.id`
177 attributes must not be `None`. Duplicates are automatically
178 ignored. Nested data IDs must be `ExpandedDataCoordinate`
179 instances.
180 elements : iterable of `DimensionElement`, optional
181 Dimension elements whose records should be exported; this is
182 forwarded to `saveDataIds` when exporting the data IDs of the
183 given datasets.
184 rewrite : callable, optional
185 A callable that takes a single `FileDataset` argument and returns
186 a modified `FileDataset`. This is typically used to rewrite the
187 path generated by the datastore. If `None`, the `FileDataset`
188 returned by `Datastore.export` will be used directly.
190 Notes
191 -----
192 At present, this only associates datasets with the collection that
193 matches their run name. Other collections will be included in the
194 export in the future (once `Registry` provides a way to look up that
195 information).
196 """
197 dataIds = set()
198 datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list)
199 for ref in refs:
200 # The query interfaces that are often used to generate the refs
201 # passed here often don't remove duplicates, so do that here for
202 # convenience.
203 if ref.id in self._dataset_ids:
204 continue
205 dataIds.add(self._registry.expandDataId(ref.dataId))
206 # `exports` is a single-element list here, because we anticipate
207 # a future where more than just Datastore.export has a vectorized
208 # API and we can pull this out of the loop.
209 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer)
210 if rewrite is not None:
211 exports = [rewrite(export) for export in exports]
212 self._dataset_ids.add(ref.getCheckedId())
213 assert ref.run is not None
214 datasets[ref.datasetType, ref.run].extend(exports)
215 self.saveDataIds(dataIds, elements=elements)
216 for (datasetType, run), records in datasets.items():
217 self._backend.saveDatasets(datasetType, run, *records)
219 def _finish(self) -> None:
220 """Delegate to the backend to finish the export process.
222 For use by `Butler.export` only.
223 """
224 self._backend.finish()
227class RepoExportBackend(ABC):
228 """An abstract interface for data repository export implementations.
229 """
231 @abstractmethod
232 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
233 """Export one or more dimension element records.
235 Parameters
236 ----------
237 element : `DimensionElement`
238 The `DimensionElement` whose elements are being exported.
239 data : `DimensionRecord` (variadic)
240 One or more records to export.
241 """
242 raise NotImplementedError()
244 @abstractmethod
245 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
246 """Export one or more datasets, including their associated DatasetType
247 and run information (but not including associated dimension
248 information).
250 Parameters
251 ----------
252 datasetType : `DatasetType`
253 Type of all datasets being exported with this call.
254 run : `str`
255 Run associated with all datasets being exported with this call.
256 datasets : `FileDataset`, variadic
257 Per-dataset information to be exported. `FileDataset.formatter`
258 attributes should be strings, not `Formatter` instances or classes.
259 """
260 raise NotImplementedError()
262 @abstractmethod
263 def finish(self) -> None:
264 """Complete the export process.
265 """
266 raise NotImplementedError()
269class RepoImportBackend(ABC):
270 """An abstract interface for data repository import implementations.
272 Import backends are expected to be constructed with a description of
273 the objects that need to be imported (from, e.g., a file written by the
274 corresponding export backend), along with a `Registry`.
275 """
277 @abstractmethod
278 def register(self) -> None:
279 """Register all runs and dataset types associated with the backend with
280 the `Registry` the backend was constructed with.
282 These operations cannot be performed inside transactions, unlike those
283 performed by `load`, and must in general be performed before `load`.
284 """
286 @abstractmethod
287 def load(self, datastore: Optional[Datastore], *,
288 directory: Optional[str] = None, transfer: Optional[str] = None) -> None:
289 """Import information associated with the backend into the given
290 registry and datastore.
292 This must be run after `register`, and may be performed inside a
293 transaction.
295 Parameters
296 ----------
297 datastore : `Datastore`
298 Datastore to import into. If `None`, datasets will only be
299 inserted into the `Registry` (primarily intended for tests).
300 directory : `str`, optional
301 File all dataset paths are relative to.
302 transfer : `str`, optional
303 Transfer mode forwarded to `Datastore.ingest`.
304 """
305 raise NotImplementedError()
308class YamlRepoExportBackend(RepoExportBackend):
309 """A repository export implementation that saves to a YAML file.
311 Parameters
312 ----------
313 stream
314 A writeable file-like object.
315 """
317 def __init__(self, stream: IO):
318 self.stream = stream
319 self.data: List[Dict[str, Any]] = []
321 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
322 # Docstring inherited from RepoExportBackend.saveDimensionData.
323 data_dicts = [record.toDict() for record in data]
324 self.data.append({
325 "type": "dimension",
326 "element": element.name,
327 "records": data_dicts, # TODO: encode regions
328 })
330 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
331 # Docstring inherited from RepoExportBackend.saveDatasets.
332 self.data.append({
333 "type": "dataset_type",
334 "name": datasetType.name,
335 "dimensions": [d.name for d in datasetType.dimensions],
336 "storage_class": datasetType.storageClass.name,
337 })
338 self.data.append({
339 "type": "run",
340 "name": run,
341 })
342 self.data.append({
343 "type": "dataset",
344 "dataset_type": datasetType.name,
345 "run": run,
346 "records": [
347 {
348 "dataset_id": [ref.id for ref in dataset.refs],
349 "data_id": [ref.dataId.byName() for ref in dataset.refs],
350 "path": dataset.path,
351 "formatter": dataset.formatter,
352 # TODO: look up and save other collections
353 }
354 for dataset in datasets
355 ]
356 })
358 def finish(self) -> None:
359 # Docstring inherited from RepoExportBackend.
360 yaml.dump(
361 {
362 "description": "Butler Data Repository Export",
363 "version": 0,
364 "data": self.data,
365 },
366 stream=self.stream,
367 sort_keys=False,
368 )
371class YamlRepoImportBackend(RepoImportBackend):
372 """A repository import implementation that reads from a YAML file.
374 Parameters
375 ----------
376 stream
377 A readable file-like object.
378 registry : `Registry`
379 The registry datasets will be imported into. Only used to retreive
380 dataset types during construction; all write happen in `register`
381 and `load`.
382 """
384 def __init__(self, stream: IO, registry: Registry):
385 # We read the file fully and convert its contents to Python objects
386 # instead of loading incrementally so we can spot some problems early;
387 # because `register` can't be put inside a transaction, we'd rather not
388 # run that at all if there's going to be problem later in `load`.
389 wrapper = yaml.safe_load(stream)
390 # TODO: When version numbers become meaningful, check here that we can
391 # read the version in the file.
392 self.runs: List[str] = []
393 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
394 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
395 self.registry: Registry = registry
396 datasetData = []
397 for data in wrapper["data"]:
398 if data["type"] == "dimension":
399 # convert all datetiem values to astropy
400 for record in data["records"]:
401 for key in record:
402 # Some older YAML files were produced with native
403 # YAML support for datetime, we support reading that
404 # data back. Newer conversion uses _AstropyTimeToYAML
405 # class with special YAML tag.
406 if isinstance(record[key], datetime):
407 record[key] = astropy.time.Time(record[key], scale="utc")
408 element = self.registry.dimensions[data["element"]]
409 RecordClass: Type[DimensionRecord] = element.RecordClass
410 self.dimensions[element].extend(
411 RecordClass.fromDict(r) for r in data["records"]
412 )
413 elif data["type"] == "run":
414 self.runs.append(data["name"])
415 elif data["type"] == "dataset_type":
416 self.datasetTypes.add(
417 DatasetType(data["name"], dimensions=data["dimensions"],
418 storageClass=data["storage_class"], universe=self.registry.dimensions)
419 )
420 elif data["type"] == "dataset":
421 # Save raw dataset data for a second loop, so we can ensure we
422 # know about all dataset types first.
423 datasetData.append(data)
424 else:
425 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
426 # key is (dataset type name, run); inner most list is collections
427 self.datasets: Mapping[Tuple[str, str], List[Tuple[FileDataset, List[str]]]] = defaultdict(list)
428 for data in datasetData:
429 datasetType = self.datasetTypes.get(data["dataset_type"])
430 if datasetType is None:
431 datasetType = self.registry.getDatasetType(data["dataset_type"])
432 self.datasets[data["dataset_type"], data["run"]].extend(
433 (
434 FileDataset(
435 d.get("path"),
436 [DatasetRef(datasetType, dataId, run=data["run"], id=refid)
437 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))],
438 formatter=doImport(d.get("formatter")) if "formatter" in d else None
439 ),
440 d.get("collections", [])
441 )
442 for d in data["records"]
443 )
445 def register(self) -> None:
446 # Docstring inherited from RepoImportBackend.register.
447 for run in self.runs:
448 self.registry.registerRun(run)
449 for datasetType in self.datasetTypes:
450 self.registry.registerDatasetType(datasetType)
452 def load(self, datastore: Optional[Datastore], *,
453 directory: Optional[str] = None, transfer: Optional[str] = None) -> None:
454 # Docstring inherited from RepoImportBackend.load.
455 for element, dimensionRecords in self.dimensions.items():
456 self.registry.insertDimensionData(element, *dimensionRecords)
457 # Mapping from collection name to list of DatasetRefs to associate.
458 collections = defaultdict(list)
459 # FileDatasets to ingest into the datastore (in bulk):
460 fileDatasets = []
461 for (datasetTypeName, run), records in self.datasets.items():
462 datasetType = self.registry.getDatasetType(datasetTypeName)
463 # Make a big flattened list of all data IDs, while remembering
464 # slices that associate them with the FileDataset instances they
465 # came from.
466 dataIds: List[DataCoordinate] = []
467 slices = []
468 for fileDataset, _ in records:
469 start = len(dataIds)
470 dataIds.extend(ref.dataId for ref in fileDataset.refs)
471 stop = len(dataIds)
472 slices.append(slice(start, stop))
473 # Insert all of those DatasetRefs at once.
474 # For now, we ignore the dataset_id we pulled from the file
475 # and just insert without one to get a new autoincrement value.
476 # Eventually (once we have origin in IDs) we'll preserve them.
477 resolvedRefs = self.registry.insertDatasets(
478 datasetType,
479 dataIds=dataIds,
480 run=run,
481 )
482 # Now iterate over the original records, and install the new
483 # resolved DatasetRefs to replace the unresolved ones as we
484 # reorganize the collection information.
485 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records):
486 fileDataset.refs = resolvedRefs[sliceForFileDataset]
487 if directory is not None:
488 fileDataset.path = os.path.join(directory, fileDataset.path)
489 fileDatasets.append(fileDataset)
490 for collection in collectionsForDataset:
491 collections[collection].extend(fileDataset.refs)
492 # Ingest everything into the datastore at once.
493 if datastore is not None and fileDatasets:
494 datastore.ingest(*fileDatasets, transfer=transfer)
495 # Associate with collections, one collection at a time.
496 for collection, refs in collections.items():
497 self.registry.associate(collection, refs)
500class _AstropyTimeToYAML:
501 """Handle conversion of astropy Time to/from YAML representation.
503 This class defines methods that convert astropy Time instances to or from
504 YAML representation. On output it converts time to string ISO format in
505 TAI scale with maximum precision defining special YAML tag for it. On
506 input it does inverse transformation. The methods need to be registered
507 with YAML dumper and loader classes.
509 Notes
510 -----
511 Python ``yaml`` module defines special helper base class ``YAMLObject``
512 that provides similar functionality but its use is complicated by the need
513 to convert ``Time`` instances to instances of ``YAMLObject`` sub-class
514 before saving them to YAML. This class avoids this intermediate step but
515 it requires separate regisration step.
516 """
518 yaml_tag = "!butler_time/tai/iso" # YAML tag name for Time class
520 @classmethod
521 def to_yaml(cls, dumper: yaml.Dumper, data: astropy.time.Time) -> Any:
522 """Convert astropy Time object into YAML format.
524 Parameters
525 ----------
526 dumper : `yaml.Dumper`
527 YAML dumper instance.
528 data : `astropy.time.Time`
529 Data to be converted.
530 """
531 if data is not None:
532 # we store time in ISO format but we need full nanosecond
533 # precision so we have to construct intermediate instance to make
534 # sure its precision is set correctly.
535 data = astropy.time.Time(data.tai, precision=9)
536 data = data.to_value("iso")
537 return dumper.represent_scalar(cls.yaml_tag, data)
539 @classmethod
540 def from_yaml(cls, loader: yaml.SafeLoader, node: yaml.ScalarNode) -> astropy.time.Time:
541 """Convert YAML node into astropy time
543 Parameters
544 ----------
545 loader : `yaml.SafeLoader`
546 Instance of YAML loader class.
547 node : `yaml.ScalarNode`
548 YAML node.
550 Returns
551 -------
552 time : `astropy.time.Time`
553 Time instance, can be ``None``.
554 """
555 if node.value is not None:
556 return astropy.time.Time(node.value, format="iso", scale="tai")
559# Register Time -> YAML conversion method with Dumper class
560yaml.Dumper.add_representer(astropy.time.Time, _AstropyTimeToYAML.to_yaml)
562# Register YAML -> Time conversion method with Loader, for our use case we
563# only need SafeLoader.
564yaml.SafeLoader.add_constructor(_AstropyTimeToYAML.yaml_tag, _AstropyTimeToYAML.from_yaml)