Coverage for python/lsst/daf/butler/transfers/_context.py : 15%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["RepoExportContext"]
26from typing import (
27 Callable,
28 Dict,
29 Iterable,
30 List,
31 Optional,
32 Tuple,
33 Union,
34)
35from collections import defaultdict
37from ..core import (
38 DataCoordinate,
39 DimensionElement,
40 DimensionRecord,
41 DatasetRef,
42 DatasetType,
43 Datastore,
44 FileDataset,
45)
46from ..registry import Registry
47from ._interfaces import RepoExportBackend
50class RepoExportContext:
51 """Public interface for exporting a subset of a data repository.
53 Instances of this class are obtained by calling `Butler.export` as the
54 value returned by that context manager::
56 with butler.export(filename="export.yaml") as export:
57 export.saveDataIds(...)
58 export.saveDatasets(...)
60 Parameters
61 ----------
62 registry : `Registry`
63 Registry to export from.
64 datastore : `Datastore`
65 Datastore to export from.
66 backend : `RepoExportBackend`
67 Implementation class for a particular export file format.
68 directory : `str`, optional
69 Directory to pass to `Datastore.export`.
70 transfer : `str`, optional
71 Transfer mdoe to pass to `Datastore.export`.
72 """
74 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *,
75 directory: Optional[str] = None, transfer: Optional[str] = None):
76 self._registry = registry
77 self._datastore = datastore
78 self._backend = backend
79 self._directory = directory
80 self._transfer = transfer
81 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
82 self._dataset_ids = set()
83 self._datasets: Dict[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list)
85 def saveDimensionData(self, element: Union[str, DimensionElement],
86 records: Iterable[Union[dict, DimensionRecord]]) -> None:
87 """Export the given dimension records associated with one or more data
88 IDs.
90 Parameters
91 ----------
92 element : `str` or `DimensionElement`
93 `DimensionElement` or `str` indicating the logical table these
94 records are from.
95 records : `Iterable` [ `DimensionRecord` or `dict` ]
96 Records to export, as an iterable containing `DimensionRecord` or
97 `dict` instances.
98 """
99 if not isinstance(element, DimensionElement):
100 element = self._registry.dimensions[element]
101 for record in records:
102 if not isinstance(record, DimensionRecord):
103 record = element.RecordClass(**record)
104 elif record.definition != element:
105 raise ValueError(
106 f"Mismatch between element={element.name} and "
107 f"dimension record with definition={record.definition.name}."
108 )
109 self._records[element].setdefault(record.dataId, record)
111 def saveDataIds(self, dataIds: Iterable[DataCoordinate], *,
112 elements: Optional[Iterable[Union[str, DimensionElement]]] = None) -> None:
113 """Export the dimension records associated with one or more data IDs.
115 Parameters
116 ----------
117 dataIds : iterable of `DataCoordinate`.
118 Data IDs to export. For large numbers of data IDs obtained by
119 calls to `Registry.queryDataIds`, it will be much more efficient if
120 these are expanded to include records (i.e.
121 `DataCoordinate.hasRecords` returns `True`) prior to the call to
122 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``.
123 elements : iterable of `DimensionElement` or `str`, optional
124 Dimension elements whose records should be exported. If `None`,
125 records for all dimensions will be exported.
126 """
127 if elements is None:
128 elements = frozenset(element for element in self._registry.dimensions.getStaticElements()
129 if element.hasTable() and element.viewOf is None)
130 else:
131 elements = frozenset(
132 self._registry.dimensions[element] if not isinstance(element, DimensionElement) else element
133 for element in elements
134 )
135 for dataId in dataIds:
136 # This is potentially quite slow, because it's approximately
137 # len(dataId.graph.elements) queries per data ID. But it's a no-op
138 # if the data ID is already expanded, and DM-26692 will add (or at
139 # least start to add / unblock) query functionality that should
140 # let us speed this up internally as well.
141 dataId = self._registry.expandDataId(dataId)
142 for record in dataId.records.values():
143 if record is not None and record.definition in elements:
144 self._records[record.definition].setdefault(record.dataId, record)
146 def saveDatasets(self, refs: Iterable[DatasetRef], *,
147 elements: Optional[Union[str, DimensionElement]] = None,
148 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None:
149 """Export one or more datasets.
151 This automatically exports any `DatasetType`, `~CollectionType.RUN`
152 collections, and dimension records associated with the datasets.
154 Parameters
155 ----------
156 refs : iterable of `DatasetRef`
157 References to the datasets to export. Their `DatasetRef.id`
158 attributes must not be `None`. Duplicates are automatically
159 ignored. Nested data IDs must have `DataCoordinate.hasRecords`
160 return `True`.
161 elements : iterable of `DimensionElement` or `str`, optional
162 Dimension elements whose records should be exported; this is
163 forwarded to `saveDataIds` when exporting the data IDs of the
164 given datasets.
165 rewrite : callable, optional
166 A callable that takes a single `FileDataset` argument and returns
167 a modified `FileDataset`. This is typically used to rewrite the
168 path generated by the datastore. If `None`, the `FileDataset`
169 returned by `Datastore.export` will be used directly.
171 Notes
172 -----
173 At present, this only associates datasets with `~CollectionType.RUN`
174 collections. Other collections will be included in the export in the
175 future (once `Registry` provides a way to look up that information).
176 """
177 dataIds = set()
178 for ref in refs:
179 # The query interfaces that are often used to generate the refs
180 # passed here often don't remove duplicates, so do that here for
181 # convenience.
182 if ref.id in self._dataset_ids:
183 continue
184 dataIds.add(ref.dataId)
185 # `exports` is a single-element list here, because we anticipate
186 # a future where more than just Datastore.export has a vectorized
187 # API and we can pull this out of the loop.
188 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer)
189 if rewrite is not None:
190 exports = [rewrite(export) for export in exports]
191 self._dataset_ids.add(ref.getCheckedId())
192 assert ref.run is not None
193 self._datasets[ref.datasetType, ref.run].extend(exports)
194 self.saveDataIds(dataIds, elements=elements)
196 def _finish(self) -> None:
197 """Delegate to the backend to finish the export process.
199 For use by `Butler.export` only.
200 """
201 for element in self._registry.dimensions.sorted(self._records.keys()):
202 # To make export deterministic (DM-26324), the next step is to
203 # implement a way to sort DataCoordinates, then transform the
204 # second argument to:
205 # *[r[dataId] for dataId in sorted(r.keys())]
206 # where
207 # r = self._records[element]
208 # (continued below).
209 self._backend.saveDimensionData(element, *self._records[element].values())
210 # Then we need to either make DatasetType sortable directly or sort
211 # the iteration below by its name (as well as run).
212 for datasetType, run in self._datasets.keys():
213 # And after that, that we need to sort the FileDataset objects in
214 # the third argument below (maybe by filename?) and the lists of
215 # DatasetRef within those (I'd use the aforementioned new
216 # DataCoordinate sort method, because I'm not sure dataset_id
217 # values are going to be reliably deterministic themselves).
218 self._backend.saveDatasets(datasetType, run, *self._datasets[datasetType, run])
219 self._backend.finish()