Coverage for python/lsst/daf/butler/transfers/_context.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["RepoExportContext"]
26from typing import (
27 Callable,
28 Dict,
29 Iterable,
30 List,
31 Optional,
32 Set,
33 Union,
34)
35from collections import defaultdict
37from ..core import (
38 DataCoordinate,
39 DatasetAssociation,
40 DimensionElement,
41 DimensionRecord,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 FileDataset,
46)
47from ..registry import CollectionType, Registry
48from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord
49from ._interfaces import RepoExportBackend
52class RepoExportContext:
53 """Public interface for exporting a subset of a data repository.
55 Instances of this class are obtained by calling `Butler.export` as the
56 value returned by that context manager::
58 with butler.export(filename="export.yaml") as export:
59 export.saveDataIds(...)
60 export.saveDatasets(...)
62 Parameters
63 ----------
64 registry : `Registry`
65 Registry to export from.
66 datastore : `Datastore`
67 Datastore to export from.
68 backend : `RepoExportBackend`
69 Implementation class for a particular export file format.
70 directory : `str`, optional
71 Directory to pass to `Datastore.export`.
72 transfer : `str`, optional
73 Transfer mdoe to pass to `Datastore.export`.
74 """
76 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *,
77 directory: Optional[str] = None, transfer: Optional[str] = None):
78 self._registry = registry
79 self._datastore = datastore
80 self._backend = backend
81 self._directory = directory
82 self._transfer = transfer
83 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
84 self._dataset_ids: Set[int] = set()
85 self._datasets: Dict[DatasetType, Dict[str, List[FileDataset]]] \
86 = defaultdict(lambda: defaultdict(list))
87 self._collections: Dict[str, CollectionRecord] = {}
89 def saveCollection(self, name: str) -> None:
90 """Export the given collection.
92 Parameters
93 ----------
94 name: `str`
95 Name of the collection.
97 Notes
98 -----
99 `~CollectionType.RUN` collections are also exported automatically when
100 any dataset referencing them is exported. They may also be explicitly
101 exported this method to export the collection with no datasets.
102 Duplicate exports of collections are ignored.
104 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
105 collection will cause its associations with exported datasets to also
106 be exported, but it does not export those datasets automatically.
108 Exporting a `~CollectionType.CHAINED` collection does not automatically
109 export its child collections; these must be explicitly exported or
110 already be present in the repository they are being imported into.
111 """
112 self._collections[name] = self._registry._collections.find(name)
114 def saveDimensionData(self, element: Union[str, DimensionElement],
115 records: Iterable[Union[dict, DimensionRecord]]) -> None:
116 """Export the given dimension records associated with one or more data
117 IDs.
119 Parameters
120 ----------
121 element : `str` or `DimensionElement`
122 `DimensionElement` or `str` indicating the logical table these
123 records are from.
124 records : `Iterable` [ `DimensionRecord` or `dict` ]
125 Records to export, as an iterable containing `DimensionRecord` or
126 `dict` instances.
127 """
128 if not isinstance(element, DimensionElement):
129 element = self._registry.dimensions[element]
130 for record in records:
131 if not isinstance(record, DimensionRecord):
132 record = element.RecordClass(**record)
133 elif record.definition != element:
134 raise ValueError(
135 f"Mismatch between element={element.name} and "
136 f"dimension record with definition={record.definition.name}."
137 )
138 self._records[element].setdefault(record.dataId, record)
140 def saveDataIds(self, dataIds: Iterable[DataCoordinate], *,
141 elements: Optional[Iterable[Union[str, DimensionElement]]] = None) -> None:
142 """Export the dimension records associated with one or more data IDs.
144 Parameters
145 ----------
146 dataIds : iterable of `DataCoordinate`.
147 Data IDs to export. For large numbers of data IDs obtained by
148 calls to `Registry.queryDataIds`, it will be much more efficient if
149 these are expanded to include records (i.e.
150 `DataCoordinate.hasRecords` returns `True`) prior to the call to
151 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``.
152 elements : iterable of `DimensionElement` or `str`, optional
153 Dimension elements whose records should be exported. If `None`,
154 records for all dimensions will be exported.
155 """
156 if elements is None:
157 elements = frozenset(element for element in self._registry.dimensions.getStaticElements()
158 if element.hasTable() and element.viewOf is None)
159 else:
160 elements = set()
161 for element in elements:
162 if not isinstance(element, DimensionElement):
163 element = self._registry.dimensions[element]
164 if element.hasTable() and element.viewOf is None:
165 elements.add(element)
166 for dataId in dataIds:
167 # This is potentially quite slow, because it's approximately
168 # len(dataId.graph.elements) queries per data ID. But it's a no-op
169 # if the data ID is already expanded, and DM-26692 will add (or at
170 # least start to add / unblock) query functionality that should
171 # let us speed this up internally as well.
172 dataId = self._registry.expandDataId(dataId)
173 for record in dataId.records.values():
174 if record is not None and record.definition in elements:
175 self._records[record.definition].setdefault(record.dataId, record)
177 def saveDatasets(self, refs: Iterable[DatasetRef], *,
178 elements: Optional[Iterable[Union[str, DimensionElement]]] = None,
179 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None:
180 """Export one or more datasets.
182 This automatically exports any `DatasetType`, `~CollectionType.RUN`
183 collections, and dimension records associated with the datasets.
185 Parameters
186 ----------
187 refs : iterable of `DatasetRef`
188 References to the datasets to export. Their `DatasetRef.id`
189 attributes must not be `None`. Duplicates are automatically
190 ignored. Nested data IDs must have `DataCoordinate.hasRecords`
191 return `True`.
192 elements : iterable of `DimensionElement` or `str`, optional
193 Dimension elements whose records should be exported; this is
194 forwarded to `saveDataIds` when exporting the data IDs of the
195 given datasets.
196 rewrite : callable, optional
197 A callable that takes a single `FileDataset` argument and returns
198 a modified `FileDataset`. This is typically used to rewrite the
199 path generated by the datastore. If `None`, the `FileDataset`
200 returned by `Datastore.export` will be used directly.
202 Notes
203 -----
204 At present, this only associates datasets with `~CollectionType.RUN`
205 collections. Other collections will be included in the export in the
206 future (once `Registry` provides a way to look up that information).
207 """
208 dataIds = set()
209 for ref in refs:
210 # The query interfaces that are often used to generate the refs
211 # passed here often don't remove duplicates, so do that here for
212 # convenience.
213 if ref.id in self._dataset_ids:
214 continue
215 dataIds.add(ref.dataId)
216 # `exports` is a single-element list here, because we anticipate
217 # a future where more than just Datastore.export has a vectorized
218 # API and we can pull this out of the loop.
219 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer)
220 if rewrite is not None:
221 exports = [rewrite(export) for export in exports]
222 self._dataset_ids.add(ref.getCheckedId())
223 assert ref.run is not None
224 self._datasets[ref.datasetType][ref.run].extend(exports)
225 self.saveDataIds(dataIds, elements=elements)
227 def _finish(self) -> None:
228 """Delegate to the backend to finish the export process.
230 For use by `Butler.export` only.
231 """
232 for element in self._registry.dimensions.sorted(self._records.keys()):
233 # To make export deterministic (DM-26324), the next step is to
234 # implement a way to sort DataCoordinates, then transform the
235 # second argument to:
236 # *[r[dataId] for dataId in sorted(r.keys())]
237 # where
238 # r = self._records[element]
239 # (continued below).
240 self._backend.saveDimensionData(element, *self._records[element].values())
241 for datasetsByRun in self._datasets.values():
242 for run in datasetsByRun.keys():
243 self._collections[run] = self._registry._collections.find(run)
244 for collectionName in self._computeSortedCollections():
245 self._backend.saveCollection(self._collections[collectionName])
246 # Continuing for DM-26324: then we need to either make DatasetType
247 # sortable directly or sort the iteration below by its name (as well as
248 # run).
249 for datasetType in self._datasets.keys():
250 for run in self._datasets[datasetType].keys():
251 # Again, for DM-26324: And after that, that we need to sort the
252 # FileDataset objects in the third argument below (maybe by
253 # filename?) and the lists of DatasetRef within those (I'd use
254 # the aforementioned new DataCoordinate sort method, because
255 # I'm not sure dataset_id values are going to be reliably
256 # deterministic themselves).
257 records = self._datasets[datasetType][run]
258 self._backend.saveDatasets(datasetType, run, *records)
259 # Export associations between datasets and collections. These need to
260 # be sorted (at two levels; they're dicts) or created more
261 # deterministically, too, which probably involves more data ID sorting.
262 for collection, associations in self._computeDatasetAssociations().items():
263 self._backend.saveDatasetAssociations(collection, self._collections[collection].type,
264 associations)
265 self._backend.finish()
267 def _computeSortedCollections(self) -> List[str]:
268 """Sort collections in a way that is both deterministic and safe
269 for registering them in a new repo in the presence of nested chains.
271 This method is intended for internal use by `RepoExportContext` only.
273 Returns
274 -------
275 names: `List` [ `str` ]
276 Ordered list of collection names.
277 """
278 # Split collections into CHAINED and everything else, and just
279 # sort "everything else" lexicographically since there are no
280 # dependencies.
281 chains: Dict[str, List[str]] = {}
282 result: List[str] = []
283 for record in self._collections.values():
284 if record.type is CollectionType.CHAINED:
285 assert isinstance(record, ChainedCollectionRecord)
286 chains[record.name] = [child for child, _ in record.children]
287 else:
288 result.append(record.name)
289 result.sort()
290 # Sort all chains topologically, breaking ties lexicographically.
291 # Append these to 'result' and remove them from 'chains' as we go.
292 while chains:
293 unblocked = {
294 parent for parent, children in chains.items()
295 if not any(child in chains.keys() for child in children)
296 }
297 if not unblocked:
298 raise RuntimeError("Apparent cycle in CHAINED collection "
299 f"dependencies involving {unblocked}.")
300 result.extend(sorted(unblocked))
301 for name in unblocked:
302 del chains[name]
303 return result
305 def _computeDatasetAssociations(self) -> Dict[str, List[DatasetAssociation]]:
306 """Return datasets-collection associations, grouped by association.
308 This queries for all associations between exported datasets and
309 exported TAGGED or CALIBRATION collections and is intended to be run
310 only by `_finish`, as this ensures all collections and all datasets
311 have already been exported and hence the order in which they are
312 exported does not matter.
314 Returns
315 -------
316 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ]
317 Dictionary keyed by collection name, with values lists of structs
318 representing an association between that collection and a dataset.
319 """
320 results = defaultdict(list)
321 for datasetType in self._datasets.keys():
322 # We query for _all_ datasets of each dataset type we export, in
323 # the specific collections we are exporting. The worst-case
324 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny
325 # subset). But we don't have any better options right now; we need
326 # a way to query for a _lot_ of explicitly given dataset_ids, and
327 # the only way to make that scale up is to either upload them to a
328 # temporary table or recognize when they are already in one because
329 # the user passed us a QueryResult object. That's blocked by (at
330 # least) DM-26692.
331 collectionTypes = {CollectionType.TAGGED}
332 if datasetType.isCalibration():
333 collectionTypes.add(CollectionType.CALIBRATION)
334 associationIter = self._registry.queryDatasetAssociations(
335 datasetType,
336 collections=self._collections.keys(),
337 collectionTypes=collectionTypes,
338 flattenChains=False,
339 )
340 for association in associationIter:
341 if association.ref.id in self._dataset_ids:
342 results[association.collection].append(association)
343 return results