Coverage for python/lsst/daf/butler/transfers/_context.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["RepoExportContext"]
26from typing import (
27 Callable,
28 Dict,
29 Iterable,
30 List,
31 Optional,
32 Set,
33 Union,
34)
35from collections import defaultdict
37from ..core import (
38 DataCoordinate,
39 DatasetAssociation,
40 DimensionElement,
41 DimensionRecord,
42 DatasetId,
43 DatasetRef,
44 DatasetType,
45 Datastore,
46 FileDataset,
47)
48from ..registry import CollectionType, Registry
49from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord
50from ._interfaces import RepoExportBackend
53class RepoExportContext:
54 """Public interface for exporting a subset of a data repository.
56 Instances of this class are obtained by calling `Butler.export` as the
57 value returned by that context manager::
59 with butler.export(filename="export.yaml") as export:
60 export.saveDataIds(...)
61 export.saveDatasets(...)
63 Parameters
64 ----------
65 registry : `Registry`
66 Registry to export from.
67 datastore : `Datastore`
68 Datastore to export from.
69 backend : `RepoExportBackend`
70 Implementation class for a particular export file format.
71 directory : `str`, optional
72 Directory to pass to `Datastore.export`.
73 transfer : `str`, optional
74 Transfer mdoe to pass to `Datastore.export`.
75 """
77 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *,
78 directory: Optional[str] = None, transfer: Optional[str] = None):
79 self._registry = registry
80 self._datastore = datastore
81 self._backend = backend
82 self._directory = directory
83 self._transfer = transfer
84 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
85 self._dataset_ids: Set[DatasetId] = set()
86 self._datasets: Dict[DatasetType, Dict[str, List[FileDataset]]] \
87 = defaultdict(lambda: defaultdict(list))
88 self._collections: Dict[str, CollectionRecord] = {}
90 def saveCollection(self, name: str) -> None:
91 """Export the given collection.
93 Parameters
94 ----------
95 name: `str`
96 Name of the collection.
98 Notes
99 -----
100 `~CollectionType.RUN` collections are also exported automatically when
101 any dataset referencing them is exported. They may also be explicitly
102 exported this method to export the collection with no datasets.
103 Duplicate exports of collections are ignored.
105 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
106 collection will cause its associations with exported datasets to also
107 be exported, but it does not export those datasets automatically.
109 Exporting a `~CollectionType.CHAINED` collection does not automatically
110 export its child collections; these must be explicitly exported or
111 already be present in the repository they are being imported into.
112 """
113 self._collections[name] = self._registry._get_collection_record(name)
115 def saveDimensionData(self, element: Union[str, DimensionElement],
116 records: Iterable[Union[dict, DimensionRecord]]) -> None:
117 """Export the given dimension records associated with one or more data
118 IDs.
120 Parameters
121 ----------
122 element : `str` or `DimensionElement`
123 `DimensionElement` or `str` indicating the logical table these
124 records are from.
125 records : `Iterable` [ `DimensionRecord` or `dict` ]
126 Records to export, as an iterable containing `DimensionRecord` or
127 `dict` instances.
128 """
129 if not isinstance(element, DimensionElement):
130 element = self._registry.dimensions[element]
131 for record in records:
132 if not isinstance(record, DimensionRecord):
133 record = element.RecordClass(**record)
134 elif record.definition != element:
135 raise ValueError(
136 f"Mismatch between element={element.name} and "
137 f"dimension record with definition={record.definition.name}."
138 )
139 self._records[element].setdefault(record.dataId, record)
141 def saveDataIds(self, dataIds: Iterable[DataCoordinate], *,
142 elements: Optional[Iterable[Union[str, DimensionElement]]] = None) -> None:
143 """Export the dimension records associated with one or more data IDs.
145 Parameters
146 ----------
147 dataIds : iterable of `DataCoordinate`.
148 Data IDs to export. For large numbers of data IDs obtained by
149 calls to `Registry.queryDataIds`, it will be much more efficient if
150 these are expanded to include records (i.e.
151 `DataCoordinate.hasRecords` returns `True`) prior to the call to
152 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``.
153 elements : iterable of `DimensionElement` or `str`, optional
154 Dimension elements whose records should be exported. If `None`,
155 records for all dimensions will be exported.
156 """
157 if elements is None:
158 elements = frozenset(element for element in self._registry.dimensions.getStaticElements()
159 if element.hasTable() and element.viewOf is None)
160 else:
161 elements = set()
162 for element in elements:
163 if not isinstance(element, DimensionElement):
164 element = self._registry.dimensions[element]
165 if element.hasTable() and element.viewOf is None:
166 elements.add(element)
167 for dataId in dataIds:
168 # This is potentially quite slow, because it's approximately
169 # len(dataId.graph.elements) queries per data ID. But it's a no-op
170 # if the data ID is already expanded, and DM-26692 will add (or at
171 # least start to add / unblock) query functionality that should
172 # let us speed this up internally as well.
173 dataId = self._registry.expandDataId(dataId)
174 for record in dataId.records.values():
175 if record is not None and record.definition in elements:
176 self._records[record.definition].setdefault(record.dataId, record)
178 def saveDatasets(self, refs: Iterable[DatasetRef], *,
179 elements: Optional[Iterable[Union[str, DimensionElement]]] = None,
180 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None:
181 """Export one or more datasets.
183 This automatically exports any `DatasetType`, `~CollectionType.RUN`
184 collections, and dimension records associated with the datasets.
186 Parameters
187 ----------
188 refs : iterable of `DatasetRef`
189 References to the datasets to export. Their `DatasetRef.id`
190 attributes must not be `None`. Duplicates are automatically
191 ignored. Nested data IDs must have `DataCoordinate.hasRecords`
192 return `True`.
193 elements : iterable of `DimensionElement` or `str`, optional
194 Dimension elements whose records should be exported; this is
195 forwarded to `saveDataIds` when exporting the data IDs of the
196 given datasets.
197 rewrite : callable, optional
198 A callable that takes a single `FileDataset` argument and returns
199 a modified `FileDataset`. This is typically used to rewrite the
200 path generated by the datastore. If `None`, the `FileDataset`
201 returned by `Datastore.export` will be used directly.
203 Notes
204 -----
205 At present, this only associates datasets with `~CollectionType.RUN`
206 collections. Other collections will be included in the export in the
207 future (once `Registry` provides a way to look up that information).
208 """
209 dataIds = set()
210 for ref in sorted(refs):
211 # The query interfaces that are often used to generate the refs
212 # passed here often don't remove duplicates, so do that here for
213 # convenience.
214 if ref.id in self._dataset_ids:
215 continue
216 dataIds.add(ref.dataId)
217 # `exports` is a single-element list here, because we anticipate
218 # a future where more than just Datastore.export has a vectorized
219 # API and we can pull this out of the loop.
220 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer)
221 if rewrite is not None:
222 exports = [rewrite(export) for export in exports]
223 self._dataset_ids.add(ref.getCheckedId())
224 assert ref.run is not None
225 self._datasets[ref.datasetType][ref.run].extend(exports)
226 self.saveDataIds(dataIds, elements=elements)
228 def _finish(self) -> None:
229 """Delegate to the backend to finish the export process.
231 For use by `Butler.export` only.
232 """
233 for element in self._registry.dimensions.sorted(self._records.keys()):
234 # To make export deterministic sort the DataCoordinate instances.
235 r = self._records[element]
236 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())])
237 for datasetsByRun in self._datasets.values():
238 for run in datasetsByRun.keys():
239 self._collections[run] = self._registry._get_collection_record(run)
240 for collectionName in self._computeSortedCollections():
241 doc = self._registry.getCollectionDocumentation(collectionName)
242 self._backend.saveCollection(self._collections[collectionName], doc)
243 # Sort the dataset types and runs before exporting to ensure
244 # reproducible order in export file.
245 for datasetType in sorted(self._datasets.keys()):
246 for run in sorted(self._datasets[datasetType].keys()):
247 # Sort the FileDataset
248 records = sorted(self._datasets[datasetType][run])
249 self._backend.saveDatasets(datasetType, run, *records)
250 # Export associations between datasets and collections. These need to
251 # be sorted (at two levels; they're dicts) or created more
252 # deterministically, too, which probably involves more data ID sorting.
253 datasetAssociations = self._computeDatasetAssociations()
254 for collection in sorted(datasetAssociations):
255 self._backend.saveDatasetAssociations(collection, self._collections[collection].type,
256 sorted(datasetAssociations[collection]))
257 self._backend.finish()
259 def _computeSortedCollections(self) -> List[str]:
260 """Sort collections in a way that is both deterministic and safe
261 for registering them in a new repo in the presence of nested chains.
263 This method is intended for internal use by `RepoExportContext` only.
265 Returns
266 -------
267 names: `List` [ `str` ]
268 Ordered list of collection names.
269 """
270 # Split collections into CHAINED and everything else, and just
271 # sort "everything else" lexicographically since there are no
272 # dependencies.
273 chains: Dict[str, List[str]] = {}
274 result: List[str] = []
275 for record in self._collections.values():
276 if record.type is CollectionType.CHAINED:
277 assert isinstance(record, ChainedCollectionRecord)
278 chains[record.name] = list(record.children)
279 else:
280 result.append(record.name)
281 result.sort()
282 # Sort all chains topologically, breaking ties lexicographically.
283 # Append these to 'result' and remove them from 'chains' as we go.
284 while chains:
285 unblocked = {
286 parent for parent, children in chains.items()
287 if not any(child in chains.keys() for child in children)
288 }
289 if not unblocked:
290 raise RuntimeError("Apparent cycle in CHAINED collection "
291 f"dependencies involving {unblocked}.")
292 result.extend(sorted(unblocked))
293 for name in unblocked:
294 del chains[name]
295 return result
297 def _computeDatasetAssociations(self) -> Dict[str, List[DatasetAssociation]]:
298 """Return datasets-collection associations, grouped by association.
300 This queries for all associations between exported datasets and
301 exported TAGGED or CALIBRATION collections and is intended to be run
302 only by `_finish`, as this ensures all collections and all datasets
303 have already been exported and hence the order in which they are
304 exported does not matter.
306 Returns
307 -------
308 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ]
309 Dictionary keyed by collection name, with values lists of structs
310 representing an association between that collection and a dataset.
311 """
312 results = defaultdict(list)
313 for datasetType in self._datasets.keys():
314 # We query for _all_ datasets of each dataset type we export, in
315 # the specific collections we are exporting. The worst-case
316 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny
317 # subset). But we don't have any better options right now; we need
318 # a way to query for a _lot_ of explicitly given dataset_ids, and
319 # the only way to make that scale up is to either upload them to a
320 # temporary table or recognize when they are already in one because
321 # the user passed us a QueryResult object. That's blocked by (at
322 # least) DM-26692.
323 collectionTypes = {CollectionType.TAGGED}
324 if datasetType.isCalibration():
325 collectionTypes.add(CollectionType.CALIBRATION)
326 associationIter = self._registry.queryDatasetAssociations(
327 datasetType,
328 collections=self._collections.keys(),
329 collectionTypes=collectionTypes,
330 flattenChains=False,
331 )
332 for association in associationIter:
333 if association.ref.id in self._dataset_ids:
334 results[association.collection].append(association)
335 return results