Coverage for python/lsst/daf/butler/transfers/_context.py: 9%
110 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 14:18 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 14:18 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["RepoExportContext"]
26from collections import defaultdict
27from typing import Callable, Dict, Iterable, List, Optional, Set, Union
29from ..core import (
30 DataCoordinate,
31 DatasetAssociation,
32 DatasetId,
33 DatasetRef,
34 DatasetType,
35 Datastore,
36 DimensionElement,
37 DimensionRecord,
38 FileDataset,
39)
40from ..registry import CollectionType, Registry
41from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord
42from ._interfaces import RepoExportBackend
45class RepoExportContext:
46 """Public interface for exporting a subset of a data repository.
48 Instances of this class are obtained by calling `Butler.export` as the
49 value returned by that context manager::
51 with butler.export(filename="export.yaml") as export:
52 export.saveDataIds(...)
53 export.saveDatasets(...)
55 Parameters
56 ----------
57 registry : `Registry`
58 Registry to export from.
59 datastore : `Datastore`
60 Datastore to export from.
61 backend : `RepoExportBackend`
62 Implementation class for a particular export file format.
63 directory : `str`, optional
64 Directory to pass to `Datastore.export`.
65 transfer : `str`, optional
66 Transfer mode to pass to `Datastore.export`.
67 """
69 def __init__(
70 self,
71 registry: Registry,
72 datastore: Datastore,
73 backend: RepoExportBackend,
74 *,
75 directory: Optional[str] = None,
76 transfer: Optional[str] = None,
77 ):
78 self._registry = registry
79 self._datastore = datastore
80 self._backend = backend
81 self._directory = directory
82 self._transfer = transfer
83 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
84 self._dataset_ids: Set[DatasetId] = set()
85 self._datasets: Dict[DatasetType, Dict[str, List[FileDataset]]] = defaultdict(
86 lambda: defaultdict(list)
87 )
88 self._collections: Dict[str, CollectionRecord] = {}
90 def saveCollection(self, name: str) -> None:
91 """Export the given collection.
93 Parameters
94 ----------
95 name: `str`
96 Name of the collection.
98 Notes
99 -----
100 `~CollectionType.RUN` collections are also exported automatically when
101 any dataset referencing them is exported. They may also be explicitly
102 exported this method to export the collection with no datasets.
103 Duplicate exports of collections are ignored.
105 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
106 collection will cause its associations with exported datasets to also
107 be exported, but it does not export those datasets automatically.
109 Exporting a `~CollectionType.CHAINED` collection does not automatically
110 export its child collections; these must be explicitly exported or
111 already be present in the repository they are being imported into.
112 """
113 self._collections[name] = self._registry._get_collection_record(name)
115 def saveDimensionData(
116 self, element: Union[str, DimensionElement], records: Iterable[Union[dict, DimensionRecord]]
117 ) -> None:
118 """Export the given dimension records associated with one or more data
119 IDs.
121 Parameters
122 ----------
123 element : `str` or `DimensionElement`
124 `DimensionElement` or `str` indicating the logical table these
125 records are from.
126 records : `Iterable` [ `DimensionRecord` or `dict` ]
127 Records to export, as an iterable containing `DimensionRecord` or
128 `dict` instances.
129 """
130 if not isinstance(element, DimensionElement):
131 element = self._registry.dimensions[element]
132 for record in records:
133 if not isinstance(record, DimensionRecord):
134 record = element.RecordClass(**record)
135 elif record.definition != element:
136 raise ValueError(
137 f"Mismatch between element={element.name} and "
138 f"dimension record with definition={record.definition.name}."
139 )
140 self._records[element].setdefault(record.dataId, record)
142 def saveDataIds(
143 self,
144 dataIds: Iterable[DataCoordinate],
145 *,
146 elements: Optional[Iterable[Union[str, DimensionElement]]] = None,
147 ) -> None:
148 """Export the dimension records associated with one or more data IDs.
150 Parameters
151 ----------
152 dataIds : iterable of `DataCoordinate`.
153 Data IDs to export. For large numbers of data IDs obtained by
154 calls to `Registry.queryDataIds`, it will be much more efficient if
155 these are expanded to include records (i.e.
156 `DataCoordinate.hasRecords` returns `True`) prior to the call to
157 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``.
158 elements : iterable of `DimensionElement` or `str`, optional
159 Dimension elements whose records should be exported. If `None`,
160 records for all dimensions will be exported.
161 """
162 if elements is None:
163 elements = frozenset(
164 element
165 for element in self._registry.dimensions.getStaticElements()
166 if element.hasTable() and element.viewOf is None
167 )
168 else:
169 elements = set()
170 for element in elements:
171 if not isinstance(element, DimensionElement):
172 element = self._registry.dimensions[element]
173 if element.hasTable() and element.viewOf is None:
174 elements.add(element)
175 for dataId in dataIds:
176 # This is potentially quite slow, because it's approximately
177 # len(dataId.graph.elements) queries per data ID. But it's a no-op
178 # if the data ID is already expanded, and DM-26692 will add (or at
179 # least start to add / unblock) query functionality that should
180 # let us speed this up internally as well.
181 dataId = self._registry.expandDataId(dataId)
182 for record in dataId.records.values():
183 if record is not None and record.definition in elements:
184 self._records[record.definition].setdefault(record.dataId, record)
186 def saveDatasets(
187 self,
188 refs: Iterable[DatasetRef],
189 *,
190 elements: Optional[Iterable[Union[str, DimensionElement]]] = None,
191 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None,
192 ) -> None:
193 """Export one or more datasets.
195 This automatically exports any `DatasetType`, `~CollectionType.RUN`
196 collections, and dimension records associated with the datasets.
198 Parameters
199 ----------
200 refs : iterable of `DatasetRef`
201 References to the datasets to export. Their `DatasetRef.id`
202 attributes must not be `None`. Duplicates are automatically
203 ignored. Nested data IDs must have `DataCoordinate.hasRecords`
204 return `True`. If any reference is to a component dataset, the
205 parent will be exported instead.
206 elements : iterable of `DimensionElement` or `str`, optional
207 Dimension elements whose records should be exported; this is
208 forwarded to `saveDataIds` when exporting the data IDs of the
209 given datasets.
210 rewrite : callable, optional
211 A callable that takes a single `FileDataset` argument and returns
212 a modified `FileDataset`. This is typically used to rewrite the
213 path generated by the datastore. If `None`, the `FileDataset`
214 returned by `Datastore.export` will be used directly.
216 Notes
217 -----
218 At present, this only associates datasets with `~CollectionType.RUN`
219 collections. Other collections will be included in the export in the
220 future (once `Registry` provides a way to look up that information).
221 """
222 data_ids = set()
223 refs_to_export = {}
224 for ref in sorted(refs):
225 dataset_id = ref.getCheckedId()
226 # The query interfaces that are often used to generate the refs
227 # passed here often don't remove duplicates, so do that here for
228 # convenience.
229 if dataset_id in self._dataset_ids or dataset_id in refs_to_export:
230 continue
231 # Also convert components to composites.
232 if ref.isComponent():
233 ref = ref.makeCompositeRef()
234 data_ids.add(ref.dataId)
235 refs_to_export[dataset_id] = ref
236 # Do a vectorized datastore export, which might be a lot faster than
237 # one-by-one.
238 exports = self._datastore.export(
239 refs_to_export.values(),
240 directory=self._directory,
241 transfer=self._transfer,
242 )
243 # Export associated data IDs.
244 self.saveDataIds(data_ids, elements=elements)
245 # Rewrite export filenames if desired, and then save them to the
246 # data structure we'll write in `_finish`.
247 # If a single exported FileDataset has multiple DatasetRefs, we save
248 # it with each of them.
249 for file_dataset in exports:
250 if rewrite is not None:
251 file_dataset = rewrite(file_dataset)
252 for ref in file_dataset.refs:
253 assert ref.run is not None
254 self._datasets[ref.datasetType][ref.run].append(file_dataset)
255 self._dataset_ids.update(refs_to_export.keys())
257 def _finish(self) -> None:
258 """Delegate to the backend to finish the export process.
260 For use by `Butler.export` only.
261 """
262 for element in self._registry.dimensions.sorted(self._records.keys()):
263 # To make export deterministic sort the DataCoordinate instances.
264 r = self._records[element]
265 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())])
266 for datasetsByRun in self._datasets.values():
267 for run in datasetsByRun.keys():
268 self._collections[run] = self._registry._get_collection_record(run)
269 for collectionName in self._computeSortedCollections():
270 doc = self._registry.getCollectionDocumentation(collectionName)
271 self._backend.saveCollection(self._collections[collectionName], doc)
272 # Sort the dataset types and runs before exporting to ensure
273 # reproducible order in export file.
274 for datasetType in sorted(self._datasets.keys()):
275 for run in sorted(self._datasets[datasetType].keys()):
276 # Sort the FileDataset
277 records = sorted(self._datasets[datasetType][run])
278 self._backend.saveDatasets(datasetType, run, *records)
279 # Export associations between datasets and collections. These need to
280 # be sorted (at two levels; they're dicts) or created more
281 # deterministically, too, which probably involves more data ID sorting.
282 datasetAssociations = self._computeDatasetAssociations()
283 for collection in sorted(datasetAssociations):
284 self._backend.saveDatasetAssociations(
285 collection, self._collections[collection].type, sorted(datasetAssociations[collection])
286 )
287 self._backend.finish()
289 def _computeSortedCollections(self) -> List[str]:
290 """Sort collections in a way that is both deterministic and safe
291 for registering them in a new repo in the presence of nested chains.
293 This method is intended for internal use by `RepoExportContext` only.
295 Returns
296 -------
297 names: `List` [ `str` ]
298 Ordered list of collection names.
299 """
300 # Split collections into CHAINED and everything else, and just
301 # sort "everything else" lexicographically since there are no
302 # dependencies.
303 chains: Dict[str, List[str]] = {}
304 result: List[str] = []
305 for record in self._collections.values():
306 if record.type is CollectionType.CHAINED:
307 assert isinstance(record, ChainedCollectionRecord)
308 chains[record.name] = list(record.children)
309 else:
310 result.append(record.name)
311 result.sort()
312 # Sort all chains topologically, breaking ties lexicographically.
313 # Append these to 'result' and remove them from 'chains' as we go.
314 while chains:
315 unblocked = {
316 parent
317 for parent, children in chains.items()
318 if not any(child in chains.keys() for child in children)
319 }
320 if not unblocked:
321 raise RuntimeError(
322 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}."
323 )
324 result.extend(sorted(unblocked))
325 for name in unblocked:
326 del chains[name]
327 return result
329 def _computeDatasetAssociations(self) -> Dict[str, List[DatasetAssociation]]:
330 """Return datasets-collection associations, grouped by association.
332 This queries for all associations between exported datasets and
333 exported TAGGED or CALIBRATION collections and is intended to be run
334 only by `_finish`, as this ensures all collections and all datasets
335 have already been exported and hence the order in which they are
336 exported does not matter.
338 Returns
339 -------
340 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ]
341 Dictionary keyed by collection name, with values lists of structs
342 representing an association between that collection and a dataset.
343 """
344 results = defaultdict(list)
345 for datasetType in self._datasets.keys():
346 # We query for _all_ datasets of each dataset type we export, in
347 # the specific collections we are exporting. The worst-case
348 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny
349 # subset). But we don't have any better options right now; we need
350 # a way to query for a _lot_ of explicitly given dataset_ids, and
351 # the only way to make that scale up is to either upload them to a
352 # temporary table or recognize when they are already in one because
353 # the user passed us a QueryResult object. That's blocked by (at
354 # least) DM-26692.
355 collectionTypes = {CollectionType.TAGGED}
356 if datasetType.isCalibration():
357 collectionTypes.add(CollectionType.CALIBRATION)
358 associationIter = self._registry.queryDatasetAssociations(
359 datasetType,
360 collections=self._collections.keys(),
361 collectionTypes=collectionTypes,
362 flattenChains=False,
363 )
364 for association in associationIter:
365 if association.ref.id in self._dataset_ids:
366 results[association.collection].append(association)
367 return results