Coverage for python/lsst/daf/butler/transfers/_context.py: 10%
111 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["RepoExportContext"]
32from collections import defaultdict
33from collections.abc import Callable, Iterable, Set
34from typing import TYPE_CHECKING
36from ..core import (
37 DataCoordinate,
38 DatasetAssociation,
39 DatasetId,
40 DatasetRef,
41 DatasetType,
42 Datastore,
43 DimensionElement,
44 DimensionRecord,
45 FileDataset,
46)
47from ..registry import CollectionType, _ButlerRegistry
48from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord
49from ._interfaces import RepoExportBackend
51if TYPE_CHECKING:
52 from lsst.resources import ResourcePathExpression
55class RepoExportContext:
56 """Public interface for exporting a subset of a data repository.
58 Instances of this class are obtained by calling `Butler.export` as the
59 value returned by that context manager::
61 with butler.export(filename="export.yaml") as export:
62 export.saveDataIds(...)
63 export.saveDatasets(...)
65 Parameters
66 ----------
67 registry : `_ButlerRegistry`
68 Registry to export from.
69 datastore : `Datastore`
70 Datastore to export from.
71 backend : `RepoExportBackend`
72 Implementation class for a particular export file format.
73 directory : `~lsst.resources.ResourcePathExpression`, optional
74 Directory to pass to `Datastore.export`. Can be `None` to use
75 the current working directory.
76 transfer : `str`, optional
77 Transfer mode to pass to `Datastore.export`.
78 """
80 def __init__(
81 self,
82 registry: _ButlerRegistry,
83 datastore: Datastore,
84 backend: RepoExportBackend,
85 *,
86 directory: ResourcePathExpression | None = None,
87 transfer: str | None = None,
88 ):
89 self._registry = registry
90 self._datastore = datastore
91 self._backend = backend
92 self._directory = directory
93 self._transfer = transfer
94 self._records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
95 self._dataset_ids: set[DatasetId] = set()
96 self._datasets: dict[DatasetType, dict[str, list[FileDataset]]] = defaultdict(
97 lambda: defaultdict(list)
98 )
99 self._collections: dict[str, CollectionRecord] = {}
101 def saveCollection(self, name: str) -> None:
102 """Export the given collection.
104 Parameters
105 ----------
106 name: `str`
107 Name of the collection.
109 Notes
110 -----
111 `~CollectionType.RUN` collections are also exported automatically when
112 any dataset referencing them is exported. They may also be explicitly
113 exported this method to export the collection with no datasets.
114 Duplicate exports of collections are ignored.
116 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
117 collection will cause its associations with exported datasets to also
118 be exported, but it does not export those datasets automatically.
120 Exporting a `~CollectionType.CHAINED` collection does not automatically
121 export its child collections; these must be explicitly exported or
122 already be present in the repository they are being imported into.
123 """
124 self._collections[name] = self._registry._get_collection_record(name)
126 def saveDimensionData(
127 self, element: str | DimensionElement, records: Iterable[dict | DimensionRecord]
128 ) -> None:
129 """Export the given dimension records associated with one or more data
130 IDs.
132 Parameters
133 ----------
134 element : `str` or `DimensionElement`
135 `DimensionElement` or `str` indicating the logical table these
136 records are from.
137 records : `~collections.abc.Iterable` [ `DimensionRecord` or `dict` ]
138 Records to export, as an iterable containing `DimensionRecord` or
139 `dict` instances.
140 """
141 if not isinstance(element, DimensionElement):
142 element = self._registry.dimensions[element]
143 for record in records:
144 if not isinstance(record, DimensionRecord):
145 record = element.RecordClass(**record)
146 elif record.definition != element:
147 raise ValueError(
148 f"Mismatch between element={element.name} and "
149 f"dimension record with definition={record.definition.name}."
150 )
151 self._records[element].setdefault(record.dataId, record)
153 def saveDataIds(
154 self,
155 dataIds: Iterable[DataCoordinate],
156 *,
157 elements: Iterable[str | DimensionElement] | None = None,
158 ) -> None:
159 """Export the dimension records associated with one or more data IDs.
161 Parameters
162 ----------
163 dataIds : iterable of `DataCoordinate`.
164 Data IDs to export. For large numbers of data IDs obtained by
165 calls to `Registry.queryDataIds`, it will be much more efficient if
166 these are expanded to include records (i.e.
167 `DataCoordinate.hasRecords` returns `True`) prior to the call to
168 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``.
169 elements : iterable of `DimensionElement` or `str`, optional
170 Dimension elements whose records should be exported. If `None`,
171 records for all dimensions will be exported.
172 """
173 standardized_elements: Set[DimensionElement]
174 if elements is None:
175 standardized_elements = frozenset(
176 element
177 for element in self._registry.dimensions.getStaticElements()
178 if element.hasTable() and element.viewOf is None
179 )
180 else:
181 standardized_elements = set()
182 for element in elements:
183 if not isinstance(element, DimensionElement):
184 element = self._registry.dimensions[element]
185 if element.hasTable() and element.viewOf is None:
186 standardized_elements.add(element)
187 for dataId in dataIds:
188 # This is potentially quite slow, because it's approximately
189 # len(dataId.graph.elements) queries per data ID. But it's a no-op
190 # if the data ID is already expanded, and DM-26692 will add (or at
191 # least start to add / unblock) query functionality that should
192 # let us speed this up internally as well.
193 dataId = self._registry.expandDataId(dataId)
194 for record in dataId.records.values():
195 if record is not None and record.definition in standardized_elements:
196 self._records[record.definition].setdefault(record.dataId, record)
198 def saveDatasets(
199 self,
200 refs: Iterable[DatasetRef],
201 *,
202 elements: Iterable[str | DimensionElement] | None = None,
203 rewrite: Callable[[FileDataset], FileDataset] | None = None,
204 ) -> None:
205 """Export one or more datasets.
207 This automatically exports any `DatasetType`, `~CollectionType.RUN`
208 collections, and dimension records associated with the datasets.
210 Parameters
211 ----------
212 refs : iterable of `DatasetRef`
213 References to the datasets to export. Their `DatasetRef.id`
214 attributes must not be `None`. Duplicates are automatically
215 ignored. Nested data IDs must have `DataCoordinate.hasRecords`
216 return `True`. If any reference is to a component dataset, the
217 parent will be exported instead.
218 elements : iterable of `DimensionElement` or `str`, optional
219 Dimension elements whose records should be exported; this is
220 forwarded to `saveDataIds` when exporting the data IDs of the
221 given datasets.
222 rewrite : callable, optional
223 A callable that takes a single `FileDataset` argument and returns
224 a modified `FileDataset`. This is typically used to rewrite the
225 path generated by the datastore. If `None`, the `FileDataset`
226 returned by `Datastore.export` will be used directly.
228 Notes
229 -----
230 At present, this only associates datasets with `~CollectionType.RUN`
231 collections. Other collections will be included in the export in the
232 future (once `Registry` provides a way to look up that information).
233 """
234 data_ids = set()
235 refs_to_export = {}
236 for ref in sorted(refs):
237 dataset_id = ref.id
238 # The query interfaces that are often used to generate the refs
239 # passed here often don't remove duplicates, so do that here for
240 # convenience.
241 if dataset_id in self._dataset_ids or dataset_id in refs_to_export:
242 continue
243 # Also convert components to composites.
244 if ref.isComponent():
245 ref = ref.makeCompositeRef()
246 data_ids.add(ref.dataId)
247 refs_to_export[dataset_id] = ref
248 # Do a vectorized datastore export, which might be a lot faster than
249 # one-by-one.
250 exports = self._datastore.export(
251 refs_to_export.values(),
252 directory=self._directory,
253 transfer=self._transfer,
254 )
255 # Export associated data IDs.
256 self.saveDataIds(data_ids, elements=elements)
257 # Rewrite export filenames if desired, and then save them to the
258 # data structure we'll write in `_finish`.
259 # If a single exported FileDataset has multiple DatasetRefs, we save
260 # it with each of them.
261 for file_dataset in exports:
262 if rewrite is not None:
263 file_dataset = rewrite(file_dataset)
264 for ref in file_dataset.refs:
265 assert ref.run is not None
266 self._datasets[ref.datasetType][ref.run].append(file_dataset)
267 self._dataset_ids.update(refs_to_export.keys())
269 def _finish(self) -> None:
270 """Delegate to the backend to finish the export process.
272 For use by `Butler.export` only.
273 """
274 for element in self._registry.dimensions.sorted(self._records.keys()):
275 # To make export deterministic sort the DataCoordinate instances.
276 r = self._records[element]
277 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())])
278 for datasetsByRun in self._datasets.values():
279 for run in datasetsByRun:
280 self._collections[run] = self._registry._get_collection_record(run)
281 for collectionName in self._computeSortedCollections():
282 doc = self._registry.getCollectionDocumentation(collectionName)
283 self._backend.saveCollection(self._collections[collectionName], doc)
284 # Sort the dataset types and runs before exporting to ensure
285 # reproducible order in export file.
286 for datasetType in sorted(self._datasets.keys()):
287 for run in sorted(self._datasets[datasetType].keys()):
288 # Sort the FileDataset
289 records = sorted(self._datasets[datasetType][run])
290 self._backend.saveDatasets(datasetType, run, *records)
291 # Export associations between datasets and collections. These need to
292 # be sorted (at two levels; they're dicts) or created more
293 # deterministically, too, which probably involves more data ID sorting.
294 datasetAssociations = self._computeDatasetAssociations()
295 for collection in sorted(datasetAssociations):
296 self._backend.saveDatasetAssociations(
297 collection, self._collections[collection].type, sorted(datasetAssociations[collection])
298 )
299 self._backend.finish()
301 def _computeSortedCollections(self) -> list[str]:
302 """Sort collections in a way that is both deterministic and safe
303 for registering them in a new repo in the presence of nested chains.
305 This method is intended for internal use by `RepoExportContext` only.
307 Returns
308 -------
309 names: `List` [ `str` ]
310 Ordered list of collection names.
311 """
312 # Split collections into CHAINED and everything else, and just
313 # sort "everything else" lexicographically since there are no
314 # dependencies.
315 chains: dict[str, list[str]] = {}
316 result: list[str] = []
317 for record in self._collections.values():
318 if record.type is CollectionType.CHAINED:
319 assert isinstance(record, ChainedCollectionRecord)
320 chains[record.name] = list(record.children)
321 else:
322 result.append(record.name)
323 result.sort()
324 # Sort all chains topologically, breaking ties lexicographically.
325 # Append these to 'result' and remove them from 'chains' as we go.
326 while chains:
327 unblocked = {
328 parent
329 for parent, children in chains.items()
330 if not any(child in chains for child in children)
331 }
332 if not unblocked:
333 raise RuntimeError(
334 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}."
335 )
336 result.extend(sorted(unblocked))
337 for name in unblocked:
338 del chains[name]
339 return result
341 def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]:
342 """Return datasets-collection associations, grouped by association.
344 This queries for all associations between exported datasets and
345 exported TAGGED or CALIBRATION collections and is intended to be run
346 only by `_finish`, as this ensures all collections and all datasets
347 have already been exported and hence the order in which they are
348 exported does not matter.
350 Returns
351 -------
352 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ]
353 Dictionary keyed by collection name, with values lists of structs
354 representing an association between that collection and a dataset.
355 """
356 results = defaultdict(list)
357 for datasetType in self._datasets:
358 # We query for _all_ datasets of each dataset type we export, in
359 # the specific collections we are exporting. The worst-case
360 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny
361 # subset). But we don't have any better options right now; we need
362 # a way to query for a _lot_ of explicitly given dataset_ids, and
363 # the only way to make that scale up is to either upload them to a
364 # temporary table or recognize when they are already in one because
365 # the user passed us a QueryResult object. That's blocked by (at
366 # least) DM-26692.
367 collectionTypes = {CollectionType.TAGGED}
368 if datasetType.isCalibration():
369 collectionTypes.add(CollectionType.CALIBRATION)
370 associationIter = self._registry.queryDatasetAssociations(
371 datasetType,
372 collections=self._collections.keys(),
373 collectionTypes=collectionTypes,
374 flattenChains=False,
375 )
376 for association in associationIter:
377 if association.ref.id in self._dataset_ids:
378 results[association.collection].append(association)
379 return results