Coverage for python/lsst/daf/butler/transfers/_context.py: 12%
115 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["RepoExportContext"]
32from collections import defaultdict
33from collections.abc import Callable, Iterable, Set
34from typing import TYPE_CHECKING
36from .._dataset_association import DatasetAssociation
37from .._dataset_ref import DatasetId, DatasetRef
38from .._dataset_type import DatasetType
39from .._file_dataset import FileDataset
40from ..datastore import Datastore
41from ..dimensions import DataCoordinate, DimensionElement, DimensionRecord
42from ..registry import CollectionType
43from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord
45if TYPE_CHECKING:
46 from lsst.resources import ResourcePathExpression
48 from ..registry.sql_registry import SqlRegistry
49 from ._interfaces import RepoExportBackend
52class RepoExportContext:
53 """Public interface for exporting a subset of a data repository.
55 Instances of this class are obtained by calling `Butler.export` as the
56 value returned by that context manager::
58 with butler.export(filename="export.yaml") as export:
59 export.saveDataIds(...)
60 export.saveDatasets(...)
62 Parameters
63 ----------
64 registry : `SqlRegistry`
65 Registry to export from.
66 datastore : `Datastore`
67 Datastore to export from.
68 backend : `RepoExportBackend`
69 Implementation class for a particular export file format.
70 directory : `~lsst.resources.ResourcePathExpression`, optional
71 Directory to pass to `Datastore.export`. Can be `None` to use
72 the current working directory.
73 transfer : `str`, optional
74 Transfer mode to pass to `Datastore.export`.
75 """
77 def __init__(
78 self,
79 registry: SqlRegistry,
80 datastore: Datastore,
81 backend: RepoExportBackend,
82 *,
83 directory: ResourcePathExpression | None = None,
84 transfer: str | None = None,
85 ):
86 self._registry = registry
87 self._datastore = datastore
88 self._backend = backend
89 self._directory = directory
90 self._transfer = transfer
91 self._records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
92 self._dataset_ids: set[DatasetId] = set()
93 self._datasets: dict[DatasetType, dict[str, list[FileDataset]]] = defaultdict(
94 lambda: defaultdict(list)
95 )
96 self._collections: dict[str, CollectionRecord] = {}
98 def saveCollection(self, name: str) -> None:
99 """Export the given collection.
101 Parameters
102 ----------
103 name: `str`
104 Name of the collection.
106 Notes
107 -----
108 `~CollectionType.RUN` collections are also exported automatically when
109 any dataset referencing them is exported. They may also be explicitly
110 exported this method to export the collection with no datasets.
111 Duplicate exports of collections are ignored.
113 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
114 collection will cause its associations with exported datasets to also
115 be exported, but it does not export those datasets automatically.
117 Exporting a `~CollectionType.CHAINED` collection does not automatically
118 export its child collections; these must be explicitly exported or
119 already be present in the repository they are being imported into.
120 """
121 self._collections[name] = self._registry._get_collection_record(name)
123 def saveDimensionData(
124 self, element: str | DimensionElement, records: Iterable[dict | DimensionRecord]
125 ) -> None:
126 """Export the given dimension records associated with one or more data
127 IDs.
129 Parameters
130 ----------
131 element : `str` or `DimensionElement`
132 `DimensionElement` or `str` indicating the logical table these
133 records are from.
134 records : `~collections.abc.Iterable` [ `DimensionRecord` or `dict` ]
135 Records to export, as an iterable containing `DimensionRecord` or
136 `dict` instances.
137 """
138 if not isinstance(element, DimensionElement):
139 element = self._registry.dimensions[element]
140 for record in records:
141 if not isinstance(record, DimensionRecord):
142 record = element.RecordClass(**record)
143 elif record.definition != element:
144 raise ValueError(
145 f"Mismatch between element={element.name} and "
146 f"dimension record with definition={record.definition.name}."
147 )
148 self._records[element].setdefault(record.dataId, record)
150 def saveDataIds(
151 self,
152 dataIds: Iterable[DataCoordinate],
153 *,
154 elements: Iterable[str | DimensionElement] | None = None,
155 ) -> None:
156 """Export the dimension records associated with one or more data IDs.
158 Parameters
159 ----------
160 dataIds : iterable of `DataCoordinate`.
161 Data IDs to export. For large numbers of data IDs obtained by
162 calls to `Registry.queryDataIds`, it will be much more efficient if
163 these are expanded to include records (i.e.
164 `DataCoordinate.hasRecords` returns `True`) prior to the call to
165 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``.
166 elements : iterable of `DimensionElement` or `str`, optional
167 Dimension elements whose records should be exported. If `None`,
168 records for all dimensions will be exported.
169 """
170 standardized_elements: Set[DimensionElement]
171 if elements is None:
172 standardized_elements = frozenset(
173 element
174 for element in self._registry.dimensions.getStaticElements()
175 if element.hasTable() and element.viewOf is None
176 )
177 else:
178 standardized_elements = set()
179 for element in elements:
180 if not isinstance(element, DimensionElement):
181 element = self._registry.dimensions[element]
182 if element.hasTable() and element.viewOf is None:
183 standardized_elements.add(element)
184 for dataId in dataIds:
185 # This is potentially quite slow, because it's approximately
186 # len(dataId.graph.elements) queries per data ID. But it's a no-op
187 # if the data ID is already expanded, and DM-26692 will add (or at
188 # least start to add / unblock) query functionality that should
189 # let us speed this up internally as well.
190 dataId = self._registry.expandDataId(dataId)
191 for record in dataId.records.values():
192 if record is not None and record.definition in standardized_elements:
193 self._records[record.definition].setdefault(record.dataId, record)
195 def saveDatasets(
196 self,
197 refs: Iterable[DatasetRef],
198 *,
199 elements: Iterable[str | DimensionElement] | None = None,
200 rewrite: Callable[[FileDataset], FileDataset] | None = None,
201 ) -> None:
202 """Export one or more datasets.
204 This automatically exports any `DatasetType`, `~CollectionType.RUN`
205 collections, and dimension records associated with the datasets.
207 Parameters
208 ----------
209 refs : iterable of `DatasetRef`
210 References to the datasets to export. Their `DatasetRef.id`
211 attributes must not be `None`. Duplicates are automatically
212 ignored. Nested data IDs must have `DataCoordinate.hasRecords`
213 return `True`. If any reference is to a component dataset, the
214 parent will be exported instead.
215 elements : iterable of `DimensionElement` or `str`, optional
216 Dimension elements whose records should be exported; this is
217 forwarded to `saveDataIds` when exporting the data IDs of the
218 given datasets.
219 rewrite : callable, optional
220 A callable that takes a single `FileDataset` argument and returns
221 a modified `FileDataset`. This is typically used to rewrite the
222 path generated by the datastore. If `None`, the `FileDataset`
223 returned by `Datastore.export` will be used directly.
225 Notes
226 -----
227 At present, this only associates datasets with `~CollectionType.RUN`
228 collections. Other collections will be included in the export in the
229 future (once `Registry` provides a way to look up that information).
230 """
231 data_ids = set()
232 refs_to_export = {}
233 for ref in sorted(refs):
234 dataset_id = ref.id
235 # The query interfaces that are often used to generate the refs
236 # passed here often don't remove duplicates, so do that here for
237 # convenience.
238 if dataset_id in self._dataset_ids or dataset_id in refs_to_export:
239 continue
240 # Also convert components to composites.
241 if ref.isComponent():
242 ref = ref.makeCompositeRef()
243 data_ids.add(ref.dataId)
244 refs_to_export[dataset_id] = ref
245 # Do a vectorized datastore export, which might be a lot faster than
246 # one-by-one.
247 exports = self._datastore.export(
248 refs_to_export.values(),
249 directory=self._directory,
250 transfer=self._transfer,
251 )
252 # Export associated data IDs.
253 self.saveDataIds(data_ids, elements=elements)
254 # Rewrite export filenames if desired, and then save them to the
255 # data structure we'll write in `_finish`.
256 # If a single exported FileDataset has multiple DatasetRefs, we save
257 # it with each of them.
258 for file_dataset in exports:
259 if rewrite is not None:
260 file_dataset = rewrite(file_dataset)
261 for ref in file_dataset.refs:
262 assert ref.run is not None
263 self._datasets[ref.datasetType][ref.run].append(file_dataset)
264 self._dataset_ids.update(refs_to_export.keys())
266 def _finish(self) -> None:
267 """Delegate to the backend to finish the export process.
269 For use by `Butler.export` only.
270 """
271 for element in self._registry.dimensions.sorted(self._records.keys()):
272 # To make export deterministic sort the DataCoordinate instances.
273 r = self._records[element]
274 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())])
275 for datasetsByRun in self._datasets.values():
276 for run in datasetsByRun:
277 self._collections[run] = self._registry._get_collection_record(run)
278 for collectionName in self._computeSortedCollections():
279 doc = self._registry.getCollectionDocumentation(collectionName)
280 self._backend.saveCollection(self._collections[collectionName], doc)
281 # Sort the dataset types and runs before exporting to ensure
282 # reproducible order in export file.
283 for datasetType in sorted(self._datasets.keys()):
284 for run in sorted(self._datasets[datasetType].keys()):
285 # Sort the FileDataset
286 records = sorted(self._datasets[datasetType][run])
287 self._backend.saveDatasets(datasetType, run, *records)
288 # Export associations between datasets and collections. These need to
289 # be sorted (at two levels; they're dicts) or created more
290 # deterministically, too, which probably involves more data ID sorting.
291 datasetAssociations = self._computeDatasetAssociations()
292 for collection in sorted(datasetAssociations):
293 self._backend.saveDatasetAssociations(
294 collection, self._collections[collection].type, sorted(datasetAssociations[collection])
295 )
296 self._backend.finish()
298 def _computeSortedCollections(self) -> list[str]:
299 """Sort collections in a way that is both deterministic and safe
300 for registering them in a new repo in the presence of nested chains.
302 This method is intended for internal use by `RepoExportContext` only.
304 Returns
305 -------
306 names: `List` [ `str` ]
307 Ordered list of collection names.
308 """
309 # Split collections into CHAINED and everything else, and just
310 # sort "everything else" lexicographically since there are no
311 # dependencies.
312 chains: dict[str, list[str]] = {}
313 result: list[str] = []
314 for record in self._collections.values():
315 if record.type is CollectionType.CHAINED:
316 assert isinstance(record, ChainedCollectionRecord)
317 chains[record.name] = list(record.children)
318 else:
319 result.append(record.name)
320 result.sort()
321 # Sort all chains topologically, breaking ties lexicographically.
322 # Append these to 'result' and remove them from 'chains' as we go.
323 while chains:
324 unblocked = {
325 parent
326 for parent, children in chains.items()
327 if not any(child in chains for child in children)
328 }
329 if not unblocked:
330 raise RuntimeError(
331 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}."
332 )
333 result.extend(sorted(unblocked))
334 for name in unblocked:
335 del chains[name]
336 return result
338 def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]:
339 """Return datasets-collection associations, grouped by association.
341 This queries for all associations between exported datasets and
342 exported TAGGED or CALIBRATION collections and is intended to be run
343 only by `_finish`, as this ensures all collections and all datasets
344 have already been exported and hence the order in which they are
345 exported does not matter.
347 Returns
348 -------
349 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ]
350 Dictionary keyed by collection name, with values lists of structs
351 representing an association between that collection and a dataset.
352 """
353 results = defaultdict(list)
354 for datasetType in self._datasets:
355 # We query for _all_ datasets of each dataset type we export, in
356 # the specific collections we are exporting. The worst-case
357 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny
358 # subset). But we don't have any better options right now; we need
359 # a way to query for a _lot_ of explicitly given dataset_ids, and
360 # the only way to make that scale up is to either upload them to a
361 # temporary table or recognize when they are already in one because
362 # the user passed us a QueryResult object. That's blocked by (at
363 # least) DM-26692.
364 collectionTypes = {CollectionType.TAGGED}
365 if datasetType.isCalibration():
366 collectionTypes.add(CollectionType.CALIBRATION)
367 associationIter = self._registry.queryDatasetAssociations(
368 datasetType,
369 collections=self._collections.keys(),
370 collectionTypes=collectionTypes,
371 flattenChains=False,
372 )
373 for association in associationIter:
374 if association.ref.id in self._dataset_ids:
375 results[association.collection].append(association)
376 return results