Coverage for python/lsst/daf/butler/transfers/_context.py: 10%
111 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 10:56 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-12 10:56 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["RepoExportContext"]
26from collections import defaultdict
27from collections.abc import Callable, Iterable, Set
28from typing import TYPE_CHECKING
30from ..core import (
31 DataCoordinate,
32 DatasetAssociation,
33 DatasetId,
34 DatasetRef,
35 DatasetType,
36 Datastore,
37 DimensionElement,
38 DimensionRecord,
39 FileDataset,
40)
41from ..registry import CollectionType, Registry
42from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord
43from ._interfaces import RepoExportBackend
45if TYPE_CHECKING:
46 from lsst.resources import ResourcePathExpression
49class RepoExportContext:
50 """Public interface for exporting a subset of a data repository.
52 Instances of this class are obtained by calling `Butler.export` as the
53 value returned by that context manager::
55 with butler.export(filename="export.yaml") as export:
56 export.saveDataIds(...)
57 export.saveDatasets(...)
59 Parameters
60 ----------
61 registry : `Registry`
62 Registry to export from.
63 datastore : `Datastore`
64 Datastore to export from.
65 backend : `RepoExportBackend`
66 Implementation class for a particular export file format.
67 directory : `~lsst.resources.ResourcePathExpression`, optional
68 Directory to pass to `Datastore.export`. Can be `None` to use
69 the current working directory.
70 transfer : `str`, optional
71 Transfer mode to pass to `Datastore.export`.
72 """
74 def __init__(
75 self,
76 registry: Registry,
77 datastore: Datastore,
78 backend: RepoExportBackend,
79 *,
80 directory: ResourcePathExpression | None = None,
81 transfer: str | None = None,
82 ):
83 self._registry = registry
84 self._datastore = datastore
85 self._backend = backend
86 self._directory = directory
87 self._transfer = transfer
88 self._records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
89 self._dataset_ids: set[DatasetId] = set()
90 self._datasets: dict[DatasetType, dict[str, list[FileDataset]]] = defaultdict(
91 lambda: defaultdict(list)
92 )
93 self._collections: dict[str, CollectionRecord] = {}
95 def saveCollection(self, name: str) -> None:
96 """Export the given collection.
98 Parameters
99 ----------
100 name: `str`
101 Name of the collection.
103 Notes
104 -----
105 `~CollectionType.RUN` collections are also exported automatically when
106 any dataset referencing them is exported. They may also be explicitly
107 exported this method to export the collection with no datasets.
108 Duplicate exports of collections are ignored.
110 Exporting a `~CollectionType.TAGGED` or `~CollectionType.CALIBRATION`
111 collection will cause its associations with exported datasets to also
112 be exported, but it does not export those datasets automatically.
114 Exporting a `~CollectionType.CHAINED` collection does not automatically
115 export its child collections; these must be explicitly exported or
116 already be present in the repository they are being imported into.
117 """
118 self._collections[name] = self._registry._get_collection_record(name)
120 def saveDimensionData(
121 self, element: str | DimensionElement, records: Iterable[dict | DimensionRecord]
122 ) -> None:
123 """Export the given dimension records associated with one or more data
124 IDs.
126 Parameters
127 ----------
128 element : `str` or `DimensionElement`
129 `DimensionElement` or `str` indicating the logical table these
130 records are from.
131 records : `~collections.abc.Iterable` [ `DimensionRecord` or `dict` ]
132 Records to export, as an iterable containing `DimensionRecord` or
133 `dict` instances.
134 """
135 if not isinstance(element, DimensionElement):
136 element = self._registry.dimensions[element]
137 for record in records:
138 if not isinstance(record, DimensionRecord):
139 record = element.RecordClass(**record)
140 elif record.definition != element:
141 raise ValueError(
142 f"Mismatch between element={element.name} and "
143 f"dimension record with definition={record.definition.name}."
144 )
145 self._records[element].setdefault(record.dataId, record)
147 def saveDataIds(
148 self,
149 dataIds: Iterable[DataCoordinate],
150 *,
151 elements: Iterable[str | DimensionElement] | None = None,
152 ) -> None:
153 """Export the dimension records associated with one or more data IDs.
155 Parameters
156 ----------
157 dataIds : iterable of `DataCoordinate`.
158 Data IDs to export. For large numbers of data IDs obtained by
159 calls to `Registry.queryDataIds`, it will be much more efficient if
160 these are expanded to include records (i.e.
161 `DataCoordinate.hasRecords` returns `True`) prior to the call to
162 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``.
163 elements : iterable of `DimensionElement` or `str`, optional
164 Dimension elements whose records should be exported. If `None`,
165 records for all dimensions will be exported.
166 """
167 standardized_elements: Set[DimensionElement]
168 if elements is None:
169 standardized_elements = frozenset(
170 element
171 for element in self._registry.dimensions.getStaticElements()
172 if element.hasTable() and element.viewOf is None
173 )
174 else:
175 standardized_elements = set()
176 for element in elements:
177 if not isinstance(element, DimensionElement):
178 element = self._registry.dimensions[element]
179 if element.hasTable() and element.viewOf is None:
180 standardized_elements.add(element)
181 for dataId in dataIds:
182 # This is potentially quite slow, because it's approximately
183 # len(dataId.graph.elements) queries per data ID. But it's a no-op
184 # if the data ID is already expanded, and DM-26692 will add (or at
185 # least start to add / unblock) query functionality that should
186 # let us speed this up internally as well.
187 dataId = self._registry.expandDataId(dataId)
188 for record in dataId.records.values():
189 if record is not None and record.definition in standardized_elements:
190 self._records[record.definition].setdefault(record.dataId, record)
192 def saveDatasets(
193 self,
194 refs: Iterable[DatasetRef],
195 *,
196 elements: Iterable[str | DimensionElement] | None = None,
197 rewrite: Callable[[FileDataset], FileDataset] | None = None,
198 ) -> None:
199 """Export one or more datasets.
201 This automatically exports any `DatasetType`, `~CollectionType.RUN`
202 collections, and dimension records associated with the datasets.
204 Parameters
205 ----------
206 refs : iterable of `DatasetRef`
207 References to the datasets to export. Their `DatasetRef.id`
208 attributes must not be `None`. Duplicates are automatically
209 ignored. Nested data IDs must have `DataCoordinate.hasRecords`
210 return `True`. If any reference is to a component dataset, the
211 parent will be exported instead.
212 elements : iterable of `DimensionElement` or `str`, optional
213 Dimension elements whose records should be exported; this is
214 forwarded to `saveDataIds` when exporting the data IDs of the
215 given datasets.
216 rewrite : callable, optional
217 A callable that takes a single `FileDataset` argument and returns
218 a modified `FileDataset`. This is typically used to rewrite the
219 path generated by the datastore. If `None`, the `FileDataset`
220 returned by `Datastore.export` will be used directly.
222 Notes
223 -----
224 At present, this only associates datasets with `~CollectionType.RUN`
225 collections. Other collections will be included in the export in the
226 future (once `Registry` provides a way to look up that information).
227 """
228 data_ids = set()
229 refs_to_export = {}
230 for ref in sorted(refs):
231 dataset_id = ref.id
232 # The query interfaces that are often used to generate the refs
233 # passed here often don't remove duplicates, so do that here for
234 # convenience.
235 if dataset_id in self._dataset_ids or dataset_id in refs_to_export:
236 continue
237 # Also convert components to composites.
238 if ref.isComponent():
239 ref = ref.makeCompositeRef()
240 data_ids.add(ref.dataId)
241 refs_to_export[dataset_id] = ref
242 # Do a vectorized datastore export, which might be a lot faster than
243 # one-by-one.
244 exports = self._datastore.export(
245 refs_to_export.values(),
246 directory=self._directory,
247 transfer=self._transfer,
248 )
249 # Export associated data IDs.
250 self.saveDataIds(data_ids, elements=elements)
251 # Rewrite export filenames if desired, and then save them to the
252 # data structure we'll write in `_finish`.
253 # If a single exported FileDataset has multiple DatasetRefs, we save
254 # it with each of them.
255 for file_dataset in exports:
256 if rewrite is not None:
257 file_dataset = rewrite(file_dataset)
258 for ref in file_dataset.refs:
259 assert ref.run is not None
260 self._datasets[ref.datasetType][ref.run].append(file_dataset)
261 self._dataset_ids.update(refs_to_export.keys())
263 def _finish(self) -> None:
264 """Delegate to the backend to finish the export process.
266 For use by `Butler.export` only.
267 """
268 for element in self._registry.dimensions.sorted(self._records.keys()):
269 # To make export deterministic sort the DataCoordinate instances.
270 r = self._records[element]
271 self._backend.saveDimensionData(element, *[r[dataId] for dataId in sorted(r.keys())])
272 for datasetsByRun in self._datasets.values():
273 for run in datasetsByRun.keys():
274 self._collections[run] = self._registry._get_collection_record(run)
275 for collectionName in self._computeSortedCollections():
276 doc = self._registry.getCollectionDocumentation(collectionName)
277 self._backend.saveCollection(self._collections[collectionName], doc)
278 # Sort the dataset types and runs before exporting to ensure
279 # reproducible order in export file.
280 for datasetType in sorted(self._datasets.keys()):
281 for run in sorted(self._datasets[datasetType].keys()):
282 # Sort the FileDataset
283 records = sorted(self._datasets[datasetType][run])
284 self._backend.saveDatasets(datasetType, run, *records)
285 # Export associations between datasets and collections. These need to
286 # be sorted (at two levels; they're dicts) or created more
287 # deterministically, too, which probably involves more data ID sorting.
288 datasetAssociations = self._computeDatasetAssociations()
289 for collection in sorted(datasetAssociations):
290 self._backend.saveDatasetAssociations(
291 collection, self._collections[collection].type, sorted(datasetAssociations[collection])
292 )
293 self._backend.finish()
295 def _computeSortedCollections(self) -> list[str]:
296 """Sort collections in a way that is both deterministic and safe
297 for registering them in a new repo in the presence of nested chains.
299 This method is intended for internal use by `RepoExportContext` only.
301 Returns
302 -------
303 names: `List` [ `str` ]
304 Ordered list of collection names.
305 """
306 # Split collections into CHAINED and everything else, and just
307 # sort "everything else" lexicographically since there are no
308 # dependencies.
309 chains: dict[str, list[str]] = {}
310 result: list[str] = []
311 for record in self._collections.values():
312 if record.type is CollectionType.CHAINED:
313 assert isinstance(record, ChainedCollectionRecord)
314 chains[record.name] = list(record.children)
315 else:
316 result.append(record.name)
317 result.sort()
318 # Sort all chains topologically, breaking ties lexicographically.
319 # Append these to 'result' and remove them from 'chains' as we go.
320 while chains:
321 unblocked = {
322 parent
323 for parent, children in chains.items()
324 if not any(child in chains.keys() for child in children)
325 }
326 if not unblocked:
327 raise RuntimeError(
328 f"Apparent cycle in CHAINED collection dependencies involving {unblocked}."
329 )
330 result.extend(sorted(unblocked))
331 for name in unblocked:
332 del chains[name]
333 return result
335 def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]:
336 """Return datasets-collection associations, grouped by association.
338 This queries for all associations between exported datasets and
339 exported TAGGED or CALIBRATION collections and is intended to be run
340 only by `_finish`, as this ensures all collections and all datasets
341 have already been exported and hence the order in which they are
342 exported does not matter.
344 Returns
345 -------
346 associations : `dict` [ `str`, `list` [ `DatasetAssociation` ] ]
347 Dictionary keyed by collection name, with values lists of structs
348 representing an association between that collection and a dataset.
349 """
350 results = defaultdict(list)
351 for datasetType in self._datasets.keys():
352 # We query for _all_ datasets of each dataset type we export, in
353 # the specific collections we are exporting. The worst-case
354 # efficiency of this is _awful_ (i.e. big repo, exporting a tiny
355 # subset). But we don't have any better options right now; we need
356 # a way to query for a _lot_ of explicitly given dataset_ids, and
357 # the only way to make that scale up is to either upload them to a
358 # temporary table or recognize when they are already in one because
359 # the user passed us a QueryResult object. That's blocked by (at
360 # least) DM-26692.
361 collectionTypes = {CollectionType.TAGGED}
362 if datasetType.isCalibration():
363 collectionTypes.add(CollectionType.CALIBRATION)
364 associationIter = self._registry.queryDatasetAssociations(
365 datasetType,
366 collections=self._collections.keys(),
367 collectionTypes=collectionTypes,
368 flattenChains=False,
369 )
370 for association in associationIter:
371 if association.ref.id in self._dataset_ids:
372 results[association.collection].append(association)
373 return results