Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["RepoExportContext"] 

25 

26from typing import ( 

27 Callable, 

28 Dict, 

29 Iterable, 

30 List, 

31 Optional, 

32 Tuple, 

33 Union, 

34) 

35from collections import defaultdict 

36 

37from ..core import ( 

38 DataCoordinate, 

39 DimensionElement, 

40 DimensionRecord, 

41 DatasetRef, 

42 DatasetType, 

43 Datastore, 

44 FileDataset, 

45) 

46from ..registry import Registry 

47from ._interfaces import RepoExportBackend 

48 

49 

50class RepoExportContext: 

51 """Public interface for exporting a subset of a data repository. 

52 

53 Instances of this class are obtained by calling `Butler.export` as the 

54 value returned by that context manager:: 

55 

56 with butler.export(filename="export.yaml") as export: 

57 export.saveDataIds(...) 

58 export.saveDatasets(...) 

59 

60 Parameters 

61 ---------- 

62 registry : `Registry` 

63 Registry to export from. 

64 datastore : `Datastore` 

65 Datastore to export from. 

66 backend : `RepoExportBackend` 

67 Implementation class for a particular export file format. 

68 directory : `str`, optional 

69 Directory to pass to `Datastore.export`. 

70 transfer : `str`, optional 

71 Transfer mdoe to pass to `Datastore.export`. 

72 """ 

73 

74 def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExportBackend, *, 

75 directory: Optional[str] = None, transfer: Optional[str] = None): 

76 self._registry = registry 

77 self._datastore = datastore 

78 self._backend = backend 

79 self._directory = directory 

80 self._transfer = transfer 

81 self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

82 self._dataset_ids = set() 

83 self._datasets: Dict[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list) 

84 

85 def saveDimensionData(self, element: Union[str, DimensionElement], 

86 records: Iterable[Union[dict, DimensionRecord]]) -> None: 

87 """Export the given dimension records associated with one or more data 

88 IDs. 

89 

90 Parameters 

91 ---------- 

92 element : `str` or `DimensionElement` 

93 `DimensionElement` or `str` indicating the logical table these 

94 records are from. 

95 records : `Iterable` [ `DimensionRecord` or `dict` ] 

96 Records to export, as an iterable containing `DimensionRecord` or 

97 `dict` instances. 

98 """ 

99 if not isinstance(element, DimensionElement): 

100 element = self._registry.dimensions[element] 

101 for record in records: 

102 if not isinstance(record, DimensionRecord): 

103 record = element.RecordClass(**record) 

104 elif record.definition != element: 

105 raise ValueError( 

106 f"Mismatch between element={element.name} and " 

107 f"dimension record with definition={record.definition.name}." 

108 ) 

109 self._records[element].setdefault(record.dataId, record) 

110 

111 def saveDataIds(self, dataIds: Iterable[DataCoordinate], *, 

112 elements: Optional[Iterable[Union[str, DimensionElement]]] = None) -> None: 

113 """Export the dimension records associated with one or more data IDs. 

114 

115 Parameters 

116 ---------- 

117 dataIds : iterable of `DataCoordinate`. 

118 Data IDs to export. For large numbers of data IDs obtained by 

119 calls to `Registry.queryDataIds`, it will be much more efficient if 

120 these are expanded to include records (i.e. 

121 `DataCoordinate.hasRecords` returns `True`) prior to the call to 

122 `saveDataIds` via e.g. ``Registry.queryDataIds(...).expanded()``. 

123 elements : iterable of `DimensionElement` or `str`, optional 

124 Dimension elements whose records should be exported. If `None`, 

125 records for all dimensions will be exported. 

126 """ 

127 if elements is None: 

128 elements = frozenset(element for element in self._registry.dimensions.getStaticElements() 

129 if element.hasTable() and element.viewOf is None) 

130 else: 

131 elements = frozenset( 

132 self._registry.dimensions[element] if not isinstance(element, DimensionElement) else element 

133 for element in elements 

134 ) 

135 for dataId in dataIds: 

136 # This is potentially quite slow, because it's approximately 

137 # len(dataId.graph.elements) queries per data ID. But it's a no-op 

138 # if the data ID is already expanded, and DM-26692 will add (or at 

139 # least start to add / unblock) query functionality that should 

140 # let us speed this up internally as well. 

141 dataId = self._registry.expandDataId(dataId) 

142 for record in dataId.records.values(): 

143 if record is not None and record.definition in elements: 

144 self._records[record.definition].setdefault(record.dataId, record) 

145 

146 def saveDatasets(self, refs: Iterable[DatasetRef], *, 

147 elements: Optional[Union[str, DimensionElement]] = None, 

148 rewrite: Optional[Callable[[FileDataset], FileDataset]] = None) -> None: 

149 """Export one or more datasets. 

150 

151 This automatically exports any `DatasetType`, `~CollectionType.RUN` 

152 collections, and dimension records associated with the datasets. 

153 

154 Parameters 

155 ---------- 

156 refs : iterable of `DatasetRef` 

157 References to the datasets to export. Their `DatasetRef.id` 

158 attributes must not be `None`. Duplicates are automatically 

159 ignored. Nested data IDs must have `DataCoordinate.hasRecords` 

160 return `True`. 

161 elements : iterable of `DimensionElement` or `str`, optional 

162 Dimension elements whose records should be exported; this is 

163 forwarded to `saveDataIds` when exporting the data IDs of the 

164 given datasets. 

165 rewrite : callable, optional 

166 A callable that takes a single `FileDataset` argument and returns 

167 a modified `FileDataset`. This is typically used to rewrite the 

168 path generated by the datastore. If `None`, the `FileDataset` 

169 returned by `Datastore.export` will be used directly. 

170 

171 Notes 

172 ----- 

173 At present, this only associates datasets with `~CollectionType.RUN` 

174 collections. Other collections will be included in the export in the 

175 future (once `Registry` provides a way to look up that information). 

176 """ 

177 dataIds = set() 

178 for ref in refs: 

179 # The query interfaces that are often used to generate the refs 

180 # passed here often don't remove duplicates, so do that here for 

181 # convenience. 

182 if ref.id in self._dataset_ids: 

183 continue 

184 dataIds.add(ref.dataId) 

185 # `exports` is a single-element list here, because we anticipate 

186 # a future where more than just Datastore.export has a vectorized 

187 # API and we can pull this out of the loop. 

188 exports = self._datastore.export([ref], directory=self._directory, transfer=self._transfer) 

189 if rewrite is not None: 

190 exports = [rewrite(export) for export in exports] 

191 self._dataset_ids.add(ref.getCheckedId()) 

192 assert ref.run is not None 

193 self._datasets[ref.datasetType, ref.run].extend(exports) 

194 self.saveDataIds(dataIds, elements=elements) 

195 

196 def _finish(self) -> None: 

197 """Delegate to the backend to finish the export process. 

198 

199 For use by `Butler.export` only. 

200 """ 

201 for element in self._registry.dimensions.sorted(self._records.keys()): 

202 # To make export deterministic (DM-26324), the next step is to 

203 # implement a way to sort DataCoordinates, then transform the 

204 # second argument to: 

205 # *[r[dataId] for dataId in sorted(r.keys())] 

206 # where 

207 # r = self._records[element] 

208 # (continued below). 

209 self._backend.saveDimensionData(element, *self._records[element].values()) 

210 # Then we need to either make DatasetType sortable directly or sort 

211 # the iteration below by its name (as well as run). 

212 for datasetType, run in self._datasets.keys(): 

213 # And after that, that we need to sort the FileDataset objects in 

214 # the third argument below (maybe by filename?) and the lists of 

215 # DatasetRef within those (I'd use the aforementioned new 

216 # DataCoordinate sort method, because I'm not sure dataset_id 

217 # values are going to be reliably deterministic themselves). 

218 self._backend.saveDatasets(datasetType, run, *self._datasets[datasetType, run]) 

219 self._backend.finish()