Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"] 

25 

26import os 

27from datetime import datetime 

28from typing import ( 

29 Any, 

30 Dict, 

31 IO, 

32 List, 

33 Mapping, 

34 Optional, 

35 Set, 

36 Tuple, 

37 Type, 

38) 

39from collections import defaultdict 

40 

41import yaml 

42import astropy.time 

43 

44from lsst.utils import doImport 

45from ..core import ( 

46 DatasetRef, 

47 DatasetType, 

48 DataCoordinate, 

49 Datastore, 

50 DimensionElement, 

51 DimensionRecord, 

52 FileDataset, 

53) 

54from ..core.utils import iterable 

55from ..core.named import NamedValueSet 

56from ..registry import Registry 

57from ._interfaces import RepoExportBackend, RepoImportBackend 

58 

59 

60class YamlRepoExportBackend(RepoExportBackend): 

61 """A repository export implementation that saves to a YAML file. 

62 

63 Parameters 

64 ---------- 

65 stream 

66 A writeable file-like object. 

67 """ 

68 

69 def __init__(self, stream: IO): 

70 self.stream = stream 

71 self.data: List[Dict[str, Any]] = [] 

72 

73 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None: 

74 # Docstring inherited from RepoExportBackend.saveDimensionData. 

75 data_dicts = [record.toDict(splitTimespan=True) for record in data] 

76 self.data.append({ 

77 "type": "dimension", 

78 "element": element.name, 

79 "records": data_dicts, 

80 }) 

81 

82 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None: 

83 # Docstring inherited from RepoExportBackend.saveDatasets. 

84 self.data.append({ 

85 "type": "dataset_type", 

86 "name": datasetType.name, 

87 "dimensions": [d.name for d in datasetType.dimensions], 

88 "storage_class": datasetType.storageClass.name, 

89 }) 

90 self.data.append({ 

91 "type": "run", 

92 "name": run, 

93 }) 

94 self.data.append({ 

95 "type": "dataset", 

96 "dataset_type": datasetType.name, 

97 "run": run, 

98 "records": [ 

99 { 

100 "dataset_id": [ref.id for ref in dataset.refs], 

101 "data_id": [ref.dataId.byName() for ref in dataset.refs], 

102 "path": dataset.path, 

103 "formatter": dataset.formatter, 

104 # TODO: look up and save other collections 

105 } 

106 for dataset in datasets 

107 ] 

108 }) 

109 

110 def finish(self) -> None: 

111 # Docstring inherited from RepoExportBackend. 

112 yaml.dump( 

113 { 

114 "description": "Butler Data Repository Export", 

115 "version": 0, 

116 "data": self.data, 

117 }, 

118 stream=self.stream, 

119 sort_keys=False, 

120 ) 

121 

122 

123class YamlRepoImportBackend(RepoImportBackend): 

124 """A repository import implementation that reads from a YAML file. 

125 

126 Parameters 

127 ---------- 

128 stream 

129 A readable file-like object. 

130 registry : `Registry` 

131 The registry datasets will be imported into. Only used to retreive 

132 dataset types during construction; all write happen in `register` 

133 and `load`. 

134 """ 

135 

136 def __init__(self, stream: IO, registry: Registry): 

137 # We read the file fully and convert its contents to Python objects 

138 # instead of loading incrementally so we can spot some problems early; 

139 # because `register` can't be put inside a transaction, we'd rather not 

140 # run that at all if there's going to be problem later in `load`. 

141 wrapper = yaml.safe_load(stream) 

142 # TODO: When version numbers become meaningful, check here that we can 

143 # read the version in the file. 

144 self.runs: List[str] = [] 

145 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet() 

146 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list) 

147 self.registry: Registry = registry 

148 datasetData = [] 

149 for data in wrapper["data"]: 

150 if data["type"] == "dimension": 

151 # convert all datetiem values to astropy 

152 for record in data["records"]: 

153 for key in record: 

154 # Some older YAML files were produced with native 

155 # YAML support for datetime, we support reading that 

156 # data back. Newer conversion uses _AstropyTimeToYAML 

157 # class with special YAML tag. 

158 if isinstance(record[key], datetime): 

159 record[key] = astropy.time.Time(record[key], scale="utc") 

160 element = self.registry.dimensions[data["element"]] 

161 RecordClass: Type[DimensionRecord] = element.RecordClass 

162 self.dimensions[element].extend( 

163 RecordClass(**r) for r in data["records"] 

164 ) 

165 elif data["type"] == "run": 

166 self.runs.append(data["name"]) 

167 elif data["type"] == "dataset_type": 

168 self.datasetTypes.add( 

169 DatasetType(data["name"], dimensions=data["dimensions"], 

170 storageClass=data["storage_class"], universe=self.registry.dimensions) 

171 ) 

172 elif data["type"] == "dataset": 

173 # Save raw dataset data for a second loop, so we can ensure we 

174 # know about all dataset types first. 

175 datasetData.append(data) 

176 else: 

177 raise ValueError(f"Unexpected dictionary type: {data['type']}.") 

178 # key is (dataset type name, run); inner most list is collections 

179 self.datasets: Mapping[Tuple[str, str], List[Tuple[FileDataset, List[str]]]] = defaultdict(list) 

180 for data in datasetData: 

181 datasetType = self.datasetTypes.get(data["dataset_type"]) 

182 if datasetType is None: 

183 datasetType = self.registry.getDatasetType(data["dataset_type"]) 

184 self.datasets[data["dataset_type"], data["run"]].extend( 

185 ( 

186 FileDataset( 

187 d.get("path"), 

188 [DatasetRef(datasetType, dataId, run=data["run"], id=refid) 

189 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))], 

190 formatter=doImport(d.get("formatter")) if "formatter" in d else None 

191 ), 

192 d.get("collections", []) 

193 ) 

194 for d in data["records"] 

195 ) 

196 

197 def register(self) -> None: 

198 # Docstring inherited from RepoImportBackend.register. 

199 for run in self.runs: 

200 self.registry.registerRun(run) 

201 for datasetType in self.datasetTypes: 

202 self.registry.registerDatasetType(datasetType) 

203 

204 def load(self, datastore: Optional[Datastore], *, 

205 directory: Optional[str] = None, transfer: Optional[str] = None, 

206 skip_dimensions: Optional[Set] = None) -> None: 

207 # Docstring inherited from RepoImportBackend.load. 

208 for element, dimensionRecords in self.dimensions.items(): 

209 if skip_dimensions and element in skip_dimensions: 

210 continue 

211 self.registry.insertDimensionData(element, *dimensionRecords) 

212 # Mapping from collection name to list of DatasetRefs to associate. 

213 collections = defaultdict(list) 

214 # FileDatasets to ingest into the datastore (in bulk): 

215 fileDatasets = [] 

216 for (datasetTypeName, run), records in self.datasets.items(): 

217 datasetType = self.registry.getDatasetType(datasetTypeName) 

218 # Make a big flattened list of all data IDs, while remembering 

219 # slices that associate them with the FileDataset instances they 

220 # came from. 

221 dataIds: List[DataCoordinate] = [] 

222 slices = [] 

223 for fileDataset, _ in records: 

224 start = len(dataIds) 

225 dataIds.extend(ref.dataId for ref in fileDataset.refs) 

226 stop = len(dataIds) 

227 slices.append(slice(start, stop)) 

228 # Insert all of those DatasetRefs at once. 

229 # For now, we ignore the dataset_id we pulled from the file 

230 # and just insert without one to get a new autoincrement value. 

231 # Eventually (once we have origin in IDs) we'll preserve them. 

232 resolvedRefs = self.registry.insertDatasets( 

233 datasetType, 

234 dataIds=dataIds, 

235 run=run, 

236 ) 

237 # Now iterate over the original records, and install the new 

238 # resolved DatasetRefs to replace the unresolved ones as we 

239 # reorganize the collection information. 

240 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records): 

241 fileDataset.refs = resolvedRefs[sliceForFileDataset] 

242 if directory is not None: 

243 fileDataset.path = os.path.join(directory, fileDataset.path) 

244 fileDatasets.append(fileDataset) 

245 for collection in collectionsForDataset: 

246 collections[collection].extend(fileDataset.refs) 

247 # Ingest everything into the datastore at once. 

248 if datastore is not None and fileDatasets: 

249 datastore.ingest(*fileDatasets, transfer=transfer) 

250 # Associate with collections, one collection at a time. 

251 for collection, refs in collections.items(): 

252 self.registry.associate(collection, refs)