Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 14%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

138 statements  

1# This file is part of dax_apdb. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["ApdbCassandraSchema"] 

25 

26import enum 

27import logging 

28from typing import List, Mapping, Optional, TYPE_CHECKING, Tuple, Union 

29 

30from .apdbSchema import ApdbSchema, ApdbTables, ColumnDef, IndexDef, IndexType, TableDef 

31 

32if TYPE_CHECKING: 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true

33 import cassandra.cluster 

34 

35 

36_LOG = logging.getLogger(__name__) 

37 

38 

39@enum.unique 

40class ExtraTables(enum.Enum): 

41 """Names of the extra tables used by Cassandra implementation.""" 

42 

43 DiaSourceToPartition = "DiaSourceToPartition" 

44 "Maps diaSourceId ro its partition values (pixel and time)." 

45 

46 def table_name(self, prefix: str = "") -> str: 

47 """Return full table name.""" 

48 return prefix + self.value 

49 

50 

51class ApdbCassandraSchema(ApdbSchema): 

52 """Class for management of APDB schema. 

53 

54 Parameters 

55 ---------- 

56 session : `cassandra.cluster.Session` 

57 Cassandra session object 

58 schema_file : `str` 

59 Name of the YAML schema file. 

60 extra_schema_file : `str`, optional 

61 Name of the YAML schema file with extra column definitions. 

62 prefix : `str`, optional 

63 Prefix to add to all schema elements. 

64 packing : `str` 

65 Type of packing to apply to columns, string "none" disable packing, 

66 any other value enables it. 

67 time_partition_tables : `bool` 

68 If True then schema will have a separate table for each time partition. 

69 """ 

70 

71 _type_map = dict(DOUBLE="DOUBLE", 

72 FLOAT="FLOAT", 

73 DATETIME="TIMESTAMP", 

74 BIGINT="BIGINT", 

75 INTEGER="INT", 

76 INT="INT", 

77 TINYINT="TINYINT", 

78 BLOB="BLOB", 

79 CHAR="TEXT", 

80 BOOL="BOOLEAN") 

81 """Map YAML column types to Cassandra""" 

82 

83 _time_partitioned_tables = [ 

84 ApdbTables.DiaObject, 

85 ApdbTables.DiaSource, 

86 ApdbTables.DiaForcedSource, 

87 ] 

88 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast] 

89 

90 def __init__(self, session: cassandra.cluster.Session, keyspace: str, schema_file: str, 

91 extra_schema_file: Optional[str] = None, prefix: str = "", 

92 time_partition_tables: bool = False): 

93 

94 super().__init__(schema_file, extra_schema_file) 

95 

96 self._session = session 

97 self._keyspace = keyspace 

98 self._prefix = prefix 

99 self._time_partition_tables = time_partition_tables 

100 

101 # add columns and index for partitioning. 

102 self._ignore_tables = [] 

103 for table, tableDef in self.tableSchemas.items(): 

104 columns = [] 

105 add_columns = True 

106 if table in self._spatially_partitioned_tables: 

107 # DiaObjectLast does not need temporal partitioning 

108 columns = ["apdb_part"] 

109 elif table in self._time_partitioned_tables: 

110 if time_partition_tables: 

111 columns = ["apdb_part"] 

112 else: 

113 columns = ["apdb_part", "apdb_time_part"] 

114 elif table is ApdbTables.SSObject: 

115 # For SSObject there is no natural partition key but we have 

116 # to partition it because there are too many of them. I'm 

117 # going to partition on its primary key (and drop separate 

118 # primary key index). 

119 columns = ["ssObjectId"] 

120 tableDef.indices = [ 

121 index for index in tableDef.indices if index.type is not IndexType.PRIMARY 

122 ] 

123 add_columns = False 

124 else: 

125 # TODO: Do not know yet how other tables can be partitioned 

126 self._ignore_tables.append(table) 

127 add_columns = False 

128 

129 if add_columns: 

130 # add columns to the column list 

131 columnDefs = [ 

132 ColumnDef(name=name, type="BIGINT", nullable=False) for name in columns 

133 ] 

134 tableDef.columns = columnDefs + tableDef.columns 

135 

136 # make an index 

137 if columns: 

138 index = IndexDef(name=f"Part_{tableDef.name}", type=IndexType.PARTITION, columns=columns) 

139 tableDef.indices.append(index) 

140 

141 self._extra_tables = self._extraTableSchema() 

142 

143 def _extraTableSchema(self) -> Mapping[ExtraTables, TableDef]: 

144 """Generate schema for extra tables.""" 

145 return { 

146 ExtraTables.DiaSourceToPartition: TableDef( 

147 name=ExtraTables.DiaSourceToPartition.value, 

148 columns=[ 

149 ColumnDef(name="diaSourceId", type="BIGINT", nullable=False), 

150 ColumnDef(name="apdb_part", type="BIGINT", nullable=False), 

151 ColumnDef(name="apdb_time_part", type="INT", nullable=False), 

152 ], 

153 indices=[ 

154 IndexDef( 

155 name=f"Part_{ExtraTables.DiaSourceToPartition.value}", 

156 type=IndexType.PARTITION, 

157 columns=["diaSourceId"], 

158 ), 

159 ], 

160 ), 

161 } 

162 

163 def tableName(self, table_name: Union[ApdbTables, ExtraTables]) -> str: 

164 """Return Cassandra table name for APDB table. 

165 """ 

166 return table_name.table_name(self._prefix) 

167 

168 def getColumnMap(self, table_name: Union[ApdbTables, ExtraTables]) -> Mapping[str, ColumnDef]: 

169 """Returns mapping of column names to Column definitions. 

170 

171 Parameters 

172 ---------- 

173 table_name : `ApdbTables` 

174 One of known APDB table names. 

175 

176 Returns 

177 ------- 

178 column_map : `dict` 

179 Mapping of column names to `ColumnDef` instances. 

180 """ 

181 if isinstance(table_name, ApdbTables): 

182 table_schema = self.tableSchemas[table_name] 

183 else: 

184 table_schema = self._extra_tables[table_name] 

185 cmap = {column.name: column for column in table_schema.columns} 

186 return cmap 

187 

188 def partitionColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]: 

189 """Return a list of columns used for table partitioning. 

190 

191 Parameters 

192 ---------- 

193 table_name : `ApdbTables` 

194 Table name in APDB schema 

195 

196 Returns 

197 ------- 

198 columns : `list` of `str` 

199 Names of columns for used for partitioning. 

200 """ 

201 if isinstance(table_name, ApdbTables): 

202 table_schema = self.tableSchemas[table_name] 

203 else: 

204 table_schema = self._extra_tables[table_name] 

205 for index in table_schema.indices: 

206 if index.type is IndexType.PARTITION: 

207 # there could be just one partitoning index (possibly with few columns) 

208 return index.columns 

209 return [] 

210 

211 def clusteringColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]: 

212 """Return a list of columns used for clustering. 

213 

214 Parameters 

215 ---------- 

216 table_name : `ApdbTables` 

217 Table name in APDB schema 

218 

219 Returns 

220 ------- 

221 columns : `list` of `str` 

222 Names of columns for used for clustering. 

223 """ 

224 if isinstance(table_name, ApdbTables): 

225 table_schema = self.tableSchemas[table_name] 

226 else: 

227 table_schema = self._extra_tables[table_name] 

228 for index in table_schema.indices: 

229 if index.type is IndexType.PRIMARY: 

230 return index.columns 

231 return [] 

232 

233 def makeSchema(self, drop: bool = False, part_range: Optional[Tuple[int, int]] = None) -> None: 

234 """Create or re-create all tables. 

235 

236 Parameters 

237 ---------- 

238 drop : `bool` 

239 If True then drop tables before creating new ones. 

240 part_range : `tuple` [ `int` ] or `None` 

241 Start and end partition number for time partitions, end is not 

242 inclusive. Used to create per-partition DiaObject, DiaSource, and 

243 DiaForcedSource tables. If `None` then per-partition tables are 

244 not created. 

245 """ 

246 for table in self.tableSchemas: 

247 self._makeTableSchema(table, drop, part_range) 

248 for extra_table in self._extra_tables: 

249 self._makeTableSchema(extra_table, drop, part_range) 

250 

251 def _makeTableSchema( 

252 self, 

253 table: Union[ApdbTables, ExtraTables], 

254 drop: bool = False, 

255 part_range: Optional[Tuple[int, int]] = None 

256 ) -> None: 

257 if table in self._ignore_tables: 

258 _LOG.debug("Skipping schema for table %s", table) 

259 return 

260 _LOG.debug("Making table %s", table) 

261 

262 fullTable = table.table_name(self._prefix) 

263 

264 table_list = [fullTable] 

265 if part_range is not None: 

266 if table in self._time_partitioned_tables: 

267 partitions = range(*part_range) 

268 table_list = [f"{fullTable}_{part}" for part in partitions] 

269 

270 if drop: 

271 queries = [ 

272 f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list 

273 ] 

274 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

275 for future in futures: 

276 _LOG.debug("wait for query: %s", future.query) 

277 future.result() 

278 _LOG.debug("query finished: %s", future.query) 

279 

280 queries = [] 

281 for table_name in table_list: 

282 if_not_exists = "" if drop else "IF NOT EXISTS" 

283 columns = ", ".join(self._tableColumns(table)) 

284 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})' 

285 _LOG.debug("query: %s", query) 

286 queries.append(query) 

287 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

288 for future in futures: 

289 _LOG.debug("wait for query: %s", future.query) 

290 future.result() 

291 _LOG.debug("query finished: %s", future.query) 

292 

293 def _tableColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]: 

294 """Return set of columns in a table 

295 

296 Parameters 

297 ---------- 

298 table_name : `ApdbTables` 

299 Name of the table. 

300 

301 Returns 

302 ------- 

303 column_defs : `list` 

304 List of strings in the format "column_name type". 

305 """ 

306 if isinstance(table_name, ApdbTables): 

307 table_schema = self.tableSchemas[table_name] 

308 else: 

309 table_schema = self._extra_tables[table_name] 

310 

311 # must have partition columns and clustering columns 

312 part_columns = [] 

313 clust_columns = [] 

314 index_columns = set() 

315 for index in table_schema.indices: 

316 if index.type is IndexType.PARTITION: 

317 part_columns = index.columns 

318 elif index.type is IndexType.PRIMARY: 

319 clust_columns = index.columns 

320 index_columns.update(index.columns) 

321 _LOG.debug("part_columns: %s", part_columns) 

322 _LOG.debug("clust_columns: %s", clust_columns) 

323 if not part_columns: 

324 raise ValueError(f"Table {table_name} configuration is missing partition index") 

325 

326 # all columns 

327 column_defs = [] 

328 for column in table_schema.columns: 

329 ctype = self._type_map[column.type] 

330 column_defs.append(f'"{column.name}" {ctype}') 

331 

332 # primary key definition 

333 part_columns = [f'"{col}"' for col in part_columns] 

334 clust_columns = [f'"{col}"' for col in clust_columns] 

335 if len(part_columns) > 1: 

336 columns = ", ".join(part_columns) 

337 part_columns = [f"({columns})"] 

338 pkey = ", ".join(part_columns + clust_columns) 

339 _LOG.debug("pkey: %s", pkey) 

340 column_defs.append(f"PRIMARY KEY ({pkey})") 

341 

342 return column_defs