Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 22%

170 statements  

« prev     ^ index     » next       coverage.py v7.3.3, created at 2023-12-20 17:15 +0000

1# This file is part of dax_apdb. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["ApdbCassandraSchema"] 

25 

26import enum 

27import logging 

28from collections.abc import Mapping 

29from typing import TYPE_CHECKING 

30 

31import felis.types 

32from felis import simple 

33 

34from .apdbSchema import ApdbSchema, ApdbTables 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 import cassandra.cluster 

38 

39 

40_LOG = logging.getLogger(__name__) 

41 

42 

43class _FelisUUID(felis.types.FelisType, felis_name="uuid", votable_name="uuid"): 

44 """Special internal type for UUID columns. Felis does not support UUID, 

45 but we need it here, to simplify logic it's easier to add a special class. 

46 """ 

47 

48 

49@enum.unique 

50class ExtraTables(enum.Enum): 

51 """Names of the extra tables used by Cassandra implementation.""" 

52 

53 DiaInsertId = "DiaInsertId" 

54 """Name of the table for insert ID records.""" 

55 

56 DiaObjectInsertId = "DiaObjectInsertId" 

57 """Name of the table for DIAObject insert ID records.""" 

58 

59 DiaSourceInsertId = "DiaSourceInsertId" 

60 """Name of the table for DIASource insert ID records.""" 

61 

62 DiaForcedSourceInsertId = "DiaFSourceInsertId" 

63 """Name of the table for DIAForcedSource insert ID records.""" 

64 

65 DiaSourceToPartition = "DiaSourceToPartition" 

66 "Maps diaSourceId to its partition values (pixel and time)." 

67 

68 def table_name(self, prefix: str = "") -> str: 

69 """Return full table name.""" 

70 return prefix + self.value 

71 

72 @classmethod 

73 def insert_id_tables(cls) -> Mapping[ExtraTables, ApdbTables]: 

74 """Return mapping of tables used for insert ID tracking to their 

75 corresponding regular tables. 

76 """ 

77 return { 

78 cls.DiaObjectInsertId: ApdbTables.DiaObject, 

79 cls.DiaSourceInsertId: ApdbTables.DiaSource, 

80 cls.DiaForcedSourceInsertId: ApdbTables.DiaForcedSource, 

81 } 

82 

83 

84class ApdbCassandraSchema(ApdbSchema): 

85 """Class for management of APDB schema. 

86 

87 Parameters 

88 ---------- 

89 session : `cassandra.cluster.Session` 

90 Cassandra session object 

91 schema_file : `str` 

92 Name of the YAML schema file. 

93 schema_name : `str`, optional 

94 Name of the schema in YAML files. 

95 prefix : `str`, optional 

96 Prefix to add to all schema elements. 

97 time_partition_tables : `bool` 

98 If True then schema will have a separate table for each time partition. 

99 """ 

100 

101 _type_map = { 

102 felis.types.Double: "DOUBLE", 

103 felis.types.Float: "FLOAT", 

104 felis.types.Timestamp: "TIMESTAMP", 

105 felis.types.Long: "BIGINT", 

106 felis.types.Int: "INT", 

107 felis.types.Short: "INT", 

108 felis.types.Byte: "TINYINT", 

109 felis.types.Binary: "BLOB", 

110 felis.types.Char: "TEXT", 

111 felis.types.String: "TEXT", 

112 felis.types.Unicode: "TEXT", 

113 felis.types.Text: "TEXT", 

114 felis.types.Boolean: "BOOLEAN", 

115 _FelisUUID: "UUID", 

116 } 

117 """Map YAML column types to Cassandra""" 

118 

119 _time_partitioned_tables = [ 

120 ApdbTables.DiaObject, 

121 ApdbTables.DiaSource, 

122 ApdbTables.DiaForcedSource, 

123 ] 

124 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast] 

125 

126 def __init__( 

127 self, 

128 session: cassandra.cluster.Session, 

129 keyspace: str, 

130 schema_file: str, 

131 schema_name: str = "ApdbSchema", 

132 prefix: str = "", 

133 time_partition_tables: bool = False, 

134 use_insert_id: bool = False, 

135 ): 

136 super().__init__(schema_file, schema_name) 

137 

138 self._session = session 

139 self._keyspace = keyspace 

140 self._prefix = prefix 

141 self._time_partition_tables = time_partition_tables 

142 self._use_insert_id = use_insert_id 

143 self._has_insert_id: bool | None = None 

144 

145 self._apdb_tables = self._apdb_tables_schema(time_partition_tables) 

146 self._extra_tables = self._extra_tables_schema() 

147 

148 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, simple.Table]: 

149 """Generate schema for regular APDB tables.""" 

150 apdb_tables: dict[ApdbTables, simple.Table] = {} 

151 

152 # add columns and index for partitioning. 

153 for table, apdb_table_def in self.tableSchemas.items(): 

154 part_columns = [] 

155 add_columns = [] 

156 primary_key = apdb_table_def.primary_key[:] 

157 if table in self._spatially_partitioned_tables: 

158 # DiaObjectLast does not need temporal partitioning 

159 part_columns = ["apdb_part"] 

160 add_columns = part_columns 

161 elif table in self._time_partitioned_tables: 

162 if time_partition_tables: 

163 part_columns = ["apdb_part"] 

164 else: 

165 part_columns = ["apdb_part", "apdb_time_part"] 

166 add_columns = part_columns 

167 elif table is ApdbTables.SSObject: 

168 # For SSObject there is no natural partition key but we have 

169 # to partition it because there are too many of them. I'm 

170 # going to partition on its primary key (and drop separate 

171 # primary key index). 

172 part_columns = ["ssObjectId"] 

173 primary_key = [] 

174 else: 

175 # TODO: Do not know what to do with the other tables 

176 continue 

177 

178 column_defs = [] 

179 if add_columns: 

180 column_defs = [ 

181 simple.Column(id=f"#{name}", name=name, datatype=felis.types.Long, nullable=False) 

182 for name in add_columns 

183 ] 

184 

185 annotations = dict(apdb_table_def.annotations) 

186 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns] 

187 if part_columns: 

188 annotations["cassandra:partitioning_columns"] = part_columns 

189 

190 apdb_tables[table] = simple.Table( 

191 id=apdb_table_def.id, 

192 name=apdb_table_def.name, 

193 columns=column_defs + apdb_table_def.columns, 

194 primary_key=primary_key, 

195 indexes=[], 

196 constraints=[], 

197 annotations=annotations, 

198 ) 

199 

200 return apdb_tables 

201 

202 def _extra_tables_schema(self) -> Mapping[ExtraTables, simple.Table]: 

203 """Generate schema for extra tables.""" 

204 extra_tables: dict[ExtraTables, simple.Table] = {} 

205 

206 # This table maps DiaSource ID to its partitions in DiaSource table and 

207 # DiaSourceInsertId tables. 

208 extra_tables[ExtraTables.DiaSourceToPartition] = simple.Table( 

209 id="#" + ExtraTables.DiaSourceToPartition.value, 

210 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix), 

211 columns=[ 

212 simple.Column( 

213 id="#diaSourceId", name="diaSourceId", datatype=felis.types.Long, nullable=False 

214 ), 

215 simple.Column(id="#apdb_part", name="apdb_part", datatype=felis.types.Long, nullable=False), 

216 simple.Column( 

217 id="#apdb_time_part", name="apdb_time_part", datatype=felis.types.Int, nullable=False 

218 ), 

219 simple.Column(id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=True), 

220 ], 

221 primary_key=[], 

222 indexes=[], 

223 constraints=[], 

224 annotations={"cassandra:partitioning_columns": ["diaSourceId"]}, 

225 ) 

226 

227 insert_id_column = simple.Column( 

228 id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=False 

229 ) 

230 

231 if not self._use_insert_id: 

232 return extra_tables 

233 

234 # Table containing insert IDs, this one is not partitioned, but 

235 # partition key must be defined. 

236 extra_tables[ExtraTables.DiaInsertId] = simple.Table( 

237 id="#" + ExtraTables.DiaInsertId.value, 

238 name=ExtraTables.DiaInsertId.table_name(self._prefix), 

239 columns=[ 

240 simple.Column(id="#partition", name="partition", datatype=felis.types.Int, nullable=False), 

241 insert_id_column, 

242 simple.Column( 

243 id="#insert_time", name="insert_time", datatype=felis.types.Timestamp, nullable=False 

244 ), 

245 ], 

246 primary_key=[insert_id_column], 

247 indexes=[], 

248 constraints=[], 

249 annotations={"cassandra:partitioning_columns": ["partition"]}, 

250 ) 

251 

252 for insert_id_table_enum, apdb_table_enum in ExtraTables.insert_id_tables().items(): 

253 apdb_table_def = self.tableSchemas[apdb_table_enum] 

254 

255 extra_tables[insert_id_table_enum] = simple.Table( 

256 id="#" + insert_id_table_enum.value, 

257 name=insert_id_table_enum.table_name(self._prefix), 

258 columns=[insert_id_column] + apdb_table_def.columns, 

259 primary_key=apdb_table_def.primary_key[:], 

260 indexes=[], 

261 constraints=[], 

262 annotations={ 

263 "cassandra:partitioning_columns": ["insert_id"], 

264 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns], 

265 }, 

266 ) 

267 

268 return extra_tables 

269 

270 @property 

271 def has_insert_id(self) -> bool: 

272 """Whether insert ID tables are to be used (`bool`).""" 

273 if self._has_insert_id is None: 

274 self._has_insert_id = self._use_insert_id and self._check_insert_id() 

275 return self._has_insert_id 

276 

277 def _check_insert_id(self) -> bool: 

278 """Check whether database has tables for tracking insert IDs.""" 

279 table_name = ExtraTables.DiaInsertId.table_name(self._prefix) 

280 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s" 

281 result = self._session.execute(query, (self._keyspace, table_name)) 

282 row = result.one() 

283 return bool(row[0]) 

284 

285 def tableName(self, table_name: ApdbTables | ExtraTables) -> str: 

286 """Return Cassandra table name for APDB table.""" 

287 return table_name.table_name(self._prefix) 

288 

289 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, simple.Column]: 

290 """Return mapping of column names to Column definitions. 

291 

292 Parameters 

293 ---------- 

294 table_name : `ApdbTables` 

295 One of known APDB table names. 

296 

297 Returns 

298 ------- 

299 column_map : `dict` 

300 Mapping of column names to `ColumnDef` instances. 

301 """ 

302 table_schema = self._table_schema(table_name) 

303 cmap = {column.name: column for column in table_schema.columns} 

304 return cmap 

305 

306 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

307 """Return a list of columns names for a table as defined in APDB 

308 schema. 

309 

310 Parameters 

311 ---------- 

312 table_name : `ApdbTables` or `ExtraTables` 

313 Enum for a table in APDB schema. 

314 

315 Returns 

316 ------- 

317 columns : `list` of `str` 

318 Names of regular columns in the table. 

319 """ 

320 table_schema = self._table_schema(table_name) 

321 return table_schema.annotations["cassandra:apdb_column_names"] 

322 

323 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

324 """Return a list of columns used for table partitioning. 

325 

326 Parameters 

327 ---------- 

328 table_name : `ApdbTables` 

329 Table name in APDB schema 

330 

331 Returns 

332 ------- 

333 columns : `list` of `str` 

334 Names of columns used for partitioning. 

335 """ 

336 table_schema = self._table_schema(table_name) 

337 return table_schema.annotations.get("cassandra:partitioning_columns", []) 

338 

339 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

340 """Return a list of columns used for clustering. 

341 

342 Parameters 

343 ---------- 

344 table_name : `ApdbTables` 

345 Table name in APDB schema 

346 

347 Returns 

348 ------- 

349 columns : `list` of `str` 

350 Names of columns for used for clustering. 

351 """ 

352 table_schema = self._table_schema(table_name) 

353 return [column.name for column in table_schema.primary_key] 

354 

355 def makeSchema(self, drop: bool = False, part_range: tuple[int, int] | None = None) -> None: 

356 """Create or re-create all tables. 

357 

358 Parameters 

359 ---------- 

360 drop : `bool` 

361 If True then drop tables before creating new ones. 

362 part_range : `tuple` [ `int` ] or `None` 

363 Start and end partition number for time partitions, end is not 

364 inclusive. Used to create per-partition DiaObject, DiaSource, and 

365 DiaForcedSource tables. If `None` then per-partition tables are 

366 not created. 

367 """ 

368 # Try to create keyspace if it does not exist 

369 query = ( 

370 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"' 

371 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}" 

372 ) 

373 self._session.execute(query) 

374 

375 for table in self._apdb_tables: 

376 self._makeTableSchema(table, drop, part_range) 

377 for extra_table in self._extra_tables: 

378 self._makeTableSchema(extra_table, drop, part_range) 

379 # Reset cached information. 

380 self._has_insert_id = None 

381 

382 def _makeTableSchema( 

383 self, 

384 table: ApdbTables | ExtraTables, 

385 drop: bool = False, 

386 part_range: tuple[int, int] | None = None, 

387 ) -> None: 

388 _LOG.debug("Making table %s", table) 

389 

390 fullTable = table.table_name(self._prefix) 

391 

392 table_list = [fullTable] 

393 if part_range is not None: 

394 if table in self._time_partitioned_tables: 

395 partitions = range(*part_range) 

396 table_list = [f"{fullTable}_{part}" for part in partitions] 

397 

398 if drop: 

399 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list] 

400 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

401 for future in futures: 

402 _LOG.debug("wait for query: %s", future.query) 

403 future.result() 

404 _LOG.debug("query finished: %s", future.query) 

405 

406 queries = [] 

407 for table_name in table_list: 

408 if_not_exists = "" if drop else "IF NOT EXISTS" 

409 columns = ", ".join(self._tableColumns(table)) 

410 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})' 

411 _LOG.debug("query: %s", query) 

412 queries.append(query) 

413 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

414 for future in futures: 

415 _LOG.debug("wait for query: %s", future.query) 

416 future.result() 

417 _LOG.debug("query finished: %s", future.query) 

418 

419 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

420 """Return set of columns in a table 

421 

422 Parameters 

423 ---------- 

424 table_name : `ApdbTables` 

425 Name of the table. 

426 

427 Returns 

428 ------- 

429 column_defs : `list` 

430 List of strings in the format "column_name type". 

431 """ 

432 table_schema = self._table_schema(table_name) 

433 

434 # must have partition columns and clustering columns 

435 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", []) 

436 clust_columns = [column.name for column in table_schema.primary_key] 

437 _LOG.debug("part_columns: %s", part_columns) 

438 _LOG.debug("clust_columns: %s", clust_columns) 

439 if not part_columns: 

440 raise ValueError(f"Table {table_name} configuration is missing partition index") 

441 

442 # all columns 

443 column_defs = [] 

444 for column in table_schema.columns: 

445 ctype = self._type_map[column.datatype] 

446 column_defs.append(f'"{column.name}" {ctype}') 

447 

448 # primary key definition 

449 part_columns = [f'"{col}"' for col in part_columns] 

450 clust_columns = [f'"{col}"' for col in clust_columns] 

451 if len(part_columns) > 1: 

452 columns = ", ".join(part_columns) 

453 part_columns = [f"({columns})"] 

454 pkey = ", ".join(part_columns + clust_columns) 

455 _LOG.debug("pkey: %s", pkey) 

456 column_defs.append(f"PRIMARY KEY ({pkey})") 

457 

458 return column_defs 

459 

460 def _table_schema(self, table: ApdbTables | ExtraTables) -> simple.Table: 

461 """Return schema definition for a table.""" 

462 if isinstance(table, ApdbTables): 

463 table_schema = self._apdb_tables[table] 

464 else: 

465 table_schema = self._extra_tables[table] 

466 return table_schema