Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 20%

192 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-06 10:33 +0000

1# This file is part of dax_apdb. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["ApdbCassandraSchema"] 

25 

26import enum 

27import logging 

28from collections.abc import Mapping 

29from typing import TYPE_CHECKING 

30 

31import felis.types 

32from felis import simple 

33 

34from .apdbSchema import ApdbSchema, ApdbTables 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 import cassandra.cluster 

38 

39 

40_LOG = logging.getLogger(__name__) 

41 

42 

43class _FelisUUID(felis.types.FelisType, felis_name="uuid", votable_name="uuid"): 

44 """Special internal type for UUID columns. Felis does not support UUID, 

45 but we need it here, to simplify logic it's easier to add a special class. 

46 """ 

47 

48 

49class InconsistentSchemaError(RuntimeError): 

50 """Exception raised when schema state is inconsistent.""" 

51 

52 

53@enum.unique 

54class ExtraTables(enum.Enum): 

55 """Names of the extra tables used by Cassandra implementation.""" 

56 

57 DiaInsertId = "DiaInsertId" 

58 """Name of the table for insert ID records.""" 

59 

60 DiaObjectInsertId = "DiaObjectInsertId" 

61 """Name of the table for DIAObject insert ID records.""" 

62 

63 DiaSourceInsertId = "DiaSourceInsertId" 

64 """Name of the table for DIASource insert ID records.""" 

65 

66 DiaForcedSourceInsertId = "DiaFSourceInsertId" 

67 """Name of the table for DIAForcedSource insert ID records.""" 

68 

69 DiaSourceToPartition = "DiaSourceToPartition" 

70 "Maps diaSourceId to its partition values (pixel and time)." 

71 

72 def table_name(self, prefix: str = "") -> str: 

73 """Return full table name.""" 

74 return prefix + self.value 

75 

76 @classmethod 

77 def insert_id_tables(cls) -> Mapping[ExtraTables, ApdbTables]: 

78 """Return mapping of tables used for insert ID tracking to their 

79 corresponding regular tables. 

80 """ 

81 return { 

82 cls.DiaObjectInsertId: ApdbTables.DiaObject, 

83 cls.DiaSourceInsertId: ApdbTables.DiaSource, 

84 cls.DiaForcedSourceInsertId: ApdbTables.DiaForcedSource, 

85 } 

86 

87 

88class ApdbCassandraSchema(ApdbSchema): 

89 """Class for management of APDB schema. 

90 

91 Parameters 

92 ---------- 

93 session : `cassandra.cluster.Session` 

94 Cassandra session object 

95 schema_file : `str` 

96 Name of the YAML schema file. 

97 schema_name : `str`, optional 

98 Name of the schema in YAML files. 

99 prefix : `str`, optional 

100 Prefix to add to all schema elements. 

101 time_partition_tables : `bool` 

102 If True then schema will have a separate table for each time partition. 

103 """ 

104 

105 _type_map = { 

106 felis.types.Double: "DOUBLE", 

107 felis.types.Float: "FLOAT", 

108 felis.types.Timestamp: "TIMESTAMP", 

109 felis.types.Long: "BIGINT", 

110 felis.types.Int: "INT", 

111 felis.types.Short: "INT", 

112 felis.types.Byte: "TINYINT", 

113 felis.types.Binary: "BLOB", 

114 felis.types.Char: "TEXT", 

115 felis.types.String: "TEXT", 

116 felis.types.Unicode: "TEXT", 

117 felis.types.Text: "TEXT", 

118 felis.types.Boolean: "BOOLEAN", 

119 _FelisUUID: "UUID", 

120 } 

121 """Map YAML column types to Cassandra""" 

122 

123 _time_partitioned_tables = [ 

124 ApdbTables.DiaObject, 

125 ApdbTables.DiaSource, 

126 ApdbTables.DiaForcedSource, 

127 ] 

128 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast] 

129 

130 def __init__( 

131 self, 

132 session: cassandra.cluster.Session, 

133 keyspace: str, 

134 schema_file: str, 

135 schema_name: str = "ApdbSchema", 

136 prefix: str = "", 

137 time_partition_tables: bool = False, 

138 use_insert_id: bool = False, 

139 ): 

140 super().__init__(schema_file, schema_name) 

141 

142 self._session = session 

143 self._keyspace = keyspace 

144 self._prefix = prefix 

145 self._time_partition_tables = time_partition_tables 

146 self._use_insert_id = use_insert_id 

147 self._has_insert_id: bool | None = None 

148 

149 self._apdb_tables = self._apdb_tables_schema(time_partition_tables) 

150 self._extra_tables = self._extra_tables_schema() 

151 

152 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, simple.Table]: 

153 """Generate schema for regular APDB tables.""" 

154 apdb_tables: dict[ApdbTables, simple.Table] = {} 

155 

156 # add columns and index for partitioning. 

157 for table, apdb_table_def in self.tableSchemas.items(): 

158 part_columns = [] 

159 add_columns = [] 

160 primary_key = apdb_table_def.primary_key[:] 

161 if table in self._spatially_partitioned_tables: 

162 # DiaObjectLast does not need temporal partitioning 

163 part_columns = ["apdb_part"] 

164 add_columns = part_columns 

165 elif table in self._time_partitioned_tables: 

166 if time_partition_tables: 

167 part_columns = ["apdb_part"] 

168 else: 

169 part_columns = ["apdb_part", "apdb_time_part"] 

170 add_columns = part_columns 

171 elif table is ApdbTables.SSObject: 

172 # For SSObject there is no natural partition key but we have 

173 # to partition it because there are too many of them. I'm 

174 # going to partition on its primary key (and drop separate 

175 # primary key index). 

176 part_columns = ["ssObjectId"] 

177 primary_key = [] 

178 elif table is ApdbTables.metadata: 

179 # Metadata is in one partition because we want to read all of 

180 # it in one query, add an extra column for partition. 

181 part_columns = ["meta_part"] 

182 add_columns = part_columns 

183 else: 

184 # TODO: Do not know what to do with the other tables 

185 continue 

186 

187 column_defs = [] 

188 if add_columns: 

189 column_defs = [ 

190 simple.Column(id=f"#{name}", name=name, datatype=felis.types.Long, nullable=False) 

191 for name in add_columns 

192 ] 

193 

194 annotations = dict(apdb_table_def.annotations) 

195 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns] 

196 if part_columns: 

197 annotations["cassandra:partitioning_columns"] = part_columns 

198 

199 apdb_tables[table] = simple.Table( 

200 id=apdb_table_def.id, 

201 name=apdb_table_def.name, 

202 columns=column_defs + apdb_table_def.columns, 

203 primary_key=primary_key, 

204 indexes=[], 

205 constraints=[], 

206 annotations=annotations, 

207 ) 

208 

209 return apdb_tables 

210 

211 def _extra_tables_schema(self) -> Mapping[ExtraTables, simple.Table]: 

212 """Generate schema for extra tables.""" 

213 extra_tables: dict[ExtraTables, simple.Table] = {} 

214 

215 # This table maps DiaSource ID to its partitions in DiaSource table and 

216 # DiaSourceInsertId tables. 

217 extra_tables[ExtraTables.DiaSourceToPartition] = simple.Table( 

218 id="#" + ExtraTables.DiaSourceToPartition.value, 

219 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix), 

220 columns=[ 

221 simple.Column( 

222 id="#diaSourceId", name="diaSourceId", datatype=felis.types.Long, nullable=False 

223 ), 

224 simple.Column(id="#apdb_part", name="apdb_part", datatype=felis.types.Long, nullable=False), 

225 simple.Column( 

226 id="#apdb_time_part", name="apdb_time_part", datatype=felis.types.Int, nullable=False 

227 ), 

228 simple.Column(id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=True), 

229 ], 

230 primary_key=[], 

231 indexes=[], 

232 constraints=[], 

233 annotations={"cassandra:partitioning_columns": ["diaSourceId"]}, 

234 ) 

235 

236 insert_id_column = simple.Column( 

237 id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=False 

238 ) 

239 

240 if not self._use_insert_id: 

241 return extra_tables 

242 

243 # Table containing insert IDs, this one is not partitioned, but 

244 # partition key must be defined. 

245 extra_tables[ExtraTables.DiaInsertId] = simple.Table( 

246 id="#" + ExtraTables.DiaInsertId.value, 

247 name=ExtraTables.DiaInsertId.table_name(self._prefix), 

248 columns=[ 

249 simple.Column(id="#partition", name="partition", datatype=felis.types.Int, nullable=False), 

250 insert_id_column, 

251 simple.Column( 

252 id="#insert_time", name="insert_time", datatype=felis.types.Timestamp, nullable=False 

253 ), 

254 ], 

255 primary_key=[insert_id_column], 

256 indexes=[], 

257 constraints=[], 

258 annotations={"cassandra:partitioning_columns": ["partition"]}, 

259 ) 

260 

261 for insert_id_table_enum, apdb_table_enum in ExtraTables.insert_id_tables().items(): 

262 apdb_table_def = self.tableSchemas[apdb_table_enum] 

263 

264 extra_tables[insert_id_table_enum] = simple.Table( 

265 id="#" + insert_id_table_enum.value, 

266 name=insert_id_table_enum.table_name(self._prefix), 

267 columns=[insert_id_column] + apdb_table_def.columns, 

268 primary_key=apdb_table_def.primary_key[:], 

269 indexes=[], 

270 constraints=[], 

271 annotations={ 

272 "cassandra:partitioning_columns": ["insert_id"], 

273 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns], 

274 }, 

275 ) 

276 

277 return extra_tables 

278 

279 @property 

280 def has_insert_id(self) -> bool: 

281 """Whether insert ID tables are to be used (`bool`).""" 

282 if self._has_insert_id is None: 

283 self._has_insert_id = self._use_insert_id and self._check_insert_id() 

284 return self._has_insert_id 

285 

286 def _check_insert_id(self) -> bool: 

287 """Check whether database has tables for tracking insert IDs.""" 

288 table_name = ExtraTables.DiaInsertId.table_name(self._prefix) 

289 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s" 

290 result = self._session.execute(query, (self._keyspace, table_name)) 

291 row = result.one() 

292 return bool(row[0]) 

293 

294 def empty(self) -> bool: 

295 """Return True if database schema is empty. 

296 

297 Returns 

298 ------- 

299 empty : `bool` 

300 `True` if none of the required APDB tables exist in the database, 

301 `False` if all required tables exist. 

302 

303 Raises 

304 ------ 

305 InconsistentSchemaError 

306 Raised when some of the required tables exist but not all. 

307 """ 

308 query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s" 

309 result = self._session.execute(query, (self._keyspace,)) 

310 table_names = set(row[0] for row in result.all()) 

311 

312 existing_tables = [] 

313 missing_tables = [] 

314 for table_enum in self._apdb_tables: 

315 table_name = table_enum.table_name(self._prefix) 

316 if table_name in table_names: 

317 existing_tables.append(table_name) 

318 else: 

319 missing_tables.append(table_name) 

320 

321 if not missing_tables: 

322 return False 

323 elif not existing_tables: 

324 return True 

325 else: 

326 raise InconsistentSchemaError( 

327 f"Only some required APDB tables exist: {existing_tables}, missing tables: {missing_tables}" 

328 ) 

329 

330 def tableName(self, table_name: ApdbTables | ExtraTables) -> str: 

331 """Return Cassandra table name for APDB table.""" 

332 return table_name.table_name(self._prefix) 

333 

334 def keyspace(self) -> str: 

335 """Return Cassandra keyspace for APDB tables.""" 

336 return self._keyspace 

337 

338 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, simple.Column]: 

339 """Return mapping of column names to Column definitions. 

340 

341 Parameters 

342 ---------- 

343 table_name : `ApdbTables` 

344 One of known APDB table names. 

345 

346 Returns 

347 ------- 

348 column_map : `dict` 

349 Mapping of column names to `ColumnDef` instances. 

350 """ 

351 table_schema = self._table_schema(table_name) 

352 cmap = {column.name: column for column in table_schema.columns} 

353 return cmap 

354 

355 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

356 """Return a list of columns names for a table as defined in APDB 

357 schema. 

358 

359 Parameters 

360 ---------- 

361 table_name : `ApdbTables` or `ExtraTables` 

362 Enum for a table in APDB schema. 

363 

364 Returns 

365 ------- 

366 columns : `list` of `str` 

367 Names of regular columns in the table. 

368 """ 

369 table_schema = self._table_schema(table_name) 

370 return table_schema.annotations["cassandra:apdb_column_names"] 

371 

372 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

373 """Return a list of columns used for table partitioning. 

374 

375 Parameters 

376 ---------- 

377 table_name : `ApdbTables` 

378 Table name in APDB schema 

379 

380 Returns 

381 ------- 

382 columns : `list` of `str` 

383 Names of columns used for partitioning. 

384 """ 

385 table_schema = self._table_schema(table_name) 

386 return table_schema.annotations.get("cassandra:partitioning_columns", []) 

387 

388 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

389 """Return a list of columns used for clustering. 

390 

391 Parameters 

392 ---------- 

393 table_name : `ApdbTables` 

394 Table name in APDB schema 

395 

396 Returns 

397 ------- 

398 columns : `list` of `str` 

399 Names of columns for used for clustering. 

400 """ 

401 table_schema = self._table_schema(table_name) 

402 return [column.name for column in table_schema.primary_key] 

403 

404 def makeSchema(self, drop: bool = False, part_range: tuple[int, int] | None = None) -> None: 

405 """Create or re-create all tables. 

406 

407 Parameters 

408 ---------- 

409 drop : `bool` 

410 If True then drop tables before creating new ones. 

411 part_range : `tuple` [ `int` ] or `None` 

412 Start and end partition number for time partitions, end is not 

413 inclusive. Used to create per-partition DiaObject, DiaSource, and 

414 DiaForcedSource tables. If `None` then per-partition tables are 

415 not created. 

416 """ 

417 # Try to create keyspace if it does not exist 

418 query = ( 

419 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"' 

420 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}" 

421 ) 

422 self._session.execute(query) 

423 

424 for table in self._apdb_tables: 

425 self._makeTableSchema(table, drop, part_range) 

426 for extra_table in self._extra_tables: 

427 self._makeTableSchema(extra_table, drop, part_range) 

428 # Reset cached information. 

429 self._has_insert_id = None 

430 

431 def _makeTableSchema( 

432 self, 

433 table: ApdbTables | ExtraTables, 

434 drop: bool = False, 

435 part_range: tuple[int, int] | None = None, 

436 ) -> None: 

437 _LOG.debug("Making table %s", table) 

438 

439 fullTable = table.table_name(self._prefix) 

440 

441 table_list = [fullTable] 

442 if part_range is not None: 

443 if table in self._time_partitioned_tables: 

444 partitions = range(*part_range) 

445 table_list = [f"{fullTable}_{part}" for part in partitions] 

446 

447 if drop: 

448 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list] 

449 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

450 for future in futures: 

451 _LOG.debug("wait for query: %s", future.query) 

452 future.result() 

453 _LOG.debug("query finished: %s", future.query) 

454 

455 queries = [] 

456 for table_name in table_list: 

457 if_not_exists = "" if drop else "IF NOT EXISTS" 

458 columns = ", ".join(self._tableColumns(table)) 

459 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})' 

460 _LOG.debug("query: %s", query) 

461 queries.append(query) 

462 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

463 for future in futures: 

464 _LOG.debug("wait for query: %s", future.query) 

465 future.result() 

466 _LOG.debug("query finished: %s", future.query) 

467 

468 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

469 """Return set of columns in a table 

470 

471 Parameters 

472 ---------- 

473 table_name : `ApdbTables` 

474 Name of the table. 

475 

476 Returns 

477 ------- 

478 column_defs : `list` 

479 List of strings in the format "column_name type". 

480 """ 

481 table_schema = self._table_schema(table_name) 

482 

483 # must have partition columns and clustering columns 

484 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", []) 

485 clust_columns = [column.name for column in table_schema.primary_key] 

486 _LOG.debug("part_columns: %s", part_columns) 

487 _LOG.debug("clust_columns: %s", clust_columns) 

488 if not part_columns: 

489 raise ValueError(f"Table {table_name} configuration is missing partition index") 

490 

491 # all columns 

492 column_defs = [] 

493 for column in table_schema.columns: 

494 ctype = self._type_map[column.datatype] 

495 column_defs.append(f'"{column.name}" {ctype}') 

496 

497 # primary key definition 

498 part_columns = [f'"{col}"' for col in part_columns] 

499 clust_columns = [f'"{col}"' for col in clust_columns] 

500 if len(part_columns) > 1: 

501 columns = ", ".join(part_columns) 

502 part_columns = [f"({columns})"] 

503 pkey = ", ".join(part_columns + clust_columns) 

504 _LOG.debug("pkey: %s", pkey) 

505 column_defs.append(f"PRIMARY KEY ({pkey})") 

506 

507 return column_defs 

508 

509 def _table_schema(self, table: ApdbTables | ExtraTables) -> simple.Table: 

510 """Return schema definition for a table.""" 

511 if isinstance(table, ApdbTables): 

512 table_schema = self._apdb_tables[table] 

513 else: 

514 table_schema = self._extra_tables[table] 

515 return table_schema