Coverage for python/lsst/dax/apdb/cassandra/apdbCassandraSchema.py: 19%

194 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-24 09:59 +0000

1# This file is part of dax_apdb. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["ApdbCassandraSchema"] 

25 

26import enum 

27import logging 

28from collections.abc import Mapping 

29from typing import TYPE_CHECKING 

30 

31import felis.datamodel 

32 

33from .. import schema_model 

34from ..apdbSchema import ApdbSchema, ApdbTables 

35 

36if TYPE_CHECKING: 

37 import cassandra.cluster 

38 

39 

40_LOG = logging.getLogger(__name__) 

41 

42 

43class InconsistentSchemaError(RuntimeError): 

44 """Exception raised when schema state is inconsistent.""" 

45 

46 

47@enum.unique 

48class ExtraTables(enum.Enum): 

49 """Names of the extra tables used by Cassandra implementation.""" 

50 

51 ApdbReplicaChunks = "ApdbReplicaChunks" 

52 """Name of the table for replica chunk records.""" 

53 

54 DiaObjectChunks = "DiaObjectChunks" 

55 """Name of the table for DIAObject chunk data.""" 

56 

57 DiaSourceChunks = "DiaSourceChunks" 

58 """Name of the table for DIASource chunk data.""" 

59 

60 DiaForcedSourceChunks = "DiaForcedSourceChunks" 

61 """Name of the table for DIAForcedSource chunk data.""" 

62 

63 DiaSourceToPartition = "DiaSourceToPartition" 

64 "Maps diaSourceId to its partition values (pixel and time)." 

65 

66 def table_name(self, prefix: str = "") -> str: 

67 """Return full table name.""" 

68 return prefix + self.value 

69 

70 @classmethod 

71 def replica_chunk_tables(cls) -> Mapping[ExtraTables, ApdbTables]: 

72 """Return mapping of tables used for replica chunks storage to their 

73 corresponding regular tables. 

74 """ 

75 return { 

76 cls.DiaObjectChunks: ApdbTables.DiaObject, 

77 cls.DiaSourceChunks: ApdbTables.DiaSource, 

78 cls.DiaForcedSourceChunks: ApdbTables.DiaForcedSource, 

79 } 

80 

81 

82class ApdbCassandraSchema(ApdbSchema): 

83 """Class for management of APDB schema. 

84 

85 Parameters 

86 ---------- 

87 session : `cassandra.cluster.Session` 

88 Cassandra session object 

89 keyspace : `str` 

90 Keyspace name for all tables. 

91 schema_file : `str` 

92 Name of the YAML schema file. 

93 schema_name : `str`, optional 

94 Name of the schema in YAML files. 

95 prefix : `str`, optional 

96 Prefix to add to all schema elements. 

97 time_partition_tables : `bool` 

98 If `True` then schema will have a separate table for each time 

99 partition. 

100 enable_replica : `bool`, optional 

101 If `True` then use additional tables for replica chunks. 

102 """ 

103 

104 _type_map = { 

105 felis.datamodel.DataType.double: "DOUBLE", 

106 felis.datamodel.DataType.float: "FLOAT", 

107 felis.datamodel.DataType.timestamp: "TIMESTAMP", 

108 felis.datamodel.DataType.long: "BIGINT", 

109 felis.datamodel.DataType.int: "INT", 

110 felis.datamodel.DataType.short: "INT", 

111 felis.datamodel.DataType.byte: "TINYINT", 

112 felis.datamodel.DataType.binary: "BLOB", 

113 felis.datamodel.DataType.char: "TEXT", 

114 felis.datamodel.DataType.string: "TEXT", 

115 felis.datamodel.DataType.unicode: "TEXT", 

116 felis.datamodel.DataType.text: "TEXT", 

117 felis.datamodel.DataType.boolean: "BOOLEAN", 

118 schema_model.ExtraDataTypes.UUID: "UUID", 

119 } 

120 """Map YAML column types to Cassandra""" 

121 

122 _time_partitioned_tables = [ 

123 ApdbTables.DiaObject, 

124 ApdbTables.DiaSource, 

125 ApdbTables.DiaForcedSource, 

126 ] 

127 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast] 

128 

129 def __init__( 

130 self, 

131 session: cassandra.cluster.Session, 

132 keyspace: str, 

133 schema_file: str, 

134 schema_name: str = "ApdbSchema", 

135 prefix: str = "", 

136 time_partition_tables: bool = False, 

137 enable_replica: bool = False, 

138 ): 

139 super().__init__(schema_file, schema_name) 

140 

141 self._session = session 

142 self._keyspace = keyspace 

143 self._prefix = prefix 

144 self._time_partition_tables = time_partition_tables 

145 self._enable_replica = enable_replica 

146 self._has_replica_chunks: bool | None = None 

147 

148 self._apdb_tables = self._apdb_tables_schema(time_partition_tables) 

149 self._extra_tables = self._extra_tables_schema() 

150 

151 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, schema_model.Table]: 

152 """Generate schema for regular APDB tables.""" 

153 apdb_tables: dict[ApdbTables, schema_model.Table] = {} 

154 

155 # add columns and index for partitioning. 

156 for table, apdb_table_def in self.tableSchemas.items(): 

157 part_columns = [] 

158 add_columns = [] 

159 primary_key = apdb_table_def.primary_key[:] 

160 if table in self._spatially_partitioned_tables: 

161 # DiaObjectLast does not need temporal partitioning 

162 part_columns = ["apdb_part"] 

163 add_columns = part_columns 

164 elif table in self._time_partitioned_tables: 

165 if time_partition_tables: 

166 part_columns = ["apdb_part"] 

167 else: 

168 part_columns = ["apdb_part", "apdb_time_part"] 

169 add_columns = part_columns 

170 elif table is ApdbTables.SSObject: 

171 # For SSObject there is no natural partition key but we have 

172 # to partition it because there are too many of them. I'm 

173 # going to partition on its primary key (and drop separate 

174 # primary key index). 

175 part_columns = ["ssObjectId"] 

176 primary_key = [] 

177 elif table is ApdbTables.metadata: 

178 # Metadata is in one partition because we want to read all of 

179 # it in one query, add an extra column for partition. 

180 part_columns = ["meta_part"] 

181 add_columns = part_columns 

182 else: 

183 # TODO: Do not know what to do with the other tables 

184 continue 

185 

186 column_defs = [] 

187 if add_columns: 

188 column_defs = [ 

189 schema_model.Column( 

190 id=f"#{name}", name=name, datatype=felis.datamodel.DataType.long, nullable=False 

191 ) 

192 for name in add_columns 

193 ] 

194 

195 annotations = dict(apdb_table_def.annotations) 

196 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns] 

197 if part_columns: 

198 annotations["cassandra:partitioning_columns"] = part_columns 

199 

200 apdb_tables[table] = schema_model.Table( 

201 id=apdb_table_def.id, 

202 name=apdb_table_def.name, 

203 columns=column_defs + apdb_table_def.columns, 

204 primary_key=primary_key, 

205 indexes=[], 

206 constraints=[], 

207 annotations=annotations, 

208 ) 

209 

210 return apdb_tables 

211 

212 def _extra_tables_schema(self) -> Mapping[ExtraTables, schema_model.Table]: 

213 """Generate schema for extra tables.""" 

214 extra_tables: dict[ExtraTables, schema_model.Table] = {} 

215 

216 # This table maps DiaSource ID to its partitions in DiaSource table and 

217 # DiaSourceChunks tables. 

218 extra_tables[ExtraTables.DiaSourceToPartition] = schema_model.Table( 

219 id="#" + ExtraTables.DiaSourceToPartition.value, 

220 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix), 

221 columns=[ 

222 schema_model.Column( 

223 id="#diaSourceId", 

224 name="diaSourceId", 

225 datatype=felis.datamodel.DataType.long, 

226 nullable=False, 

227 ), 

228 schema_model.Column( 

229 id="#apdb_part", name="apdb_part", datatype=felis.datamodel.DataType.long, nullable=False 

230 ), 

231 schema_model.Column( 

232 id="#apdb_time_part", 

233 name="apdb_time_part", 

234 datatype=felis.datamodel.DataType.int, 

235 nullable=False, 

236 ), 

237 schema_model.Column( 

238 id="#apdb_replica_chunk", 

239 name="apdb_replica_chunk", 

240 datatype=felis.datamodel.DataType.long, 

241 nullable=True, 

242 ), 

243 ], 

244 primary_key=[], 

245 indexes=[], 

246 constraints=[], 

247 annotations={"cassandra:partitioning_columns": ["diaSourceId"]}, 

248 ) 

249 

250 replica_chunk_column = schema_model.Column( 

251 id="#apdb_replica_chunk", 

252 name="apdb_replica_chunk", 

253 datatype=felis.datamodel.DataType.long, 

254 nullable=False, 

255 ) 

256 

257 if not self._enable_replica: 

258 return extra_tables 

259 

260 # Table containing insert IDs, this one is not partitioned, but 

261 # partition key must be defined. 

262 extra_tables[ExtraTables.ApdbReplicaChunks] = schema_model.Table( 

263 id="#" + ExtraTables.ApdbReplicaChunks.value, 

264 name=ExtraTables.ApdbReplicaChunks.table_name(self._prefix), 

265 columns=[ 

266 schema_model.Column( 

267 id="#partition", name="partition", datatype=felis.datamodel.DataType.int, nullable=False 

268 ), 

269 replica_chunk_column, 

270 schema_model.Column( 

271 id="#last_update_time", 

272 name="last_update_time", 

273 datatype=felis.datamodel.DataType.timestamp, 

274 nullable=False, 

275 ), 

276 schema_model.Column( 

277 id="#unique_id", 

278 name="unique_id", 

279 datatype=schema_model.ExtraDataTypes.UUID, 

280 nullable=False, 

281 ), 

282 ], 

283 primary_key=[replica_chunk_column], 

284 indexes=[], 

285 constraints=[], 

286 annotations={"cassandra:partitioning_columns": ["partition"]}, 

287 ) 

288 

289 for chunk_table_enum, apdb_table_enum in ExtraTables.replica_chunk_tables().items(): 

290 apdb_table_def = self.tableSchemas[apdb_table_enum] 

291 

292 extra_tables[chunk_table_enum] = schema_model.Table( 

293 id="#" + chunk_table_enum.value, 

294 name=chunk_table_enum.table_name(self._prefix), 

295 columns=[replica_chunk_column] + apdb_table_def.columns, 

296 primary_key=apdb_table_def.primary_key[:], 

297 indexes=[], 

298 constraints=[], 

299 annotations={ 

300 "cassandra:partitioning_columns": ["apdb_replica_chunk"], 

301 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns], 

302 }, 

303 ) 

304 

305 return extra_tables 

306 

307 @property 

308 def has_replica_chunks(self) -> bool: 

309 """Whether insert ID tables are to be used (`bool`).""" 

310 if self._has_replica_chunks is None: 

311 self._has_replica_chunks = self._enable_replica and self._check_replica_chunks() 

312 return self._has_replica_chunks 

313 

314 def _check_replica_chunks(self) -> bool: 

315 """Check whether database has tables for tracking insert IDs.""" 

316 table_name = ExtraTables.ApdbReplicaChunks.table_name(self._prefix) 

317 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s" 

318 result = self._session.execute(query, (self._keyspace, table_name)) 

319 row = result.one() 

320 return bool(row[0]) 

321 

322 def empty(self) -> bool: 

323 """Return True if database schema is empty. 

324 

325 Returns 

326 ------- 

327 empty : `bool` 

328 `True` if none of the required APDB tables exist in the database, 

329 `False` if all required tables exist. 

330 

331 Raises 

332 ------ 

333 InconsistentSchemaError 

334 Raised when some of the required tables exist but not all. 

335 """ 

336 query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s" 

337 result = self._session.execute(query, (self._keyspace,)) 

338 table_names = set(row[0] for row in result.all()) 

339 

340 existing_tables = [] 

341 missing_tables = [] 

342 for table_enum in self._apdb_tables: 

343 table_name = table_enum.table_name(self._prefix) 

344 if self._time_partition_tables and table_enum in self._time_partitioned_tables: 

345 # Check prefix for time-partitioned tables. 

346 exists = any(table.startswith(f"{table_name}_") for table in table_names) 

347 else: 

348 exists = table_name in table_names 

349 if exists: 

350 existing_tables.append(table_name) 

351 else: 

352 missing_tables.append(table_name) 

353 

354 if not missing_tables: 

355 return False 

356 elif not existing_tables: 

357 return True 

358 else: 

359 raise InconsistentSchemaError( 

360 f"Only some required APDB tables exist: {existing_tables}, missing tables: {missing_tables}" 

361 ) 

362 

363 def tableName(self, table_name: ApdbTables | ExtraTables) -> str: 

364 """Return Cassandra table name for APDB table.""" 

365 return table_name.table_name(self._prefix) 

366 

367 def keyspace(self) -> str: 

368 """Return Cassandra keyspace for APDB tables.""" 

369 return self._keyspace 

370 

371 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, schema_model.Column]: 

372 """Return mapping of column names to Column definitions. 

373 

374 Parameters 

375 ---------- 

376 table_name : `ApdbTables` 

377 One of known APDB table names. 

378 

379 Returns 

380 ------- 

381 column_map : `dict` 

382 Mapping of column names to `ColumnDef` instances. 

383 """ 

384 table_schema = self._table_schema(table_name) 

385 cmap = {column.name: column for column in table_schema.columns} 

386 return cmap 

387 

388 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

389 """Return a list of columns names for a table as defined in APDB 

390 schema. 

391 

392 Parameters 

393 ---------- 

394 table_name : `ApdbTables` or `ExtraTables` 

395 Enum for a table in APDB schema. 

396 

397 Returns 

398 ------- 

399 columns : `list` of `str` 

400 Names of regular columns in the table. 

401 """ 

402 table_schema = self._table_schema(table_name) 

403 return table_schema.annotations["cassandra:apdb_column_names"] 

404 

405 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

406 """Return a list of columns used for table partitioning. 

407 

408 Parameters 

409 ---------- 

410 table_name : `ApdbTables` 

411 Table name in APDB schema 

412 

413 Returns 

414 ------- 

415 columns : `list` of `str` 

416 Names of columns used for partitioning. 

417 """ 

418 table_schema = self._table_schema(table_name) 

419 return table_schema.annotations.get("cassandra:partitioning_columns", []) 

420 

421 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

422 """Return a list of columns used for clustering. 

423 

424 Parameters 

425 ---------- 

426 table_name : `ApdbTables` 

427 Table name in APDB schema 

428 

429 Returns 

430 ------- 

431 columns : `list` of `str` 

432 Names of columns for used for clustering. 

433 """ 

434 table_schema = self._table_schema(table_name) 

435 return [column.name for column in table_schema.primary_key] 

436 

437 def makeSchema( 

438 self, 

439 *, 

440 drop: bool = False, 

441 part_range: tuple[int, int] | None = None, 

442 replication_factor: int | None = None, 

443 ) -> None: 

444 """Create or re-create all tables. 

445 

446 Parameters 

447 ---------- 

448 drop : `bool` 

449 If True then drop tables before creating new ones. Note that 

450 only tables are dropped and not the whole keyspace. 

451 part_range : `tuple` [ `int` ] or `None` 

452 Start and end partition number for time partitions, end is not 

453 inclusive. Used to create per-partition DiaObject, DiaSource, and 

454 DiaForcedSource tables. If `None` then per-partition tables are 

455 not created. 

456 replication_factor : `int`, optional 

457 Replication factor used when creating new keyspace, if keyspace 

458 already exists its replication factor is not changed. 

459 """ 

460 # Try to create keyspace if it does not exist 

461 if replication_factor is None: 

462 replication_factor = 1 

463 query = ( 

464 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"' 

465 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': " 

466 f"{replication_factor}" 

467 "}" 

468 ) 

469 self._session.execute(query) 

470 

471 for table in self._apdb_tables: 

472 self._makeTableSchema(table, drop, part_range) 

473 for extra_table in self._extra_tables: 

474 self._makeTableSchema(extra_table, drop, part_range) 

475 # Reset cached information. 

476 self._has_replica_chunks = None 

477 

478 def _makeTableSchema( 

479 self, 

480 table: ApdbTables | ExtraTables, 

481 drop: bool = False, 

482 part_range: tuple[int, int] | None = None, 

483 ) -> None: 

484 _LOG.debug("Making table %s", table) 

485 

486 fullTable = table.table_name(self._prefix) 

487 

488 table_list = [fullTable] 

489 if part_range is not None: 

490 if table in self._time_partitioned_tables: 

491 partitions = range(*part_range) 

492 table_list = [f"{fullTable}_{part}" for part in partitions] 

493 

494 if drop: 

495 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list] 

496 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

497 for future in futures: 

498 _LOG.debug("wait for query: %s", future.query) 

499 future.result() 

500 _LOG.debug("query finished: %s", future.query) 

501 

502 queries = [] 

503 for table_name in table_list: 

504 if_not_exists = "" if drop else "IF NOT EXISTS" 

505 columns = ", ".join(self._tableColumns(table)) 

506 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})' 

507 _LOG.debug("query: %s", query) 

508 queries.append(query) 

509 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

510 for future in futures: 

511 _LOG.debug("wait for query: %s", future.query) 

512 future.result() 

513 _LOG.debug("query finished: %s", future.query) 

514 

515 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

516 """Return set of columns in a table 

517 

518 Parameters 

519 ---------- 

520 table_name : `ApdbTables` 

521 Name of the table. 

522 

523 Returns 

524 ------- 

525 column_defs : `list` 

526 List of strings in the format "column_name type". 

527 """ 

528 table_schema = self._table_schema(table_name) 

529 

530 # must have partition columns and clustering columns 

531 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", []) 

532 clust_columns = [column.name for column in table_schema.primary_key] 

533 _LOG.debug("part_columns: %s", part_columns) 

534 _LOG.debug("clust_columns: %s", clust_columns) 

535 if not part_columns: 

536 raise ValueError(f"Table {table_name} configuration is missing partition index") 

537 

538 # all columns 

539 column_defs = [] 

540 for column in table_schema.columns: 

541 ctype = self._type_map[column.datatype] 

542 column_defs.append(f'"{column.name}" {ctype}') 

543 

544 # primary key definition 

545 part_columns = [f'"{col}"' for col in part_columns] 

546 clust_columns = [f'"{col}"' for col in clust_columns] 

547 if len(part_columns) > 1: 

548 columns = ", ".join(part_columns) 

549 part_columns = [f"({columns})"] 

550 pkey = ", ".join(part_columns + clust_columns) 

551 _LOG.debug("pkey: %s", pkey) 

552 column_defs.append(f"PRIMARY KEY ({pkey})") 

553 

554 return column_defs 

555 

556 def _table_schema(self, table: ApdbTables | ExtraTables) -> schema_model.Table: 

557 """Return schema definition for a table.""" 

558 if isinstance(table, ApdbTables): 

559 table_schema = self._apdb_tables[table] 

560 else: 

561 table_schema = self._extra_tables[table] 

562 return table_schema