Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 19%

195 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-13 09:59 +0000

1# This file is part of dax_apdb. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["ApdbCassandraSchema"] 

25 

26import enum 

27import logging 

28from collections.abc import Mapping 

29from typing import TYPE_CHECKING 

30 

31import felis.types 

32from felis import simple 

33 

34from .apdbSchema import ApdbSchema, ApdbTables 

35 

36if TYPE_CHECKING: 

37 import cassandra.cluster 

38 

39 

40_LOG = logging.getLogger(__name__) 

41 

42 

43class _FelisUUID(felis.types.FelisType, felis_name="uuid", votable_name="uuid"): 

44 """Special internal type for UUID columns. Felis does not support UUID, 

45 but we need it here, to simplify logic it's easier to add a special class. 

46 """ 

47 

48 

49class InconsistentSchemaError(RuntimeError): 

50 """Exception raised when schema state is inconsistent.""" 

51 

52 

53@enum.unique 

54class ExtraTables(enum.Enum): 

55 """Names of the extra tables used by Cassandra implementation.""" 

56 

57 DiaInsertId = "DiaInsertId" 

58 """Name of the table for insert ID records.""" 

59 

60 DiaObjectInsertId = "DiaObjectInsertId" 

61 """Name of the table for DIAObject insert ID records.""" 

62 

63 DiaSourceInsertId = "DiaSourceInsertId" 

64 """Name of the table for DIASource insert ID records.""" 

65 

66 DiaForcedSourceInsertId = "DiaFSourceInsertId" 

67 """Name of the table for DIAForcedSource insert ID records.""" 

68 

69 DiaSourceToPartition = "DiaSourceToPartition" 

70 "Maps diaSourceId to its partition values (pixel and time)." 

71 

72 def table_name(self, prefix: str = "") -> str: 

73 """Return full table name.""" 

74 return prefix + self.value 

75 

76 @classmethod 

77 def insert_id_tables(cls) -> Mapping[ExtraTables, ApdbTables]: 

78 """Return mapping of tables used for insert ID tracking to their 

79 corresponding regular tables. 

80 """ 

81 return { 

82 cls.DiaObjectInsertId: ApdbTables.DiaObject, 

83 cls.DiaSourceInsertId: ApdbTables.DiaSource, 

84 cls.DiaForcedSourceInsertId: ApdbTables.DiaForcedSource, 

85 } 

86 

87 

88class ApdbCassandraSchema(ApdbSchema): 

89 """Class for management of APDB schema. 

90 

91 Parameters 

92 ---------- 

93 session : `cassandra.cluster.Session` 

94 Cassandra session object 

95 schema_file : `str` 

96 Name of the YAML schema file. 

97 schema_name : `str`, optional 

98 Name of the schema in YAML files. 

99 prefix : `str`, optional 

100 Prefix to add to all schema elements. 

101 time_partition_tables : `bool` 

102 If True then schema will have a separate table for each time partition. 

103 """ 

104 

105 _type_map = { 

106 felis.types.Double: "DOUBLE", 

107 felis.types.Float: "FLOAT", 

108 felis.types.Timestamp: "TIMESTAMP", 

109 felis.types.Long: "BIGINT", 

110 felis.types.Int: "INT", 

111 felis.types.Short: "INT", 

112 felis.types.Byte: "TINYINT", 

113 felis.types.Binary: "BLOB", 

114 felis.types.Char: "TEXT", 

115 felis.types.String: "TEXT", 

116 felis.types.Unicode: "TEXT", 

117 felis.types.Text: "TEXT", 

118 felis.types.Boolean: "BOOLEAN", 

119 _FelisUUID: "UUID", 

120 } 

121 """Map YAML column types to Cassandra""" 

122 

123 _time_partitioned_tables = [ 

124 ApdbTables.DiaObject, 

125 ApdbTables.DiaSource, 

126 ApdbTables.DiaForcedSource, 

127 ] 

128 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast] 

129 

130 def __init__( 

131 self, 

132 session: cassandra.cluster.Session, 

133 keyspace: str, 

134 schema_file: str, 

135 schema_name: str = "ApdbSchema", 

136 prefix: str = "", 

137 time_partition_tables: bool = False, 

138 use_insert_id: bool = False, 

139 ): 

140 super().__init__(schema_file, schema_name) 

141 

142 self._session = session 

143 self._keyspace = keyspace 

144 self._prefix = prefix 

145 self._time_partition_tables = time_partition_tables 

146 self._use_insert_id = use_insert_id 

147 self._has_insert_id: bool | None = None 

148 

149 self._apdb_tables = self._apdb_tables_schema(time_partition_tables) 

150 self._extra_tables = self._extra_tables_schema() 

151 

152 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, simple.Table]: 

153 """Generate schema for regular APDB tables.""" 

154 apdb_tables: dict[ApdbTables, simple.Table] = {} 

155 

156 # add columns and index for partitioning. 

157 for table, apdb_table_def in self.tableSchemas.items(): 

158 part_columns = [] 

159 add_columns = [] 

160 primary_key = apdb_table_def.primary_key[:] 

161 if table in self._spatially_partitioned_tables: 

162 # DiaObjectLast does not need temporal partitioning 

163 part_columns = ["apdb_part"] 

164 add_columns = part_columns 

165 elif table in self._time_partitioned_tables: 

166 if time_partition_tables: 

167 part_columns = ["apdb_part"] 

168 else: 

169 part_columns = ["apdb_part", "apdb_time_part"] 

170 add_columns = part_columns 

171 elif table is ApdbTables.SSObject: 

172 # For SSObject there is no natural partition key but we have 

173 # to partition it because there are too many of them. I'm 

174 # going to partition on its primary key (and drop separate 

175 # primary key index). 

176 part_columns = ["ssObjectId"] 

177 primary_key = [] 

178 elif table is ApdbTables.metadata: 

179 # Metadata is in one partition because we want to read all of 

180 # it in one query, add an extra column for partition. 

181 part_columns = ["meta_part"] 

182 add_columns = part_columns 

183 else: 

184 # TODO: Do not know what to do with the other tables 

185 continue 

186 

187 column_defs = [] 

188 if add_columns: 

189 column_defs = [ 

190 simple.Column(id=f"#{name}", name=name, datatype=felis.types.Long, nullable=False) 

191 for name in add_columns 

192 ] 

193 

194 annotations = dict(apdb_table_def.annotations) 

195 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns] 

196 if part_columns: 

197 annotations["cassandra:partitioning_columns"] = part_columns 

198 

199 apdb_tables[table] = simple.Table( 

200 id=apdb_table_def.id, 

201 name=apdb_table_def.name, 

202 columns=column_defs + apdb_table_def.columns, 

203 primary_key=primary_key, 

204 indexes=[], 

205 constraints=[], 

206 annotations=annotations, 

207 ) 

208 

209 return apdb_tables 

210 

211 def _extra_tables_schema(self) -> Mapping[ExtraTables, simple.Table]: 

212 """Generate schema for extra tables.""" 

213 extra_tables: dict[ExtraTables, simple.Table] = {} 

214 

215 # This table maps DiaSource ID to its partitions in DiaSource table and 

216 # DiaSourceInsertId tables. 

217 extra_tables[ExtraTables.DiaSourceToPartition] = simple.Table( 

218 id="#" + ExtraTables.DiaSourceToPartition.value, 

219 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix), 

220 columns=[ 

221 simple.Column( 

222 id="#diaSourceId", name="diaSourceId", datatype=felis.types.Long, nullable=False 

223 ), 

224 simple.Column(id="#apdb_part", name="apdb_part", datatype=felis.types.Long, nullable=False), 

225 simple.Column( 

226 id="#apdb_time_part", name="apdb_time_part", datatype=felis.types.Int, nullable=False 

227 ), 

228 simple.Column(id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=True), 

229 ], 

230 primary_key=[], 

231 indexes=[], 

232 constraints=[], 

233 annotations={"cassandra:partitioning_columns": ["diaSourceId"]}, 

234 ) 

235 

236 insert_id_column = simple.Column( 

237 id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=False 

238 ) 

239 

240 if not self._use_insert_id: 

241 return extra_tables 

242 

243 # Table containing insert IDs, this one is not partitioned, but 

244 # partition key must be defined. 

245 extra_tables[ExtraTables.DiaInsertId] = simple.Table( 

246 id="#" + ExtraTables.DiaInsertId.value, 

247 name=ExtraTables.DiaInsertId.table_name(self._prefix), 

248 columns=[ 

249 simple.Column(id="#partition", name="partition", datatype=felis.types.Int, nullable=False), 

250 insert_id_column, 

251 simple.Column( 

252 id="#insert_time", name="insert_time", datatype=felis.types.Timestamp, nullable=False 

253 ), 

254 ], 

255 primary_key=[insert_id_column], 

256 indexes=[], 

257 constraints=[], 

258 annotations={"cassandra:partitioning_columns": ["partition"]}, 

259 ) 

260 

261 for insert_id_table_enum, apdb_table_enum in ExtraTables.insert_id_tables().items(): 

262 apdb_table_def = self.tableSchemas[apdb_table_enum] 

263 

264 extra_tables[insert_id_table_enum] = simple.Table( 

265 id="#" + insert_id_table_enum.value, 

266 name=insert_id_table_enum.table_name(self._prefix), 

267 columns=[insert_id_column] + apdb_table_def.columns, 

268 primary_key=apdb_table_def.primary_key[:], 

269 indexes=[], 

270 constraints=[], 

271 annotations={ 

272 "cassandra:partitioning_columns": ["insert_id"], 

273 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns], 

274 }, 

275 ) 

276 

277 return extra_tables 

278 

279 @property 

280 def has_insert_id(self) -> bool: 

281 """Whether insert ID tables are to be used (`bool`).""" 

282 if self._has_insert_id is None: 

283 self._has_insert_id = self._use_insert_id and self._check_insert_id() 

284 return self._has_insert_id 

285 

286 def _check_insert_id(self) -> bool: 

287 """Check whether database has tables for tracking insert IDs.""" 

288 table_name = ExtraTables.DiaInsertId.table_name(self._prefix) 

289 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s" 

290 result = self._session.execute(query, (self._keyspace, table_name)) 

291 row = result.one() 

292 return bool(row[0]) 

293 

294 def empty(self) -> bool: 

295 """Return True if database schema is empty. 

296 

297 Returns 

298 ------- 

299 empty : `bool` 

300 `True` if none of the required APDB tables exist in the database, 

301 `False` if all required tables exist. 

302 

303 Raises 

304 ------ 

305 InconsistentSchemaError 

306 Raised when some of the required tables exist but not all. 

307 """ 

308 query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s" 

309 result = self._session.execute(query, (self._keyspace,)) 

310 table_names = set(row[0] for row in result.all()) 

311 

312 existing_tables = [] 

313 missing_tables = [] 

314 for table_enum in self._apdb_tables: 

315 table_name = table_enum.table_name(self._prefix) 

316 if self._time_partition_tables and table_enum in self._time_partitioned_tables: 

317 # Check prefix for time-partitioned tables. 

318 exists = any(table.startswith(f"{table_name}_") for table in table_names) 

319 else: 

320 exists = table_name in table_names 

321 if exists: 

322 existing_tables.append(table_name) 

323 else: 

324 missing_tables.append(table_name) 

325 

326 if not missing_tables: 

327 return False 

328 elif not existing_tables: 

329 return True 

330 else: 

331 raise InconsistentSchemaError( 

332 f"Only some required APDB tables exist: {existing_tables}, missing tables: {missing_tables}" 

333 ) 

334 

335 def tableName(self, table_name: ApdbTables | ExtraTables) -> str: 

336 """Return Cassandra table name for APDB table.""" 

337 return table_name.table_name(self._prefix) 

338 

339 def keyspace(self) -> str: 

340 """Return Cassandra keyspace for APDB tables.""" 

341 return self._keyspace 

342 

343 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, simple.Column]: 

344 """Return mapping of column names to Column definitions. 

345 

346 Parameters 

347 ---------- 

348 table_name : `ApdbTables` 

349 One of known APDB table names. 

350 

351 Returns 

352 ------- 

353 column_map : `dict` 

354 Mapping of column names to `ColumnDef` instances. 

355 """ 

356 table_schema = self._table_schema(table_name) 

357 cmap = {column.name: column for column in table_schema.columns} 

358 return cmap 

359 

360 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

361 """Return a list of columns names for a table as defined in APDB 

362 schema. 

363 

364 Parameters 

365 ---------- 

366 table_name : `ApdbTables` or `ExtraTables` 

367 Enum for a table in APDB schema. 

368 

369 Returns 

370 ------- 

371 columns : `list` of `str` 

372 Names of regular columns in the table. 

373 """ 

374 table_schema = self._table_schema(table_name) 

375 return table_schema.annotations["cassandra:apdb_column_names"] 

376 

377 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

378 """Return a list of columns used for table partitioning. 

379 

380 Parameters 

381 ---------- 

382 table_name : `ApdbTables` 

383 Table name in APDB schema 

384 

385 Returns 

386 ------- 

387 columns : `list` of `str` 

388 Names of columns used for partitioning. 

389 """ 

390 table_schema = self._table_schema(table_name) 

391 return table_schema.annotations.get("cassandra:partitioning_columns", []) 

392 

393 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

394 """Return a list of columns used for clustering. 

395 

396 Parameters 

397 ---------- 

398 table_name : `ApdbTables` 

399 Table name in APDB schema 

400 

401 Returns 

402 ------- 

403 columns : `list` of `str` 

404 Names of columns for used for clustering. 

405 """ 

406 table_schema = self._table_schema(table_name) 

407 return [column.name for column in table_schema.primary_key] 

408 

409 def makeSchema( 

410 self, 

411 *, 

412 drop: bool = False, 

413 part_range: tuple[int, int] | None = None, 

414 replication_factor: int | None = None, 

415 ) -> None: 

416 """Create or re-create all tables. 

417 

418 Parameters 

419 ---------- 

420 drop : `bool` 

421 If True then drop tables before creating new ones. Note that 

422 only tables are dropped and not the whole keyspace. 

423 part_range : `tuple` [ `int` ] or `None` 

424 Start and end partition number for time partitions, end is not 

425 inclusive. Used to create per-partition DiaObject, DiaSource, and 

426 DiaForcedSource tables. If `None` then per-partition tables are 

427 not created. 

428 replication_factor : `int`, optional 

429 Replication factor used when creating new keyspace, if keyspace 

430 already exists its replication factor is not changed. 

431 """ 

432 # Try to create keyspace if it does not exist 

433 if replication_factor is None: 

434 replication_factor = 1 

435 query = ( 

436 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"' 

437 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': " 

438 f"{replication_factor}" 

439 "}" 

440 ) 

441 self._session.execute(query) 

442 

443 for table in self._apdb_tables: 

444 self._makeTableSchema(table, drop, part_range) 

445 for extra_table in self._extra_tables: 

446 self._makeTableSchema(extra_table, drop, part_range) 

447 # Reset cached information. 

448 self._has_insert_id = None 

449 

450 def _makeTableSchema( 

451 self, 

452 table: ApdbTables | ExtraTables, 

453 drop: bool = False, 

454 part_range: tuple[int, int] | None = None, 

455 ) -> None: 

456 _LOG.debug("Making table %s", table) 

457 

458 fullTable = table.table_name(self._prefix) 

459 

460 table_list = [fullTable] 

461 if part_range is not None: 

462 if table in self._time_partitioned_tables: 

463 partitions = range(*part_range) 

464 table_list = [f"{fullTable}_{part}" for part in partitions] 

465 

466 if drop: 

467 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list] 

468 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

469 for future in futures: 

470 _LOG.debug("wait for query: %s", future.query) 

471 future.result() 

472 _LOG.debug("query finished: %s", future.query) 

473 

474 queries = [] 

475 for table_name in table_list: 

476 if_not_exists = "" if drop else "IF NOT EXISTS" 

477 columns = ", ".join(self._tableColumns(table)) 

478 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})' 

479 _LOG.debug("query: %s", query) 

480 queries.append(query) 

481 futures = [self._session.execute_async(query, timeout=None) for query in queries] 

482 for future in futures: 

483 _LOG.debug("wait for query: %s", future.query) 

484 future.result() 

485 _LOG.debug("query finished: %s", future.query) 

486 

487 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]: 

488 """Return set of columns in a table 

489 

490 Parameters 

491 ---------- 

492 table_name : `ApdbTables` 

493 Name of the table. 

494 

495 Returns 

496 ------- 

497 column_defs : `list` 

498 List of strings in the format "column_name type". 

499 """ 

500 table_schema = self._table_schema(table_name) 

501 

502 # must have partition columns and clustering columns 

503 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", []) 

504 clust_columns = [column.name for column in table_schema.primary_key] 

505 _LOG.debug("part_columns: %s", part_columns) 

506 _LOG.debug("clust_columns: %s", clust_columns) 

507 if not part_columns: 

508 raise ValueError(f"Table {table_name} configuration is missing partition index") 

509 

510 # all columns 

511 column_defs = [] 

512 for column in table_schema.columns: 

513 ctype = self._type_map[column.datatype] 

514 column_defs.append(f'"{column.name}" {ctype}') 

515 

516 # primary key definition 

517 part_columns = [f'"{col}"' for col in part_columns] 

518 clust_columns = [f'"{col}"' for col in clust_columns] 

519 if len(part_columns) > 1: 

520 columns = ", ".join(part_columns) 

521 part_columns = [f"({columns})"] 

522 pkey = ", ".join(part_columns + clust_columns) 

523 _LOG.debug("pkey: %s", pkey) 

524 column_defs.append(f"PRIMARY KEY ({pkey})") 

525 

526 return column_defs 

527 

528 def _table_schema(self, table: ApdbTables | ExtraTables) -> simple.Table: 

529 """Return schema definition for a table.""" 

530 if isinstance(table, ApdbTables): 

531 table_schema = self._apdb_tables[table] 

532 else: 

533 table_schema = self._extra_tables[table] 

534 return table_schema