Coverage for python/lsst/dax/apdb/cassandra/apdbCassandraSchema.py: 18%
207 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 03:20 -0700
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 03:20 -0700
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["ApdbCassandraSchema"]
26import enum
27import logging
28from collections.abc import Mapping
29from typing import TYPE_CHECKING
31import felis.datamodel
33from .. import schema_model
34from ..apdbSchema import ApdbSchema, ApdbTables
36if TYPE_CHECKING:
37 import cassandra.cluster
40_LOG = logging.getLogger(__name__)
43class InconsistentSchemaError(RuntimeError):
44 """Exception raised when schema state is inconsistent."""
47@enum.unique
48class ExtraTables(enum.Enum):
49 """Names of the extra tables used by Cassandra implementation."""
51 ApdbReplicaChunks = "ApdbReplicaChunks"
52 """Name of the table for replica chunk records."""
54 DiaObjectChunks = "DiaObjectChunks"
55 """Name of the table for DIAObject chunk data."""
57 DiaSourceChunks = "DiaSourceChunks"
58 """Name of the table for DIASource chunk data."""
60 DiaForcedSourceChunks = "DiaForcedSourceChunks"
61 """Name of the table for DIAForcedSource chunk data."""
63 DiaSourceToPartition = "DiaSourceToPartition"
64 "Maps diaSourceId to its partition values (pixel and time)."
66 def table_name(self, prefix: str = "") -> str:
67 """Return full table name."""
68 return prefix + self.value
70 @classmethod
71 def replica_chunk_tables(cls) -> Mapping[ExtraTables, ApdbTables]:
72 """Return mapping of tables used for replica chunks storage to their
73 corresponding regular tables.
74 """
75 return {
76 cls.DiaObjectChunks: ApdbTables.DiaObject,
77 cls.DiaSourceChunks: ApdbTables.DiaSource,
78 cls.DiaForcedSourceChunks: ApdbTables.DiaForcedSource,
79 }
82class ApdbCassandraSchema(ApdbSchema):
83 """Class for management of APDB schema.
85 Parameters
86 ----------
87 session : `cassandra.cluster.Session`
88 Cassandra session object
89 keyspace : `str`
90 Keyspace name for all tables.
91 schema_file : `str`
92 Name of the YAML schema file.
93 schema_name : `str`, optional
94 Name of the schema in YAML files.
95 prefix : `str`, optional
96 Prefix to add to all schema elements.
97 time_partition_tables : `bool`
98 If `True` then schema will have a separate table for each time
99 partition.
100 enable_replica : `bool`, optional
101 If `True` then use additional tables for replica chunks.
102 """
104 _type_map = {
105 felis.datamodel.DataType.double: "DOUBLE",
106 felis.datamodel.DataType.float: "FLOAT",
107 felis.datamodel.DataType.timestamp: "TIMESTAMP",
108 felis.datamodel.DataType.long: "BIGINT",
109 felis.datamodel.DataType.int: "INT",
110 felis.datamodel.DataType.short: "SMALLINT",
111 felis.datamodel.DataType.byte: "TINYINT",
112 felis.datamodel.DataType.binary: "BLOB",
113 felis.datamodel.DataType.char: "TEXT",
114 felis.datamodel.DataType.string: "TEXT",
115 felis.datamodel.DataType.unicode: "TEXT",
116 felis.datamodel.DataType.text: "TEXT",
117 felis.datamodel.DataType.boolean: "BOOLEAN",
118 schema_model.ExtraDataTypes.UUID: "UUID",
119 }
120 """Map YAML column types to Cassandra"""
122 _time_partitioned_tables = [
123 ApdbTables.DiaObject,
124 ApdbTables.DiaSource,
125 ApdbTables.DiaForcedSource,
126 ]
127 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast]
129 def __init__(
130 self,
131 session: cassandra.cluster.Session,
132 keyspace: str,
133 schema_file: str,
134 schema_name: str = "ApdbSchema",
135 prefix: str = "",
136 time_partition_tables: bool = False,
137 enable_replica: bool = False,
138 ):
139 super().__init__(schema_file, schema_name)
141 self._session = session
142 self._keyspace = keyspace
143 self._prefix = prefix
144 self._time_partition_tables = time_partition_tables
145 self._enable_replica = enable_replica
146 self._has_replica_chunks: bool | None = None
148 self._apdb_tables = self._apdb_tables_schema(time_partition_tables)
149 self._extra_tables = self._extra_tables_schema()
151 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, schema_model.Table]:
152 """Generate schema for regular APDB tables."""
153 apdb_tables: dict[ApdbTables, schema_model.Table] = {}
155 # add columns and index for partitioning.
156 for table, apdb_table_def in self.tableSchemas.items():
157 part_columns = []
158 add_columns = []
159 primary_key = apdb_table_def.primary_key[:]
160 if table in self._spatially_partitioned_tables:
161 # DiaObjectLast does not need temporal partitioning
162 part_columns = ["apdb_part"]
163 add_columns = part_columns
164 elif table in self._time_partitioned_tables:
165 if time_partition_tables:
166 part_columns = ["apdb_part"]
167 else:
168 part_columns = ["apdb_part", "apdb_time_part"]
169 add_columns = part_columns
170 elif table is ApdbTables.SSObject:
171 # For SSObject there is no natural partition key but we have
172 # to partition it because there are too many of them. I'm
173 # going to partition on its primary key (and drop separate
174 # primary key index).
175 part_columns = ["ssObjectId"]
176 primary_key = []
177 elif table is ApdbTables.metadata:
178 # Metadata is in one partition because we want to read all of
179 # it in one query, add an extra column for partition.
180 part_columns = ["meta_part"]
181 add_columns = part_columns
182 else:
183 # TODO: Do not know what to do with the other tables
184 continue
186 column_defs = []
187 if add_columns:
188 column_defs = [
189 schema_model.Column(
190 id=f"#{name}", name=name, datatype=felis.datamodel.DataType.long, nullable=False
191 )
192 for name in add_columns
193 ]
195 annotations = dict(apdb_table_def.annotations)
196 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns]
197 if part_columns:
198 annotations["cassandra:partitioning_columns"] = part_columns
200 apdb_tables[table] = schema_model.Table(
201 id=apdb_table_def.id,
202 name=apdb_table_def.name,
203 columns=column_defs + apdb_table_def.columns,
204 primary_key=primary_key,
205 indexes=[],
206 constraints=[],
207 annotations=annotations,
208 )
210 return apdb_tables
212 def _extra_tables_schema(self) -> Mapping[ExtraTables, schema_model.Table]:
213 """Generate schema for extra tables."""
214 extra_tables: dict[ExtraTables, schema_model.Table] = {}
216 # This table maps DiaSource ID to its partitions in DiaSource table and
217 # DiaSourceChunks tables.
218 extra_tables[ExtraTables.DiaSourceToPartition] = schema_model.Table(
219 id="#" + ExtraTables.DiaSourceToPartition.value,
220 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix),
221 columns=[
222 schema_model.Column(
223 id="#diaSourceId",
224 name="diaSourceId",
225 datatype=felis.datamodel.DataType.long,
226 nullable=False,
227 ),
228 schema_model.Column(
229 id="#apdb_part", name="apdb_part", datatype=felis.datamodel.DataType.long, nullable=False
230 ),
231 schema_model.Column(
232 id="#apdb_time_part",
233 name="apdb_time_part",
234 datatype=felis.datamodel.DataType.int,
235 nullable=False,
236 ),
237 schema_model.Column(
238 id="#apdb_replica_chunk",
239 name="apdb_replica_chunk",
240 datatype=felis.datamodel.DataType.long,
241 nullable=True,
242 ),
243 ],
244 primary_key=[],
245 indexes=[],
246 constraints=[],
247 annotations={"cassandra:partitioning_columns": ["diaSourceId"]},
248 )
250 replica_chunk_column = schema_model.Column(
251 id="#apdb_replica_chunk",
252 name="apdb_replica_chunk",
253 datatype=felis.datamodel.DataType.long,
254 nullable=False,
255 )
257 if not self._enable_replica:
258 return extra_tables
260 # Table containing insert IDs, this one is not partitioned, but
261 # partition key must be defined.
262 extra_tables[ExtraTables.ApdbReplicaChunks] = schema_model.Table(
263 id="#" + ExtraTables.ApdbReplicaChunks.value,
264 name=ExtraTables.ApdbReplicaChunks.table_name(self._prefix),
265 columns=[
266 schema_model.Column(
267 id="#partition", name="partition", datatype=felis.datamodel.DataType.int, nullable=False
268 ),
269 replica_chunk_column,
270 schema_model.Column(
271 id="#last_update_time",
272 name="last_update_time",
273 datatype=felis.datamodel.DataType.timestamp,
274 nullable=False,
275 ),
276 schema_model.Column(
277 id="#unique_id",
278 name="unique_id",
279 datatype=schema_model.ExtraDataTypes.UUID,
280 nullable=False,
281 ),
282 ],
283 primary_key=[replica_chunk_column],
284 indexes=[],
285 constraints=[],
286 annotations={"cassandra:partitioning_columns": ["partition"]},
287 )
289 for chunk_table_enum, apdb_table_enum in ExtraTables.replica_chunk_tables().items():
290 apdb_table_def = self.tableSchemas[apdb_table_enum]
292 extra_tables[chunk_table_enum] = schema_model.Table(
293 id="#" + chunk_table_enum.value,
294 name=chunk_table_enum.table_name(self._prefix),
295 columns=[replica_chunk_column] + apdb_table_def.columns,
296 primary_key=apdb_table_def.primary_key[:],
297 indexes=[],
298 constraints=[],
299 annotations={
300 "cassandra:partitioning_columns": ["apdb_replica_chunk"],
301 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns],
302 },
303 )
305 return extra_tables
307 @property
308 def has_replica_chunks(self) -> bool:
309 """Whether insert ID tables are to be used (`bool`)."""
310 if self._has_replica_chunks is None:
311 self._has_replica_chunks = self._enable_replica and self._check_replica_chunks()
312 return self._has_replica_chunks
314 def _check_replica_chunks(self) -> bool:
315 """Check whether database has tables for tracking insert IDs."""
316 table_name = ExtraTables.ApdbReplicaChunks.table_name(self._prefix)
317 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s"
318 result = self._session.execute(query, (self._keyspace, table_name))
319 row = result.one()
320 return bool(row[0])
322 def empty(self) -> bool:
323 """Return True if database schema is empty.
325 Returns
326 -------
327 empty : `bool`
328 `True` if none of the required APDB tables exist in the database,
329 `False` if all required tables exist.
331 Raises
332 ------
333 InconsistentSchemaError
334 Raised when some of the required tables exist but not all.
335 """
336 query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s"
337 result = self._session.execute(query, (self._keyspace,))
338 table_names = set(row[0] for row in result.all())
340 existing_tables = []
341 missing_tables = []
342 for table_enum in self._apdb_tables:
343 table_name = table_enum.table_name(self._prefix)
344 if self._time_partition_tables and table_enum in self._time_partitioned_tables:
345 # Check prefix for time-partitioned tables.
346 exists = any(table.startswith(f"{table_name}_") for table in table_names)
347 else:
348 exists = table_name in table_names
349 if exists:
350 existing_tables.append(table_name)
351 else:
352 missing_tables.append(table_name)
354 if not missing_tables:
355 return False
356 elif not existing_tables:
357 return True
358 else:
359 raise InconsistentSchemaError(
360 f"Only some required APDB tables exist: {existing_tables}, missing tables: {missing_tables}"
361 )
363 def existing_tables(self, *args: ApdbTables) -> dict[ApdbTables, list[str]]:
364 """Return the list of existing table names for given table.
366 Parameters
367 ----------
368 *args : `ApdbTables`
369 Tables for which to return their existing table names.
371 Returns
372 -------
373 tables : `dict` [`ApdbTables`, `list`[`str`]]
374 Mapping of the APDB table to the list of the existing table names.
375 More than one name can be present in the list if configuration
376 specifies per-partition tables.
377 """
378 if self._time_partition_tables and not set(args).isdisjoint(self._time_partitioned_tables):
379 # Some of the tables should have per-partition tables.
380 query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s"
381 result = self._session.execute(query, (self._keyspace,))
382 table_names = set(row[0] for row in result.all())
384 tables = {}
385 for table_enum in args:
386 base_name = table_enum.table_name(self._prefix)
387 if table_enum in self._time_partitioned_tables:
388 tables[table_enum] = [table for table in table_names if table.startswith(f"{base_name}_")]
389 else:
390 tables[table_enum] = [base_name]
391 return tables
392 else:
393 # Do not check that they exist, we know that they should.
394 return {table_enum: [table_enum.table_name(self._prefix)] for table_enum in args}
396 def tableName(self, table_name: ApdbTables | ExtraTables) -> str:
397 """Return Cassandra table name for APDB table."""
398 return table_name.table_name(self._prefix)
400 def keyspace(self) -> str:
401 """Return Cassandra keyspace for APDB tables."""
402 return self._keyspace
404 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, schema_model.Column]:
405 """Return mapping of column names to Column definitions.
407 Parameters
408 ----------
409 table_name : `ApdbTables`
410 One of known APDB table names.
412 Returns
413 -------
414 column_map : `dict`
415 Mapping of column names to `ColumnDef` instances.
416 """
417 table_schema = self._table_schema(table_name)
418 cmap = {column.name: column for column in table_schema.columns}
419 return cmap
421 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]:
422 """Return a list of columns names for a table as defined in APDB
423 schema.
425 Parameters
426 ----------
427 table_name : `ApdbTables` or `ExtraTables`
428 Enum for a table in APDB schema.
430 Returns
431 -------
432 columns : `list` of `str`
433 Names of regular columns in the table.
434 """
435 table_schema = self._table_schema(table_name)
436 return table_schema.annotations["cassandra:apdb_column_names"]
438 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
439 """Return a list of columns used for table partitioning.
441 Parameters
442 ----------
443 table_name : `ApdbTables`
444 Table name in APDB schema
446 Returns
447 -------
448 columns : `list` of `str`
449 Names of columns used for partitioning.
450 """
451 table_schema = self._table_schema(table_name)
452 return table_schema.annotations.get("cassandra:partitioning_columns", [])
454 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
455 """Return a list of columns used for clustering.
457 Parameters
458 ----------
459 table_name : `ApdbTables`
460 Table name in APDB schema
462 Returns
463 -------
464 columns : `list` of `str`
465 Names of columns for used for clustering.
466 """
467 table_schema = self._table_schema(table_name)
468 return [column.name for column in table_schema.primary_key]
470 def makeSchema(
471 self,
472 *,
473 drop: bool = False,
474 part_range: tuple[int, int] | None = None,
475 replication_factor: int | None = None,
476 ) -> None:
477 """Create or re-create all tables.
479 Parameters
480 ----------
481 drop : `bool`
482 If True then drop tables before creating new ones. Note that
483 only tables are dropped and not the whole keyspace.
484 part_range : `tuple` [ `int` ] or `None`
485 Start and end partition number for time partitions, end is not
486 inclusive. Used to create per-partition DiaObject, DiaSource, and
487 DiaForcedSource tables. If `None` then per-partition tables are
488 not created.
489 replication_factor : `int`, optional
490 Replication factor used when creating new keyspace, if keyspace
491 already exists its replication factor is not changed.
492 """
493 # Try to create keyspace if it does not exist
494 if replication_factor is None:
495 replication_factor = 1
496 query = (
497 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"'
498 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': "
499 f"{replication_factor}"
500 "}"
501 )
502 self._session.execute(query)
504 for table in self._apdb_tables:
505 self._makeTableSchema(table, drop, part_range)
506 for extra_table in self._extra_tables:
507 self._makeTableSchema(extra_table, drop, part_range)
508 # Reset cached information.
509 self._has_replica_chunks = None
511 def _makeTableSchema(
512 self,
513 table: ApdbTables | ExtraTables,
514 drop: bool = False,
515 part_range: tuple[int, int] | None = None,
516 ) -> None:
517 _LOG.debug("Making table %s", table)
519 fullTable = table.table_name(self._prefix)
521 table_list = [fullTable]
522 if part_range is not None:
523 if table in self._time_partitioned_tables:
524 partitions = range(*part_range)
525 table_list = [f"{fullTable}_{part}" for part in partitions]
527 if drop:
528 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list]
529 futures = [self._session.execute_async(query, timeout=None) for query in queries]
530 for future in futures:
531 _LOG.debug("wait for query: %s", future.query)
532 future.result()
533 _LOG.debug("query finished: %s", future.query)
535 queries = []
536 for table_name in table_list:
537 if_not_exists = "" if drop else "IF NOT EXISTS"
538 columns = ", ".join(self._tableColumns(table))
539 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})'
540 _LOG.debug("query: %s", query)
541 queries.append(query)
542 futures = [self._session.execute_async(query, timeout=None) for query in queries]
543 for future in futures:
544 _LOG.debug("wait for query: %s", future.query)
545 future.result()
546 _LOG.debug("query finished: %s", future.query)
548 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
549 """Return set of columns in a table
551 Parameters
552 ----------
553 table_name : `ApdbTables`
554 Name of the table.
556 Returns
557 -------
558 column_defs : `list`
559 List of strings in the format "column_name type".
560 """
561 table_schema = self._table_schema(table_name)
563 # must have partition columns and clustering columns
564 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", [])
565 clust_columns = [column.name for column in table_schema.primary_key]
566 _LOG.debug("part_columns: %s", part_columns)
567 _LOG.debug("clust_columns: %s", clust_columns)
568 if not part_columns:
569 raise ValueError(f"Table {table_name} configuration is missing partition index")
571 # all columns
572 column_defs = []
573 for column in table_schema.columns:
574 ctype = self._type_map[column.datatype]
575 column_defs.append(f'"{column.name}" {ctype}')
577 # primary key definition
578 part_columns = [f'"{col}"' for col in part_columns]
579 clust_columns = [f'"{col}"' for col in clust_columns]
580 if len(part_columns) > 1:
581 columns = ", ".join(part_columns)
582 part_columns = [f"({columns})"]
583 pkey = ", ".join(part_columns + clust_columns)
584 _LOG.debug("pkey: %s", pkey)
585 column_defs.append(f"PRIMARY KEY ({pkey})")
587 return column_defs
589 def _table_schema(self, table: ApdbTables | ExtraTables) -> schema_model.Table:
590 """Return schema definition for a table."""
591 if isinstance(table, ApdbTables):
592 table_schema = self._apdb_tables[table]
593 else:
594 table_schema = self._extra_tables[table]
595 return table_schema