Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 19%
195 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-16 03:10 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-16 03:10 -0700
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["ApdbCassandraSchema"]
26import enum
27import logging
28from collections.abc import Mapping
29from typing import TYPE_CHECKING
31import felis.types
32from felis import simple
34from .apdbSchema import ApdbSchema, ApdbTables
36if TYPE_CHECKING:
37 import cassandra.cluster
40_LOG = logging.getLogger(__name__)
43class _FelisUUID(felis.types.FelisType, felis_name="uuid", votable_name="uuid"):
44 """Special internal type for UUID columns. Felis does not support UUID,
45 but we need it here, to simplify logic it's easier to add a special class.
46 """
49class InconsistentSchemaError(RuntimeError):
50 """Exception raised when schema state is inconsistent."""
53@enum.unique
54class ExtraTables(enum.Enum):
55 """Names of the extra tables used by Cassandra implementation."""
57 DiaInsertId = "DiaInsertId"
58 """Name of the table for insert ID records."""
60 DiaObjectInsertId = "DiaObjectInsertId"
61 """Name of the table for DIAObject insert ID records."""
63 DiaSourceInsertId = "DiaSourceInsertId"
64 """Name of the table for DIASource insert ID records."""
66 DiaForcedSourceInsertId = "DiaFSourceInsertId"
67 """Name of the table for DIAForcedSource insert ID records."""
69 DiaSourceToPartition = "DiaSourceToPartition"
70 "Maps diaSourceId to its partition values (pixel and time)."
72 def table_name(self, prefix: str = "") -> str:
73 """Return full table name."""
74 return prefix + self.value
76 @classmethod
77 def insert_id_tables(cls) -> Mapping[ExtraTables, ApdbTables]:
78 """Return mapping of tables used for insert ID tracking to their
79 corresponding regular tables.
80 """
81 return {
82 cls.DiaObjectInsertId: ApdbTables.DiaObject,
83 cls.DiaSourceInsertId: ApdbTables.DiaSource,
84 cls.DiaForcedSourceInsertId: ApdbTables.DiaForcedSource,
85 }
88class ApdbCassandraSchema(ApdbSchema):
89 """Class for management of APDB schema.
91 Parameters
92 ----------
93 session : `cassandra.cluster.Session`
94 Cassandra session object
95 schema_file : `str`
96 Name of the YAML schema file.
97 schema_name : `str`, optional
98 Name of the schema in YAML files.
99 prefix : `str`, optional
100 Prefix to add to all schema elements.
101 time_partition_tables : `bool`
102 If True then schema will have a separate table for each time partition.
103 """
105 _type_map = {
106 felis.types.Double: "DOUBLE",
107 felis.types.Float: "FLOAT",
108 felis.types.Timestamp: "TIMESTAMP",
109 felis.types.Long: "BIGINT",
110 felis.types.Int: "INT",
111 felis.types.Short: "INT",
112 felis.types.Byte: "TINYINT",
113 felis.types.Binary: "BLOB",
114 felis.types.Char: "TEXT",
115 felis.types.String: "TEXT",
116 felis.types.Unicode: "TEXT",
117 felis.types.Text: "TEXT",
118 felis.types.Boolean: "BOOLEAN",
119 _FelisUUID: "UUID",
120 }
121 """Map YAML column types to Cassandra"""
123 _time_partitioned_tables = [
124 ApdbTables.DiaObject,
125 ApdbTables.DiaSource,
126 ApdbTables.DiaForcedSource,
127 ]
128 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast]
130 def __init__(
131 self,
132 session: cassandra.cluster.Session,
133 keyspace: str,
134 schema_file: str,
135 schema_name: str = "ApdbSchema",
136 prefix: str = "",
137 time_partition_tables: bool = False,
138 use_insert_id: bool = False,
139 ):
140 super().__init__(schema_file, schema_name)
142 self._session = session
143 self._keyspace = keyspace
144 self._prefix = prefix
145 self._time_partition_tables = time_partition_tables
146 self._use_insert_id = use_insert_id
147 self._has_insert_id: bool | None = None
149 self._apdb_tables = self._apdb_tables_schema(time_partition_tables)
150 self._extra_tables = self._extra_tables_schema()
152 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, simple.Table]:
153 """Generate schema for regular APDB tables."""
154 apdb_tables: dict[ApdbTables, simple.Table] = {}
156 # add columns and index for partitioning.
157 for table, apdb_table_def in self.tableSchemas.items():
158 part_columns = []
159 add_columns = []
160 primary_key = apdb_table_def.primary_key[:]
161 if table in self._spatially_partitioned_tables:
162 # DiaObjectLast does not need temporal partitioning
163 part_columns = ["apdb_part"]
164 add_columns = part_columns
165 elif table in self._time_partitioned_tables:
166 if time_partition_tables:
167 part_columns = ["apdb_part"]
168 else:
169 part_columns = ["apdb_part", "apdb_time_part"]
170 add_columns = part_columns
171 elif table is ApdbTables.SSObject:
172 # For SSObject there is no natural partition key but we have
173 # to partition it because there are too many of them. I'm
174 # going to partition on its primary key (and drop separate
175 # primary key index).
176 part_columns = ["ssObjectId"]
177 primary_key = []
178 elif table is ApdbTables.metadata:
179 # Metadata is in one partition because we want to read all of
180 # it in one query, add an extra column for partition.
181 part_columns = ["meta_part"]
182 add_columns = part_columns
183 else:
184 # TODO: Do not know what to do with the other tables
185 continue
187 column_defs = []
188 if add_columns:
189 column_defs = [
190 simple.Column(id=f"#{name}", name=name, datatype=felis.types.Long, nullable=False)
191 for name in add_columns
192 ]
194 annotations = dict(apdb_table_def.annotations)
195 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns]
196 if part_columns:
197 annotations["cassandra:partitioning_columns"] = part_columns
199 apdb_tables[table] = simple.Table(
200 id=apdb_table_def.id,
201 name=apdb_table_def.name,
202 columns=column_defs + apdb_table_def.columns,
203 primary_key=primary_key,
204 indexes=[],
205 constraints=[],
206 annotations=annotations,
207 )
209 return apdb_tables
211 def _extra_tables_schema(self) -> Mapping[ExtraTables, simple.Table]:
212 """Generate schema for extra tables."""
213 extra_tables: dict[ExtraTables, simple.Table] = {}
215 # This table maps DiaSource ID to its partitions in DiaSource table and
216 # DiaSourceInsertId tables.
217 extra_tables[ExtraTables.DiaSourceToPartition] = simple.Table(
218 id="#" + ExtraTables.DiaSourceToPartition.value,
219 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix),
220 columns=[
221 simple.Column(
222 id="#diaSourceId", name="diaSourceId", datatype=felis.types.Long, nullable=False
223 ),
224 simple.Column(id="#apdb_part", name="apdb_part", datatype=felis.types.Long, nullable=False),
225 simple.Column(
226 id="#apdb_time_part", name="apdb_time_part", datatype=felis.types.Int, nullable=False
227 ),
228 simple.Column(id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=True),
229 ],
230 primary_key=[],
231 indexes=[],
232 constraints=[],
233 annotations={"cassandra:partitioning_columns": ["diaSourceId"]},
234 )
236 insert_id_column = simple.Column(
237 id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=False
238 )
240 if not self._use_insert_id:
241 return extra_tables
243 # Table containing insert IDs, this one is not partitioned, but
244 # partition key must be defined.
245 extra_tables[ExtraTables.DiaInsertId] = simple.Table(
246 id="#" + ExtraTables.DiaInsertId.value,
247 name=ExtraTables.DiaInsertId.table_name(self._prefix),
248 columns=[
249 simple.Column(id="#partition", name="partition", datatype=felis.types.Int, nullable=False),
250 insert_id_column,
251 simple.Column(
252 id="#insert_time", name="insert_time", datatype=felis.types.Timestamp, nullable=False
253 ),
254 ],
255 primary_key=[insert_id_column],
256 indexes=[],
257 constraints=[],
258 annotations={"cassandra:partitioning_columns": ["partition"]},
259 )
261 for insert_id_table_enum, apdb_table_enum in ExtraTables.insert_id_tables().items():
262 apdb_table_def = self.tableSchemas[apdb_table_enum]
264 extra_tables[insert_id_table_enum] = simple.Table(
265 id="#" + insert_id_table_enum.value,
266 name=insert_id_table_enum.table_name(self._prefix),
267 columns=[insert_id_column] + apdb_table_def.columns,
268 primary_key=apdb_table_def.primary_key[:],
269 indexes=[],
270 constraints=[],
271 annotations={
272 "cassandra:partitioning_columns": ["insert_id"],
273 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns],
274 },
275 )
277 return extra_tables
279 @property
280 def has_insert_id(self) -> bool:
281 """Whether insert ID tables are to be used (`bool`)."""
282 if self._has_insert_id is None:
283 self._has_insert_id = self._use_insert_id and self._check_insert_id()
284 return self._has_insert_id
286 def _check_insert_id(self) -> bool:
287 """Check whether database has tables for tracking insert IDs."""
288 table_name = ExtraTables.DiaInsertId.table_name(self._prefix)
289 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s"
290 result = self._session.execute(query, (self._keyspace, table_name))
291 row = result.one()
292 return bool(row[0])
294 def empty(self) -> bool:
295 """Return True if database schema is empty.
297 Returns
298 -------
299 empty : `bool`
300 `True` if none of the required APDB tables exist in the database,
301 `False` if all required tables exist.
303 Raises
304 ------
305 InconsistentSchemaError
306 Raised when some of the required tables exist but not all.
307 """
308 query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s"
309 result = self._session.execute(query, (self._keyspace,))
310 table_names = set(row[0] for row in result.all())
312 existing_tables = []
313 missing_tables = []
314 for table_enum in self._apdb_tables:
315 table_name = table_enum.table_name(self._prefix)
316 if self._time_partition_tables and table_enum in self._time_partitioned_tables:
317 # Check prefix for time-partitioned tables.
318 exists = any(table.startswith(f"{table_name}_") for table in table_names)
319 else:
320 exists = table_name in table_names
321 if exists:
322 existing_tables.append(table_name)
323 else:
324 missing_tables.append(table_name)
326 if not missing_tables:
327 return False
328 elif not existing_tables:
329 return True
330 else:
331 raise InconsistentSchemaError(
332 f"Only some required APDB tables exist: {existing_tables}, missing tables: {missing_tables}"
333 )
335 def tableName(self, table_name: ApdbTables | ExtraTables) -> str:
336 """Return Cassandra table name for APDB table."""
337 return table_name.table_name(self._prefix)
339 def keyspace(self) -> str:
340 """Return Cassandra keyspace for APDB tables."""
341 return self._keyspace
343 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, simple.Column]:
344 """Return mapping of column names to Column definitions.
346 Parameters
347 ----------
348 table_name : `ApdbTables`
349 One of known APDB table names.
351 Returns
352 -------
353 column_map : `dict`
354 Mapping of column names to `ColumnDef` instances.
355 """
356 table_schema = self._table_schema(table_name)
357 cmap = {column.name: column for column in table_schema.columns}
358 return cmap
360 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]:
361 """Return a list of columns names for a table as defined in APDB
362 schema.
364 Parameters
365 ----------
366 table_name : `ApdbTables` or `ExtraTables`
367 Enum for a table in APDB schema.
369 Returns
370 -------
371 columns : `list` of `str`
372 Names of regular columns in the table.
373 """
374 table_schema = self._table_schema(table_name)
375 return table_schema.annotations["cassandra:apdb_column_names"]
377 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
378 """Return a list of columns used for table partitioning.
380 Parameters
381 ----------
382 table_name : `ApdbTables`
383 Table name in APDB schema
385 Returns
386 -------
387 columns : `list` of `str`
388 Names of columns used for partitioning.
389 """
390 table_schema = self._table_schema(table_name)
391 return table_schema.annotations.get("cassandra:partitioning_columns", [])
393 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
394 """Return a list of columns used for clustering.
396 Parameters
397 ----------
398 table_name : `ApdbTables`
399 Table name in APDB schema
401 Returns
402 -------
403 columns : `list` of `str`
404 Names of columns for used for clustering.
405 """
406 table_schema = self._table_schema(table_name)
407 return [column.name for column in table_schema.primary_key]
409 def makeSchema(
410 self,
411 *,
412 drop: bool = False,
413 part_range: tuple[int, int] | None = None,
414 replication_factor: int | None = None,
415 ) -> None:
416 """Create or re-create all tables.
418 Parameters
419 ----------
420 drop : `bool`
421 If True then drop tables before creating new ones. Note that
422 only tables are dropped and not the whole keyspace.
423 part_range : `tuple` [ `int` ] or `None`
424 Start and end partition number for time partitions, end is not
425 inclusive. Used to create per-partition DiaObject, DiaSource, and
426 DiaForcedSource tables. If `None` then per-partition tables are
427 not created.
428 replication_factor : `int`, optional
429 Replication factor used when creating new keyspace, if keyspace
430 already exists its replication factor is not changed.
431 """
432 # Try to create keyspace if it does not exist
433 if replication_factor is None:
434 replication_factor = 1
435 query = (
436 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"'
437 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': "
438 f"{replication_factor}"
439 "}"
440 )
441 self._session.execute(query)
443 for table in self._apdb_tables:
444 self._makeTableSchema(table, drop, part_range)
445 for extra_table in self._extra_tables:
446 self._makeTableSchema(extra_table, drop, part_range)
447 # Reset cached information.
448 self._has_insert_id = None
450 def _makeTableSchema(
451 self,
452 table: ApdbTables | ExtraTables,
453 drop: bool = False,
454 part_range: tuple[int, int] | None = None,
455 ) -> None:
456 _LOG.debug("Making table %s", table)
458 fullTable = table.table_name(self._prefix)
460 table_list = [fullTable]
461 if part_range is not None:
462 if table in self._time_partitioned_tables:
463 partitions = range(*part_range)
464 table_list = [f"{fullTable}_{part}" for part in partitions]
466 if drop:
467 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list]
468 futures = [self._session.execute_async(query, timeout=None) for query in queries]
469 for future in futures:
470 _LOG.debug("wait for query: %s", future.query)
471 future.result()
472 _LOG.debug("query finished: %s", future.query)
474 queries = []
475 for table_name in table_list:
476 if_not_exists = "" if drop else "IF NOT EXISTS"
477 columns = ", ".join(self._tableColumns(table))
478 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})'
479 _LOG.debug("query: %s", query)
480 queries.append(query)
481 futures = [self._session.execute_async(query, timeout=None) for query in queries]
482 for future in futures:
483 _LOG.debug("wait for query: %s", future.query)
484 future.result()
485 _LOG.debug("query finished: %s", future.query)
487 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
488 """Return set of columns in a table
490 Parameters
491 ----------
492 table_name : `ApdbTables`
493 Name of the table.
495 Returns
496 -------
497 column_defs : `list`
498 List of strings in the format "column_name type".
499 """
500 table_schema = self._table_schema(table_name)
502 # must have partition columns and clustering columns
503 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", [])
504 clust_columns = [column.name for column in table_schema.primary_key]
505 _LOG.debug("part_columns: %s", part_columns)
506 _LOG.debug("clust_columns: %s", clust_columns)
507 if not part_columns:
508 raise ValueError(f"Table {table_name} configuration is missing partition index")
510 # all columns
511 column_defs = []
512 for column in table_schema.columns:
513 ctype = self._type_map[column.datatype]
514 column_defs.append(f'"{column.name}" {ctype}')
516 # primary key definition
517 part_columns = [f'"{col}"' for col in part_columns]
518 clust_columns = [f'"{col}"' for col in clust_columns]
519 if len(part_columns) > 1:
520 columns = ", ".join(part_columns)
521 part_columns = [f"({columns})"]
522 pkey = ", ".join(part_columns + clust_columns)
523 _LOG.debug("pkey: %s", pkey)
524 column_defs.append(f"PRIMARY KEY ({pkey})")
526 return column_defs
528 def _table_schema(self, table: ApdbTables | ExtraTables) -> simple.Table:
529 """Return schema definition for a table."""
530 if isinstance(table, ApdbTables):
531 table_schema = self._apdb_tables[table]
532 else:
533 table_schema = self._extra_tables[table]
534 return table_schema