Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 20%
192 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-03 10:51 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-03 10:51 +0000
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["ApdbCassandraSchema"]
26import enum
27import logging
28from collections.abc import Mapping
29from typing import TYPE_CHECKING
31import felis.types
32from felis import simple
34from .apdbSchema import ApdbSchema, ApdbTables
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 import cassandra.cluster
40_LOG = logging.getLogger(__name__)
43class _FelisUUID(felis.types.FelisType, felis_name="uuid", votable_name="uuid"):
44 """Special internal type for UUID columns. Felis does not support UUID,
45 but we need it here, to simplify logic it's easier to add a special class.
46 """
49class InconsistentSchemaError(RuntimeError):
50 """Exception raised when schema state is inconsistent."""
53@enum.unique
54class ExtraTables(enum.Enum):
55 """Names of the extra tables used by Cassandra implementation."""
57 DiaInsertId = "DiaInsertId"
58 """Name of the table for insert ID records."""
60 DiaObjectInsertId = "DiaObjectInsertId"
61 """Name of the table for DIAObject insert ID records."""
63 DiaSourceInsertId = "DiaSourceInsertId"
64 """Name of the table for DIASource insert ID records."""
66 DiaForcedSourceInsertId = "DiaFSourceInsertId"
67 """Name of the table for DIAForcedSource insert ID records."""
69 DiaSourceToPartition = "DiaSourceToPartition"
70 "Maps diaSourceId to its partition values (pixel and time)."
72 def table_name(self, prefix: str = "") -> str:
73 """Return full table name."""
74 return prefix + self.value
76 @classmethod
77 def insert_id_tables(cls) -> Mapping[ExtraTables, ApdbTables]:
78 """Return mapping of tables used for insert ID tracking to their
79 corresponding regular tables.
80 """
81 return {
82 cls.DiaObjectInsertId: ApdbTables.DiaObject,
83 cls.DiaSourceInsertId: ApdbTables.DiaSource,
84 cls.DiaForcedSourceInsertId: ApdbTables.DiaForcedSource,
85 }
88class ApdbCassandraSchema(ApdbSchema):
89 """Class for management of APDB schema.
91 Parameters
92 ----------
93 session : `cassandra.cluster.Session`
94 Cassandra session object
95 schema_file : `str`
96 Name of the YAML schema file.
97 schema_name : `str`, optional
98 Name of the schema in YAML files.
99 prefix : `str`, optional
100 Prefix to add to all schema elements.
101 time_partition_tables : `bool`
102 If True then schema will have a separate table for each time partition.
103 """
105 _type_map = {
106 felis.types.Double: "DOUBLE",
107 felis.types.Float: "FLOAT",
108 felis.types.Timestamp: "TIMESTAMP",
109 felis.types.Long: "BIGINT",
110 felis.types.Int: "INT",
111 felis.types.Short: "INT",
112 felis.types.Byte: "TINYINT",
113 felis.types.Binary: "BLOB",
114 felis.types.Char: "TEXT",
115 felis.types.String: "TEXT",
116 felis.types.Unicode: "TEXT",
117 felis.types.Text: "TEXT",
118 felis.types.Boolean: "BOOLEAN",
119 _FelisUUID: "UUID",
120 }
121 """Map YAML column types to Cassandra"""
123 _time_partitioned_tables = [
124 ApdbTables.DiaObject,
125 ApdbTables.DiaSource,
126 ApdbTables.DiaForcedSource,
127 ]
128 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast]
130 def __init__(
131 self,
132 session: cassandra.cluster.Session,
133 keyspace: str,
134 schema_file: str,
135 schema_name: str = "ApdbSchema",
136 prefix: str = "",
137 time_partition_tables: bool = False,
138 use_insert_id: bool = False,
139 ):
140 super().__init__(schema_file, schema_name)
142 self._session = session
143 self._keyspace = keyspace
144 self._prefix = prefix
145 self._time_partition_tables = time_partition_tables
146 self._use_insert_id = use_insert_id
147 self._has_insert_id: bool | None = None
149 self._apdb_tables = self._apdb_tables_schema(time_partition_tables)
150 self._extra_tables = self._extra_tables_schema()
152 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, simple.Table]:
153 """Generate schema for regular APDB tables."""
154 apdb_tables: dict[ApdbTables, simple.Table] = {}
156 # add columns and index for partitioning.
157 for table, apdb_table_def in self.tableSchemas.items():
158 part_columns = []
159 add_columns = []
160 primary_key = apdb_table_def.primary_key[:]
161 if table in self._spatially_partitioned_tables:
162 # DiaObjectLast does not need temporal partitioning
163 part_columns = ["apdb_part"]
164 add_columns = part_columns
165 elif table in self._time_partitioned_tables:
166 if time_partition_tables:
167 part_columns = ["apdb_part"]
168 else:
169 part_columns = ["apdb_part", "apdb_time_part"]
170 add_columns = part_columns
171 elif table is ApdbTables.SSObject:
172 # For SSObject there is no natural partition key but we have
173 # to partition it because there are too many of them. I'm
174 # going to partition on its primary key (and drop separate
175 # primary key index).
176 part_columns = ["ssObjectId"]
177 primary_key = []
178 elif table is ApdbTables.metadata:
179 # Metadata is in one partition because we want to read all of
180 # it in one query, add an extra column for partition.
181 part_columns = ["meta_part"]
182 add_columns = part_columns
183 else:
184 # TODO: Do not know what to do with the other tables
185 continue
187 column_defs = []
188 if add_columns:
189 column_defs = [
190 simple.Column(id=f"#{name}", name=name, datatype=felis.types.Long, nullable=False)
191 for name in add_columns
192 ]
194 annotations = dict(apdb_table_def.annotations)
195 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns]
196 if part_columns:
197 annotations["cassandra:partitioning_columns"] = part_columns
199 apdb_tables[table] = simple.Table(
200 id=apdb_table_def.id,
201 name=apdb_table_def.name,
202 columns=column_defs + apdb_table_def.columns,
203 primary_key=primary_key,
204 indexes=[],
205 constraints=[],
206 annotations=annotations,
207 )
209 return apdb_tables
211 def _extra_tables_schema(self) -> Mapping[ExtraTables, simple.Table]:
212 """Generate schema for extra tables."""
213 extra_tables: dict[ExtraTables, simple.Table] = {}
215 # This table maps DiaSource ID to its partitions in DiaSource table and
216 # DiaSourceInsertId tables.
217 extra_tables[ExtraTables.DiaSourceToPartition] = simple.Table(
218 id="#" + ExtraTables.DiaSourceToPartition.value,
219 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix),
220 columns=[
221 simple.Column(
222 id="#diaSourceId", name="diaSourceId", datatype=felis.types.Long, nullable=False
223 ),
224 simple.Column(id="#apdb_part", name="apdb_part", datatype=felis.types.Long, nullable=False),
225 simple.Column(
226 id="#apdb_time_part", name="apdb_time_part", datatype=felis.types.Int, nullable=False
227 ),
228 simple.Column(id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=True),
229 ],
230 primary_key=[],
231 indexes=[],
232 constraints=[],
233 annotations={"cassandra:partitioning_columns": ["diaSourceId"]},
234 )
236 insert_id_column = simple.Column(
237 id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=False
238 )
240 if not self._use_insert_id:
241 return extra_tables
243 # Table containing insert IDs, this one is not partitioned, but
244 # partition key must be defined.
245 extra_tables[ExtraTables.DiaInsertId] = simple.Table(
246 id="#" + ExtraTables.DiaInsertId.value,
247 name=ExtraTables.DiaInsertId.table_name(self._prefix),
248 columns=[
249 simple.Column(id="#partition", name="partition", datatype=felis.types.Int, nullable=False),
250 insert_id_column,
251 simple.Column(
252 id="#insert_time", name="insert_time", datatype=felis.types.Timestamp, nullable=False
253 ),
254 ],
255 primary_key=[insert_id_column],
256 indexes=[],
257 constraints=[],
258 annotations={"cassandra:partitioning_columns": ["partition"]},
259 )
261 for insert_id_table_enum, apdb_table_enum in ExtraTables.insert_id_tables().items():
262 apdb_table_def = self.tableSchemas[apdb_table_enum]
264 extra_tables[insert_id_table_enum] = simple.Table(
265 id="#" + insert_id_table_enum.value,
266 name=insert_id_table_enum.table_name(self._prefix),
267 columns=[insert_id_column] + apdb_table_def.columns,
268 primary_key=apdb_table_def.primary_key[:],
269 indexes=[],
270 constraints=[],
271 annotations={
272 "cassandra:partitioning_columns": ["insert_id"],
273 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns],
274 },
275 )
277 return extra_tables
279 @property
280 def has_insert_id(self) -> bool:
281 """Whether insert ID tables are to be used (`bool`)."""
282 if self._has_insert_id is None:
283 self._has_insert_id = self._use_insert_id and self._check_insert_id()
284 return self._has_insert_id
286 def _check_insert_id(self) -> bool:
287 """Check whether database has tables for tracking insert IDs."""
288 table_name = ExtraTables.DiaInsertId.table_name(self._prefix)
289 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s"
290 result = self._session.execute(query, (self._keyspace, table_name))
291 row = result.one()
292 return bool(row[0])
294 def empty(self) -> bool:
295 """Return True if database schema is empty.
297 Returns
298 -------
299 empty : `bool`
300 `True` if none of the required APDB tables exist in the database,
301 `False` if all required tables exist.
303 Raises
304 ------
305 InconsistentSchemaError
306 Raised when some of the required tables exist but not all.
307 """
308 query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s"
309 result = self._session.execute(query, (self._keyspace,))
310 table_names = set(row[0] for row in result.all())
312 existing_tables = []
313 missing_tables = []
314 for table_enum in self._apdb_tables:
315 table_name = table_enum.table_name(self._prefix)
316 if table_name in table_names:
317 existing_tables.append(table_name)
318 else:
319 missing_tables.append(table_name)
321 if not missing_tables:
322 return False
323 elif not existing_tables:
324 return True
325 else:
326 raise InconsistentSchemaError(
327 f"Only some required APDB tables exist: {existing_tables}, missing tables: {missing_tables}"
328 )
330 def tableName(self, table_name: ApdbTables | ExtraTables) -> str:
331 """Return Cassandra table name for APDB table."""
332 return table_name.table_name(self._prefix)
334 def keyspace(self) -> str:
335 """Return Cassandra keyspace for APDB tables."""
336 return self._keyspace
338 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, simple.Column]:
339 """Return mapping of column names to Column definitions.
341 Parameters
342 ----------
343 table_name : `ApdbTables`
344 One of known APDB table names.
346 Returns
347 -------
348 column_map : `dict`
349 Mapping of column names to `ColumnDef` instances.
350 """
351 table_schema = self._table_schema(table_name)
352 cmap = {column.name: column for column in table_schema.columns}
353 return cmap
355 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]:
356 """Return a list of columns names for a table as defined in APDB
357 schema.
359 Parameters
360 ----------
361 table_name : `ApdbTables` or `ExtraTables`
362 Enum for a table in APDB schema.
364 Returns
365 -------
366 columns : `list` of `str`
367 Names of regular columns in the table.
368 """
369 table_schema = self._table_schema(table_name)
370 return table_schema.annotations["cassandra:apdb_column_names"]
372 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
373 """Return a list of columns used for table partitioning.
375 Parameters
376 ----------
377 table_name : `ApdbTables`
378 Table name in APDB schema
380 Returns
381 -------
382 columns : `list` of `str`
383 Names of columns used for partitioning.
384 """
385 table_schema = self._table_schema(table_name)
386 return table_schema.annotations.get("cassandra:partitioning_columns", [])
388 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
389 """Return a list of columns used for clustering.
391 Parameters
392 ----------
393 table_name : `ApdbTables`
394 Table name in APDB schema
396 Returns
397 -------
398 columns : `list` of `str`
399 Names of columns for used for clustering.
400 """
401 table_schema = self._table_schema(table_name)
402 return [column.name for column in table_schema.primary_key]
404 def makeSchema(self, drop: bool = False, part_range: tuple[int, int] | None = None) -> None:
405 """Create or re-create all tables.
407 Parameters
408 ----------
409 drop : `bool`
410 If True then drop tables before creating new ones.
411 part_range : `tuple` [ `int` ] or `None`
412 Start and end partition number for time partitions, end is not
413 inclusive. Used to create per-partition DiaObject, DiaSource, and
414 DiaForcedSource tables. If `None` then per-partition tables are
415 not created.
416 """
417 # Try to create keyspace if it does not exist
418 query = (
419 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"'
420 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}"
421 )
422 self._session.execute(query)
424 for table in self._apdb_tables:
425 self._makeTableSchema(table, drop, part_range)
426 for extra_table in self._extra_tables:
427 self._makeTableSchema(extra_table, drop, part_range)
428 # Reset cached information.
429 self._has_insert_id = None
431 def _makeTableSchema(
432 self,
433 table: ApdbTables | ExtraTables,
434 drop: bool = False,
435 part_range: tuple[int, int] | None = None,
436 ) -> None:
437 _LOG.debug("Making table %s", table)
439 fullTable = table.table_name(self._prefix)
441 table_list = [fullTable]
442 if part_range is not None:
443 if table in self._time_partitioned_tables:
444 partitions = range(*part_range)
445 table_list = [f"{fullTable}_{part}" for part in partitions]
447 if drop:
448 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list]
449 futures = [self._session.execute_async(query, timeout=None) for query in queries]
450 for future in futures:
451 _LOG.debug("wait for query: %s", future.query)
452 future.result()
453 _LOG.debug("query finished: %s", future.query)
455 queries = []
456 for table_name in table_list:
457 if_not_exists = "" if drop else "IF NOT EXISTS"
458 columns = ", ".join(self._tableColumns(table))
459 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})'
460 _LOG.debug("query: %s", query)
461 queries.append(query)
462 futures = [self._session.execute_async(query, timeout=None) for query in queries]
463 for future in futures:
464 _LOG.debug("wait for query: %s", future.query)
465 future.result()
466 _LOG.debug("query finished: %s", future.query)
468 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
469 """Return set of columns in a table
471 Parameters
472 ----------
473 table_name : `ApdbTables`
474 Name of the table.
476 Returns
477 -------
478 column_defs : `list`
479 List of strings in the format "column_name type".
480 """
481 table_schema = self._table_schema(table_name)
483 # must have partition columns and clustering columns
484 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", [])
485 clust_columns = [column.name for column in table_schema.primary_key]
486 _LOG.debug("part_columns: %s", part_columns)
487 _LOG.debug("clust_columns: %s", clust_columns)
488 if not part_columns:
489 raise ValueError(f"Table {table_name} configuration is missing partition index")
491 # all columns
492 column_defs = []
493 for column in table_schema.columns:
494 ctype = self._type_map[column.datatype]
495 column_defs.append(f'"{column.name}" {ctype}')
497 # primary key definition
498 part_columns = [f'"{col}"' for col in part_columns]
499 clust_columns = [f'"{col}"' for col in clust_columns]
500 if len(part_columns) > 1:
501 columns = ", ".join(part_columns)
502 part_columns = [f"({columns})"]
503 pkey = ", ".join(part_columns + clust_columns)
504 _LOG.debug("pkey: %s", pkey)
505 column_defs.append(f"PRIMARY KEY ({pkey})")
507 return column_defs
509 def _table_schema(self, table: ApdbTables | ExtraTables) -> simple.Table:
510 """Return schema definition for a table."""
511 if isinstance(table, ApdbTables):
512 table_schema = self._apdb_tables[table]
513 else:
514 table_schema = self._extra_tables[table]
515 return table_schema