Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 22%
170 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-12 10:17 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-12 10:17 +0000
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["ApdbCassandraSchema"]
26import enum
27import logging
28from collections.abc import Mapping
29from typing import TYPE_CHECKING
31import felis.types
32from felis import simple
34from .apdbSchema import ApdbSchema, ApdbTables
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 import cassandra.cluster
40_LOG = logging.getLogger(__name__)
43class _FelisUUID(felis.types.FelisType, felis_name="uuid", votable_name="uuid"):
44 """Special internal type for UUID columns. Felis does not support UUID,
45 but we need it here, to simplify logic it's easier to add a special class.
46 """
49@enum.unique
50class ExtraTables(enum.Enum):
51 """Names of the extra tables used by Cassandra implementation."""
53 DiaInsertId = "DiaInsertId"
54 """Name of the table for insert ID records."""
56 DiaObjectInsertId = "DiaObjectInsertId"
57 """Name of the table for DIAObject insert ID records."""
59 DiaSourceInsertId = "DiaSourceInsertId"
60 """Name of the table for DIASource insert ID records."""
62 DiaForcedSourceInsertId = "DiaFSourceInsertId"
63 """Name of the table for DIAForcedSource insert ID records."""
65 DiaSourceToPartition = "DiaSourceToPartition"
66 "Maps diaSourceId to its partition values (pixel and time)."
68 def table_name(self, prefix: str = "") -> str:
69 """Return full table name."""
70 return prefix + self.value
72 @classmethod
73 def insert_id_tables(cls) -> Mapping[ExtraTables, ApdbTables]:
74 """Return mapping of tables used for insert ID tracking to their
75 corresponding regular tables.
76 """
77 return {
78 cls.DiaObjectInsertId: ApdbTables.DiaObject,
79 cls.DiaSourceInsertId: ApdbTables.DiaSource,
80 cls.DiaForcedSourceInsertId: ApdbTables.DiaForcedSource,
81 }
84class ApdbCassandraSchema(ApdbSchema):
85 """Class for management of APDB schema.
87 Parameters
88 ----------
89 session : `cassandra.cluster.Session`
90 Cassandra session object
91 schema_file : `str`
92 Name of the YAML schema file.
93 schema_name : `str`, optional
94 Name of the schema in YAML files.
95 prefix : `str`, optional
96 Prefix to add to all schema elements.
97 time_partition_tables : `bool`
98 If True then schema will have a separate table for each time partition.
99 """
101 _type_map = {
102 felis.types.Double: "DOUBLE",
103 felis.types.Float: "FLOAT",
104 felis.types.Timestamp: "TIMESTAMP",
105 felis.types.Long: "BIGINT",
106 felis.types.Int: "INT",
107 felis.types.Short: "INT",
108 felis.types.Byte: "TINYINT",
109 felis.types.Binary: "BLOB",
110 felis.types.Char: "TEXT",
111 felis.types.String: "TEXT",
112 felis.types.Unicode: "TEXT",
113 felis.types.Text: "TEXT",
114 felis.types.Boolean: "BOOLEAN",
115 _FelisUUID: "UUID",
116 }
117 """Map YAML column types to Cassandra"""
119 _time_partitioned_tables = [
120 ApdbTables.DiaObject,
121 ApdbTables.DiaSource,
122 ApdbTables.DiaForcedSource,
123 ]
124 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast]
126 def __init__(
127 self,
128 session: cassandra.cluster.Session,
129 keyspace: str,
130 schema_file: str,
131 schema_name: str = "ApdbSchema",
132 prefix: str = "",
133 time_partition_tables: bool = False,
134 use_insert_id: bool = False,
135 ):
136 super().__init__(schema_file, schema_name)
138 self._session = session
139 self._keyspace = keyspace
140 self._prefix = prefix
141 self._time_partition_tables = time_partition_tables
142 self._use_insert_id = use_insert_id
143 self._has_insert_id: bool | None = None
145 self._apdb_tables = self._apdb_tables_schema(time_partition_tables)
146 self._extra_tables = self._extra_tables_schema()
148 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, simple.Table]:
149 """Generate schema for regular APDB tables."""
150 apdb_tables: dict[ApdbTables, simple.Table] = {}
152 # add columns and index for partitioning.
153 for table, apdb_table_def in self.tableSchemas.items():
154 part_columns = []
155 add_columns = []
156 primary_key = apdb_table_def.primary_key[:]
157 if table in self._spatially_partitioned_tables:
158 # DiaObjectLast does not need temporal partitioning
159 part_columns = ["apdb_part"]
160 add_columns = part_columns
161 elif table in self._time_partitioned_tables:
162 if time_partition_tables:
163 part_columns = ["apdb_part"]
164 else:
165 part_columns = ["apdb_part", "apdb_time_part"]
166 add_columns = part_columns
167 elif table is ApdbTables.SSObject:
168 # For SSObject there is no natural partition key but we have
169 # to partition it because there are too many of them. I'm
170 # going to partition on its primary key (and drop separate
171 # primary key index).
172 part_columns = ["ssObjectId"]
173 primary_key = []
174 else:
175 # TODO: Do not know what to do with the other tables
176 continue
178 column_defs = []
179 if add_columns:
180 column_defs = [
181 simple.Column(id=f"#{name}", name=name, datatype=felis.types.Long, nullable=False)
182 for name in add_columns
183 ]
185 annotations = dict(apdb_table_def.annotations)
186 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns]
187 if part_columns:
188 annotations["cassandra:partitioning_columns"] = part_columns
190 apdb_tables[table] = simple.Table(
191 id=apdb_table_def.id,
192 name=apdb_table_def.name,
193 columns=column_defs + apdb_table_def.columns,
194 primary_key=primary_key,
195 indexes=[],
196 constraints=[],
197 annotations=annotations,
198 )
200 return apdb_tables
202 def _extra_tables_schema(self) -> Mapping[ExtraTables, simple.Table]:
203 """Generate schema for extra tables."""
204 extra_tables: dict[ExtraTables, simple.Table] = {}
206 # This table maps DiaSource ID to its partitions in DiaSource table and
207 # DiaSourceInsertId tables.
208 extra_tables[ExtraTables.DiaSourceToPartition] = simple.Table(
209 id="#" + ExtraTables.DiaSourceToPartition.value,
210 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix),
211 columns=[
212 simple.Column(
213 id="#diaSourceId", name="diaSourceId", datatype=felis.types.Long, nullable=False
214 ),
215 simple.Column(id="#apdb_part", name="apdb_part", datatype=felis.types.Long, nullable=False),
216 simple.Column(
217 id="#apdb_time_part", name="apdb_time_part", datatype=felis.types.Int, nullable=False
218 ),
219 simple.Column(id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=True),
220 ],
221 primary_key=[],
222 indexes=[],
223 constraints=[],
224 annotations={"cassandra:partitioning_columns": ["diaSourceId"]},
225 )
227 insert_id_column = simple.Column(
228 id="#insert_id", name="insert_id", datatype=_FelisUUID, nullable=False
229 )
231 if not self._use_insert_id:
232 return extra_tables
234 # Table containing insert IDs, this one is not partitioned, but
235 # partition key must be defined.
236 extra_tables[ExtraTables.DiaInsertId] = simple.Table(
237 id="#" + ExtraTables.DiaInsertId.value,
238 name=ExtraTables.DiaInsertId.table_name(self._prefix),
239 columns=[
240 simple.Column(id="#partition", name="partition", datatype=felis.types.Int, nullable=False),
241 insert_id_column,
242 simple.Column(
243 id="#insert_time", name="insert_time", datatype=felis.types.Timestamp, nullable=False
244 ),
245 ],
246 primary_key=[insert_id_column],
247 indexes=[],
248 constraints=[],
249 annotations={"cassandra:partitioning_columns": ["partition"]},
250 )
252 for insert_id_table_enum, apdb_table_enum in ExtraTables.insert_id_tables().items():
253 apdb_table_def = self.tableSchemas[apdb_table_enum]
255 extra_tables[insert_id_table_enum] = simple.Table(
256 id="#" + insert_id_table_enum.value,
257 name=insert_id_table_enum.table_name(self._prefix),
258 columns=[insert_id_column] + apdb_table_def.columns,
259 primary_key=apdb_table_def.primary_key[:],
260 indexes=[],
261 constraints=[],
262 annotations={
263 "cassandra:partitioning_columns": ["insert_id"],
264 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns],
265 },
266 )
268 return extra_tables
270 @property
271 def has_insert_id(self) -> bool:
272 """Whether insert ID tables are to be used (`bool`)."""
273 if self._has_insert_id is None:
274 self._has_insert_id = self._use_insert_id and self._check_insert_id()
275 return self._has_insert_id
277 def _check_insert_id(self) -> bool:
278 """Check whether database has tables for tracking insert IDs."""
279 table_name = ExtraTables.DiaInsertId.table_name(self._prefix)
280 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s"
281 result = self._session.execute(query, (self._keyspace, table_name))
282 row = result.one()
283 return bool(row[0])
285 def tableName(self, table_name: ApdbTables | ExtraTables) -> str:
286 """Return Cassandra table name for APDB table."""
287 return table_name.table_name(self._prefix)
289 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, simple.Column]:
290 """Return mapping of column names to Column definitions.
292 Parameters
293 ----------
294 table_name : `ApdbTables`
295 One of known APDB table names.
297 Returns
298 -------
299 column_map : `dict`
300 Mapping of column names to `ColumnDef` instances.
301 """
302 table_schema = self._table_schema(table_name)
303 cmap = {column.name: column for column in table_schema.columns}
304 return cmap
306 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]:
307 """Return a list of columns names for a table as defined in APDB
308 schema.
310 Parameters
311 ----------
312 table_name : `ApdbTables` or `ExtraTables`
313 Enum for a table in APDB schema.
315 Returns
316 -------
317 columns : `list` of `str`
318 Names of regular columns in the table.
319 """
320 table_schema = self._table_schema(table_name)
321 return table_schema.annotations["cassandra:apdb_column_names"]
323 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
324 """Return a list of columns used for table partitioning.
326 Parameters
327 ----------
328 table_name : `ApdbTables`
329 Table name in APDB schema
331 Returns
332 -------
333 columns : `list` of `str`
334 Names of columns used for partitioning.
335 """
336 table_schema = self._table_schema(table_name)
337 return table_schema.annotations.get("cassandra:partitioning_columns", [])
339 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
340 """Return a list of columns used for clustering.
342 Parameters
343 ----------
344 table_name : `ApdbTables`
345 Table name in APDB schema
347 Returns
348 -------
349 columns : `list` of `str`
350 Names of columns for used for clustering.
351 """
352 table_schema = self._table_schema(table_name)
353 return [column.name for column in table_schema.primary_key]
355 def makeSchema(self, drop: bool = False, part_range: tuple[int, int] | None = None) -> None:
356 """Create or re-create all tables.
358 Parameters
359 ----------
360 drop : `bool`
361 If True then drop tables before creating new ones.
362 part_range : `tuple` [ `int` ] or `None`
363 Start and end partition number for time partitions, end is not
364 inclusive. Used to create per-partition DiaObject, DiaSource, and
365 DiaForcedSource tables. If `None` then per-partition tables are
366 not created.
367 """
368 # Try to create keyspace if it does not exist
369 query = (
370 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"'
371 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3}"
372 )
373 self._session.execute(query)
375 for table in self._apdb_tables:
376 self._makeTableSchema(table, drop, part_range)
377 for extra_table in self._extra_tables:
378 self._makeTableSchema(extra_table, drop, part_range)
379 # Reset cached information.
380 self._has_insert_id = None
382 def _makeTableSchema(
383 self,
384 table: ApdbTables | ExtraTables,
385 drop: bool = False,
386 part_range: tuple[int, int] | None = None,
387 ) -> None:
388 _LOG.debug("Making table %s", table)
390 fullTable = table.table_name(self._prefix)
392 table_list = [fullTable]
393 if part_range is not None:
394 if table in self._time_partitioned_tables:
395 partitions = range(*part_range)
396 table_list = [f"{fullTable}_{part}" for part in partitions]
398 if drop:
399 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list]
400 futures = [self._session.execute_async(query, timeout=None) for query in queries]
401 for future in futures:
402 _LOG.debug("wait for query: %s", future.query)
403 future.result()
404 _LOG.debug("query finished: %s", future.query)
406 queries = []
407 for table_name in table_list:
408 if_not_exists = "" if drop else "IF NOT EXISTS"
409 columns = ", ".join(self._tableColumns(table))
410 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})'
411 _LOG.debug("query: %s", query)
412 queries.append(query)
413 futures = [self._session.execute_async(query, timeout=None) for query in queries]
414 for future in futures:
415 _LOG.debug("wait for query: %s", future.query)
416 future.result()
417 _LOG.debug("query finished: %s", future.query)
419 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
420 """Return set of columns in a table
422 Parameters
423 ----------
424 table_name : `ApdbTables`
425 Name of the table.
427 Returns
428 -------
429 column_defs : `list`
430 List of strings in the format "column_name type".
431 """
432 table_schema = self._table_schema(table_name)
434 # must have partition columns and clustering columns
435 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", [])
436 clust_columns = [column.name for column in table_schema.primary_key]
437 _LOG.debug("part_columns: %s", part_columns)
438 _LOG.debug("clust_columns: %s", clust_columns)
439 if not part_columns:
440 raise ValueError(f"Table {table_name} configuration is missing partition index")
442 # all columns
443 column_defs = []
444 for column in table_schema.columns:
445 ctype = self._type_map[column.datatype]
446 column_defs.append(f'"{column.name}" {ctype}')
448 # primary key definition
449 part_columns = [f'"{col}"' for col in part_columns]
450 clust_columns = [f'"{col}"' for col in clust_columns]
451 if len(part_columns) > 1:
452 columns = ", ".join(part_columns)
453 part_columns = [f"({columns})"]
454 pkey = ", ".join(part_columns + clust_columns)
455 _LOG.debug("pkey: %s", pkey)
456 column_defs.append(f"PRIMARY KEY ({pkey})")
458 return column_defs
460 def _table_schema(self, table: ApdbTables | ExtraTables) -> simple.Table:
461 """Return schema definition for a table."""
462 if isinstance(table, ApdbTables):
463 table_schema = self._apdb_tables[table]
464 else:
465 table_schema = self._extra_tables[table]
466 return table_schema