Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 14%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["ApdbCassandraSchema"]
26import enum
27import logging
28from typing import List, Mapping, Optional, TYPE_CHECKING, Tuple, Union
30from .apdbSchema import ApdbSchema, ApdbTables, ColumnDef, IndexDef, IndexType, TableDef
32if TYPE_CHECKING: 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true
33 import cassandra.cluster
36_LOG = logging.getLogger(__name__)
39@enum.unique
40class ExtraTables(enum.Enum):
41 """Names of the extra tables used by Cassandra implementation."""
43 DiaSourceToPartition = "DiaSourceToPartition"
44 "Maps diaSourceId ro its partition values (pixel and time)."
46 def table_name(self, prefix: str = "") -> str:
47 """Return full table name."""
48 return prefix + self.value
51class ApdbCassandraSchema(ApdbSchema):
52 """Class for management of APDB schema.
54 Parameters
55 ----------
56 session : `cassandra.cluster.Session`
57 Cassandra session object
58 schema_file : `str`
59 Name of the YAML schema file.
60 extra_schema_file : `str`, optional
61 Name of the YAML schema file with extra column definitions.
62 prefix : `str`, optional
63 Prefix to add to all schema elements.
64 packing : `str`
65 Type of packing to apply to columns, string "none" disable packing,
66 any other value enables it.
67 time_partition_tables : `bool`
68 If True then schema will have a separate table for each time partition.
69 """
71 _type_map = dict(DOUBLE="DOUBLE",
72 FLOAT="FLOAT",
73 DATETIME="TIMESTAMP",
74 BIGINT="BIGINT",
75 INTEGER="INT",
76 INT="INT",
77 TINYINT="TINYINT",
78 BLOB="BLOB",
79 CHAR="TEXT",
80 BOOL="BOOLEAN")
81 """Map YAML column types to Cassandra"""
83 _time_partitioned_tables = [
84 ApdbTables.DiaObject,
85 ApdbTables.DiaSource,
86 ApdbTables.DiaForcedSource,
87 ]
88 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast]
90 def __init__(self, session: cassandra.cluster.Session, keyspace: str, schema_file: str,
91 extra_schema_file: Optional[str] = None, prefix: str = "",
92 time_partition_tables: bool = False):
94 super().__init__(schema_file, extra_schema_file)
96 self._session = session
97 self._keyspace = keyspace
98 self._prefix = prefix
99 self._time_partition_tables = time_partition_tables
101 # add columns and index for partitioning.
102 self._ignore_tables = []
103 for table, tableDef in self.tableSchemas.items():
104 columns = []
105 add_columns = True
106 if table in self._spatially_partitioned_tables:
107 # DiaObjectLast does not need temporal partitioning
108 columns = ["apdb_part"]
109 elif table in self._time_partitioned_tables:
110 if time_partition_tables:
111 columns = ["apdb_part"]
112 else:
113 columns = ["apdb_part", "apdb_time_part"]
114 elif table is ApdbTables.SSObject:
115 # For SSObject there is no natural partition key but we have
116 # to partition it because there are too many of them. I'm
117 # going to partition on its primary key (and drop separate
118 # primary key index).
119 columns = ["ssObjectId"]
120 tableDef.indices = [
121 index for index in tableDef.indices if index.type is not IndexType.PRIMARY
122 ]
123 add_columns = False
124 else:
125 # TODO: Do not know yet how other tables can be partitioned
126 self._ignore_tables.append(table)
127 add_columns = False
129 if add_columns:
130 # add columns to the column list
131 columnDefs = [
132 ColumnDef(name=name, type="BIGINT", nullable=False) for name in columns
133 ]
134 tableDef.columns = columnDefs + tableDef.columns
136 # make an index
137 if columns:
138 index = IndexDef(name=f"Part_{tableDef.name}", type=IndexType.PARTITION, columns=columns)
139 tableDef.indices.append(index)
141 self._extra_tables = self._extraTableSchema()
143 def _extraTableSchema(self) -> Mapping[ExtraTables, TableDef]:
144 """Generate schema for extra tables."""
145 return {
146 ExtraTables.DiaSourceToPartition: TableDef(
147 name=ExtraTables.DiaSourceToPartition.value,
148 columns=[
149 ColumnDef(name="diaSourceId", type="BIGINT", nullable=False),
150 ColumnDef(name="apdb_part", type="BIGINT", nullable=False),
151 ColumnDef(name="apdb_time_part", type="INT", nullable=False),
152 ],
153 indices=[
154 IndexDef(
155 name=f"Part_{ExtraTables.DiaSourceToPartition.value}",
156 type=IndexType.PARTITION,
157 columns=["diaSourceId"],
158 ),
159 ],
160 ),
161 }
163 def tableName(self, table_name: Union[ApdbTables, ExtraTables]) -> str:
164 """Return Cassandra table name for APDB table.
165 """
166 return table_name.table_name(self._prefix)
168 def getColumnMap(self, table_name: Union[ApdbTables, ExtraTables]) -> Mapping[str, ColumnDef]:
169 """Returns mapping of column names to Column definitions.
171 Parameters
172 ----------
173 table_name : `ApdbTables`
174 One of known APDB table names.
176 Returns
177 -------
178 column_map : `dict`
179 Mapping of column names to `ColumnDef` instances.
180 """
181 if isinstance(table_name, ApdbTables):
182 table_schema = self.tableSchemas[table_name]
183 else:
184 table_schema = self._extra_tables[table_name]
185 cmap = {column.name: column for column in table_schema.columns}
186 return cmap
188 def partitionColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]:
189 """Return a list of columns used for table partitioning.
191 Parameters
192 ----------
193 table_name : `ApdbTables`
194 Table name in APDB schema
196 Returns
197 -------
198 columns : `list` of `str`
199 Names of columns for used for partitioning.
200 """
201 if isinstance(table_name, ApdbTables):
202 table_schema = self.tableSchemas[table_name]
203 else:
204 table_schema = self._extra_tables[table_name]
205 for index in table_schema.indices:
206 if index.type is IndexType.PARTITION:
207 # there could be just one partitoning index (possibly with few columns)
208 return index.columns
209 return []
211 def clusteringColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]:
212 """Return a list of columns used for clustering.
214 Parameters
215 ----------
216 table_name : `ApdbTables`
217 Table name in APDB schema
219 Returns
220 -------
221 columns : `list` of `str`
222 Names of columns for used for clustering.
223 """
224 if isinstance(table_name, ApdbTables):
225 table_schema = self.tableSchemas[table_name]
226 else:
227 table_schema = self._extra_tables[table_name]
228 for index in table_schema.indices:
229 if index.type is IndexType.PRIMARY:
230 return index.columns
231 return []
233 def makeSchema(self, drop: bool = False, part_range: Optional[Tuple[int, int]] = None) -> None:
234 """Create or re-create all tables.
236 Parameters
237 ----------
238 drop : `bool`
239 If True then drop tables before creating new ones.
240 part_range : `tuple` [ `int` ] or `None`
241 Start and end partition number for time partitions, end is not
242 inclusive. Used to create per-partition DiaObject, DiaSource, and
243 DiaForcedSource tables. If `None` then per-partition tables are
244 not created.
245 """
246 for table in self.tableSchemas:
247 self._makeTableSchema(table, drop, part_range)
248 for extra_table in self._extra_tables:
249 self._makeTableSchema(extra_table, drop, part_range)
251 def _makeTableSchema(
252 self,
253 table: Union[ApdbTables, ExtraTables],
254 drop: bool = False,
255 part_range: Optional[Tuple[int, int]] = None
256 ) -> None:
257 if table in self._ignore_tables:
258 _LOG.debug("Skipping schema for table %s", table)
259 return
260 _LOG.debug("Making table %s", table)
262 fullTable = table.table_name(self._prefix)
264 table_list = [fullTable]
265 if part_range is not None:
266 if table in self._time_partitioned_tables:
267 partitions = range(*part_range)
268 table_list = [f"{fullTable}_{part}" for part in partitions]
270 if drop:
271 queries = [
272 f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list
273 ]
274 futures = [self._session.execute_async(query, timeout=None) for query in queries]
275 for future in futures:
276 _LOG.debug("wait for query: %s", future.query)
277 future.result()
278 _LOG.debug("query finished: %s", future.query)
280 queries = []
281 for table_name in table_list:
282 if_not_exists = "" if drop else "IF NOT EXISTS"
283 columns = ", ".join(self._tableColumns(table))
284 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})'
285 _LOG.debug("query: %s", query)
286 queries.append(query)
287 futures = [self._session.execute_async(query, timeout=None) for query in queries]
288 for future in futures:
289 _LOG.debug("wait for query: %s", future.query)
290 future.result()
291 _LOG.debug("query finished: %s", future.query)
293 def _tableColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]:
294 """Return set of columns in a table
296 Parameters
297 ----------
298 table_name : `ApdbTables`
299 Name of the table.
301 Returns
302 -------
303 column_defs : `list`
304 List of strings in the format "column_name type".
305 """
306 if isinstance(table_name, ApdbTables):
307 table_schema = self.tableSchemas[table_name]
308 else:
309 table_schema = self._extra_tables[table_name]
311 # must have partition columns and clustering columns
312 part_columns = []
313 clust_columns = []
314 index_columns = set()
315 for index in table_schema.indices:
316 if index.type is IndexType.PARTITION:
317 part_columns = index.columns
318 elif index.type is IndexType.PRIMARY:
319 clust_columns = index.columns
320 index_columns.update(index.columns)
321 _LOG.debug("part_columns: %s", part_columns)
322 _LOG.debug("clust_columns: %s", clust_columns)
323 if not part_columns:
324 raise ValueError(f"Table {table_name} configuration is missing partition index")
326 # all columns
327 column_defs = []
328 for column in table_schema.columns:
329 ctype = self._type_map[column.type]
330 column_defs.append(f'"{column.name}" {ctype}')
332 # primary key definition
333 part_columns = [f'"{col}"' for col in part_columns]
334 clust_columns = [f'"{col}"' for col in clust_columns]
335 if len(part_columns) > 1:
336 columns = ", ".join(part_columns)
337 part_columns = [f"({columns})"]
338 pkey = ", ".join(part_columns + clust_columns)
339 _LOG.debug("pkey: %s", pkey)
340 column_defs.append(f"PRIMARY KEY ({pkey})")
342 return column_defs