Coverage for python/lsst/dax/apdb/apdbCassandraSchema.py: 11%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["ApdbCassandraSchema"]
26import itertools
27import logging
28from typing import List, Mapping, Optional, TYPE_CHECKING, Tuple
30from .apdbSchema import ApdbSchema, ApdbTables, ColumnDef, IndexDef, IndexType
32if TYPE_CHECKING: 32 ↛ 33line 32 didn't jump to line 33, because the condition on line 32 was never true
33 import cassandra.cluster
36_LOG = logging.getLogger(__name__)
39class ApdbCassandraSchema(ApdbSchema):
40 """Class for management of APDB schema.
42 Parameters
43 ----------
44 session : `cassandra.cluster.Session`
45 Cassandra session object
46 schema_file : `str`
47 Name of the YAML schema file.
48 extra_schema_file : `str`, optional
49 Name of the YAML schema file with extra column definitions.
50 prefix : `str`, optional
51 Prefix to add to all schema elements.
52 packing : `str`
53 Type of packing to apply to columns, string "none" disable packing,
54 any other value enables it.
55 time_partition_tables : `bool`
56 If True then schema will have a separate table for each time partition.
57 """
59 _type_map = dict(DOUBLE="DOUBLE",
60 FLOAT="FLOAT",
61 DATETIME="TIMESTAMP",
62 BIGINT="BIGINT",
63 INTEGER="INT",
64 INT="INT",
65 TINYINT="TINYINT",
66 BLOB="BLOB",
67 CHAR="TEXT",
68 BOOL="BOOLEAN")
69 """Map YAML column types to Cassandra"""
71 def __init__(self, session: cassandra.cluster.Session, schema_file: str,
72 extra_schema_file: Optional[str] = None, prefix: str = "",
73 packing: str = "none", time_partition_tables: bool = False):
75 super().__init__(schema_file, extra_schema_file)
77 self._session = session
78 self._prefix = prefix
79 self._packing = packing
81 # add columns and index for partitioning.
82 self._ignore_tables = []
83 for table, tableDef in self.tableSchemas.items():
84 columns = []
85 if table is ApdbTables.DiaObjectLast:
86 # DiaObjectLast does not need temporal partitioning
87 columns = ["apdb_part"]
88 elif table in (ApdbTables.DiaObject, ApdbTables.DiaSource, ApdbTables.DiaForcedSource):
89 # these three tables can use either pure spatial or combined
90 if time_partition_tables:
91 columns = ["apdb_part"]
92 else:
93 columns = ["apdb_part", "apdb_time_part"]
94 else:
95 # TODO: Do not know yet how other tables can be partitioned
96 self._ignore_tables.append(table)
98 # add columns to the column list
99 columnDefs = [ColumnDef(name=name,
100 type="BIGINT",
101 nullable=False,
102 default=None,
103 description="",
104 unit=None,
105 ucd=None) for name in columns]
106 tableDef.columns = columnDefs + tableDef.columns
108 # make an index
109 index = IndexDef(name=f"Part_{tableDef.name}", type=IndexType.PARTITION, columns=columns)
110 tableDef.indices.append(index)
112 self._packed_columns = {}
113 if self._packing != "none":
114 for table, tableDef in self.tableSchemas.items():
115 index_columns = set(itertools.chain.from_iterable(
116 index.columns for index in tableDef.indices
117 ))
118 columnsDefs = [column for column in tableDef.columns if column.name not in index_columns]
119 self._packed_columns[table] = columnsDefs
121 def tableName(self, table_name: ApdbTables) -> str:
122 """Return Cassandra table name for APDB table.
123 """
124 return table_name.table_name(self._prefix)
126 def getColumnMap(self, table_name: ApdbTables) -> Mapping[str, ColumnDef]:
127 """Returns mapping of column names to Column definitions.
129 Parameters
130 ----------
131 table_name : `ApdbTables`
132 One of known APDB table names.
134 Returns
135 -------
136 column_map : `dict`
137 Mapping of column names to `ColumnDef` instances.
138 """
139 table = self.tableSchemas[table_name]
140 cmap = {column.name: column for column in table.columns}
141 return cmap
143 def partitionColumns(self, table_name: ApdbTables) -> List[str]:
144 """Return a list of columns used for table partitioning.
146 Parameters
147 ----------
148 table_name : `ApdbTables`
149 Table name in APDB schema
151 Returns
152 -------
153 columns : `list` of `str`
154 Names of columns for used for partitioning.
155 """
156 table_schema = self.tableSchemas[table_name]
157 for index in table_schema.indices:
158 if index.type is IndexType.PARTITION:
159 # there could be just one partitoning index (possibly with few columns)
160 return index.columns
161 return []
163 def clusteringColumns(self, table_name: ApdbTables) -> List[str]:
164 """Return a list of columns used for clustering.
166 Parameters
167 ----------
168 table_name : `ApdbTables`
169 Table name in APDB schema
171 Returns
172 -------
173 columns : `list` of `str`
174 Names of columns for used for partitioning.
175 """
176 table_schema = self.tableSchemas[table_name]
177 for index in table_schema.indices:
178 if index.type is IndexType.PRIMARY:
179 return index.columns
180 return []
182 def makeSchema(self, drop: bool = False, part_range: Optional[Tuple[int, int]] = None) -> None:
183 """Create or re-create all tables.
185 Parameters
186 ----------
187 drop : `bool`
188 If True then drop tables before creating new ones.
189 part_range : `tuple` [ `int` ] or `None`
190 Start and end partition number for time partitions, end is not
191 inclusive. Used to create per-partition DiaObject, DiaSource, and
192 DiaForcedSource tables. If `None` then per-partition tables are
193 not created.
194 """
196 for table in self.tableSchemas:
197 if table in self._ignore_tables:
198 _LOG.debug("Skipping schema for table %s", table)
199 continue
200 _LOG.debug("Making table %s", table)
202 fullTable = table.table_name(self._prefix)
204 table_list = [fullTable]
205 if part_range is not None:
206 if table in (ApdbTables.DiaSource, ApdbTables.DiaForcedSource, ApdbTables.DiaObject):
207 partitions = range(*part_range)
208 table_list = [f"{fullTable}_{part}" for part in partitions]
210 if drop:
211 queries = [f'DROP TABLE IF EXISTS "{table_name}"' for table_name in table_list]
212 futures = [self._session.execute_async(query, timeout=None) for query in queries]
213 for future in futures:
214 _LOG.debug("wait for query: %s", future.query)
215 future.result()
216 _LOG.debug("query finished: %s", future.query)
218 queries = []
219 for table_name in table_list:
220 if_not_exists = "" if drop else "IF NOT EXISTS"
221 columns = ", ".join(self._tableColumns(table))
222 query = f'CREATE TABLE {if_not_exists} "{table_name}" ({columns})'
223 _LOG.debug("query: %s", query)
224 queries.append(query)
225 futures = [self._session.execute_async(query, timeout=None) for query in queries]
226 for future in futures:
227 _LOG.debug("wait for query: %s", future.query)
228 future.result()
229 _LOG.debug("query finished: %s", future.query)
231 def _tableColumns(self, table_name: ApdbTables) -> List[str]:
232 """Return set of columns in a table
234 Parameters
235 ----------
236 table_name : `ApdbTables`
237 Name of the table.
239 Returns
240 -------
241 column_defs : `list`
242 List of strings in the format "column_name type".
243 """
244 table_schema = self.tableSchemas[table_name]
246 # must have partition columns and clustering columns
247 part_columns = []
248 clust_columns = []
249 index_columns = set()
250 for index in table_schema.indices:
251 if index.type is IndexType.PARTITION:
252 part_columns = index.columns
253 elif index.type is IndexType.PRIMARY:
254 clust_columns = index.columns
255 index_columns.update(index.columns)
256 _LOG.debug("part_columns: %s", part_columns)
257 _LOG.debug("clust_columns: %s", clust_columns)
258 if not part_columns:
259 raise ValueError(f"Table {table_name} configuration is missing partition index")
260 if not clust_columns:
261 raise ValueError(f"Table {table_name} configuration is missing primary index")
263 # all columns
264 column_defs = []
265 for column in table_schema.columns:
266 if self._packing != "none" and column.name not in index_columns:
267 # when packing all non-index columns are replaced by a BLOB
268 continue
269 ctype = self._type_map[column.type]
270 column_defs.append(f'"{column.name}" {ctype}')
272 # packed content goes to a single blob column
273 if self._packing != "none":
274 column_defs.append('"apdb_packed" blob')
276 # primary key definition
277 part_columns = [f'"{col}"' for col in part_columns]
278 clust_columns = [f'"{col}"' for col in clust_columns]
279 if len(part_columns) > 1:
280 columns = ", ".join(part_columns)
281 part_columns = [f"({columns})"]
282 pkey = ", ".join(part_columns + clust_columns)
283 _LOG.debug("pkey: %s", pkey)
284 column_defs.append(f"PRIMARY KEY ({pkey})")
286 return column_defs
288 def packedColumns(self, table_name: ApdbTables) -> List[ColumnDef]:
289 """Return set of columns that are packed into BLOB.
291 Parameters
292 ----------
293 table_name : `ApdbTables`
294 Name of the table.
296 Returns
297 -------
298 columns : `list` [ `ColumnDef` ]
299 List of column definitions. Empty list is returned if packing is
300 not configured.
301 """
302 return self._packed_columns.get(table_name, [])