Coverage for python/lsst/dax/apdb/cassandra_utils.py: 15%
89 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-14 03:44 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-14 03:44 -0800
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = [
25 "literal",
26 "pandas_dataframe_factory",
27 "quote_id",
28 "raw_data_factory",
29 "select_concurrent",
30]
32import logging
33import numpy as np
34import pandas
35from datetime import datetime, timedelta
36from typing import Any, List, Tuple, Union
38# If cassandra-driver is not there the module can still be imported
39# but things will not work.
40try:
41 from cassandra.cluster import EXEC_PROFILE_DEFAULT, Session
42 from cassandra.concurrent import execute_concurrent
44 CASSANDRA_IMPORTED = True
45except ImportError:
46 CASSANDRA_IMPORTED = False
49_LOG = logging.getLogger(__name__)
52if CASSANDRA_IMPORTED: 52 ↛ 54line 52 didn't jump to line 54, because the condition on line 52 was never true
54 class SessionWrapper:
55 """Special wrapper class to workaround ``execute_concurrent()`` issue
56 which does not allow non-default execution profile.
58 Instance of this class can be passed to execute_concurrent() instead
59 of `Session` instance. This class implements a small set of methods
60 that are needed by ``execute_concurrent()``. When
61 ``execute_concurrent()`` is fixed to accept exectution profiles, this
62 wrapper can be dropped.
63 """
65 def __init__(
66 self, session: Session, execution_profile: Any = EXEC_PROFILE_DEFAULT
67 ):
68 self._session = session
69 self._execution_profile = execution_profile
71 def execute_async(
72 self,
73 *args: Any,
74 execution_profile: Any = EXEC_PROFILE_DEFAULT,
75 **kwargs: Any,
76 ) -> Any:
77 # explicit parameter can override our settings
78 if execution_profile is EXEC_PROFILE_DEFAULT:
79 execution_profile = self._execution_profile
80 return self._session.execute_async(
81 *args, execution_profile=execution_profile, **kwargs
82 )
84 def submit(self, *args: Any, **kwargs: Any) -> Any:
85 # internal method
86 return self._session.submit(*args, **kwargs)
89def pandas_dataframe_factory(
90 colnames: List[str], rows: List[Tuple]
91) -> pandas.DataFrame:
92 """Special non-standard row factory that creates pandas DataFrame from
93 Cassandra result set.
95 Parameters
96 ----------
97 colnames : `list` [ `str` ]
98 Names of the columns.
99 rows : `list` of `tuple`
100 Result rows.
102 Returns
103 -------
104 catalog : `pandas.DataFrame`
105 DataFrame with the result set.
107 Notes
108 -----
109 When using this method as row factory for Cassandra, the resulting
110 DataFrame should be accessed in a non-standard way using
111 `ResultSet._current_rows` attribute.
112 """
113 return pandas.DataFrame.from_records(rows, columns=colnames)
116def raw_data_factory(
117 colnames: List[str], rows: List[Tuple]
118) -> Tuple[List[str], List[Tuple]]:
119 """Special non-standard row factory that makes 2-element tuple containing
120 unmodified data: list of column names and list of rows.
122 Parameters
123 ----------
124 colnames : `list` [ `str` ]
125 Names of the columns.
126 rows : `list` of `tuple`
127 Result rows.
129 Returns
130 -------
131 colnames : `list` [ `str` ]
132 Names of the columns.
133 rows : `list` of `tuple`
134 Result rows
136 Notes
137 -----
138 When using this method as row factory for Cassandra, the resulting
139 2-element tuple should be accessed in a non-standard way using
140 `ResultSet._current_rows` attribute. This factory is used to build
141 pandas DataFrames in `select_concurrent` method.
142 """
143 return (colnames, rows)
146def select_concurrent(
147 session: Session, statements: List[Tuple], execution_profile: str, concurrency: int
148) -> Union[pandas.DataFrame, List]:
149 """Execute bunch of queries concurrently and merge their results into
150 a single result.
152 Parameters
153 ----------
154 statements : `list` [ `tuple` ]
155 List of statements and their parameters, passed directly to
156 ``execute_concurrent()``.
157 execution_profile : `str`
158 Execution profile name.
160 Returns
161 -------
162 result
163 Combined result of multiple statements, type of the result depends on
164 specific row factory defined in execution profile. If row factory is
165 one of `pandas_dataframe_factory` or `raw_data_factory` then pandas
166 DataFrame is created from a combined result. Otherwise a list of
167 rows is returned, type of each row is determined by the row factory.
169 Notes
170 -----
171 This method can raise any exception that is raised by one of the provided
172 statements.
173 """
174 session_wrap = SessionWrapper(session, execution_profile)
175 results = execute_concurrent(
176 session_wrap,
177 statements,
178 results_generator=True,
179 raise_on_first_error=False,
180 concurrency=concurrency,
181 )
183 ep = session.get_execution_profile(execution_profile)
184 if ep.row_factory is raw_data_factory:
186 # Collect rows into a single list and build Dataframe out of that
187 _LOG.debug("making pandas data frame out of rows/columns")
188 columns: Any = None
189 rows = []
190 for success, result in results:
191 if success:
192 result = result._current_rows
193 if columns is None:
194 columns = result[0]
195 elif columns != result[0]:
196 _LOG.error(
197 "different columns returned by queries: %s and %s",
198 columns,
199 result[0],
200 )
201 raise ValueError(
202 f"different columns returned by queries: {columns} and {result[0]}"
203 )
204 rows += result[1]
205 else:
206 _LOG.error("error returned by query: %s", result)
207 raise result
208 catalog = pandas_dataframe_factory(columns, rows)
209 _LOG.debug("pandas catalog shape: %s", catalog.shape)
210 return catalog
212 elif ep.row_factory is pandas_dataframe_factory:
214 # Merge multiple DataFrames into one
215 _LOG.debug("making pandas data frame out of set of data frames")
216 dataframes = []
217 for success, result in results:
218 if success:
219 dataframes.append(result._current_rows)
220 else:
221 _LOG.error("error returned by query: %s", result)
222 raise result
223 # concatenate all frames
224 if len(dataframes) == 1:
225 catalog = dataframes[0]
226 else:
227 catalog = pandas.concat(dataframes)
228 _LOG.debug("pandas catalog shape: %s", catalog.shape)
229 return catalog
231 else:
233 # Just concatenate all rows into a single collection.
234 rows = []
235 for success, result in results:
236 if success:
237 rows.extend(result)
238 else:
239 _LOG.error("error returned by query: %s", result)
240 raise result
241 _LOG.debug("number of rows: %s", len(rows))
242 return rows
245def literal(v: Any) -> Any:
246 """Transform object into a value for the query."""
247 if v is None:
248 pass
249 elif isinstance(v, datetime):
250 v = int((v - datetime(1970, 1, 1)) / timedelta(seconds=1)) * 1000
251 elif isinstance(v, (bytes, str)):
252 pass
253 else:
254 try:
255 if not np.isfinite(v):
256 v = None
257 except TypeError:
258 pass
259 return v
262def quote_id(columnName: str) -> str:
263 """Smart quoting for column names. Lower-case names are not quoted."""
264 if not columnName.islower():
265 columnName = '"' + columnName + '"'
266 return columnName