Coverage for python/lsst/dax/apdb/apdbCassandra.py: 17%

525 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-03 10:51 +0000

1# This file is part of dax_apdb. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ["ApdbCassandraConfig", "ApdbCassandra"] 

25 

26import logging 

27import uuid 

28from collections.abc import Iterable, Iterator, Mapping, Set 

29from typing import TYPE_CHECKING, Any, cast 

30 

31import numpy as np 

32import pandas 

33 

34# If cassandra-driver is not there the module can still be imported 

35# but ApdbCassandra cannot be instantiated. 

36try: 

37 import cassandra 

38 import cassandra.query 

39 from cassandra.auth import AuthProvider, PlainTextAuthProvider 

40 from cassandra.cluster import EXEC_PROFILE_DEFAULT, Cluster, ExecutionProfile 

41 from cassandra.policies import AddressTranslator, RoundRobinPolicy, WhiteListRoundRobinPolicy 

42 

43 CASSANDRA_IMPORTED = True 

44except ImportError: 

45 CASSANDRA_IMPORTED = False 

46 

47import felis.types 

48import lsst.daf.base as dafBase 

49from felis.simple import Table 

50from lsst import sphgeom 

51from lsst.pex.config import ChoiceField, Field, ListField 

52from lsst.utils.db_auth import DbAuth, DbAuthNotFoundError 

53from lsst.utils.iteration import chunk_iterable 

54 

55from .apdb import Apdb, ApdbConfig, ApdbInsertId, ApdbTableData 

56from .apdbCassandraSchema import ApdbCassandraSchema, ExtraTables 

57from .apdbMetadataCassandra import ApdbMetadataCassandra 

58from .apdbSchema import ApdbTables 

59from .cassandra_utils import ( 

60 ApdbCassandraTableData, 

61 PreparedStatementCache, 

62 literal, 

63 pandas_dataframe_factory, 

64 quote_id, 

65 raw_data_factory, 

66 select_concurrent, 

67) 

68from .pixelization import Pixelization 

69from .timer import Timer 

70from .versionTuple import IncompatibleVersionError, VersionTuple 

71 

72if TYPE_CHECKING: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true

73 from .apdbMetadata import ApdbMetadata 

74 

75_LOG = logging.getLogger(__name__) 

76 

77VERSION = VersionTuple(0, 1, 0) 

78"""Version for the code defined in this module. This needs to be updated 

79(following compatibility rules) when schema produced by this code changes. 

80""" 

81 

82# Copied from daf_butler. 

83DB_AUTH_ENVVAR = "LSST_DB_AUTH" 

84"""Default name of the environmental variable that will be used to locate DB 

85credentials configuration file. """ 

86 

87DB_AUTH_PATH = "~/.lsst/db-auth.yaml" 

88"""Default path at which it is expected that DB credentials are found.""" 

89 

90 

91class CassandraMissingError(Exception): 

92 def __init__(self) -> None: 

93 super().__init__("cassandra-driver module cannot be imported") 

94 

95 

96class ApdbCassandraConfig(ApdbConfig): 

97 """Configuration class for Cassandra-based APDB implementation.""" 

98 

99 contact_points = ListField[str]( 

100 doc="The list of contact points to try connecting for cluster discovery.", default=["127.0.0.1"] 

101 ) 

102 private_ips = ListField[str](doc="List of internal IP addresses for contact_points.", default=[]) 

103 port = Field[int](doc="Port number to connect to.", default=9042) 

104 keyspace = Field[str](doc="Default keyspace for operations.", default="apdb") 

105 username = Field[str]( 

106 doc=f"Cassandra user name, if empty then {DB_AUTH_PATH} has to provide it with password.", 

107 default="", 

108 ) 

109 read_consistency = Field[str]( 

110 doc="Name for consistency level of read operations, default: QUORUM, can be ONE.", default="QUORUM" 

111 ) 

112 write_consistency = Field[str]( 

113 doc="Name for consistency level of write operations, default: QUORUM, can be ONE.", default="QUORUM" 

114 ) 

115 read_timeout = Field[float](doc="Timeout in seconds for read operations.", default=120.0) 

116 write_timeout = Field[float](doc="Timeout in seconds for write operations.", default=10.0) 

117 remove_timeout = Field[float](doc="Timeout in seconds for remove operations.", default=600.0) 

118 read_concurrency = Field[int](doc="Concurrency level for read operations.", default=500) 

119 protocol_version = Field[int]( 

120 doc="Cassandra protocol version to use, default is V4", 

121 default=cassandra.ProtocolVersion.V4 if CASSANDRA_IMPORTED else 0, 

122 ) 

123 dia_object_columns = ListField[str]( 

124 doc="List of columns to read from DiaObject[Last], by default read all columns", default=[] 

125 ) 

126 prefix = Field[str](doc="Prefix to add to table names", default="") 

127 part_pixelization = ChoiceField[str]( 

128 allowed=dict(htm="HTM pixelization", q3c="Q3C pixelization", mq3c="MQ3C pixelization"), 

129 doc="Pixelization used for partitioning index.", 

130 default="mq3c", 

131 ) 

132 part_pix_level = Field[int](doc="Pixelization level used for partitioning index.", default=10) 

133 part_pix_max_ranges = Field[int](doc="Max number of ranges in pixelization envelope", default=64) 

134 ra_dec_columns = ListField[str](default=["ra", "dec"], doc="Names of ra/dec columns in DiaObject table") 

135 timer = Field[bool](doc="If True then print/log timing information", default=False) 

136 time_partition_tables = Field[bool]( 

137 doc="Use per-partition tables for sources instead of partitioning by time", default=True 

138 ) 

139 time_partition_days = Field[int]( 

140 doc=( 

141 "Time partitioning granularity in days, this value must not be changed after database is " 

142 "initialized" 

143 ), 

144 default=30, 

145 ) 

146 time_partition_start = Field[str]( 

147 doc=( 

148 "Starting time for per-partition tables, in yyyy-mm-ddThh:mm:ss format, in TAI. " 

149 "This is used only when time_partition_tables is True." 

150 ), 

151 default="2018-12-01T00:00:00", 

152 ) 

153 time_partition_end = Field[str]( 

154 doc=( 

155 "Ending time for per-partition tables, in yyyy-mm-ddThh:mm:ss format, in TAI. " 

156 "This is used only when time_partition_tables is True." 

157 ), 

158 default="2030-01-01T00:00:00", 

159 ) 

160 query_per_time_part = Field[bool]( 

161 default=False, 

162 doc=( 

163 "If True then build separate query for each time partition, otherwise build one single query. " 

164 "This is only used when time_partition_tables is False in schema config." 

165 ), 

166 ) 

167 query_per_spatial_part = Field[bool]( 

168 default=False, 

169 doc="If True then build one query per spatial partition, otherwise build single query.", 

170 ) 

171 use_insert_id_skips_diaobjects = Field[bool]( 

172 default=False, 

173 doc=( 

174 "If True then do not store DiaObjects when use_insert_id is True " 

175 "(DiaObjectsInsertId has the same data)." 

176 ), 

177 ) 

178 

179 

180if CASSANDRA_IMPORTED: 180 ↛ 195line 180 didn't jump to line 195, because the condition on line 180 was never false

181 

182 class _AddressTranslator(AddressTranslator): 

183 """Translate internal IP address to external. 

184 

185 Only used for docker-based setup, not viable long-term solution. 

186 """ 

187 

188 def __init__(self, public_ips: list[str], private_ips: list[str]): 

189 self._map = dict((k, v) for k, v in zip(private_ips, public_ips)) 

190 

191 def translate(self, private_ip: str) -> str: 

192 return self._map.get(private_ip, private_ip) 

193 

194 

195def _quote_column(name: str) -> str: 

196 """Quote column name""" 

197 if name.islower(): 

198 return name 

199 else: 

200 return f'"{name}"' 

201 

202 

203class ApdbCassandra(Apdb): 

204 """Implementation of APDB database on to of Apache Cassandra. 

205 

206 The implementation is configured via standard ``pex_config`` mechanism 

207 using `ApdbCassandraConfig` configuration class. For an example of 

208 different configurations check config/ folder. 

209 

210 Parameters 

211 ---------- 

212 config : `ApdbCassandraConfig` 

213 Configuration object. 

214 """ 

215 

216 metadataSchemaVersionKey = "version:schema" 

217 """Name of the metadata key to store schema version number.""" 

218 

219 metadataCodeVersionKey = "version:ApdbCassandra" 

220 """Name of the metadata key to store code version number.""" 

221 

222 partition_zero_epoch = dafBase.DateTime(1970, 1, 1, 0, 0, 0, dafBase.DateTime.TAI) 

223 """Start time for partition 0, this should never be changed.""" 

224 

225 def __init__(self, config: ApdbCassandraConfig): 

226 if not CASSANDRA_IMPORTED: 

227 raise CassandraMissingError() 

228 

229 config.validate() 

230 self.config = config 

231 

232 _LOG.debug("ApdbCassandra Configuration:") 

233 for key, value in self.config.items(): 

234 _LOG.debug(" %s: %s", key, value) 

235 

236 self._pixelization = Pixelization( 

237 config.part_pixelization, config.part_pix_level, config.part_pix_max_ranges 

238 ) 

239 

240 addressTranslator: AddressTranslator | None = None 

241 if config.private_ips: 

242 addressTranslator = _AddressTranslator(list(config.contact_points), list(config.private_ips)) 

243 

244 self._keyspace = config.keyspace 

245 

246 self._cluster = Cluster( 

247 execution_profiles=self._makeProfiles(config), 

248 contact_points=self.config.contact_points, 

249 port=self.config.port, 

250 address_translator=addressTranslator, 

251 protocol_version=self.config.protocol_version, 

252 auth_provider=self._make_auth_provider(config), 

253 ) 

254 self._session = self._cluster.connect() 

255 # Disable result paging 

256 self._session.default_fetch_size = None 

257 

258 self._schema = ApdbCassandraSchema( 

259 session=self._session, 

260 keyspace=self._keyspace, 

261 schema_file=self.config.schema_file, 

262 schema_name=self.config.schema_name, 

263 prefix=self.config.prefix, 

264 time_partition_tables=self.config.time_partition_tables, 

265 use_insert_id=self.config.use_insert_id, 

266 ) 

267 self._partition_zero_epoch_mjd = self.partition_zero_epoch.get(system=dafBase.DateTime.MJD) 

268 

269 self._metadata: ApdbMetadataCassandra | None = None 

270 if not self._schema.empty(): 

271 self._metadata = ApdbMetadataCassandra(self._session, self._schema, self.config) 

272 self._versionCheck(self._metadata) 

273 

274 # Cache for prepared statements 

275 self._preparer = PreparedStatementCache(self._session) 

276 

277 def __del__(self) -> None: 

278 if hasattr(self, "_cluster"): 

279 self._cluster.shutdown() 

280 

281 def _make_auth_provider(self, config: ApdbCassandraConfig) -> AuthProvider | None: 

282 """Make Cassandra authentication provider instance.""" 

283 try: 

284 dbauth = DbAuth(DB_AUTH_PATH, DB_AUTH_ENVVAR) 

285 except DbAuthNotFoundError: 

286 # Credentials file doesn't exist, use anonymous login. 

287 return None 

288 

289 empty_username = True 

290 # Try every contact point in turn. 

291 for hostname in config.contact_points: 

292 try: 

293 username, password = dbauth.getAuth( 

294 "cassandra", config.username, hostname, config.port, config.keyspace 

295 ) 

296 if not username: 

297 # Password without user name, try next hostname, but give 

298 # warning later if no better match is found. 

299 empty_username = True 

300 else: 

301 return PlainTextAuthProvider(username=username, password=password) 

302 except DbAuthNotFoundError: 

303 pass 

304 

305 if empty_username: 

306 _LOG.warning( 

307 f"Credentials file ({DB_AUTH_PATH} or ${DB_AUTH_ENVVAR}) provided password but not " 

308 f"user name, anonymous Cassandra logon will be attempted." 

309 ) 

310 

311 return None 

312 

313 def _versionCheck(self, metadata: ApdbMetadataCassandra) -> None: 

314 """Check schema version compatibility.""" 

315 

316 def _get_version(key: str, default: VersionTuple) -> VersionTuple: 

317 """Retrieve version number from given metadata key.""" 

318 if metadata.table_exists(): 

319 version_str = metadata.get(key) 

320 if version_str is None: 

321 # Should not happen with existing metadata table. 

322 raise RuntimeError(f"Version key {key!r} does not exist in metadata table.") 

323 return VersionTuple.fromString(version_str) 

324 return default 

325 

326 # For old databases where metadata table does not exist we assume that 

327 # version of both code and schema is 0.1.0. 

328 initial_version = VersionTuple(0, 1, 0) 

329 db_schema_version = _get_version(self.metadataSchemaVersionKey, initial_version) 

330 db_code_version = _get_version(self.metadataCodeVersionKey, initial_version) 

331 

332 # For now there is no way to make read-only APDB instances, assume that 

333 # any access can do updates. 

334 if not self._schema.schemaVersion().checkCompatibility(db_schema_version, True): 

335 raise IncompatibleVersionError( 

336 f"Configured schema version {self._schema.schemaVersion()} " 

337 f"is not compatible with database version {db_schema_version}" 

338 ) 

339 if not self.apdbImplementationVersion().checkCompatibility(db_code_version, True): 

340 raise IncompatibleVersionError( 

341 f"Current code version {self.apdbImplementationVersion()} " 

342 f"is not compatible with database version {db_code_version}" 

343 ) 

344 

345 @classmethod 

346 def apdbImplementationVersion(cls) -> VersionTuple: 

347 # Docstring inherited from base class. 

348 return VERSION 

349 

350 def apdbSchemaVersion(self) -> VersionTuple: 

351 # Docstring inherited from base class. 

352 return self._schema.schemaVersion() 

353 

354 def tableDef(self, table: ApdbTables) -> Table | None: 

355 # docstring is inherited from a base class 

356 return self._schema.tableSchemas.get(table) 

357 

358 def makeSchema(self, drop: bool = False) -> None: 

359 # docstring is inherited from a base class 

360 

361 if self.config.time_partition_tables: 

362 time_partition_start = dafBase.DateTime(self.config.time_partition_start, dafBase.DateTime.TAI) 

363 time_partition_end = dafBase.DateTime(self.config.time_partition_end, dafBase.DateTime.TAI) 

364 part_range = ( 

365 self._time_partition(time_partition_start), 

366 self._time_partition(time_partition_end) + 1, 

367 ) 

368 self._schema.makeSchema(drop=drop, part_range=part_range) 

369 else: 

370 self._schema.makeSchema(drop=drop) 

371 

372 # Reset metadata after schema initialization. 

373 self._metadata = ApdbMetadataCassandra(self._session, self._schema, self.config) 

374 

375 # Fill version numbers, but only if they are not defined. 

376 if self._metadata.table_exists(): 

377 if self._metadata.get(self.metadataSchemaVersionKey) is None: 

378 self._metadata.set(self.metadataSchemaVersionKey, str(self._schema.schemaVersion())) 

379 if self._metadata.get(self.metadataCodeVersionKey) is None: 

380 self._metadata.set(self.metadataCodeVersionKey, str(self.apdbImplementationVersion())) 

381 

382 def getDiaObjects(self, region: sphgeom.Region) -> pandas.DataFrame: 

383 # docstring is inherited from a base class 

384 

385 sp_where = self._spatial_where(region) 

386 _LOG.debug("getDiaObjects: #partitions: %s", len(sp_where)) 

387 

388 # We need to exclude extra partitioning columns from result. 

389 column_names = self._schema.apdbColumnNames(ApdbTables.DiaObjectLast) 

390 what = ",".join(_quote_column(column) for column in column_names) 

391 

392 table_name = self._schema.tableName(ApdbTables.DiaObjectLast) 

393 query = f'SELECT {what} from "{self._keyspace}"."{table_name}"' 

394 statements: list[tuple] = [] 

395 for where, params in sp_where: 

396 full_query = f"{query} WHERE {where}" 

397 if params: 

398 statement = self._preparer.prepare(full_query) 

399 else: 

400 # If there are no params then it is likely that query has a 

401 # bunch of literals rendered already, no point trying to 

402 # prepare it because it's not reusable. 

403 statement = cassandra.query.SimpleStatement(full_query) 

404 statements.append((statement, params)) 

405 _LOG.debug("getDiaObjects: #queries: %s", len(statements)) 

406 

407 with Timer("DiaObject select", self.config.timer): 

408 objects = cast( 

409 pandas.DataFrame, 

410 select_concurrent( 

411 self._session, statements, "read_pandas_multi", self.config.read_concurrency 

412 ), 

413 ) 

414 

415 _LOG.debug("found %s DiaObjects", objects.shape[0]) 

416 return objects 

417 

418 def getDiaSources( 

419 self, region: sphgeom.Region, object_ids: Iterable[int] | None, visit_time: dafBase.DateTime 

420 ) -> pandas.DataFrame | None: 

421 # docstring is inherited from a base class 

422 months = self.config.read_sources_months 

423 if months == 0: 

424 return None 

425 mjd_end = visit_time.get(system=dafBase.DateTime.MJD) 

426 mjd_start = mjd_end - months * 30 

427 

428 return self._getSources(region, object_ids, mjd_start, mjd_end, ApdbTables.DiaSource) 

429 

430 def getDiaForcedSources( 

431 self, region: sphgeom.Region, object_ids: Iterable[int] | None, visit_time: dafBase.DateTime 

432 ) -> pandas.DataFrame | None: 

433 # docstring is inherited from a base class 

434 months = self.config.read_forced_sources_months 

435 if months == 0: 

436 return None 

437 mjd_end = visit_time.get(system=dafBase.DateTime.MJD) 

438 mjd_start = mjd_end - months * 30 

439 

440 return self._getSources(region, object_ids, mjd_start, mjd_end, ApdbTables.DiaForcedSource) 

441 

442 def containsVisitDetector(self, visit: int, detector: int) -> bool: 

443 # docstring is inherited from a base class 

444 raise NotImplementedError() 

445 

446 def getInsertIds(self) -> list[ApdbInsertId] | None: 

447 # docstring is inherited from a base class 

448 if not self._schema.has_insert_id: 

449 return None 

450 

451 # everything goes into a single partition 

452 partition = 0 

453 

454 table_name = self._schema.tableName(ExtraTables.DiaInsertId) 

455 query = f'SELECT insert_time, insert_id FROM "{self._keyspace}"."{table_name}" WHERE partition = ?' 

456 

457 result = self._session.execute( 

458 self._preparer.prepare(query), 

459 (partition,), 

460 timeout=self.config.read_timeout, 

461 execution_profile="read_tuples", 

462 ) 

463 # order by insert_time 

464 rows = sorted(result) 

465 return [ 

466 ApdbInsertId(id=row[1], insert_time=dafBase.DateTime(int(row[0].timestamp() * 1e9))) 

467 for row in rows 

468 ] 

469 

470 def deleteInsertIds(self, ids: Iterable[ApdbInsertId]) -> None: 

471 # docstring is inherited from a base class 

472 if not self._schema.has_insert_id: 

473 raise ValueError("APDB is not configured for history storage") 

474 

475 all_insert_ids = [id.id for id in ids] 

476 # There is 64k limit on number of markers in Cassandra CQL 

477 for insert_ids in chunk_iterable(all_insert_ids, 20_000): 

478 params = ",".join("?" * len(insert_ids)) 

479 

480 # everything goes into a single partition 

481 partition = 0 

482 

483 table_name = self._schema.tableName(ExtraTables.DiaInsertId) 

484 query = ( 

485 f'DELETE FROM "{self._keyspace}"."{table_name}" ' 

486 f"WHERE partition = ? AND insert_id IN ({params})" 

487 ) 

488 

489 self._session.execute( 

490 self._preparer.prepare(query), 

491 [partition] + list(insert_ids), 

492 timeout=self.config.remove_timeout, 

493 ) 

494 

495 # Also remove those insert_ids from Dia*InsertId tables.abs 

496 for table in ( 

497 ExtraTables.DiaObjectInsertId, 

498 ExtraTables.DiaSourceInsertId, 

499 ExtraTables.DiaForcedSourceInsertId, 

500 ): 

501 table_name = self._schema.tableName(table) 

502 query = f'DELETE FROM "{self._keyspace}"."{table_name}" WHERE insert_id IN ({params})' 

503 self._session.execute( 

504 self._preparer.prepare(query), 

505 insert_ids, 

506 timeout=self.config.remove_timeout, 

507 ) 

508 

509 def getDiaObjectsHistory(self, ids: Iterable[ApdbInsertId]) -> ApdbTableData: 

510 # docstring is inherited from a base class 

511 return self._get_history(ExtraTables.DiaObjectInsertId, ids) 

512 

513 def getDiaSourcesHistory(self, ids: Iterable[ApdbInsertId]) -> ApdbTableData: 

514 # docstring is inherited from a base class 

515 return self._get_history(ExtraTables.DiaSourceInsertId, ids) 

516 

517 def getDiaForcedSourcesHistory(self, ids: Iterable[ApdbInsertId]) -> ApdbTableData: 

518 # docstring is inherited from a base class 

519 return self._get_history(ExtraTables.DiaForcedSourceInsertId, ids) 

520 

521 def getSSObjects(self) -> pandas.DataFrame: 

522 # docstring is inherited from a base class 

523 tableName = self._schema.tableName(ApdbTables.SSObject) 

524 query = f'SELECT * from "{self._keyspace}"."{tableName}"' 

525 

526 objects = None 

527 with Timer("SSObject select", self.config.timer): 

528 result = self._session.execute(query, execution_profile="read_pandas") 

529 objects = result._current_rows 

530 

531 _LOG.debug("found %s DiaObjects", objects.shape[0]) 

532 return objects 

533 

534 def store( 

535 self, 

536 visit_time: dafBase.DateTime, 

537 objects: pandas.DataFrame, 

538 sources: pandas.DataFrame | None = None, 

539 forced_sources: pandas.DataFrame | None = None, 

540 ) -> None: 

541 # docstring is inherited from a base class 

542 

543 insert_id: ApdbInsertId | None = None 

544 if self._schema.has_insert_id: 

545 insert_id = ApdbInsertId.new_insert_id(visit_time) 

546 self._storeInsertId(insert_id, visit_time) 

547 

548 # fill region partition column for DiaObjects 

549 objects = self._add_obj_part(objects) 

550 self._storeDiaObjects(objects, visit_time, insert_id) 

551 

552 if sources is not None: 

553 # copy apdb_part column from DiaObjects to DiaSources 

554 sources = self._add_src_part(sources, objects) 

555 self._storeDiaSources(ApdbTables.DiaSource, sources, visit_time, insert_id) 

556 self._storeDiaSourcesPartitions(sources, visit_time, insert_id) 

557 

558 if forced_sources is not None: 

559 forced_sources = self._add_fsrc_part(forced_sources, objects) 

560 self._storeDiaSources(ApdbTables.DiaForcedSource, forced_sources, visit_time, insert_id) 

561 

562 def storeSSObjects(self, objects: pandas.DataFrame) -> None: 

563 # docstring is inherited from a base class 

564 self._storeObjectsPandas(objects, ApdbTables.SSObject) 

565 

566 def reassignDiaSources(self, idMap: Mapping[int, int]) -> None: 

567 # docstring is inherited from a base class 

568 

569 # To update a record we need to know its exact primary key (including 

570 # partition key) so we start by querying for diaSourceId to find the 

571 # primary keys. 

572 

573 table_name = self._schema.tableName(ExtraTables.DiaSourceToPartition) 

574 # split it into 1k IDs per query 

575 selects: list[tuple] = [] 

576 for ids in chunk_iterable(idMap.keys(), 1_000): 

577 ids_str = ",".join(str(item) for item in ids) 

578 selects.append( 

579 ( 

580 ( 

581 'SELECT "diaSourceId", "apdb_part", "apdb_time_part", "insert_id" ' 

582 f'FROM "{self._keyspace}"."{table_name}" WHERE "diaSourceId" IN ({ids_str})' 

583 ), 

584 {}, 

585 ) 

586 ) 

587 

588 # No need for DataFrame here, read data as tuples. 

589 result = cast( 

590 list[tuple[int, int, int, uuid.UUID | None]], 

591 select_concurrent(self._session, selects, "read_tuples", self.config.read_concurrency), 

592 ) 

593 

594 # Make mapping from source ID to its partition. 

595 id2partitions: dict[int, tuple[int, int]] = {} 

596 id2insert_id: dict[int, uuid.UUID] = {} 

597 for row in result: 

598 id2partitions[row[0]] = row[1:3] 

599 if row[3] is not None: 

600 id2insert_id[row[0]] = row[3] 

601 

602 # make sure we know partitions for each ID 

603 if set(id2partitions) != set(idMap): 

604 missing = ",".join(str(item) for item in set(idMap) - set(id2partitions)) 

605 raise ValueError(f"Following DiaSource IDs do not exist in the database: {missing}") 

606 

607 # Reassign in standard tables 

608 queries = cassandra.query.BatchStatement() 

609 table_name = self._schema.tableName(ApdbTables.DiaSource) 

610 for diaSourceId, ssObjectId in idMap.items(): 

611 apdb_part, apdb_time_part = id2partitions[diaSourceId] 

612 values: tuple 

613 if self.config.time_partition_tables: 

614 query = ( 

615 f'UPDATE "{self._keyspace}"."{table_name}_{apdb_time_part}"' 

616 ' SET "ssObjectId" = ?, "diaObjectId" = NULL' 

617 ' WHERE "apdb_part" = ? AND "diaSourceId" = ?' 

618 ) 

619 values = (ssObjectId, apdb_part, diaSourceId) 

620 else: 

621 query = ( 

622 f'UPDATE "{self._keyspace}"."{table_name}"' 

623 ' SET "ssObjectId" = ?, "diaObjectId" = NULL' 

624 ' WHERE "apdb_part" = ? AND "apdb_time_part" = ? AND "diaSourceId" = ?' 

625 ) 

626 values = (ssObjectId, apdb_part, apdb_time_part, diaSourceId) 

627 queries.add(self._preparer.prepare(query), values) 

628 

629 # Reassign in history tables, only if history is enabled 

630 if id2insert_id: 

631 # Filter out insert ids that have been deleted already. There is a 

632 # potential race with concurrent removal of insert IDs, but it 

633 # should be handled by WHERE in UPDATE. 

634 known_ids = set() 

635 if insert_ids := self.getInsertIds(): 

636 known_ids = set(insert_id.id for insert_id in insert_ids) 

637 id2insert_id = {key: value for key, value in id2insert_id.items() if value in known_ids} 

638 if id2insert_id: 

639 table_name = self._schema.tableName(ExtraTables.DiaSourceInsertId) 

640 for diaSourceId, ssObjectId in idMap.items(): 

641 if insert_id := id2insert_id.get(diaSourceId): 

642 query = ( 

643 f'UPDATE "{self._keyspace}"."{table_name}" ' 

644 ' SET "ssObjectId" = ?, "diaObjectId" = NULL ' 

645 'WHERE "insert_id" = ? AND "diaSourceId" = ?' 

646 ) 

647 values = (ssObjectId, insert_id, diaSourceId) 

648 queries.add(self._preparer.prepare(query), values) 

649 

650 _LOG.debug("%s: will update %d records", table_name, len(idMap)) 

651 with Timer(table_name + " update", self.config.timer): 

652 self._session.execute(queries, execution_profile="write") 

653 

654 def dailyJob(self) -> None: 

655 # docstring is inherited from a base class 

656 pass 

657 

658 def countUnassociatedObjects(self) -> int: 

659 # docstring is inherited from a base class 

660 

661 # It's too inefficient to implement it for Cassandra in current schema. 

662 raise NotImplementedError() 

663 

664 @property 

665 def metadata(self) -> ApdbMetadata: 

666 # docstring is inherited from a base class 

667 if self._metadata is None: 

668 raise RuntimeError("Database schema was not initialized.") 

669 return self._metadata 

670 

671 def _makeProfiles(self, config: ApdbCassandraConfig) -> Mapping[Any, ExecutionProfile]: 

672 """Make all execution profiles used in the code.""" 

673 if config.private_ips: 

674 loadBalancePolicy = WhiteListRoundRobinPolicy(hosts=config.contact_points) 

675 else: 

676 loadBalancePolicy = RoundRobinPolicy() 

677 

678 read_tuples_profile = ExecutionProfile( 

679 consistency_level=getattr(cassandra.ConsistencyLevel, config.read_consistency), 

680 request_timeout=config.read_timeout, 

681 row_factory=cassandra.query.tuple_factory, 

682 load_balancing_policy=loadBalancePolicy, 

683 ) 

684 read_pandas_profile = ExecutionProfile( 

685 consistency_level=getattr(cassandra.ConsistencyLevel, config.read_consistency), 

686 request_timeout=config.read_timeout, 

687 row_factory=pandas_dataframe_factory, 

688 load_balancing_policy=loadBalancePolicy, 

689 ) 

690 read_raw_profile = ExecutionProfile( 

691 consistency_level=getattr(cassandra.ConsistencyLevel, config.read_consistency), 

692 request_timeout=config.read_timeout, 

693 row_factory=raw_data_factory, 

694 load_balancing_policy=loadBalancePolicy, 

695 ) 

696 # Profile to use with select_concurrent to return pandas data frame 

697 read_pandas_multi_profile = ExecutionProfile( 

698 consistency_level=getattr(cassandra.ConsistencyLevel, config.read_consistency), 

699 request_timeout=config.read_timeout, 

700 row_factory=pandas_dataframe_factory, 

701 load_balancing_policy=loadBalancePolicy, 

702 ) 

703 # Profile to use with select_concurrent to return raw data (columns and 

704 # rows) 

705 read_raw_multi_profile = ExecutionProfile( 

706 consistency_level=getattr(cassandra.ConsistencyLevel, config.read_consistency), 

707 request_timeout=config.read_timeout, 

708 row_factory=raw_data_factory, 

709 load_balancing_policy=loadBalancePolicy, 

710 ) 

711 write_profile = ExecutionProfile( 

712 consistency_level=getattr(cassandra.ConsistencyLevel, config.write_consistency), 

713 request_timeout=config.write_timeout, 

714 load_balancing_policy=loadBalancePolicy, 

715 ) 

716 # To replace default DCAwareRoundRobinPolicy 

717 default_profile = ExecutionProfile( 

718 load_balancing_policy=loadBalancePolicy, 

719 ) 

720 return { 

721 "read_tuples": read_tuples_profile, 

722 "read_pandas": read_pandas_profile, 

723 "read_raw": read_raw_profile, 

724 "read_pandas_multi": read_pandas_multi_profile, 

725 "read_raw_multi": read_raw_multi_profile, 

726 "write": write_profile, 

727 EXEC_PROFILE_DEFAULT: default_profile, 

728 } 

729 

730 def _getSources( 

731 self, 

732 region: sphgeom.Region, 

733 object_ids: Iterable[int] | None, 

734 mjd_start: float, 

735 mjd_end: float, 

736 table_name: ApdbTables, 

737 ) -> pandas.DataFrame: 

738 """Return catalog of DiaSource instances given set of DiaObject IDs. 

739 

740 Parameters 

741 ---------- 

742 region : `lsst.sphgeom.Region` 

743 Spherical region. 

744 object_ids : 

745 Collection of DiaObject IDs 

746 mjd_start : `float` 

747 Lower bound of time interval. 

748 mjd_end : `float` 

749 Upper bound of time interval. 

750 table_name : `ApdbTables` 

751 Name of the table. 

752 

753 Returns 

754 ------- 

755 catalog : `pandas.DataFrame`, or `None` 

756 Catalog containing DiaSource records. Empty catalog is returned if 

757 ``object_ids`` is empty. 

758 """ 

759 object_id_set: Set[int] = set() 

760 if object_ids is not None: 

761 object_id_set = set(object_ids) 

762 if len(object_id_set) == 0: 

763 return self._make_empty_catalog(table_name) 

764 

765 sp_where = self._spatial_where(region) 

766 tables, temporal_where = self._temporal_where(table_name, mjd_start, mjd_end) 

767 

768 # We need to exclude extra partitioning columns from result. 

769 column_names = self._schema.apdbColumnNames(table_name) 

770 what = ",".join(_quote_column(column) for column in column_names) 

771 

772 # Build all queries 

773 statements: list[tuple] = [] 

774 for table in tables: 

775 prefix = f'SELECT {what} from "{self._keyspace}"."{table}"' 

776 statements += list(self._combine_where(prefix, sp_where, temporal_where)) 

777 _LOG.debug("_getSources %s: #queries: %s", table_name, len(statements)) 

778 

779 with Timer(table_name.name + " select", self.config.timer): 

780 catalog = cast( 

781 pandas.DataFrame, 

782 select_concurrent( 

783 self._session, statements, "read_pandas_multi", self.config.read_concurrency 

784 ), 

785 ) 

786 

787 # filter by given object IDs 

788 if len(object_id_set) > 0: 

789 catalog = cast(pandas.DataFrame, catalog[catalog["diaObjectId"].isin(object_id_set)]) 

790 

791 # precise filtering on midpointMjdTai 

792 catalog = cast(pandas.DataFrame, catalog[catalog["midpointMjdTai"] > mjd_start]) 

793 

794 _LOG.debug("found %d %ss", catalog.shape[0], table_name.name) 

795 return catalog 

796 

797 def _get_history(self, table: ExtraTables, ids: Iterable[ApdbInsertId]) -> ApdbTableData: 

798 """Return records from a particular table given set of insert IDs.""" 

799 if not self._schema.has_insert_id: 

800 raise ValueError("APDB is not configured for history retrieval") 

801 

802 insert_ids = [id.id for id in ids] 

803 params = ",".join("?" * len(insert_ids)) 

804 

805 table_name = self._schema.tableName(table) 

806 # I know that history table schema has only regular APDB columns plus 

807 # an insert_id column, and this is exactly what we need to return from 

808 # this method, so selecting a star is fine here. 

809 query = f'SELECT * FROM "{self._keyspace}"."{table_name}" WHERE insert_id IN ({params})' 

810 statement = self._preparer.prepare(query) 

811 

812 with Timer("DiaObject history", self.config.timer): 

813 result = self._session.execute(statement, insert_ids, execution_profile="read_raw") 

814 table_data = cast(ApdbCassandraTableData, result._current_rows) 

815 return table_data 

816 

817 def _storeInsertId(self, insert_id: ApdbInsertId, visit_time: dafBase.DateTime) -> None: 

818 # Cassandra timestamp uses milliseconds since epoch 

819 timestamp = insert_id.insert_time.nsecs() // 1_000_000 

820 

821 # everything goes into a single partition 

822 partition = 0 

823 

824 table_name = self._schema.tableName(ExtraTables.DiaInsertId) 

825 query = ( 

826 f'INSERT INTO "{self._keyspace}"."{table_name}" (partition, insert_id, insert_time) ' 

827 "VALUES (?, ?, ?)" 

828 ) 

829 

830 self._session.execute( 

831 self._preparer.prepare(query), 

832 (partition, insert_id.id, timestamp), 

833 timeout=self.config.write_timeout, 

834 execution_profile="write", 

835 ) 

836 

837 def _storeDiaObjects( 

838 self, objs: pandas.DataFrame, visit_time: dafBase.DateTime, insert_id: ApdbInsertId | None 

839 ) -> None: 

840 """Store catalog of DiaObjects from current visit. 

841 

842 Parameters 

843 ---------- 

844 objs : `pandas.DataFrame` 

845 Catalog with DiaObject records 

846 visit_time : `lsst.daf.base.DateTime` 

847 Time of the current visit. 

848 """ 

849 visit_time_dt = visit_time.toPython() 

850 extra_columns = dict(lastNonForcedSource=visit_time_dt) 

851 self._storeObjectsPandas(objs, ApdbTables.DiaObjectLast, extra_columns=extra_columns) 

852 

853 extra_columns["validityStart"] = visit_time_dt 

854 time_part: int | None = self._time_partition(visit_time) 

855 if not self.config.time_partition_tables: 

856 extra_columns["apdb_time_part"] = time_part 

857 time_part = None 

858 

859 # Only store DiaObects if not storing insert_ids or explicitly 

860 # configured to always store them 

861 if insert_id is None or not self.config.use_insert_id_skips_diaobjects: 

862 self._storeObjectsPandas( 

863 objs, ApdbTables.DiaObject, extra_columns=extra_columns, time_part=time_part 

864 ) 

865 

866 if insert_id is not None: 

867 extra_columns = dict(insert_id=insert_id.id, validityStart=visit_time_dt) 

868 self._storeObjectsPandas(objs, ExtraTables.DiaObjectInsertId, extra_columns=extra_columns) 

869 

870 def _storeDiaSources( 

871 self, 

872 table_name: ApdbTables, 

873 sources: pandas.DataFrame, 

874 visit_time: dafBase.DateTime, 

875 insert_id: ApdbInsertId | None, 

876 ) -> None: 

877 """Store catalog of DIASources or DIAForcedSources from current visit. 

878 

879 Parameters 

880 ---------- 

881 sources : `pandas.DataFrame` 

882 Catalog containing DiaSource records 

883 visit_time : `lsst.daf.base.DateTime` 

884 Time of the current visit. 

885 """ 

886 time_part: int | None = self._time_partition(visit_time) 

887 extra_columns: dict[str, Any] = {} 

888 if not self.config.time_partition_tables: 

889 extra_columns["apdb_time_part"] = time_part 

890 time_part = None 

891 

892 self._storeObjectsPandas(sources, table_name, extra_columns=extra_columns, time_part=time_part) 

893 

894 if insert_id is not None: 

895 extra_columns = dict(insert_id=insert_id.id) 

896 if table_name is ApdbTables.DiaSource: 

897 extra_table = ExtraTables.DiaSourceInsertId 

898 else: 

899 extra_table = ExtraTables.DiaForcedSourceInsertId 

900 self._storeObjectsPandas(sources, extra_table, extra_columns=extra_columns) 

901 

902 def _storeDiaSourcesPartitions( 

903 self, sources: pandas.DataFrame, visit_time: dafBase.DateTime, insert_id: ApdbInsertId | None 

904 ) -> None: 

905 """Store mapping of diaSourceId to its partitioning values. 

906 

907 Parameters 

908 ---------- 

909 sources : `pandas.DataFrame` 

910 Catalog containing DiaSource records 

911 visit_time : `lsst.daf.base.DateTime` 

912 Time of the current visit. 

913 """ 

914 id_map = cast(pandas.DataFrame, sources[["diaSourceId", "apdb_part"]]) 

915 extra_columns = { 

916 "apdb_time_part": self._time_partition(visit_time), 

917 "insert_id": insert_id.id if insert_id is not None else None, 

918 } 

919 

920 self._storeObjectsPandas( 

921 id_map, ExtraTables.DiaSourceToPartition, extra_columns=extra_columns, time_part=None 

922 ) 

923 

924 def _storeObjectsPandas( 

925 self, 

926 records: pandas.DataFrame, 

927 table_name: ApdbTables | ExtraTables, 

928 extra_columns: Mapping | None = None, 

929 time_part: int | None = None, 

930 ) -> None: 

931 """Store generic objects. 

932 

933 Takes Pandas catalog and stores a bunch of records in a table. 

934 

935 Parameters 

936 ---------- 

937 records : `pandas.DataFrame` 

938 Catalog containing object records 

939 table_name : `ApdbTables` 

940 Name of the table as defined in APDB schema. 

941 extra_columns : `dict`, optional 

942 Mapping (column_name, column_value) which gives fixed values for 

943 columns in each row, overrides values in ``records`` if matching 

944 columns exist there. 

945 time_part : `int`, optional 

946 If not `None` then insert into a per-partition table. 

947 

948 Notes 

949 ----- 

950 If Pandas catalog contains additional columns not defined in table 

951 schema they are ignored. Catalog does not have to contain all columns 

952 defined in a table, but partition and clustering keys must be present 

953 in a catalog or ``extra_columns``. 

954 """ 

955 # use extra columns if specified 

956 if extra_columns is None: 

957 extra_columns = {} 

958 extra_fields = list(extra_columns.keys()) 

959 

960 # Fields that will come from dataframe. 

961 df_fields = [column for column in records.columns if column not in extra_fields] 

962 

963 column_map = self._schema.getColumnMap(table_name) 

964 # list of columns (as in felis schema) 

965 fields = [column_map[field].name for field in df_fields if field in column_map] 

966 fields += extra_fields 

967 

968 # check that all partitioning and clustering columns are defined 

969 required_columns = self._schema.partitionColumns(table_name) + self._schema.clusteringColumns( 

970 table_name 

971 ) 

972 missing_columns = [column for column in required_columns if column not in fields] 

973 if missing_columns: 

974 raise ValueError(f"Primary key columns are missing from catalog: {missing_columns}") 

975 

976 qfields = [quote_id(field) for field in fields] 

977 qfields_str = ",".join(qfields) 

978 

979 with Timer(table_name.name + " query build", self.config.timer): 

980 table = self._schema.tableName(table_name) 

981 if time_part is not None: 

982 table = f"{table}_{time_part}" 

983 

984 holders = ",".join(["?"] * len(qfields)) 

985 query = f'INSERT INTO "{self._keyspace}"."{table}" ({qfields_str}) VALUES ({holders})' 

986 statement = self._preparer.prepare(query) 

987 queries = cassandra.query.BatchStatement() 

988 for rec in records.itertuples(index=False): 

989 values = [] 

990 for field in df_fields: 

991 if field not in column_map: 

992 continue 

993 value = getattr(rec, field) 

994 if column_map[field].datatype is felis.types.Timestamp: 

995 if isinstance(value, pandas.Timestamp): 

996 value = literal(value.to_pydatetime()) 

997 else: 

998 # Assume it's seconds since epoch, Cassandra 

999 # datetime is in milliseconds 

1000 value = int(value * 1000) 

1001 values.append(literal(value)) 

1002 for field in extra_fields: 

1003 value = extra_columns[field] 

1004 values.append(literal(value)) 

1005 queries.add(statement, values) 

1006 

1007 _LOG.debug("%s: will store %d records", self._schema.tableName(table_name), records.shape[0]) 

1008 with Timer(table_name.name + " insert", self.config.timer): 

1009 self._session.execute(queries, timeout=self.config.write_timeout, execution_profile="write") 

1010 

1011 def _add_obj_part(self, df: pandas.DataFrame) -> pandas.DataFrame: 

1012 """Calculate spatial partition for each record and add it to a 

1013 DataFrame. 

1014 

1015 Notes 

1016 ----- 

1017 This overrides any existing column in a DataFrame with the same name 

1018 (apdb_part). Original DataFrame is not changed, copy of a DataFrame is 

1019 returned. 

1020 """ 

1021 # calculate HTM index for every DiaObject 

1022 apdb_part = np.zeros(df.shape[0], dtype=np.int64) 

1023 ra_col, dec_col = self.config.ra_dec_columns 

1024 for i, (ra, dec) in enumerate(zip(df[ra_col], df[dec_col])): 

1025 uv3d = sphgeom.UnitVector3d(sphgeom.LonLat.fromDegrees(ra, dec)) 

1026 idx = self._pixelization.pixel(uv3d) 

1027 apdb_part[i] = idx 

1028 df = df.copy() 

1029 df["apdb_part"] = apdb_part 

1030 return df 

1031 

1032 def _add_src_part(self, sources: pandas.DataFrame, objs: pandas.DataFrame) -> pandas.DataFrame: 

1033 """Add apdb_part column to DiaSource catalog. 

1034 

1035 Notes 

1036 ----- 

1037 This method copies apdb_part value from a matching DiaObject record. 

1038 DiaObject catalog needs to have a apdb_part column filled by 

1039 ``_add_obj_part`` method and DiaSource records need to be 

1040 associated to DiaObjects via ``diaObjectId`` column. 

1041 

1042 This overrides any existing column in a DataFrame with the same name 

1043 (apdb_part). Original DataFrame is not changed, copy of a DataFrame is 

1044 returned. 

1045 """ 

1046 pixel_id_map: dict[int, int] = { 

1047 diaObjectId: apdb_part for diaObjectId, apdb_part in zip(objs["diaObjectId"], objs["apdb_part"]) 

1048 } 

1049 apdb_part = np.zeros(sources.shape[0], dtype=np.int64) 

1050 ra_col, dec_col = self.config.ra_dec_columns 

1051 for i, (diaObjId, ra, dec) in enumerate( 

1052 zip(sources["diaObjectId"], sources[ra_col], sources[dec_col]) 

1053 ): 

1054 if diaObjId == 0: 

1055 # DiaSources associated with SolarSystemObjects do not have an 

1056 # associated DiaObject hence we skip them and set partition 

1057 # based on its own ra/dec 

1058 uv3d = sphgeom.UnitVector3d(sphgeom.LonLat.fromDegrees(ra, dec)) 

1059 idx = self._pixelization.pixel(uv3d) 

1060 apdb_part[i] = idx 

1061 else: 

1062 apdb_part[i] = pixel_id_map[diaObjId] 

1063 sources = sources.copy() 

1064 sources["apdb_part"] = apdb_part 

1065 return sources 

1066 

1067 def _add_fsrc_part(self, sources: pandas.DataFrame, objs: pandas.DataFrame) -> pandas.DataFrame: 

1068 """Add apdb_part column to DiaForcedSource catalog. 

1069 

1070 Notes 

1071 ----- 

1072 This method copies apdb_part value from a matching DiaObject record. 

1073 DiaObject catalog needs to have a apdb_part column filled by 

1074 ``_add_obj_part`` method and DiaSource records need to be 

1075 associated to DiaObjects via ``diaObjectId`` column. 

1076 

1077 This overrides any existing column in a DataFrame with the same name 

1078 (apdb_part). Original DataFrame is not changed, copy of a DataFrame is 

1079 returned. 

1080 """ 

1081 pixel_id_map: dict[int, int] = { 

1082 diaObjectId: apdb_part for diaObjectId, apdb_part in zip(objs["diaObjectId"], objs["apdb_part"]) 

1083 } 

1084 apdb_part = np.zeros(sources.shape[0], dtype=np.int64) 

1085 for i, diaObjId in enumerate(sources["diaObjectId"]): 

1086 apdb_part[i] = pixel_id_map[diaObjId] 

1087 sources = sources.copy() 

1088 sources["apdb_part"] = apdb_part 

1089 return sources 

1090 

1091 def _time_partition(self, time: float | dafBase.DateTime) -> int: 

1092 """Calculate time partiton number for a given time. 

1093 

1094 Parameters 

1095 ---------- 

1096 time : `float` or `lsst.daf.base.DateTime` 

1097 Time for which to calculate partition number. Can be float to mean 

1098 MJD or `lsst.daf.base.DateTime` 

1099 

1100 Returns 

1101 ------- 

1102 partition : `int` 

1103 Partition number for a given time. 

1104 """ 

1105 if isinstance(time, dafBase.DateTime): 

1106 mjd = time.get(system=dafBase.DateTime.MJD) 

1107 else: 

1108 mjd = time 

1109 days_since_epoch = mjd - self._partition_zero_epoch_mjd 

1110 partition = int(days_since_epoch) // self.config.time_partition_days 

1111 return partition 

1112 

1113 def _make_empty_catalog(self, table_name: ApdbTables) -> pandas.DataFrame: 

1114 """Make an empty catalog for a table with a given name. 

1115 

1116 Parameters 

1117 ---------- 

1118 table_name : `ApdbTables` 

1119 Name of the table. 

1120 

1121 Returns 

1122 ------- 

1123 catalog : `pandas.DataFrame` 

1124 An empty catalog. 

1125 """ 

1126 table = self._schema.tableSchemas[table_name] 

1127 

1128 data = { 

1129 columnDef.name: pandas.Series(dtype=self._schema.column_dtype(columnDef.datatype)) 

1130 for columnDef in table.columns 

1131 } 

1132 return pandas.DataFrame(data) 

1133 

1134 def _combine_where( 

1135 self, 

1136 prefix: str, 

1137 where1: list[tuple[str, tuple]], 

1138 where2: list[tuple[str, tuple]], 

1139 suffix: str | None = None, 

1140 ) -> Iterator[tuple[cassandra.query.Statement, tuple]]: 

1141 """Make cartesian product of two parts of WHERE clause into a series 

1142 of statements to execute. 

1143 

1144 Parameters 

1145 ---------- 

1146 prefix : `str` 

1147 Initial statement prefix that comes before WHERE clause, e.g. 

1148 "SELECT * from Table" 

1149 """ 

1150 # If lists are empty use special sentinels. 

1151 if not where1: 

1152 where1 = [("", ())] 

1153 if not where2: 

1154 where2 = [("", ())] 

1155 

1156 for expr1, params1 in where1: 

1157 for expr2, params2 in where2: 

1158 full_query = prefix 

1159 wheres = [] 

1160 if expr1: 

1161 wheres.append(expr1) 

1162 if expr2: 

1163 wheres.append(expr2) 

1164 if wheres: 

1165 full_query += " WHERE " + " AND ".join(wheres) 

1166 if suffix: 

1167 full_query += " " + suffix 

1168 params = params1 + params2 

1169 if params: 

1170 statement = self._preparer.prepare(full_query) 

1171 else: 

1172 # If there are no params then it is likely that query 

1173 # has a bunch of literals rendered already, no point 

1174 # trying to prepare it. 

1175 statement = cassandra.query.SimpleStatement(full_query) 

1176 yield (statement, params) 

1177 

1178 def _spatial_where( 

1179 self, region: sphgeom.Region | None, use_ranges: bool = False 

1180 ) -> list[tuple[str, tuple]]: 

1181 """Generate expressions for spatial part of WHERE clause. 

1182 

1183 Parameters 

1184 ---------- 

1185 region : `sphgeom.Region` 

1186 Spatial region for query results. 

1187 use_ranges : `bool` 

1188 If True then use pixel ranges ("apdb_part >= p1 AND apdb_part <= 

1189 p2") instead of exact list of pixels. Should be set to True for 

1190 large regions covering very many pixels. 

1191 

1192 Returns 

1193 ------- 

1194 expressions : `list` [ `tuple` ] 

1195 Empty list is returned if ``region`` is `None`, otherwise a list 

1196 of one or more (expression, parameters) tuples 

1197 """ 

1198 if region is None: 

1199 return [] 

1200 if use_ranges: 

1201 pixel_ranges = self._pixelization.envelope(region) 

1202 expressions: list[tuple[str, tuple]] = [] 

1203 for lower, upper in pixel_ranges: 

1204 upper -= 1 

1205 if lower == upper: 

1206 expressions.append(('"apdb_part" = ?', (lower,))) 

1207 else: 

1208 expressions.append(('"apdb_part" >= ? AND "apdb_part" <= ?', (lower, upper))) 

1209 return expressions 

1210 else: 

1211 pixels = self._pixelization.pixels(region) 

1212 if self.config.query_per_spatial_part: 

1213 return [('"apdb_part" = ?', (pixel,)) for pixel in pixels] 

1214 else: 

1215 pixels_str = ",".join([str(pix) for pix in pixels]) 

1216 return [(f'"apdb_part" IN ({pixels_str})', ())] 

1217 

1218 def _temporal_where( 

1219 self, 

1220 table: ApdbTables, 

1221 start_time: float | dafBase.DateTime, 

1222 end_time: float | dafBase.DateTime, 

1223 query_per_time_part: bool | None = None, 

1224 ) -> tuple[list[str], list[tuple[str, tuple]]]: 

1225 """Generate table names and expressions for temporal part of WHERE 

1226 clauses. 

1227 

1228 Parameters 

1229 ---------- 

1230 table : `ApdbTables` 

1231 Table to select from. 

1232 start_time : `dafBase.DateTime` or `float` 

1233 Starting Datetime of MJD value of the time range. 

1234 start_time : `dafBase.DateTime` or `float` 

1235 Starting Datetime of MJD value of the time range. 

1236 query_per_time_part : `bool`, optional 

1237 If None then use ``query_per_time_part`` from configuration. 

1238 

1239 Returns 

1240 ------- 

1241 tables : `list` [ `str` ] 

1242 List of the table names to query. 

1243 expressions : `list` [ `tuple` ] 

1244 A list of zero or more (expression, parameters) tuples. 

1245 """ 

1246 tables: list[str] 

1247 temporal_where: list[tuple[str, tuple]] = [] 

1248 table_name = self._schema.tableName(table) 

1249 time_part_start = self._time_partition(start_time) 

1250 time_part_end = self._time_partition(end_time) 

1251 time_parts = list(range(time_part_start, time_part_end + 1)) 

1252 if self.config.time_partition_tables: 

1253 tables = [f"{table_name}_{part}" for part in time_parts] 

1254 else: 

1255 tables = [table_name] 

1256 if query_per_time_part is None: 

1257 query_per_time_part = self.config.query_per_time_part 

1258 if query_per_time_part: 

1259 temporal_where = [('"apdb_time_part" = ?', (time_part,)) for time_part in time_parts] 

1260 else: 

1261 time_part_list = ",".join([str(part) for part in time_parts]) 

1262 temporal_where = [(f'"apdb_time_part" IN ({time_part_list})', ())] 

1263 

1264 return tables, temporal_where