Coverage for python/lsst/dax/apdb/apdb.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Module defining Apdb class and related methods.
23"""
25__all__ = ["ApdbConfig", "Apdb", "Visit"]
27from collections import namedtuple
28from contextlib import contextmanager
29from datetime import datetime
30import logging
31import numpy as np
32import os
33import pandas
35import lsst.geom as geom
36import lsst.afw.table as afwTable
37import lsst.pex.config as pexConfig
38from lsst.pex.config import Field, ChoiceField, ListField
39import sqlalchemy
40from sqlalchemy import (func, sql)
41from sqlalchemy.pool import NullPool
42from . import timer, apdbSchema
45_LOG = logging.getLogger(__name__.partition(".")[2]) # strip leading "lsst."
48class Timer(object):
49 """Timer class defining context manager which tracks execution timing.
51 Typical use:
53 with Timer("timer_name"):
54 do_something
56 On exit from block it will print elapsed time.
58 See also :py:mod:`timer` module.
59 """
60 def __init__(self, name, do_logging=True, log_before_cursor_execute=False):
61 self._log_before_cursor_execute = log_before_cursor_execute
62 self._do_logging = do_logging
63 self._timer1 = timer.Timer(name)
64 self._timer2 = timer.Timer(name + " (before/after cursor)")
66 def __enter__(self):
67 """
68 Enter context, start timer
69 """
70# event.listen(engine.Engine, "before_cursor_execute", self._start_timer)
71# event.listen(engine.Engine, "after_cursor_execute", self._stop_timer)
72 self._timer1.start()
73 return self
75 def __exit__(self, exc_type, exc_val, exc_tb):
76 """
77 Exit context, stop and dump timer
78 """
79 if exc_type is None:
80 self._timer1.stop()
81 if self._do_logging:
82 self._timer1.dump()
83# event.remove(engine.Engine, "before_cursor_execute", self._start_timer)
84# event.remove(engine.Engine, "after_cursor_execute", self._stop_timer)
85 return False
87 def _start_timer(self, conn, cursor, statement, parameters, context, executemany):
88 """Start counting"""
89 if self._log_before_cursor_execute:
90 _LOG.info("before_cursor_execute")
91 self._timer2.start()
93 def _stop_timer(self, conn, cursor, statement, parameters, context, executemany):
94 """Stop counting"""
95 self._timer2.stop()
96 if self._do_logging:
97 self._timer2.dump()
100def _split(seq, nItems):
101 """Split a sequence into smaller sequences"""
102 seq = list(seq)
103 while seq:
104 yield seq[:nItems]
105 del seq[:nItems]
108# Information about single visit
109Visit = namedtuple('Visit', 'visitId visitTime lastObjectId lastSourceId')
112@contextmanager
113def _ansi_session(engine):
114 """Returns a connection, makes sure that ANSI mode is set for MySQL
115 """
116 with engine.begin() as conn:
117 if engine.name == 'mysql':
118 conn.execute(sql.text("SET SESSION SQL_MODE = 'ANSI'"))
119 yield conn
120 return
123def _data_file_name(basename):
124 """Return path name of a data file.
125 """
126 return os.path.join("${DAX_APDB_DIR}", "data", basename)
129class ApdbConfig(pexConfig.Config):
131 db_url = Field(dtype=str, doc="SQLAlchemy database connection URI")
132 isolation_level = ChoiceField(dtype=str,
133 doc="Transaction isolation level",
134 allowed={"READ_COMMITTED": "Read committed",
135 "READ_UNCOMMITTED": "Read uncommitted",
136 "REPEATABLE_READ": "Repeatable read",
137 "SERIALIZABLE": "Serializable"},
138 default="READ_COMMITTED",
139 optional=True)
140 connection_pool = Field(dtype=bool,
141 doc=("If False then disable SQLAlchemy connection pool. "
142 "Do not use connection pool when forking."),
143 default=True)
144 connection_timeout = Field(dtype=float,
145 doc="Maximum time to wait time for database lock to be released before "
146 "exiting. Defaults to sqlachemy defaults if not set.",
147 default=None,
148 optional=True)
149 sql_echo = Field(dtype=bool,
150 doc="If True then pass SQLAlchemy echo option.",
151 default=False)
152 dia_object_index = ChoiceField(dtype=str,
153 doc="Indexing mode for DiaObject table",
154 allowed={'baseline': "Index defined in baseline schema",
155 'pix_id_iov': "(pixelId, objectId, iovStart) PK",
156 'last_object_table': "Separate DiaObjectLast table"},
157 default='baseline')
158 dia_object_nightly = Field(dtype=bool,
159 doc="Use separate nightly table for DiaObject",
160 default=False)
161 read_sources_months = Field(dtype=int,
162 doc="Number of months of history to read from DiaSource",
163 default=12)
164 read_forced_sources_months = Field(dtype=int,
165 doc="Number of months of history to read from DiaForcedSource",
166 default=12)
167 dia_object_columns = ListField(dtype=str,
168 doc="List of columns to read from DiaObject, by default read all columns",
169 default=[])
170 object_last_replace = Field(dtype=bool,
171 doc="If True (default) then use \"upsert\" for DiaObjectsLast table",
172 default=True)
173 schema_file = Field(dtype=str,
174 doc="Location of (YAML) configuration file with standard schema",
175 default=_data_file_name("apdb-schema.yaml"))
176 extra_schema_file = Field(dtype=str,
177 doc="Location of (YAML) configuration file with extra schema",
178 default=_data_file_name("apdb-schema-extra.yaml"))
179 column_map = Field(dtype=str,
180 doc="Location of (YAML) configuration file with column mapping",
181 default=_data_file_name("apdb-afw-map.yaml"))
182 prefix = Field(dtype=str,
183 doc="Prefix to add to table names and index names",
184 default="")
185 explain = Field(dtype=bool,
186 doc="If True then run EXPLAIN SQL command on each executed query",
187 default=False)
188 timer = Field(dtype=bool,
189 doc="If True then print/log timing information",
190 default=False)
191 diaobject_index_hint = Field(dtype=str,
192 doc="Name of the index to use with Oracle index hint",
193 default=None,
194 optional=True)
195 dynamic_sampling_hint = Field(dtype=int,
196 doc="If non-zero then use dynamic_sampling hint",
197 default=0)
198 cardinality_hint = Field(dtype=int,
199 doc="If non-zero then use cardinality hint",
200 default=0)
202 def validate(self):
203 super().validate()
204 if self.isolation_level == "READ_COMMITTED" and self.db_url.startswith("sqlite"):
205 raise ValueError("Attempting to run Apdb with SQLITE and isolation level 'READ_COMMITTED.' "
206 "Use 'READ_UNCOMMITTED' instead.")
209class Apdb(object):
210 """Interface to L1 database, hides all database access details.
212 The implementation is configured via standard ``pex_config`` mechanism
213 using `ApdbConfig` configuration class. For an example of different
214 configurations check config/ folder.
216 Parameters
217 ----------
218 config : `ApdbConfig`
219 afw_schemas : `dict`, optional
220 Dictionary with table name for a key and `afw.table.Schema`
221 for a value. Columns in schema will be added to standard
222 APDB schema.
223 """
225 def __init__(self, config, afw_schemas=None):
227 self.config = config
229 # logging.getLogger('sqlalchemy').setLevel(logging.INFO)
230 _LOG.debug("APDB Configuration:")
231 _LOG.debug(" dia_object_index: %s", self.config.dia_object_index)
232 _LOG.debug(" dia_object_nightly: %s", self.config.dia_object_nightly)
233 _LOG.debug(" read_sources_months: %s", self.config.read_sources_months)
234 _LOG.debug(" read_forced_sources_months: %s", self.config.read_forced_sources_months)
235 _LOG.debug(" dia_object_columns: %s", self.config.dia_object_columns)
236 _LOG.debug(" object_last_replace: %s", self.config.object_last_replace)
237 _LOG.debug(" schema_file: %s", self.config.schema_file)
238 _LOG.debug(" extra_schema_file: %s", self.config.extra_schema_file)
239 _LOG.debug(" column_map: %s", self.config.column_map)
240 _LOG.debug(" schema prefix: %s", self.config.prefix)
242 # engine is reused between multiple processes, make sure that we don't
243 # share connections by disabling pool (by using NullPool class)
244 kw = dict(echo=self.config.sql_echo)
245 conn_args = dict()
246 if not self.config.connection_pool:
247 kw.update(poolclass=NullPool)
248 if self.config.isolation_level is not None:
249 kw.update(isolation_level=self.config.isolation_level)
250 if self.config.connection_timeout is not None:
251 if self.config.db_url.startswith("sqlite"):
252 conn_args.update(timeout=self.config.connection_timeout)
253 elif self.config.db_url.startswith(("postgresql", "mysql")):
254 conn_args.update(connect_timeout=self.config.connection_timeout)
255 kw.update(connect_args=conn_args)
256 self._engine = sqlalchemy.create_engine(self.config.db_url, **kw)
258 self._schema = apdbSchema.ApdbSchema(engine=self._engine,
259 dia_object_index=self.config.dia_object_index,
260 dia_object_nightly=self.config.dia_object_nightly,
261 schema_file=self.config.schema_file,
262 extra_schema_file=self.config.extra_schema_file,
263 column_map=self.config.column_map,
264 afw_schemas=afw_schemas,
265 prefix=self.config.prefix)
267 def lastVisit(self):
268 """Returns last visit information or `None` if visits table is empty.
270 Visits table is used by ap_proto to track visit information, it is
271 not a part of the regular APDB schema.
273 Returns
274 -------
275 visit : `Visit` or `None`
276 Last stored visit info or `None` if there was nothing stored yet.
277 """
279 with self._engine.begin() as conn:
281 stmnt = sql.select([sql.func.max(self._schema.visits.c.visitId),
282 sql.func.max(self._schema.visits.c.visitTime)])
283 res = conn.execute(stmnt)
284 row = res.fetchone()
285 if row[0] is None:
286 return None
288 visitId = row[0]
289 visitTime = row[1]
290 _LOG.info("lastVisit: visitId: %s visitTime: %s (%s)", visitId,
291 visitTime, type(visitTime))
293 # get max IDs from corresponding tables
294 stmnt = sql.select([sql.func.max(self._schema.objects.c.diaObjectId)])
295 lastObjectId = conn.scalar(stmnt)
296 stmnt = sql.select([sql.func.max(self._schema.sources.c.diaSourceId)])
297 lastSourceId = conn.scalar(stmnt)
299 return Visit(visitId=visitId, visitTime=visitTime,
300 lastObjectId=lastObjectId, lastSourceId=lastSourceId)
302 def saveVisit(self, visitId, visitTime):
303 """Store visit information.
305 This method is only used by ``ap_proto`` script from ``l1dbproto``
306 and is not intended for production pipelines.
308 Parameters
309 ----------
310 visitId : `int`
311 Visit identifier
312 visitTime : `datetime.datetime`
313 Visit timestamp.
314 """
316 ins = self._schema.visits.insert().values(visitId=visitId,
317 visitTime=visitTime)
318 self._engine.execute(ins)
320 def tableRowCount(self):
321 """Returns dictionary with the table names and row counts.
323 Used by ``ap_proto`` to keep track of the size of the database tables.
324 Depending on database technology this could be expensive operation.
326 Returns
327 -------
328 row_counts : `dict`
329 Dict where key is a table name and value is a row count.
330 """
331 res = {}
332 tables = [self._schema.objects, self._schema.sources, self._schema.forcedSources]
333 if self.config.dia_object_index == 'last_object_table':
334 tables.append(self._schema.objects_last)
335 for table in tables:
336 stmt = sql.select([func.count()]).select_from(table)
337 count = self._engine.scalar(stmt)
338 res[table.name] = count
340 return res
342 def getDiaObjects(self, pixel_ranges, return_pandas=False):
343 """Returns catalog of DiaObject instances from given region.
345 Objects are searched based on pixelization index and region is
346 determined by the set of indices. There is no assumption on a
347 particular type of index, client is responsible for consistency
348 when calculating pixelization indices.
350 This method returns :doc:`/modules/lsst.afw.table/index` catalog with schema determined by
351 the schema of APDB table. Re-mapping of the column names is done for
352 some columns (based on column map passed to constructor) but types
353 or units are not changed.
355 Returns only the last version of each DiaObject.
357 Parameters
358 ----------
359 pixel_ranges : `list` of `tuple`
360 Sequence of ranges, range is a tuple (minPixelID, maxPixelID).
361 This defines set of pixel indices to be included in result.
362 return_pandas : `bool`
363 Return a `pandas.DataFrame` instead of
364 `lsst.afw.table.SourceCatalog`.
366 Returns
367 -------
368 catalog : `lsst.afw.table.SourceCatalog` or `pandas.DataFrame`
369 Catalog containing DiaObject records.
370 """
372 # decide what columns we need
373 if self.config.dia_object_index == 'last_object_table':
374 table = self._schema.objects_last
375 else:
376 table = self._schema.objects
377 if not self.config.dia_object_columns:
378 query = table.select()
379 else:
380 columns = [table.c[col] for col in self.config.dia_object_columns]
381 query = sql.select(columns)
383 if self.config.diaobject_index_hint:
384 val = self.config.diaobject_index_hint
385 query = query.with_hint(table, 'index_rs_asc(%(name)s "{}")'.format(val))
386 if self.config.dynamic_sampling_hint > 0:
387 val = self.config.dynamic_sampling_hint
388 query = query.with_hint(table, 'dynamic_sampling(%(name)s {})'.format(val))
389 if self.config.cardinality_hint > 0:
390 val = self.config.cardinality_hint
391 query = query.with_hint(table, 'FIRST_ROWS_1 cardinality(%(name)s {})'.format(val))
393 # build selection
394 exprlist = []
395 for low, upper in pixel_ranges:
396 upper -= 1
397 if low == upper:
398 exprlist.append(table.c.pixelId == low)
399 else:
400 exprlist.append(sql.expression.between(table.c.pixelId, low, upper))
401 query = query.where(sql.expression.or_(*exprlist))
403 # select latest version of objects
404 if self.config.dia_object_index != 'last_object_table':
405 query = query.where(table.c.validityEnd == None) # noqa: E711
407 _LOG.debug("query: %s", query)
409 if self.config.explain:
410 # run the same query with explain
411 self._explain(query, self._engine)
413 # execute select
414 with Timer('DiaObject select', self.config.timer):
415 with self._engine.begin() as conn:
416 if return_pandas:
417 objects = pandas.read_sql_query(query, conn)
418 else:
419 res = conn.execute(query)
420 objects = self._convertResult(res, "DiaObject")
421 _LOG.debug("found %s DiaObjects", len(objects))
422 return objects
424 def getDiaSourcesInRegion(self, pixel_ranges, dt, return_pandas=False):
425 """Returns catalog of DiaSource instances from given region.
427 Sources are searched based on pixelization index and region is
428 determined by the set of indices. There is no assumption on a
429 particular type of index, client is responsible for consistency
430 when calculating pixelization indices.
432 This method returns :doc:`/modules/lsst.afw.table/index` catalog with schema determined by
433 the schema of APDB table. Re-mapping of the column names is done for
434 some columns (based on column map passed to constructor) but types or
435 units are not changed.
437 Parameters
438 ----------
439 pixel_ranges : `list` of `tuple`
440 Sequence of ranges, range is a tuple (minPixelID, maxPixelID).
441 This defines set of pixel indices to be included in result.
442 dt : `datetime.datetime`
443 Time of the current visit
444 return_pandas : `bool`
445 Return a `pandas.DataFrame` instead of
446 `lsst.afw.table.SourceCatalog`.
448 Returns
449 -------
450 catalog : `lsst.afw.table.SourceCatalog`, `pandas.DataFrame`, or `None`
451 Catalog containing DiaSource records. `None` is returned if
452 ``read_sources_months`` configuration parameter is set to 0.
453 """
455 if self.config.read_sources_months == 0:
456 _LOG.info("Skip DiaSources fetching")
457 return None
459 table = self._schema.sources
460 query = table.select()
462 # build selection
463 exprlist = []
464 for low, upper in pixel_ranges:
465 upper -= 1
466 if low == upper:
467 exprlist.append(table.c.pixelId == low)
468 else:
469 exprlist.append(sql.expression.between(table.c.pixelId, low, upper))
470 query = query.where(sql.expression.or_(*exprlist))
472 # execute select
473 with Timer('DiaSource select', self.config.timer):
474 with _ansi_session(self._engine) as conn:
475 if return_pandas:
476 sources = pandas.read_sql_query(query, conn)
477 else:
478 res = conn.execute(query)
479 sources = self._convertResult(res, "DiaSource")
480 _LOG.debug("found %s DiaSources", len(sources))
481 return sources
483 def getDiaSources(self, object_ids, dt, return_pandas=False):
484 """Returns catalog of DiaSource instances given set of DiaObject IDs.
486 This method returns :doc:`/modules/lsst.afw.table/index` catalog with schema determined by
487 the schema of APDB table. Re-mapping of the column names is done for
488 some columns (based on column map passed to constructor) but types or
489 units are not changed.
491 Parameters
492 ----------
493 object_ids :
494 Collection of DiaObject IDs
495 dt : `datetime.datetime`
496 Time of the current visit
497 return_pandas : `bool`
498 Return a `pandas.DataFrame` instead of
499 `lsst.afw.table.SourceCatalog`.
502 Returns
503 -------
504 catalog : `lsst.afw.table.SourceCatalog`, `pandas.DataFrame`, or `None`
505 Catalog contaning DiaSource records. `None` is returned if
506 ``read_sources_months`` configuration parameter is set to 0 or
507 when ``object_ids`` is empty.
508 """
510 if self.config.read_sources_months == 0:
511 _LOG.info("Skip DiaSources fetching")
512 return None
514 if len(object_ids) <= 0:
515 _LOG.info("Skip DiaSources fetching - no Objects")
516 # this should create a catalog, but the list of columns may be empty
517 return None
519 table = self._schema.sources
520 sources = None
521 with Timer('DiaSource select', self.config.timer):
522 with _ansi_session(self._engine) as conn:
523 for ids in _split(sorted(object_ids), 1000):
524 query = 'SELECT * FROM "' + table.name + '" WHERE '
526 # select by object id
527 ids = ",".join(str(id) for id in ids)
528 query += '"diaObjectId" IN (' + ids + ') '
530 # execute select
531 if return_pandas:
532 df = pandas.read_sql_query(sql.text(query), conn)
533 if sources is None:
534 sources = df
535 else:
536 sources = sources.append(df)
537 else:
538 res = conn.execute(sql.text(query))
539 sources = self._convertResult(res, "DiaSource", sources)
541 _LOG.debug("found %s DiaSources", len(sources))
542 return sources
544 def getDiaForcedSources(self, object_ids, dt, return_pandas=False):
545 """Returns catalog of DiaForcedSource instances matching given
546 DiaObjects.
548 This method returns :doc:`/modules/lsst.afw.table/index` catalog with schema determined by
549 the schema of L1 database table. Re-mapping of the column names may
550 be done for some columns (based on column map passed to constructor)
551 but types or units are not changed.
553 Parameters
554 ----------
555 object_ids :
556 Collection of DiaObject IDs
557 dt : `datetime.datetime`
558 Time of the current visit
559 return_pandas : `bool`
560 Return a `pandas.DataFrame` instead of
561 `lsst.afw.table.SourceCatalog`.
563 Returns
564 -------
565 catalog : `lsst.afw.table.SourceCatalog` or `None`
566 Catalog contaning DiaForcedSource records. `None` is returned if
567 ``read_sources_months`` configuration parameter is set to 0 or
568 when ``object_ids`` is empty.
569 """
571 if self.config.read_forced_sources_months == 0:
572 _LOG.info("Skip DiaForceSources fetching")
573 return None
575 if len(object_ids) <= 0:
576 _LOG.info("Skip DiaForceSources fetching - no Objects")
577 # this should create a catalog, but the list of columns may be empty
578 return None
580 table = self._schema.forcedSources
581 sources = None
583 with Timer('DiaForcedSource select', self.config.timer):
584 with _ansi_session(self._engine) as conn:
585 for ids in _split(sorted(object_ids), 1000):
587 query = 'SELECT * FROM "' + table.name + '" WHERE '
589 # select by object id
590 ids = ",".join(str(id) for id in ids)
591 query += '"diaObjectId" IN (' + ids + ') '
593 # execute select
594 if return_pandas:
595 df = pandas.read_sql_query(sql.text(query), conn)
596 if sources is None:
597 sources = df
598 else:
599 sources = sources.append(df)
600 else:
601 res = conn.execute(sql.text(query))
602 sources = self._convertResult(res, "DiaForcedSource", sources)
604 _LOG.debug("found %s DiaForcedSources", len(sources))
605 return sources
607 def storeDiaObjects(self, objs, dt):
608 """Store catalog of DiaObjects from current visit.
610 This methods takes :doc:`/modules/lsst.afw.table/index` catalog, its schema must be
611 compatible with the schema of APDB table:
613 - column names must correspond to database table columns
614 - some columns names are re-mapped based on column map passed to
615 constructor
616 - types and units of the columns must match database definitions,
617 no unit conversion is performed presently
618 - columns that have default values in database schema can be
619 omitted from afw schema
620 - this method knows how to fill interval-related columns
621 (validityStart, validityEnd) they do not need to appear in
622 afw schema
624 Parameters
625 ----------
626 objs : `lsst.afw.table.BaseCatalog` or `pandas.DataFrame`
627 Catalog with DiaObject records
628 dt : `datetime.datetime`
629 Time of the visit
630 """
632 if isinstance(objs, pandas.DataFrame):
633 ids = sorted(objs['diaObjectId'])
634 else:
635 ids = sorted([obj['id'] for obj in objs])
636 _LOG.debug("first object ID: %d", ids[0])
638 # NOTE: workaround for sqlite, need this here to avoid
639 # "database is locked" error.
640 table = self._schema.objects
642 # everything to be done in single transaction
643 with _ansi_session(self._engine) as conn:
645 ids = ",".join(str(id) for id in ids)
647 if self.config.dia_object_index == 'last_object_table':
649 # insert and replace all records in LAST table, mysql and postgres have
650 # non-standard features (handled in _storeObjectsAfw)
651 table = self._schema.objects_last
652 do_replace = self.config.object_last_replace
653 # If the input data is of type Pandas, we drop the previous
654 # objects regardless of the do_replace setting due to how
655 # Pandas inserts objects.
656 if not do_replace or isinstance(objs, pandas.DataFrame):
657 query = 'DELETE FROM "' + table.name + '" '
658 query += 'WHERE "diaObjectId" IN (' + ids + ') '
660 if self.config.explain:
661 # run the same query with explain
662 self._explain(query, conn)
664 with Timer(table.name + ' delete', self.config.timer):
665 res = conn.execute(sql.text(query))
666 _LOG.debug("deleted %s objects", res.rowcount)
668 extra_columns = dict(lastNonForcedSource=dt)
669 if isinstance(objs, pandas.DataFrame):
670 with Timer("DiaObjectLast insert", self.config.timer):
671 for col, data in extra_columns.items():
672 objs[col] = data
673 objs.to_sql("DiaObjectLast", conn, if_exists='append',
674 index=False)
675 else:
676 self._storeObjectsAfw(objs, conn, table, "DiaObjectLast",
677 replace=do_replace,
678 extra_columns=extra_columns)
680 else:
682 # truncate existing validity intervals
683 table = self._schema.objects
684 query = 'UPDATE "' + table.name + '" '
685 query += "SET \"validityEnd\" = '" + str(dt) + "' "
686 query += 'WHERE "diaObjectId" IN (' + ids + ') '
687 query += 'AND "validityEnd" IS NULL'
689 # _LOG.debug("query: %s", query)
691 if self.config.explain:
692 # run the same query with explain
693 self._explain(query, conn)
695 with Timer(table.name + ' truncate', self.config.timer):
696 res = conn.execute(sql.text(query))
697 _LOG.debug("truncated %s intervals", res.rowcount)
699 # insert new versions
700 if self.config.dia_object_nightly:
701 table = self._schema.objects_nightly
702 else:
703 table = self._schema.objects
704 extra_columns = dict(lastNonForcedSource=dt, validityStart=dt,
705 validityEnd=None)
706 if isinstance(objs, pandas.DataFrame):
707 with Timer("DiaObject insert", self.config.timer):
708 for col, data in extra_columns.items():
709 objs[col] = data
710 objs.to_sql("DiaObject", conn, if_exists='append',
711 index=False)
712 else:
713 self._storeObjectsAfw(objs, conn, table, "DiaObject",
714 extra_columns=extra_columns)
716 def storeDiaSources(self, sources):
717 """Store catalog of DIASources from current visit.
719 This methods takes :doc:`/modules/lsst.afw.table/index` catalog, its schema must be
720 compatible with the schema of L1 database table:
722 - column names must correspond to database table columns
723 - some columns names may be re-mapped based on column map passed to
724 constructor
725 - types and units of the columns must match database definitions,
726 no unit conversion is performed presently
727 - columns that have default values in database schema can be
728 omitted from afw schema
730 Parameters
731 ----------
732 sources : `lsst.afw.table.BaseCatalog` or `pandas.DataFrame`
733 Catalog containing DiaSource records
734 """
736 # everything to be done in single transaction
737 with _ansi_session(self._engine) as conn:
739 if isinstance(sources, pandas.DataFrame):
740 with Timer("DiaSource insert", self.config.timer):
741 sources.to_sql("DiaSource", conn, if_exists='append',
742 index=False)
743 else:
744 table = self._schema.sources
745 self._storeObjectsAfw(sources, conn, table, "DiaSource")
747 def storeDiaForcedSources(self, sources):
748 """Store a set of DIAForcedSources from current visit.
750 This methods takes :doc:`/modules/lsst.afw.table/index` catalog, its schema must be
751 compatible with the schema of L1 database table:
753 - column names must correspond to database table columns
754 - some columns names may be re-mapped based on column map passed to
755 constructor
756 - types and units of the columns must match database definitions,
757 no unit conversion is performed presently
758 - columns that have default values in database schema can be
759 omitted from afw schema
761 Parameters
762 ----------
763 sources : `lsst.afw.table.BaseCatalog` or `pandas.DataFrame`
764 Catalog containing DiaForcedSource records
765 """
767 # everything to be done in single transaction
768 with _ansi_session(self._engine) as conn:
770 if isinstance(sources, pandas.DataFrame):
771 with Timer("DiaForcedSource insert", self.config.timer):
772 sources.to_sql("DiaForcedSource", conn, if_exists='append',
773 index=False)
774 else:
775 table = self._schema.forcedSources
776 self._storeObjectsAfw(sources, conn, table, "DiaForcedSource")
778 def countUnassociatedObjects(self):
779 """Return the number of DiaObjects that have only one DiaSource associated
780 with them.
782 Used as part of ap_verify metrics.
784 Returns
785 -------
786 count : `int`
787 Number of DiaObjects with exactly one associated DiaSource.
788 """
789 # Retrieve the DiaObject table.
790 table = self._schema.objects
792 # Construct the sql statement.
793 stmt = sql.select([func.count()]).select_from(table).where(table.c.nDiaSources == 1)
794 stmt = stmt.where(table.c.validityEnd == None) # noqa: E711
796 # Return the count.
797 count = self._engine.scalar(stmt)
799 return count
801 def isVisitProcessed(self, visitInfo):
802 """Test whether data from an image has been loaded into the database.
804 Used as part of ap_verify metrics.
806 Parameters
807 ----------
808 visitInfo : `lsst.afw.image.VisitInfo`
809 The metadata for the image of interest.
811 Returns
812 -------
813 isProcessed : `bool`
814 `True` if the data are present, `False` otherwise.
815 """
816 id = visitInfo.getExposureId()
817 table = self._schema.sources
818 idField = table.c.ccdVisitId
820 # Hopefully faster than SELECT DISTINCT
821 query = sql.select([idField]).select_from(table) \
822 .where(idField == id).limit(1)
824 return self._engine.scalar(query) is not None
826 def dailyJob(self):
827 """Implement daily activities like cleanup/vacuum.
829 What should be done during daily cleanup is determined by
830 configuration/schema.
831 """
833 # move data from DiaObjectNightly into DiaObject
834 if self.config.dia_object_nightly:
835 with _ansi_session(self._engine) as conn:
836 query = 'INSERT INTO "' + self._schema.objects.name + '" '
837 query += 'SELECT * FROM "' + self._schema.objects_nightly.name + '"'
838 with Timer('DiaObjectNightly copy', self.config.timer):
839 conn.execute(sql.text(query))
841 query = 'DELETE FROM "' + self._schema.objects_nightly.name + '"'
842 with Timer('DiaObjectNightly delete', self.config.timer):
843 conn.execute(sql.text(query))
845 if self._engine.name == 'postgresql':
847 # do VACUUM on all tables
848 _LOG.info("Running VACUUM on all tables")
849 connection = self._engine.raw_connection()
850 ISOLATION_LEVEL_AUTOCOMMIT = 0
851 connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
852 cursor = connection.cursor()
853 cursor.execute("VACUUM ANALYSE")
855 def makeSchema(self, drop=False, mysql_engine='InnoDB', oracle_tablespace=None, oracle_iot=False):
856 """Create or re-create all tables.
858 Parameters
859 ----------
860 drop : `bool`
861 If True then drop tables before creating new ones.
862 mysql_engine : `str`, optional
863 Name of the MySQL engine to use for new tables.
864 oracle_tablespace : `str`, optional
865 Name of Oracle tablespace.
866 oracle_iot : `bool`, optional
867 Make Index-organized DiaObjectLast table.
868 """
869 self._schema.makeSchema(drop=drop, mysql_engine=mysql_engine,
870 oracle_tablespace=oracle_tablespace,
871 oracle_iot=oracle_iot)
873 def _explain(self, query, conn):
874 """Run the query with explain
875 """
877 _LOG.info("explain for query: %s...", query[:64])
879 if conn.engine.name == 'mysql':
880 query = "EXPLAIN EXTENDED " + query
881 else:
882 query = "EXPLAIN " + query
884 res = conn.execute(sql.text(query))
885 if res.returns_rows:
886 _LOG.info("explain: %s", res.keys())
887 for row in res:
888 _LOG.info("explain: %s", row)
889 else:
890 _LOG.info("EXPLAIN returned nothing")
892 def _storeObjectsAfw(self, objects, conn, table, schema_table_name,
893 replace=False, extra_columns=None):
894 """Generic store method.
896 Takes catalog of records and stores a bunch of objects in a table.
898 Parameters
899 ----------
900 objects : `lsst.afw.table.BaseCatalog`
901 Catalog containing object records
902 conn :
903 Database connection
904 table : `sqlalchemy.Table`
905 Database table
906 schema_table_name : `str`
907 Name of the table to be used for finding table schema.
908 replace : `boolean`
909 If `True` then use replace instead of INSERT (should be more efficient)
910 extra_columns : `dict`, optional
911 Mapping (column_name, column_value) which gives column values to add
912 to every row, only if column is missing in catalog records.
913 """
915 def quoteValue(v):
916 """Quote and escape values"""
917 if v is None:
918 v = "NULL"
919 elif isinstance(v, datetime):
920 v = "'" + str(v) + "'"
921 elif isinstance(v, str):
922 # we don't expect nasty stuff in strings
923 v = "'" + v + "'"
924 elif isinstance(v, geom.Angle):
925 v = v.asDegrees()
926 if np.isfinite(v):
927 v = str(v)
928 else:
929 v = "NULL"
930 else:
931 if np.isfinite(v):
932 v = str(v)
933 else:
934 v = "NULL"
935 return v
937 def quoteId(columnName):
938 """Smart quoting for column names.
939 Lower-case names are not quoted.
940 """
941 if not columnName.islower():
942 columnName = '"' + columnName + '"'
943 return columnName
945 if conn.engine.name == "oracle":
946 return self._storeObjectsAfwOracle(objects, conn, table,
947 schema_table_name, replace,
948 extra_columns)
950 schema = objects.getSchema()
951 # use extra columns if specified
952 extra_fields = list((extra_columns or {}).keys())
954 afw_fields = [field.getName() for key, field in schema
955 if field.getName() not in extra_fields]
957 column_map = self._schema.getAfwColumns(schema_table_name)
958 # list of columns (as in cat schema)
959 fields = [column_map[field].name for field in afw_fields if field in column_map]
961 if replace and conn.engine.name in ('mysql', 'sqlite'):
962 query = 'REPLACE INTO '
963 else:
964 query = 'INSERT INTO '
965 qfields = [quoteId(field) for field in fields + extra_fields]
966 query += quoteId(table.name) + ' (' + ','.join(qfields) + ') ' + 'VALUES '
968 values = []
969 for rec in objects:
970 row = []
971 for field in afw_fields:
972 if field not in column_map:
973 continue
974 value = rec[field]
975 if column_map[field].type == "DATETIME" and \
976 np.isfinite(value):
977 # convert seconds into datetime
978 value = datetime.utcfromtimestamp(value)
979 row.append(quoteValue(value))
980 for field in extra_fields:
981 row.append(quoteValue(extra_columns[field]))
982 values.append('(' + ','.join(row) + ')')
984 if self.config.explain:
985 # run the same query with explain, only give it one row of data
986 self._explain(query + values[0], conn)
988 query += ','.join(values)
990 if replace and conn.engine.name == 'postgresql':
991 # This depends on that "replace" can only be true for DiaObjectLast table
992 pks = ('pixelId', 'diaObjectId')
993 query += " ON CONFLICT (\"{}\", \"{}\") DO UPDATE SET ".format(*pks)
994 fields = [column_map[field].name for field in afw_fields if field in column_map]
995 fields = ['"{0}" = EXCLUDED."{0}"'.format(field)
996 for field in fields if field not in pks]
997 query += ', '.join(fields)
999 # _LOG.debug("query: %s", query)
1000 _LOG.info("%s: will store %d records", table.name, len(objects))
1001 with Timer(table.name + ' insert', self.config.timer):
1002 res = conn.execute(sql.text(query))
1003 _LOG.debug("inserted %s intervals", res.rowcount)
1005 def _storeObjectsAfwOracle(self, objects, conn, table, schema_table_name,
1006 replace=False, extra_columns=None):
1007 """Store method for Oracle.
1009 Takes catalog of records and stores a bunch of objects in a table.
1011 Parameters
1012 ----------
1013 objects : `lsst.afw.table.BaseCatalog`
1014 Catalog containing object records
1015 conn :
1016 Database connection
1017 table : `sqlalchemy.Table`
1018 Database table
1019 schema_table_name : `str`
1020 Name of the table to be used for finding table schema.
1021 replace : `boolean`
1022 If `True` then use replace instead of INSERT (should be more efficient)
1023 extra_columns : `dict`, optional
1024 Mapping (column_name, column_value) which gives column values to add
1025 to every row, only if column is missing in catalog records.
1026 """
1028 def quoteId(columnName):
1029 """Smart quoting for column names.
1030 Lower-case naems are not quoted (Oracle backend needs them unquoted).
1031 """
1032 if not columnName.islower():
1033 columnName = '"' + columnName + '"'
1034 return columnName
1036 schema = objects.getSchema()
1038 # use extra columns that as overrides always.
1039 extra_fields = list((extra_columns or {}).keys())
1041 afw_fields = [field.getName() for key, field in schema
1042 if field.getName() not in extra_fields]
1043 # _LOG.info("afw_fields: %s", afw_fields)
1045 column_map = self._schema.getAfwColumns(schema_table_name)
1046 # _LOG.info("column_map: %s", column_map)
1048 # list of columns (as in cat schema)
1049 fields = [column_map[field].name for field in afw_fields
1050 if field in column_map]
1051 # _LOG.info("fields: %s", fields)
1053 qfields = [quoteId(field) for field in fields + extra_fields]
1055 if not replace:
1056 vals = [":col{}".format(i) for i in range(len(fields))]
1057 vals += [":extcol{}".format(i) for i in range(len(extra_fields))]
1058 query = 'INSERT INTO ' + quoteId(table.name)
1059 query += ' (' + ','.join(qfields) + ') VALUES'
1060 query += ' (' + ','.join(vals) + ')'
1061 else:
1062 qvals = [":col{} {}".format(i, quoteId(field)) for i, field in enumerate(fields)]
1063 qvals += [":extcol{} {}".format(i, quoteId(field)) for i, field in enumerate(extra_fields)]
1064 pks = ('pixelId', 'diaObjectId')
1065 onexpr = ["SRC.{col} = DST.{col}".format(col=quoteId(col)) for col in pks]
1066 setexpr = ["DST.{col} = SRC.{col}".format(col=quoteId(col))
1067 for col in fields + extra_fields if col not in pks]
1068 vals = ["SRC.{col}".format(col=quoteId(col)) for col in fields + extra_fields]
1069 query = "MERGE INTO {} DST ".format(quoteId(table.name))
1070 query += "USING (SELECT {} FROM DUAL) SRC ".format(", ".join(qvals))
1071 query += "ON ({}) ".format(" AND ".join(onexpr))
1072 query += "WHEN MATCHED THEN UPDATE SET {} ".format(" ,".join(setexpr))
1073 query += "WHEN NOT MATCHED THEN INSERT "
1074 query += "({}) VALUES ({})".format(','.join(qfields), ','.join(vals))
1075 # _LOG.info("query: %s", query)
1077 values = []
1078 for rec in objects:
1079 row = {}
1080 col = 0
1081 for field in afw_fields:
1082 if field not in column_map:
1083 continue
1084 value = rec[field]
1085 if column_map[field].type == "DATETIME" and not np.isnan(value):
1086 # convert seconds into datetime
1087 value = datetime.utcfromtimestamp(value)
1088 elif isinstance(value, geom.Angle):
1089 value = str(value.asDegrees())
1090 elif not np.isfinite(value):
1091 value = None
1092 row["col{}".format(col)] = value
1093 col += 1
1094 for i, field in enumerate(extra_fields):
1095 row["extcol{}".format(i)] = extra_columns[field]
1096 values.append(row)
1098 # _LOG.debug("query: %s", query)
1099 _LOG.info("%s: will store %d records", table.name, len(objects))
1100 with Timer(table.name + ' insert', self.config.timer):
1101 res = conn.execute(sql.text(query), values)
1102 _LOG.debug("inserted %s intervals", res.rowcount)
1104 def _convertResult(self, res, table_name, catalog=None):
1105 """Convert result set into output catalog.
1107 Parameters
1108 ----------
1109 res : `sqlalchemy.ResultProxy`
1110 SQLAlchemy result set returned by query.
1111 table_name : `str`
1112 Name of the table.
1113 catalog : `lsst.afw.table.BaseCatalog`
1114 If not None then extend existing catalog
1116 Returns
1117 -------
1118 catalog : `lsst.afw.table.SourceCatalog`
1119 If ``catalog`` is None then new instance is returned, otherwise
1120 ``catalog`` is updated and returned.
1121 """
1122 # make catalog schema
1123 columns = res.keys()
1124 schema, col_map = self._schema.getAfwSchema(table_name, columns)
1125 if catalog is None:
1126 _LOG.debug("_convertResult: schema: %s", schema)
1127 _LOG.debug("_convertResult: col_map: %s", col_map)
1128 catalog = afwTable.SourceCatalog(schema)
1130 # fill catalog
1131 for row in res:
1132 record = catalog.addNew()
1133 for col, value in row.items():
1134 # some columns may exist in database but not included in afw schema
1135 col = col_map.get(col)
1136 if col is not None:
1137 if isinstance(value, datetime):
1138 # convert datetime to number of seconds
1139 value = int((value - datetime.utcfromtimestamp(0)).total_seconds())
1140 elif col.getTypeString() == 'Angle' and value is not None:
1141 value = value * geom.degrees
1142 if value is not None:
1143 record.set(col, value)
1145 return catalog