Coverage for python/lsst/sims/catalogs/db/dbConnection.py : 78%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# 2017 March 9 # str_cast exists because numpy.dtype does # not like unicode-like things as the names # of columns. Unfortunately, in python 2, # builtins.str looks unicode-like. We will # use str_cast in python 2 to maintain # both python 3 compatibility and our use of # numpy dtype from past.builtins import str as past_str str_cast = past_str
Table, event, text)
#The documentation at http://docs.sqlalchemy.org/en/rel_0_7/core/types.html#sqlalchemy.types.Numeric #suggests using the cdecimal module. Since it is not standard, import decimal. #TODO: test for cdecimal and use it if it exists.
""" A function to return the value of pi. This is needed for adding PI() to sqlite databases """
""" A database event listener which will define the math functions necessary for evaluating the Haversine function in sqlite databases (where they are not otherwise defined)
see: http://docs.sqlalchemy.org/en/latest/core/events.html """
#------------------------------------------------------------ # Iterator for database chunks
"""Iterator for query chunks"""
#arbitrarySQL exists in case a CatalogDBObject calls #get_arbitrary_chunk_iterator; in that case, we need to #be able to tell this object to call _postprocess_arbitrary_results, #rather than _postprocess_results
else: raise StopIteration
else:
""" This is a class that will hold the engine, session, and metadata for a DBObject. This will allow multiple DBObjects to share the same sqlalchemy connection, when appropriate. """
""" @param [in] database is the name of the database file being connected to
@param [in] driver is the dialect of the database (e.g. 'sqlite', 'mssql', etc.)
@param [in] host is the URL of the remote host, if appropriate
@param [in] port is the port on the remote host to connect to, if appropriate
@param [in] verbose is a boolean controlling sqlalchemy's verbosity """
#DbAuth will not look up hosts that are None, '' or 0 try: authDict = {'username': DbAuth.username(self._host, str(self._port)), 'password': DbAuth.password(self._host, str(self._port))} except: if self._driver == 'mssql+pymssql': print("\nFor more information on database authentication using the db-auth.paf" " policy file see: " "https://confluence.lsstcorp.org/display/SIM/Accessing+the+UW+CATSIM+Database\n") raise
dbUrl = url.URL(self._driver, host=self._host, port=self._port, database=self._database, **authDict) else: database=self._database)
bind=self._engine))
"""Validate connection parameters
- Check if user passed dbAddress instead of an database. Convert and warn. - Check that required connection paramters are present - Replace default host/port if driver is 'sqlite' """
"Attempting to convert to database, driver, host, " "and port parameters. Any usernames and passwords are ignored and must " "be in the db-auth.paf policy file. "%(self.database), FutureWarning)
raise AttributeError("%s has no attribute 'driver'. "%(self.__class__.__name__) + errMessage)
raise AttributeError("%s has no attribute 'database'. "%(self.__class__.__name__) + errMessage) raise AttributeError("%s.database is None. "%(self.__class__.__name__) + errMessage)
#When passed sqlite database, override default host/port
(str(self._driver) == str(other._driver)) and \ (str(self._host) == str(other._host)) and \ (str(self._port) == str(other._port))
def engine(self):
def session(self):
def metadata(self):
def database(self):
def driver(self):
def host(self):
def port(self):
def verbose(self):
connection=None, cache_connection=True): """ Initialize DBObject.
@param [in] database is the name of the database file being connected to
@param [in] driver is the dialect of the database (e.g. 'sqlite', 'mssql', etc.)
@param [in] host is the URL of the remote host, if appropriate
@param [in] port is the port on the remote host to connect to, if appropriate
@param [in] verbose is a boolean controlling sqlalchemy's verbosity (default False)
@param [in] connection is an optional instance of DBConnection, in the event that this DBObject can share a database connection with another DBObject. This is only necessary or even possible in a few specialized cases and should be used carefully.
@param [in] cache_connection is a boolean. If True, DBObject will use a cache of DBConnections (if available) to get the connection to this database. """
#this is a cache for the query, so that any one query does not have to guess dtype multiple times
#Explicit constructor to DBObject preferred driver=driver, host=host, port=port, verbose=verbose)
use_cache=cache_connection)
else:
""" Search self._connection_cache (if it exists; it won't for DBObject, but will for CatalogDBObject) for a DBConnection matching the specified parameters. If it exists, return it. If not, open a connection to the specified database, add it to the cache, and return the connection.
Parameters ---------- database is the name of the database file being connected to
driver is the dialect of the database (e.g. 'sqlite', 'mssql', etc.)
host is the URL of the remote host, if appropriate
port is the port on the remote host to connect to, if appropriate
use_cache is a boolean specifying whether or not we try to use the cache of database connections (you don't want to if opening many connections in many threads). """
"""Return a list of the names of the tables in the database"""
""" Return a list of the names of the columns in the specified table. If no table is specified, return a dict of lists. The dict will be keyed to the table names. The lists will be of the column names in that table """ return [] else: else:
""" Make final modifications to a set of data before returning it to the user
**Parameters**
* results : a structured array constructed from the result set from a query
**Returns**
* results : a potentially modified structured array. The default is to do nothing.
"""
""" This wrapper exists so that a ChunkIterator built from a DBObject can have the same API as a ChunkIterator built from a CatalogDBObject """ return self._postprocess_arbitrary_results(results)
""" Determine the dtype from the data. Store it in a global variable so we do not have to repeat on every chunk. """
# We are going to detect the dtype by reading in a single row # of data with np.genfromtxt. To do this, we must pass the # row as a string delimited by a specified character. Here we # select a character that does not occur anywhere in the data.
raise RuntimeError("DBObject could not detect the dtype of your return rows\n" "Please specify a dtype with the 'dtype' kwarg.")
else:
return numpy.recarray((0,), dtype = self.dtype)
""" Executes an arbitrary query. Returns a recarray of the results.
dtype will be the dtype of the output recarray. If it is None, then the code will guess the datatype and assign generic names to the columns """
""" This wrapper exists so that CatalogDBObjects can refer to get_arbitrary_chunk_iterator and DBObjects can refer to get_chunk_iterator """
""" Take an arbitrary, user-specified query and return a ChunkIterator that executes that query
dtype will tell the ChunkIterator what datatype to expect for this query. This information gets passed to _postprocess_results.
If 'None', then _postprocess_results will just guess the datatype and return generic names for the columns. """
"""Meta class for registering new objects.
When any new type of object class is created, this registers it in a `registry` class attribute, available to all derived instance catalog. """ # check if attribute objid is specified. # If not, create a default warnings.warn("registry class attribute should not be " "over-ridden in InstanceCatalog classes. " "Proceed with caution")
# check if 'registry' is specified. # if not, then this is the base class: add the registry else: # add this class to the registry 'This will override previous definition on line %i of %s'% (srcline, srcfile))
# check if the list of unique ids is specified # if not, then this is the base class: add the list else: pass '\nOutput object ids may not be unique.\nThis may not be a problem if you do not '+\ 'want globally unique id values') else:
dbObjects = cls.registry.keys() outstr = "++++++++++++++++++++++++++++++++++++++++++++++\n"+\ "Registered object types are:\n" for dbObject in dbObjects: outstr += "%s\n"%(dbObject) outstr += "\n\n" outstr += "To query the possible column names do:\n" outstr += "$> CatalogDBObject.from_objid([name]).show_mapped_columns()\n" outstr += "+++++++++++++++++++++++++++++++++++++++++++++" return outstr
"""Database Object base class
"""
#Provide information if this object should be tested in the unit test
#: Mapping of DDL types to python types. Strings are assumed to be 256 characters #: this can be overridden by modifying the dbTypeMap or by making a custom columns #: list. #: numpy doesn't know how to convert decimal.Decimal types, so I changed this to float #: TODO this doesn't seem to make a difference but make sure. 'NUMERIC':(float,), 'SMALLINT':(int,), 'TINYINT':(int,), 'VARCHAR':(str, 256), 'TEXT':(str, 256), 'CLOB':(str, 256), 'NVARCHAR':(str, 256), 'NCLOB':(str, 256), 'NTEXT':(str, 256), 'CHAR':(str, 1), 'INT':(int,), 'REAL':(float,), 'DOUBLE':(float,), 'STRING':(str, 256), 'DOUBLE_PRECISION':(float,), 'DECIMAL':(float,)}
def from_objid(cls, objid, *args, **kwargs): """Given a string objid, return an instance of the appropriate CatalogDBObject class. """ raise RuntimeError('Attempting to construct an object that does not exist')
table=None, objid=None, idColKey=None, connection=None, cache_connection=True):
raise ValueError("Double-specified tableid in CatalogDBObject:" " once in class definition, once in __init__")
self.tableid = table
raise ValueError("Double-specified objid in CatalogDBObject:" " once in class definition, once in __init__")
self.objid = objid
raise ValueError("Double-specified idColKey in CatalogDBObject:" " once in class definition, once in __init__")
self.idColKey = idColKey
self.idColKey = self.getIdColKey() msg = ("CatalogDBObject must be subclassed, and " "define objid, tableid and idColKey. You are missing: ") if self.objid is None: msg += "objid, " if self.tableid is None: msg += "tableid, " if self.idColKey is None: msg += "idColKey" raise ValueError(msg)
warnings.warn("objectTypeId has not " "been set. Input files for phosim are not " "possible.")
verbose=verbose, connection=connection, cache_connection=True)
except sa_exc.OperationalError as e: if self.driver == 'mssql+pymssql': message = "\n To connect to the UW CATSIM database: " message += " Check that you have valid connection parameters, an open ssh tunnel " message += "and that your $HOME/.lsst/db-auth.paf contains the appropriate credientials. " message += "Please consult the following link for more information on access: " message += " https://confluence.lsstcorp.org/display/SIM/Accessing+the+UW+CATSIM+Database " else: message = '' raise RuntimeError("Failed to connect to %s: sqlalchemy.%s %s" % (self.connection.engine, e.args[0], message))
#Need to do this after the table is instantiated so that #the default columns can be filled from the table object. # build column mapping and type mapping dicts from columns
for col in self.columnMap.keys(): print("%s -- %s"%(col, self.typeMap[col][0].__name__))
for col in self.table.c.keys(): print("%s -- %s"%(col, self.table.c[col].type.__visit_name__))
except ImportError: raise ImportError("sims_catalogs not set up. Cannot get InstanceCatalog from the object.")
return self.objectTypeId
autoload=True)
for el in self.columns]) for el in self.columns])
else: warnings.warn("Database column, %s, overridden in self.columns... "%(col)+ "Skipping default assignment.") else: if self.verbose: warnings.warn("Can't create default column for %s. There is no mapping "%(col)+ "for type %s. Modify the dbTypeMap, or make a custom columns "%(dbtypestr)+ "list.")
"""Given a list of valid column names, return the query object""" except KeyError: for col in colnames: if col in self.columnMap: continue else: warnings.warn("%s not in columnMap"%(col)) raise ValueError('entries in colnames must be in self.columnMap')
# Get the first query else:
#Check if the column is a default column (col == val) #If column is in the table, use it. else: #If not assume the user specified the column correctly
"""Filter the query by the associated metadata"""
"""Post-process the query results to put them in a structured array.
**Parameters**
* results : a result set as returned by execution of the query
**Returns**
* _final_pass(retresults) : the result of calling the _final_pass method on a structured array constructed from the query data. """
else: return results
dt_list = [] for k in cols: sub_list = [past_str(k)] if self.typeMap[k][0] is not str: for el in self.typeMap[k]: sub_list.append(el) else: sub_list.append(past_str) for el in self.typeMap[k][1:]: sub_list.append(el) dt_list.append(tuple(sub_list))
dtype = numpy.dtype(dt_list)
else:
result[colName] if result[colName] or colName not in self.dbDefaultValues else self.dbDefaultValues[colName] for colName in cols ])
else:
obs_metadata=None, constraint=None, limit=None): """Execute a query
**Parameters**
* colnames : list or None a list of valid column names, corresponding to entries in the `columns` class attribute. If not specified, all columns are queried. * chunk_size : int (optional) if specified, then return an iterator object to query the database, each time returning the next `chunk_size` elements. If not specified, all matching results will be returned. * obs_metadata : object (optional) an observation metadata object which has a "filter" method, which will add a filter string to the query. * constraint : str (optional) a string which is interpreted as SQL and used as a predicate on the query * limit : int (optional) limits the number of rows returned by the query
**Returns**
* result : list or iterator If chunk_size is not specified, then result is a list of all items which match the specified query. If chunk_size is specified, then result is an iterator over lists of the given size.
"""
query = query.limit(limit)
''' Class to read a file into a database and then query it''' #Column names to index. Specify compound indexes using tuples of column names dtype=None, numGuess=1000, delimiter=None, verbose=False, idColKey=None, **kwargs): """ Initialize an object for querying databases loaded from a file
Keyword arguments: @param dataLocatorString: Path to the file to load @param runtable: The name of the table to create. If None, a random table name will be used. @param driver: name of database driver (e.g. 'sqlite', 'mssql+pymssql') @param host: hostname for database connection (None if sqlite) @param port: port for database connection (None if sqlite) @param database: name of database (filename if sqlite) @param dtype: The numpy dtype to use when loading the file. If None, it the dtype will be guessed. @param numGuess: The number of lines to use in guessing the dtype from the file. @param delimiter: The delimiter to use when parsing the file default is white space. @param idColKey: The name of the column that uniquely identifies each row in the database """
raise ValueError("CatalogDBObject must be subclassed, and " "define objid and tableid and idColKey.")
warnings.warn("objectTypeId has not " "been set. Input files for phosim are not " "possible.")
port=self.port, verbose=verbose) self.connection.engine, self.connection.metadata, numGuess, indexCols=self.indexCols, **kwargs) else: raise ValueError("Could not locate file %s."%(dataLocatorString))
def from_objid(cls, objid, *args, **kwargs): """Given a string objid, return an instance of the appropriate fileDBObject class. """ |