Coverage for python/lsst/sims/catalogs/definitions/InstanceCatalog.py : 86%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
"""Instance Catalog"""
"""Meta class for registering instance catalogs.
When any new type of instance catalog class is created, this registers it in a `registry` class attribute, available to all derived instance catalogs. """ def convert_to_underscores(name): """convert, e.g. CatalogName to catalog_name"""
# check if attribute catalog_type is specified. # If not, create a default warnings.warn("registry class attribute should not be " "over-ridden in InstanceCatalog classes. " "Proceed with caution")
# check if 'registry' is specified. # if not, then this is the base class: add the registry
# add this class to the registry raise ValueError("Catalog Type %s is duplicated" % cls.catalog_type)
# add methods for default columns lambda self, value=default[1], type=default[2]: np.array([value for i in range(len(self._current_chunk))], dtype=type))
# store compound columns and check for collisions # # We create a forward and backward mapping. # The dictionary cls._compound_columns maps the compound column # name to the multiple individual columns it represents. # The dictionary cls._compound_column_names maps the individual # column names to the compound column that contains them
except TypeError: raise ValueError("column names in compound " "decorator must be strings")
raise ValueError("column name '%s' in compound getter " "'%s' conflicts with getter '%s'" % (col, key, getter))
raise ValueError("duplicate compound column name: '%s'" % col)
else:
"""An object used for introspection of the database colums.
This mimics a numpy record array, but when a column is referenced, it logs the reference and returns zeros. """
return 0
""" Base class for instance catalogs generated by simulations.
Instance catalogs include a dictionary of numpy arrays which contains core data. Additional arrays can be appended as ancillary data.
Catalog types and Object types are defined in the CatalogDescription class catalogType = TRIM, SCIENCE, PHOTCAL, DIASOURCE, MISC, INVALID objectType = Point, Moving, Sersic, Image, Artefact, MISC catalogTable is name of the database table queried dataArray dictionary of numpy arrays of data """
# These are the class attributes to be specified in any derived class: # Note: these columns will be filtered on even if they are not included in column_outputs
# cannot_be_null before calculating getter columns
def new_catalog(cls, catalog_type, *args, **kwargs): """Return a new catalog of the given catalog type""" elif inspect.isclass(catalog_type) and issubclass(catalog_type, InstanceCatalog): return catalog_type(*args, **kwargs) else: raise ValueError("Unrecognized catalog_type: %s" % str(catalog_type))
def is_compound_column(cls, column_name): """Return true if the given column name is a compound column""" return True
"""Iterate the column names, expanding any compound columns"""
for col in getattr(getattr(self, "get_" + column), '_colnames'): yield col else:
constraint=None, specFileMap=None, cannot_be_null=None):
""" @param [in] db_obj is an instantiation of the CatalogDBObject class, which provide connection to a specific database table
see sims_catalogs_generation/python/lsst/sims/catalogs/generation/db/dbConnection.py
@param [in] obs_metadata is an instantiation of the ObservationMetaData class characterizing a specific telescope observation
see sims_catalogs_generation/python/lsst/sims/catalogs/generation/db/ObservationMetaData.py
@param [in] column_outputs is a list of column names to be output in the catalog. This is optional and will be appended to the list of column_outputs defined int he class definition.
@param [in] cannot_be_null is a list of column names indicating columns which cannot have the values Null, None, or NaN. Rows running afoul of this criterion will not be written by the write_catalog() method (though they may appear in the iterator returned by iter_catalog()). Note: these columns will be filtered on, even if they do not appear in column_outputs.
@param [in] constraint is an optional SQL constraint to be applied to the database query
@param [in] specFileMap is an instantiation of the SpecMap class
(defined in sims_catalogs_measures/python/sims/catalogs/measures/instance/fileMaps.py)
that maps database entries for SED names to actual file paths. If set to None, the class definition of InstanceCatalog ensures that it will be set to defaultSpecMap, which is the correct mapping for the LSST sims_sed_library """
# this dict will contain information telling the user where the columns in # the catalog come from
else:
else:
# Because cannot_be_null can both be declared at class definition # and at instantiation, we need to be able to combine the two inputs # into something the InstanceCatalog will actually use to filter # rows. self._cannot_be_null is a member variable that contains # the contents both of self.cannot_be_null (set at class definition) # and the cannot_be_null kwarg passed to __init__(). self._cannot_be_null # is what the catalog actually uses in self._filter_chunk
else:
self.specFileMap = specFileMap
# self._column_origins_switch tells column_by_name to log where it is getting # the columns in self._column_origins (we only want to do that once)
# now we will create and populate a list containing the names of # all of the columns which this InstanceCatalog can return. # Note: this needs to happen before self._check_requirements() # is called in case any getters depend on the contents of # _all_available_columns. That way, self._check_requirements() # can verify that the getter will run the way it is actually # being called.
self._column_outputs = []
# because asking for a compound_column means asking for # its individual sub-columns, which means those columns # will get listed twice in the catalog for name in self._all_available_columns: if name not in self._compound_columns: self._column_outputs.append(name)
"""Set the current chunk and clear the column cache""" else:
"""Get the list of columns required to be in the database object."""
# just call the column: this will log queries to the database.
# now do the same thing for columns specified in _cannot_be_null # (in case the catalog is filtered on columns that are not meant # to be written to the catalog)
"""Given a column name, return the column data"""
column_name not in self._actually_calculated_columns):
column_name in self._current_chunk.dtype.names):
else:
if self._column_origins_switch: self._column_origins[column_name] = 'default column'
return getattr(self, "default_%s"%column_name)(*args, **kwargs)
"""Check whether the supplied db_obj has the necessary column names"""
else:
else: # Because some earlier part of the code copies default columns # into the same place as columns that exist natively in the # database, this is where we have to mark columns that are # set by default
"({0})".format(', '.join(nodefault)))
self.print_column_origins()
warnings.warn("Using raw formatting for column '%s' " "with type %s" % (col, chunk_cols[i].dtype))
self.endline)
write_header=True, write_mode='w'): """ Write query self.db_obj and write the resulting InstanceCatalog to an ASCII output file
@param [in] filename is the name of the ASCII file to be written
@param [in] chunk_size is an optional parameter telling the CompoundInstanceCatalog to query the database in manageable chunks (in case returning the whole catalog takes too much memory)
@param [in] write_header a boolean specifying whether or not to add a header to the output catalog (default True)
@param [in] write_mode is 'w' if you want to overwrite the output file or 'a' if you want to append to an existing output file (default: 'w') """
write_header=write_header, write_mode=write_mode, obs_metadata=self.obs_metadata, constraint=self.constraint)
write_mode='w', obs_metadata=None, constraint=None): """ This method queries db_obj, and then writes the resulting recarray to the specified ASCII output file.
@param [in] filename is the name of the ASCII file to be written
@param [in] obs_metadata is an ObservationMetaData instantiation characterizing the telescope pointing (optional)
@param [in] constraint is an optional SQL constraint applied to the database query.
@param [in] chunk_size is an optional parameter telling the CompoundInstanceCatalog to query the database in manageable chunks (in case returning the whole catalog takes too much memory)
@param [in] write_header a boolean specifying whether or not to add a header to the output catalog (default True)
@param [in] write_mode is 'w' if you want to overwrite the output file or 'a' if you want to append to an existing output file (default: 'w') """
obs_metadata=obs_metadata, constraint=constraint, chunk_size=chunk_size)
""" This function verifies the catalog's required columns, initializes some member variables that are required for the catalog-writing process. """
""" Update self._current_chunk and self._column_cache to only include the rows specified by good_dexes (which will be a list of indexes). """ # In the event that self._column_cache has already been created, # update the cache so that only valid rows remain therein # this is a sub-column of a compound column; # ignore it, we will update the cache when we come # to the compound column continue else:
""" Take a chunk of database rows and select only those that match the criteria set by self._cannot_be_null. Set self._current_chunk to be the rows that pass this test. Return a numpy array of the indices of those rows relative to the original chunk. """
# go through the database query results and remove all of those # rows that have already run afoul of self._cannot_be_null np.logical_and(str_vec != 'nan', str_vec != 'null')))
# If some columns are specified as cannot_be_null, loop over those columns, # removing rows that run afoul of that criterion from the chunk.
np.logical_and(filter_vals != 'nan', filter_vals != 'null')))
""" write self._current_chunk to the file specified by file_handle """
if col in list_of_transform_keys else self.column_by_name(col) for col in self.iter_column_names()]
# Create the template with the first chunk
# use a generator expression for lines rather than a list # for memory efficiency
""" This method takes a recarray (usually returned by querying db_obj), and writes it to the catalog. This method also handles any transformation of columns that needs to happen before they are written to the catalog.
@param [in] chunk is the recarray of queried columns to be formatted and written to the catalog.
@param [in] file_handle is a file handle pointing to the file where the catalog is being written. """
""" Iterate over the lines of a catalog.
chunk_size controls the number of rows returned at a time from the database (smaller chunk_size will result in less memory usage but slower performance).
Catalog rows will be returned as lists. """
obs_metadata=self.obs_metadata, constraint=self.constraint, chunk_size=chunk_size)
if col in list_of_transform_keys else self.column_by_name(col) for col in self.iter_column_names()]
""" Iterate over catalog contents one chunk at a time.
chunk_size controls the number of catalog rows contained in each chunk.
The iterator will return a chunk of the database (a list of lists containing the contents of the datbase chunk). The first dimension of the chunk corresponds to the columns of the catalog, i.e. chunk[0] is a list containing the 0th column of the catalog.
The iterator will also yield a colMap, which is a dict mapping the names of the columns to their index value in the chunk.
Usage:
for chunk, colMap in cat.iter_catalog_chunks(chunk_size=1000): for ix in range(len(chunk[0])): print chunk[0][ix], chunk[1][ix], chunk[2][ix]
will print out the first three columns of the catalog, row by row """
obs_metadata=self.obs_metadata, constraint=self.constraint, chunk_size=chunk_size)
if col in list_of_transform_keys else self.column_by_name(col) for col in self.iter_column_names()]
return self.column_by_name(self.refIdCol)
arr = self.column_by_name(self.refIdCol) if len(arr) > 0: return np.left_shift(self.column_by_name(self.refIdCol), nShift) + \ self.db_obj.getObjectTypeId() else: return arr
""" This method will return the name of the class that first defined the input method.
This is taken verbatim from http://stackoverflow.com/questions/961048/get-class-that-defined-method """
return None
""" Print the origins of the columns in this catalog """
print('\nwhere the columns in ', self.__class__, ' come from') for column_name in self._column_origins: print(column_name, self._column_origins[column_name])
print('\n')
|