lsst.meas.algorithms  15.0-13-g0ee414d5+5
ingestIndexReferenceTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 #
4 # Copyright 2008-2017 AURA/LSST.
5 #
6 # This product includes software developed by the
7 # LSST Project (http://www.lsst.org/).
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the LSST License Statement and
20 # the GNU General Public License along with this program. If not,
21 # see <https://www.lsstcorp.org/LegalNotices/>.
22 #
23 
24 __all__ = ["IngestIndexedReferenceConfig", "IngestIndexedReferenceTask", "DatasetConfig"]
25 
26 import numpy as np
27 
28 import lsst.pex.config as pexConfig
29 import lsst.pipe.base as pipeBase
30 import lsst.geom
31 import lsst.afw.table as afwTable
32 from lsst.afw.image import fluxFromABMag, fluxErrFromABMagErr
33 from .indexerRegistry import IndexerRegistry
34 from .readTextCatalogTask import ReadTextCatalogTask
35 
36 
37 class IngestReferenceRunner(pipeBase.TaskRunner):
38  """!Task runner for the reference catalog ingester
39 
40  Data IDs are ignored so the runner should just run the task on the parsed command.
41  """
42 
43  def run(self, parsedCmd):
44  """!Run the task.
45  Several arguments need to be collected to send on to the task methods.
46 
47  @param[in] parsedCmd Parsed command including command line arguments.
48  @returns Struct containing the result of the indexing.
49  """
50  files = parsedCmd.files
51  butler = parsedCmd.butler
52  task = self.TaskClass(config=self.config, log=self.log, butler=butler)
53  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
54 
55  result = task.create_indexed_catalog(files)
56  if self.doReturnResults:
57  return pipeBase.Struct(
58  result=result,
59  )
60 
61 
62 class DatasetConfig(pexConfig.Config):
63  ref_dataset_name = pexConfig.Field(
64  dtype=str,
65  default='cal_ref_cat',
66  doc='String to pass to the butler to retrieve persisted files.',
67  )
68  indexer = IndexerRegistry.makeField(
69  default='HTM',
70  doc='Name of indexer algoritm to use. Default is HTM',
71  )
72 
73 
74 class IngestIndexedReferenceConfig(pexConfig.Config):
75  dataset_config = pexConfig.ConfigField(
76  dtype=DatasetConfig,
77  doc="Configuration for reading the ingested data",
78  )
79  file_reader = pexConfig.ConfigurableField(
80  target=ReadTextCatalogTask,
81  doc='Task to use to read the files. Default is to expect text files.'
82  )
83  ra_name = pexConfig.Field(
84  dtype=str,
85  doc="Name of RA column",
86  )
87  dec_name = pexConfig.Field(
88  dtype=str,
89  doc="Name of Dec column",
90  )
91  mag_column_list = pexConfig.ListField(
92  dtype=str,
93  doc="The values in the reference catalog are assumed to be in AB magnitudes. "
94  "List of column names to use for photometric information. At least one entry is required."
95  )
96  mag_err_column_map = pexConfig.DictField(
97  keytype=str,
98  itemtype=str,
99  default={},
100  doc="A map of magnitude column name (key) to magnitude error column (value)."
101  )
102  is_photometric_name = pexConfig.Field(
103  dtype=str,
104  optional=True,
105  doc='Name of column stating if satisfactory for photometric calibration (optional).'
106  )
107  is_resolved_name = pexConfig.Field(
108  dtype=str,
109  optional=True,
110  doc='Name of column stating if the object is resolved (optional).'
111  )
112  is_variable_name = pexConfig.Field(
113  dtype=str,
114  optional=True,
115  doc='Name of column stating if the object is measured to be variable (optional).'
116  )
117  id_name = pexConfig.Field(
118  dtype=str,
119  optional=True,
120  doc='Name of column to use as an identifier (optional).'
121  )
122  extra_col_names = pexConfig.ListField(
123  dtype=str,
124  default=[],
125  doc='Extra columns to add to the reference catalog.'
126  )
127 
128  def validate(self):
129  pexConfig.Config.validate(self)
130  if not (self.ra_name and self.dec_name and self.mag_column_list):
131  raise ValueError("ra_name and dec_name and at least one entry in mag_column_list must be" +
132  " supplied.")
133  if len(self.mag_err_column_map) > 0 and not len(self.mag_column_list) == len(self.mag_err_column_map):
134  raise ValueError("If magnitude errors are provided, all magnitudes must have an error column")
135 
136 
137 class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
138  """!Class for both producing indexed reference catalogs and for loading them.
139 
140  This implements an indexing scheme based on hierarchical triangular mesh (HTM).
141  The term index really means breaking the catalog into localized chunks called
142  shards. In this case each shard contains the entries from the catalog in a single
143  HTM trixel
144  """
145  canMultiprocess = False
146  ConfigClass = IngestIndexedReferenceConfig
147  RunnerClass = IngestReferenceRunner
148  _DefaultName = 'IngestIndexedReferenceTask'
149 
150  _flags = ['photometric', 'resolved', 'variable']
151 
152  @classmethod
153  def _makeArgumentParser(cls):
154  """Create an argument parser
155 
156  This overrides the original because we need the file arguments
157  """
158  parser = pipeBase.InputOnlyArgumentParser(name=cls._DefaultName)
159  parser.add_argument("files", nargs="+", help="Names of files to index")
160  return parser
161 
162  def __init__(self, *args, **kwargs):
163  """!Constructor for the HTM indexing engine
164 
165  @param[in] butler dafPersistence.Butler object for reading and writing catalogs
166  """
167  self.butler = kwargs.pop('butler')
168  pipeBase.Task.__init__(self, *args, **kwargs)
169  self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
170  self.config.dataset_config.indexer.active)
171  self.makeSubtask('file_reader')
172 
173  def create_indexed_catalog(self, files):
174  """!Index a set of files comprising a reference catalog. Outputs are persisted in the
175  data repository.
176 
177  @param[in] files A list of file names to read.
178  """
179  rec_num = 0
180  first = True
181  for filename in files:
182  arr = self.file_reader.run(filename)
183  index_list = self.indexer.index_points(arr[self.config.ra_name], arr[self.config.dec_name])
184  if first:
185  schema, key_map = self.make_schema(arr.dtype)
186  # persist empty catalog to hold the master schema
187  dataId = self.indexer.make_data_id('master_schema',
188  self.config.dataset_config.ref_dataset_name)
189  self.butler.put(self.get_catalog(dataId, schema), 'ref_cat',
190  dataId=dataId)
191  first = False
192  pixel_ids = set(index_list)
193  for pixel_id in pixel_ids:
194  dataId = self.indexer.make_data_id(pixel_id, self.config.dataset_config.ref_dataset_name)
195  catalog = self.get_catalog(dataId, schema)
196  els = np.where(index_list == pixel_id)
197  for row in arr[els]:
198  record = catalog.addNew()
199  rec_num = self._fill_record(record, row, rec_num, key_map)
200  self.butler.put(catalog, 'ref_cat', dataId=dataId)
201  dataId = self.indexer.make_data_id(None, self.config.dataset_config.ref_dataset_name)
202  self.butler.put(self.config.dataset_config, 'ref_cat_config', dataId=dataId)
203 
204  @staticmethod
205  def compute_coord(row, ra_name, dec_name):
206  """!Create an ICRS SpherePoint from a np.array row
207  @param[in] row dict like object with ra/dec info in degrees
208  @param[in] ra_name name of RA key
209  @param[in] dec_name name of Dec key
210  @returns ICRS SpherePoint constructed from the RA/Dec values
211  """
212  return lsst.geom.SpherePoint(row[ra_name], row[dec_name], lsst.geom.degrees)
213 
214  def _set_flags(self, record, row, key_map):
215  """!Set the flags for a record. Relies on the _flags class attribute
216  @param[in,out] record SourceCatalog record to modify
217  @param[in] row dict like object containing flag info
218  @param[in] key_map Map of catalog keys to use in filling the record
219  """
220  names = record.schema.getNames()
221  for flag in self._flags:
222  if flag in names:
223  attr_name = 'is_{}_name'.format(flag)
224  record.set(key_map[flag], bool(row[getattr(self.config, attr_name)]))
225 
226  def _set_mags(self, record, row, key_map):
227  """!Set the flux records from the input magnitudes
228  @param[in,out] record SourceCatalog record to modify
229  @param[in] row dict like object containing magnitude values
230  @param[in] key_map Map of catalog keys to use in filling the record
231  """
232  for item in self.config.mag_column_list:
233  record.set(key_map[item+'_flux'], fluxFromABMag(row[item]))
234  if len(self.config.mag_err_column_map) > 0:
235  for err_key in self.config.mag_err_column_map.keys():
236  error_col_name = self.config.mag_err_column_map[err_key]
237  record.set(key_map[err_key+'_fluxSigma'],
238  fluxErrFromABMagErr(row[error_col_name], row[err_key]))
239 
240  def _set_extra(self, record, row, key_map):
241  """!Copy the extra column information to the record
242  @param[in,out] record SourceCatalog record to modify
243  @param[in] row dict like object containing the column values
244  @param[in] key_map Map of catalog keys to use in filling the record
245  """
246  for extra_col in self.config.extra_col_names:
247  value = row[extra_col]
248  # If data read from a text file contains string like entires,
249  # numpy stores this as its own internal type, a numpy.str_
250  # object. This seems to be a consequence of how numpy stores
251  # string like objects in fixed column arrays. This checks
252  # if any of the values to be added to the catalog are numpy
253  # string types, and if they are, casts them to a python string
254  # which is what the python c++ records expect
255  if isinstance(value, np.str_):
256  value = str(value)
257  record.set(key_map[extra_col], value)
258 
259  def _fill_record(self, record, row, rec_num, key_map):
260  """!Fill a record to put in the persisted indexed catalogs
261 
262  @param[in,out] record afwTable.SourceRecord in a reference catalog to fill.
263  @param[in] row A row from a numpy array constructed from the input catalogs.
264  @param[in] rec_num Starting integer to increment for the unique id
265  @param[in] key_map Map of catalog keys to use in filling the record
266  """
267  record.setCoord(self.compute_coord(row, self.config.ra_name, self.config.dec_name))
268  if self.config.id_name:
269  record.setId(row[self.config.id_name])
270  else:
271  rec_num += 1
272  record.setId(rec_num)
273  # No parents
274  record.setParent(-1)
275 
276  self._set_flags(record, row, key_map)
277  self._set_mags(record, row, key_map)
278  self._set_extra(record, row, key_map)
279  return rec_num
280 
281  def get_catalog(self, dataId, schema):
282  """!Get a catalog from the butler or create it if it doesn't exist
283 
284  @param[in] dataId Identifier for catalog to retrieve
285  @param[in] schema Schema to use in catalog creation if the butler can't get it
286  @returns table (an lsst.afw.table.SourceCatalog) for the specified identifier
287  """
288  if self.butler.datasetExists('ref_cat', dataId=dataId):
289  return self.butler.get('ref_cat', dataId=dataId)
290  return afwTable.SourceCatalog(schema)
291 
292  def make_schema(self, dtype):
293  """!Make the schema to use in constructing the persisted catalogs.
294 
295  @param[in] dtype A np.dtype to use in constructing the schema
296  @returns a pair of items:
297  - The schema for the output source catalog.
298  - A map of catalog keys to use in filling the record
299  """
300  key_map = {}
301  mag_column_list = self.config.mag_column_list
302  mag_err_column_map = self.config.mag_err_column_map
303  if len(mag_err_column_map) > 0 and (
304  not len(mag_column_list) == len(mag_err_column_map) or
305  not sorted(mag_column_list) == sorted(mag_err_column_map.keys())):
306  raise ValueError("Every magnitude column must have a corresponding error column")
307  # makes a schema with a coord, id and parent_id
308  schema = afwTable.SourceTable.makeMinimalSchema()
309 
310  def add_field(name):
311  if dtype[name].kind == 'U':
312  # dealing with a string like thing. Need to get type and size.
313  at_type = str
314  at_size = dtype[name].itemsize
315  return schema.addField(name, type=at_type, size=at_size)
316  else:
317  at_type = dtype[name].type
318  return schema.addField(name, at_type)
319 
320  for item in mag_column_list:
321  key_map[item+'_flux'] = schema.addField(item+'_flux', float)
322  if len(mag_err_column_map) > 0:
323  for err_item in mag_err_column_map.keys():
324  key_map[err_item+'_fluxSigma'] = schema.addField(err_item+'_fluxSigma', float)
325  for flag in self._flags:
326  attr_name = 'is_{}_name'.format(flag)
327  if getattr(self.config, attr_name):
328  key_map[flag] = schema.addField(flag, 'Flag')
329  for col in self.config.extra_col_names:
330  key_map[col] = add_field(col)
331  return schema, key_map
def create_indexed_catalog(self, files)
Index a set of files comprising a reference catalog.
def _fill_record(self, record, row, rec_num, key_map)
Fill a record to put in the persisted indexed catalogs.
def make_schema(self, dtype)
Make the schema to use in constructing the persisted catalogs.
def _set_mags(self, record, row, key_map)
Set the flux records from the input magnitudes.
def _set_extra(self, record, row, key_map)
Copy the extra column information to the record.
def get_catalog(self, dataId, schema)
Get a catalog from the butler or create it if it doesn&#39;t exist.
def compute_coord(row, ra_name, dec_name)
Create an ICRS SpherePoint from a np.array row.
Class for both producing indexed reference catalogs and for loading them.
def _set_flags(self, record, row, key_map)
Set the flags for a record.
def __init__(self, args, kwargs)
Constructor for the HTM indexing engine.