lsst.meas.algorithms  14.0-18-gf7dca964+3
ingestIndexReferenceTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 #
4 # Copyright 2008-2017 AURA/LSST.
5 #
6 # This product includes software developed by the
7 # LSST Project (http://www.lsst.org/).
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the LSST License Statement and
20 # the GNU General Public License along with this program. If not,
21 # see <https://www.lsstcorp.org/LegalNotices/>.
22 #
23 from __future__ import absolute_import, division, print_function
24 
25 __all__ = ["IngestIndexedReferenceConfig", "IngestIndexedReferenceTask", "DatasetConfig"]
26 
27 import numpy as np
28 
29 import lsst.pex.config as pexConfig
30 import lsst.pipe.base as pipeBase
31 import lsst.afw.table as afwTable
32 import lsst.afw.coord as afwCoord
33 import lsst.afw.geom as afwGeom
34 from lsst.afw.image import fluxFromABMag, fluxErrFromABMagErr
35 from .indexerRegistry import IndexerRegistry
36 from .readTextCatalogTask import ReadTextCatalogTask
37 
38 
39 class IngestReferenceRunner(pipeBase.TaskRunner):
40  """!Task runner for the reference catalog ingester
41 
42  Data IDs are ignored so the runner should just run the task on the parsed command.
43  """
44 
45  def run(self, parsedCmd):
46  """!Run the task.
47  Several arguments need to be collected to send on to the task methods.
48 
49  @param[in] parsedCmd Parsed command including command line arguments.
50  @param[out] Struct containing the result of the indexing.
51  """
52  files = parsedCmd.files
53  butler = parsedCmd.butler
54  task = self.TaskClass(config=self.config, log=self.log, butler=butler)
55  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
56 
57  result = task.create_indexed_catalog(files)
58  if self.doReturnResults:
59  return pipeBase.Struct(
60  result=result,
61  )
62 
63 
64 class DatasetConfig(pexConfig.Config):
65  ref_dataset_name = pexConfig.Field(
66  dtype=str,
67  default='cal_ref_cat',
68  doc='String to pass to the butler to retrieve persisted files.',
69  )
70  indexer = IndexerRegistry.makeField(
71  default='HTM',
72  doc='Name of indexer algoritm to use. Default is HTM',
73  )
74 
75 
76 class IngestIndexedReferenceConfig(pexConfig.Config):
77  dataset_config = pexConfig.ConfigField(
78  dtype=DatasetConfig,
79  doc="Configuration for reading the ingested data",
80  )
81  file_reader = pexConfig.ConfigurableField(
82  target=ReadTextCatalogTask,
83  doc='Task to use to read the files. Default is to expect text files.'
84  )
85  ra_name = pexConfig.Field(
86  dtype=str,
87  doc="Name of RA column",
88  )
89  dec_name = pexConfig.Field(
90  dtype=str,
91  doc="Name of Dec column",
92  )
93  mag_column_list = pexConfig.ListField(
94  dtype=str,
95  doc="The values in the reference catalog are assumed to be in AB magnitudes. "
96  "List of column names to use for photometric information. At least one entry is required."
97  )
98  mag_err_column_map = pexConfig.DictField(
99  keytype=str,
100  itemtype=str,
101  default={},
102  doc="A map of magnitude column name (key) to magnitude error column (value)."
103  )
104  is_photometric_name = pexConfig.Field(
105  dtype=str,
106  optional=True,
107  doc='Name of column stating if satisfactory for photometric calibration (optional).'
108  )
109  is_resolved_name = pexConfig.Field(
110  dtype=str,
111  optional=True,
112  doc='Name of column stating if the object is resolved (optional).'
113  )
114  is_variable_name = pexConfig.Field(
115  dtype=str,
116  optional=True,
117  doc='Name of column stating if the object is measured to be variable (optional).'
118  )
119  id_name = pexConfig.Field(
120  dtype=str,
121  optional=True,
122  doc='Name of column to use as an identifier (optional).'
123  )
124  extra_col_names = pexConfig.ListField(
125  dtype=str,
126  default=[],
127  doc='Extra columns to add to the reference catalog.'
128  )
129 
130  def validate(self):
131  pexConfig.Config.validate(self)
132  if not (self.ra_name and self.dec_name and self.mag_column_list):
133  raise ValueError("ra_name and dec_name and at least one entry in mag_column_list must be" +
134  " supplied.")
135  if len(self.mag_err_column_map) > 0 and not len(self.mag_column_list) == len(self.mag_err_column_map):
136  raise ValueError("If magnitude errors are provided, all magnitudes must have an error column")
137 
138 
139 class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
140  """!Class for both producing indexed reference catalogs and for loading them.
141 
142  This implements an indexing scheme based on hierarchical triangular mesh (HTM).
143  The term index really means breaking the catalog into localized chunks called
144  shards. In this case each shard contains the entries from the catalog in a single
145  HTM trixel
146  """
147  canMultiprocess = False
148  ConfigClass = IngestIndexedReferenceConfig
149  RunnerClass = IngestReferenceRunner
150  _DefaultName = 'IngestIndexedReferenceTask'
151 
152  _flags = ['photometric', 'resolved', 'variable']
153 
154  @classmethod
155  def _makeArgumentParser(cls):
156  """Create an argument parser
157 
158  This overrides the original because we need the file arguments
159  """
160  parser = pipeBase.InputOnlyArgumentParser(name=cls._DefaultName)
161  parser.add_argument("files", nargs="+", help="Names of files to index")
162  return parser
163 
164  def __init__(self, *args, **kwargs):
165  """!Constructor for the HTM indexing engine
166 
167  @param[in] butler dafPersistence.Butler object for reading and writing catalogs
168  """
169  self.butler = kwargs.pop('butler')
170  pipeBase.Task.__init__(self, *args, **kwargs)
171  self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
172  self.config.dataset_config.indexer.active)
173  self.makeSubtask('file_reader')
174 
175  def create_indexed_catalog(self, files):
176  """!Index a set of files comprising a reference catalog. Outputs are persisted in the
177  data repository.
178 
179  @param[in] files A list of file names to read.
180  """
181  rec_num = 0
182  first = True
183  for filename in files:
184  arr = self.file_reader.run(filename)
185  index_list = self.indexer.index_points(arr[self.config.ra_name], arr[self.config.dec_name])
186  if first:
187  schema, key_map = self.make_schema(arr.dtype)
188  # persist empty catalog to hold the master schema
189  dataId = self.indexer.make_data_id('master_schema',
190  self.config.dataset_config.ref_dataset_name)
191  self.butler.put(self.get_catalog(dataId, schema), 'ref_cat',
192  dataId=dataId)
193  first = False
194  pixel_ids = set(index_list)
195  for pixel_id in pixel_ids:
196  dataId = self.indexer.make_data_id(pixel_id, self.config.dataset_config.ref_dataset_name)
197  catalog = self.get_catalog(dataId, schema)
198  els = np.where(index_list == pixel_id)
199  for row in arr[els]:
200  record = catalog.addNew()
201  rec_num = self._fill_record(record, row, rec_num, key_map)
202  self.butler.put(catalog, 'ref_cat', dataId=dataId)
203  dataId = self.indexer.make_data_id(None, self.config.dataset_config.ref_dataset_name)
204  self.butler.put(self.config.dataset_config, 'ref_cat_config', dataId=dataId)
205 
206  @staticmethod
207  def compute_coord(row, ra_name, dec_name):
208  """!Create a afwCoord object from a np.array row
209  @param[in] row dict like object with ra/dec info in degrees
210  @param[in] ra_name name of RA key
211  @param[in] dec_name name of Dec key
212  @param[out] IcrsCoord object constructed from the RA/Dec values
213  """
214  return afwCoord.IcrsCoord(row[ra_name]*afwGeom.degrees,
215  row[dec_name]*afwGeom.degrees)
216 
217  def _set_flags(self, record, row, key_map):
218  """!Set the flags for a record. Relies on the _flags class attribute
219  @param[in,out] record SourceCatalog record to modify
220  @param[in] row dict like object containing flag info
221  @param[in] key_map Map of catalog keys to use in filling the record
222  """
223  names = record.schema.getNames()
224  for flag in self._flags:
225  if flag in names:
226  attr_name = 'is_{}_name'.format(flag)
227  record.set(key_map[flag], bool(row[getattr(self.config, attr_name)]))
228 
229  def _set_mags(self, record, row, key_map):
230  """!Set the flux records from the input magnitudes
231  @param[in,out] record SourceCatalog record to modify
232  @param[in] row dict like object containing magnitude values
233  @param[in] key_map Map of catalog keys to use in filling the record
234  """
235  for item in self.config.mag_column_list:
236  record.set(key_map[item+'_flux'], fluxFromABMag(row[item]))
237  if len(self.config.mag_err_column_map) > 0:
238  for err_key in self.config.mag_err_column_map.keys():
239  error_col_name = self.config.mag_err_column_map[err_key]
240  record.set(key_map[err_key+'_fluxSigma'],
241  fluxErrFromABMagErr(row[error_col_name], row[err_key]))
242 
243  def _set_extra(self, record, row, key_map):
244  """!Copy the extra column information to the record
245  @param[in,out] record SourceCatalog record to modify
246  @param[in] row dict like object containing the column values
247  @param[in] key_map Map of catalog keys to use in filling the record
248  """
249  for extra_col in self.config.extra_col_names:
250  value = row[extra_col]
251  # If data read from a text file contains string like entires,
252  # numpy stores this as its own internal type, a numpy.str_
253  # object. This seems to be a consequence of how numpy stores
254  # string like objects in fixed column arrays. This checks
255  # if any of the values to be added to the catalog are numpy
256  # string types, and if they are, casts them to a python string
257  # which is what the python c++ records expect
258  if isinstance(value, np.str_):
259  value = str(value)
260  record.set(key_map[extra_col], value)
261 
262  def _fill_record(self, record, row, rec_num, key_map):
263  """!Fill a record to put in the persisted indexed catalogs
264 
265  @param[in,out] record afwTable.SourceRecord in a reference catalog to fill.
266  @param[in] row A row from a numpy array constructed from the input catalogs.
267  @param[in] rec_num Starting integer to increment for the unique id
268  @param[in] key_map Map of catalog keys to use in filling the record
269  """
270  record.setCoord(self.compute_coord(row, self.config.ra_name, self.config.dec_name))
271  if self.config.id_name:
272  record.setId(row[self.config.id_name])
273  else:
274  rec_num += 1
275  record.setId(rec_num)
276  # No parents
277  record.setParent(-1)
278 
279  self._set_flags(record, row, key_map)
280  self._set_mags(record, row, key_map)
281  self._set_extra(record, row, key_map)
282  return rec_num
283 
284  def get_catalog(self, dataId, schema):
285  """!Get a catalog from the butler or create it if it doesn't exist
286 
287  @param[in] dataId Identifier for catalog to retrieve
288  @param[in] schema Schema to use in catalog creation if the butler can't get it
289  @param[out] afwTable.SourceCatalog for the specified identifier
290  """
291  if self.butler.datasetExists('ref_cat', dataId=dataId):
292  return self.butler.get('ref_cat', dataId=dataId)
293  return afwTable.SourceCatalog(schema)
294 
295  def make_schema(self, dtype):
296  """!Make the schema to use in constructing the persisted catalogs.
297 
298  @param[in] dtype A np.dtype to use in constructing the schema
299  @param[out] The schema for the output source catalog.
300  @param[out] A map of catalog keys to use in filling the record
301  """
302  key_map = {}
303  mag_column_list = self.config.mag_column_list
304  mag_err_column_map = self.config.mag_err_column_map
305  if len(mag_err_column_map) > 0 and (
306  not len(mag_column_list) == len(mag_err_column_map) or
307  not sorted(mag_column_list) == sorted(mag_err_column_map.keys())):
308  raise ValueError("Every magnitude column must have a corresponding error column")
309  # makes a schema with a coord, id and parent_id
310  schema = afwTable.SourceTable.makeMinimalSchema()
311 
312  def add_field(name):
313  if dtype[name].kind == 'U':
314  # dealing with a string like thing. Need to get type and size.
315  at_type = str
316  at_size = dtype[name].itemsize
317  return schema.addField(name, type=at_type, size=at_size)
318  else:
319  at_type = dtype[name].type
320  return schema.addField(name, at_type)
321 
322  for item in mag_column_list:
323  key_map[item+'_flux'] = schema.addField(item+'_flux', float)
324  if len(mag_err_column_map) > 0:
325  for err_item in mag_err_column_map.keys():
326  key_map[err_item+'_fluxSigma'] = schema.addField(err_item+'_fluxSigma', float)
327  for flag in self._flags:
328  attr_name = 'is_{}_name'.format(flag)
329  if getattr(self.config, attr_name):
330  key_map[flag] = schema.addField(flag, 'Flag')
331  for col in self.config.extra_col_names:
332  key_map[col] = add_field(col)
333  return schema, key_map
def create_indexed_catalog(self, files)
Index a set of files comprising a reference catalog.
def _fill_record(self, record, row, rec_num, key_map)
Fill a record to put in the persisted indexed catalogs.
def make_schema(self, dtype)
Make the schema to use in constructing the persisted catalogs.
def _set_mags(self, record, row, key_map)
Set the flux records from the input magnitudes.
def _set_extra(self, record, row, key_map)
Copy the extra column information to the record.
def get_catalog(self, dataId, schema)
Get a catalog from the butler or create it if it doesn&#39;t exist.
def compute_coord(row, ra_name, dec_name)
Create a afwCoord object from a np.array row.
Class for both producing indexed reference catalogs and for loading them.
def _set_flags(self, record, row, key_map)
Set the flags for a record.
def __init__(self, args, kwargs)
Constructor for the HTM indexing engine.