lsst.meas.algorithms  13.0-18-gc4ad422+4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ingestIndexReferenceTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 #
4 # Copyright 2008-2017 AURA/LSST.
5 #
6 # This product includes software developed by the
7 # LSST Project (http://www.lsst.org/).
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the LSST License Statement and
20 # the GNU General Public License along with this program. If not,
21 # see <https://www.lsstcorp.org/LegalNotices/>.
22 #
23 from __future__ import absolute_import, division, print_function
24 
25 __all__ = ["IngestIndexedReferenceConfig", "IngestIndexedReferenceTask", "DatasetConfig"]
26 
27 import numpy as np
28 
29 import lsst.pex.config as pexConfig
30 import lsst.pipe.base as pipeBase
31 import lsst.afw.table as afwTable
32 import lsst.afw.coord as afwCoord
33 import lsst.afw.geom as afwGeom
34 from lsst.afw.image import fluxFromABMag, fluxErrFromABMagErr
35 from .indexerRegistry import IndexerRegistry
36 from .readTextCatalogTask import ReadTextCatalogTask
37 
38 
39 class IngestReferenceRunner(pipeBase.TaskRunner):
40  """!Task runner for the reference catalog ingester
41 
42  Data IDs are ignored so the runner should just run the task on the parsed command.
43  """
44 
45  def run(self, parsedCmd):
46  """!Run the task.
47  Several arguments need to be collected to send on to the task methods.
48 
49  @param[in] parsedCmd Parsed command including command line arguments.
50  @param[out] Struct containing the result of the indexing.
51  """
52  files = parsedCmd.files
53  butler = parsedCmd.butler
54  task = self.TaskClass(config=self.config, log=self.log, butler=butler)
55  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
56 
57  result = task.create_indexed_catalog(files)
58  if self.doReturnResults:
59  return pipeBase.Struct(
60  result=result,
61  )
62 
63 class DatasetConfig(pexConfig.Config):
64  ref_dataset_name = pexConfig.Field(
65  dtype=str,
66  default='cal_ref_cat',
67  doc='String to pass to the butler to retrieve persisted files.',
68  )
69  indexer = IndexerRegistry.makeField(
70  default='HTM',
71  doc='Name of indexer algoritm to use. Default is HTM',
72  )
73 
74 class IngestIndexedReferenceConfig(pexConfig.Config):
75  dataset_config = pexConfig.ConfigField(
76  dtype=DatasetConfig,
77  doc="Configuration for reading the ingested data",
78  )
79  file_reader = pexConfig.ConfigurableField(
80  target=ReadTextCatalogTask,
81  doc='Task to use to read the files. Default is to expect text files.'
82  )
83  ra_name = pexConfig.Field(
84  dtype=str,
85  doc="Name of RA column",
86  )
87  dec_name = pexConfig.Field(
88  dtype=str,
89  doc="Name of Dec column",
90  )
91  mag_column_list = pexConfig.ListField(
92  dtype=str,
93  doc="The values in the reference catalog are assumed to be in AB magnitudes. "
94  "List of column names to use for photometric information. At least one entry is required."
95  )
96  mag_err_column_map = pexConfig.DictField(
97  keytype=str,
98  itemtype=str,
99  default={},
100  doc="A map of magnitude column name (key) to magnitude error column (value)."
101  )
102  is_photometric_name = pexConfig.Field(
103  dtype=str,
104  optional=True,
105  doc='Name of column stating if satisfactory for photometric calibration (optional).'
106  )
107  is_resolved_name = pexConfig.Field(
108  dtype=str,
109  optional=True,
110  doc='Name of column stating if the object is resolved (optional).'
111  )
112  is_variable_name = pexConfig.Field(
113  dtype=str,
114  optional=True,
115  doc='Name of column stating if the object is measured to be variable (optional).'
116  )
117  id_name = pexConfig.Field(
118  dtype=str,
119  optional=True,
120  doc='Name of column to use as an identifier (optional).'
121  )
122  extra_col_names = pexConfig.ListField(
123  dtype=str,
124  default=[],
125  doc='Extra columns to add to the reference catalog.'
126  )
127 
128  def validate(self):
129  pexConfig.Config.validate(self)
130  if not (self.ra_name and self.dec_name and self.mag_column_list):
131  raise ValueError("ra_name and dec_name and at least one entry in mag_column_list must be" +
132  " supplied.")
133  if len(self.mag_err_column_map) > 0 and not len(self.mag_column_list) == len(self.mag_err_column_map):
134  raise ValueError("If magnitude errors are provided, all magnitudes must have an error column")
135 
136 
137 class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
138  """!Class for both producing indexed reference catalogs and for loading them.
139 
140  This implements an indexing scheme based on hierarchical triangular mesh (HTM).
141  The term index really means breaking the catalog into localized chunks called
142  shards. In this case each shard contains the entries from the catalog in a single
143  HTM trixel
144  """
145  canMultiprocess = False
146  ConfigClass = IngestIndexedReferenceConfig
147  RunnerClass = IngestReferenceRunner
148  _DefaultName = 'IngestIndexedReferenceTask'
149 
150  _flags = ['photometric', 'resolved', 'variable']
151 
152  @classmethod
153  def _makeArgumentParser(cls):
154  """Create an argument parser
155 
156  This overrides the original because we need the file arguments
157  """
158  parser = pipeBase.InputOnlyArgumentParser(name=cls._DefaultName)
159  parser.add_argument("files", nargs="+", help="Names of files to index")
160  return parser
161 
162  def __init__(self, *args, **kwargs):
163  """!Constructor for the HTM indexing engine
164 
165  @param[in] butler dafPersistence.Butler object for reading and writing catalogs
166  """
167  self.butler = kwargs.pop('butler')
168  pipeBase.Task.__init__(self, *args, **kwargs)
169  self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name](self.config.dataset_config.indexer.active)
170  self.makeSubtask('file_reader')
171 
172  def create_indexed_catalog(self, files):
173  """!Index a set of files comprising a reference catalog. Outputs are persisted in the
174  data repository.
175 
176  @param[in] files A list of file names to read.
177  """
178  rec_num = 0
179  first = True
180  for filename in files:
181  arr = self.file_reader.run(filename)
182  index_list = self.indexer.index_points(arr[self.config.ra_name], arr[self.config.dec_name])
183  if first:
184  schema, key_map = self.make_schema(arr.dtype)
185  # persist empty catalog to hold the master schema
186  dataId = self.indexer.make_data_id('master_schema', self.config.dataset_config.ref_dataset_name)
187  self.butler.put(self.get_catalog(dataId, schema), 'ref_cat',
188  dataId=dataId)
189  first = False
190  pixel_ids = set(index_list)
191  for pixel_id in pixel_ids:
192  dataId = self.indexer.make_data_id(pixel_id, self.config.dataset_config.ref_dataset_name)
193  catalog = self.get_catalog(dataId, schema)
194  els = np.where(index_list == pixel_id)
195  for row in arr[els]:
196  record = catalog.addNew()
197  rec_num = self._fill_record(record, row, rec_num, key_map)
198  self.butler.put(catalog, 'ref_cat', dataId=dataId)
199  dataId = self.indexer.make_data_id(None, self.config.dataset_config.ref_dataset_name)
200  self.butler.put(self.config.dataset_config, 'ref_cat_config', dataId=dataId)
201 
202  @staticmethod
203  def compute_coord(row, ra_name, dec_name):
204  """!Create a afwCoord object from a np.array row
205  @param[in] row dict like object with ra/dec info in degrees
206  @param[in] ra_name name of RA key
207  @param[in] dec_name name of Dec key
208  @param[out] IcrsCoord object constructed from the RA/Dec values
209  """
210  return afwCoord.IcrsCoord(row[ra_name]*afwGeom.degrees,
211  row[dec_name]*afwGeom.degrees)
212 
213  def _set_flags(self, record, row, key_map):
214  """!Set the flags for a record. Relies on the _flags class attribute
215  @param[in,out] record SourceCatalog record to modify
216  @param[in] row dict like object containing flag info
217  @param[in] key_map Map of catalog keys to use in filling the record
218  """
219  names = record.schema.getNames()
220  for flag in self._flags:
221  if flag in names:
222  attr_name = 'is_{}_name'.format(flag)
223  record.set(key_map[flag], bool(row[getattr(self.config, attr_name)]))
224 
225  def _set_mags(self, record, row, key_map):
226  """!Set the flux records from the input magnitudes
227  @param[in,out] record SourceCatalog record to modify
228  @param[in] row dict like object containing magnitude values
229  @param[in] key_map Map of catalog keys to use in filling the record
230  """
231  for item in self.config.mag_column_list:
232  record.set(key_map[item+'_flux'], fluxFromABMag(row[item]))
233  if len(self.config.mag_err_column_map) > 0:
234  for err_key in self.config.mag_err_column_map.keys():
235  error_col_name = self.config.mag_err_column_map[err_key]
236  record.set(key_map[err_key+'_fluxSigma'],
237  fluxErrFromABMagErr(row[error_col_name], row[err_key]))
238 
239  def _set_extra(self, record, row, key_map):
240  """!Copy the extra column information to the record
241  @param[in,out] record SourceCatalog record to modify
242  @param[in] row dict like object containing the column values
243  @param[in] key_map Map of catalog keys to use in filling the record
244  """
245  for extra_col in self.config.extra_col_names:
246  value = row[extra_col]
247  # If data read from a text file contains string like entires,
248  # numpy stores this as its own internal type, a numpy.str_
249  # object. This seems to be a consequence of how numpy stores
250  # string like objects in fixed column arrays. This checks
251  # if any of the values to be added to the catalog are numpy
252  # string types, and if they are, casts them to a python string
253  # which is what the python c++ records expect
254  if isinstance(value, np.str_):
255  value = str(value)
256  record.set(key_map[extra_col], value)
257 
258  def _fill_record(self, record, row, rec_num, key_map):
259  """!Fill a record to put in the persisted indexed catalogs
260 
261  @param[in,out] record afwTable.SourceRecord in a reference catalog to fill.
262  @param[in] row A row from a numpy array constructed from the input catalogs.
263  @param[in] rec_num Starting integer to increment for the unique id
264  @param[in] key_map Map of catalog keys to use in filling the record
265  """
266  record.setCoord(self.compute_coord(row, self.config.ra_name, self.config.dec_name))
267  if self.config.id_name:
268  record.setId(row[self.config.id_name])
269  else:
270  rec_num += 1
271  record.setId(rec_num)
272  # No parents
273  record.setParent(-1)
274 
275  self._set_flags(record, row, key_map)
276  self._set_mags(record, row, key_map)
277  self._set_extra(record, row, key_map)
278  return rec_num
279 
280  def get_catalog(self, dataId, schema):
281  """!Get a catalog from the butler or create it if it doesn't exist
282 
283  @param[in] dataId Identifier for catalog to retrieve
284  @param[in] schema Schema to use in catalog creation if the butler can't get it
285  @param[out] afwTable.SourceCatalog for the specified identifier
286  """
287  if self.butler.datasetExists('ref_cat', dataId=dataId):
288  return self.butler.get('ref_cat', dataId=dataId)
289  return afwTable.SourceCatalog(schema)
290 
291  def make_schema(self, dtype):
292  """!Make the schema to use in constructing the persisted catalogs.
293 
294  @param[in] dtype A np.dtype to use in constructing the schema
295  @param[out] The schema for the output source catalog.
296  @param[out] A map of catalog keys to use in filling the record
297  """
298  key_map = {}
299  mag_column_list = self.config.mag_column_list
300  mag_err_column_map = self.config.mag_err_column_map
301  if len(mag_err_column_map) > 0 and (
302  not len(mag_column_list) == len(mag_err_column_map) or
303  not sorted(mag_column_list) == sorted(mag_err_column_map.keys())):
304  raise ValueError("Every magnitude column must have a corresponding error column")
305  # makes a schema with a coord, id and parent_id
306  schema = afwTable.SourceTable.makeMinimalSchema()
307 
308  def add_field(name):
309  if dtype[name].kind == 'U':
310  # dealing with a string like thing. Need to get type and size.
311  at_type = str
312  at_size = dtype[name].itemsize
313  return schema.addField(name, type=at_type, size=at_size)
314  else:
315  at_type = dtype[name].type
316  return schema.addField(name, at_type)
317 
318  for item in mag_column_list:
319  key_map[item+'_flux'] = schema.addField(item+'_flux', float)
320  if len(mag_err_column_map) > 0:
321  for err_item in mag_err_column_map.keys():
322  key_map[err_item+'_fluxSigma'] = schema.addField(err_item+'_fluxSigma', float)
323  for flag in self._flags:
324  attr_name = 'is_{}_name'.format(flag)
325  if getattr(self.config, attr_name):
326  key_map[flag] = schema.addField(flag, 'Flag')
327  for col in self.config.extra_col_names:
328  key_map[col] = add_field(col)
329  return schema, key_map
def _fill_record
Fill a record to put in the persisted indexed catalogs.
def make_schema
Make the schema to use in constructing the persisted catalogs.
def create_indexed_catalog
Index a set of files comprising a reference catalog.
def get_catalog
Get a catalog from the butler or create it if it doesn&#39;t exist.
Class for both producing indexed reference catalogs and for loading them.