lsst.meas.algorithms  14.0-21-ge7d40960+3
ingestIndexReferenceTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 #
4 # Copyright 2008-2017 AURA/LSST.
5 #
6 # This product includes software developed by the
7 # LSST Project (http://www.lsst.org/).
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the LSST License Statement and
20 # the GNU General Public License along with this program. If not,
21 # see <https://www.lsstcorp.org/LegalNotices/>.
22 #
23 from __future__ import absolute_import, division, print_function
24 
25 __all__ = ["IngestIndexedReferenceConfig", "IngestIndexedReferenceTask", "DatasetConfig"]
26 
27 import numpy as np
28 
29 import lsst.pex.config as pexConfig
30 import lsst.pipe.base as pipeBase
31 import lsst.afw.table as afwTable
32 import lsst.afw.geom as afwGeom
33 from lsst.afw.image import fluxFromABMag, fluxErrFromABMagErr
34 from .indexerRegistry import IndexerRegistry
35 from .readTextCatalogTask import ReadTextCatalogTask
36 
37 
38 class IngestReferenceRunner(pipeBase.TaskRunner):
39  """!Task runner for the reference catalog ingester
40 
41  Data IDs are ignored so the runner should just run the task on the parsed command.
42  """
43 
44  def run(self, parsedCmd):
45  """!Run the task.
46  Several arguments need to be collected to send on to the task methods.
47 
48  @param[in] parsedCmd Parsed command including command line arguments.
49  @param[out] Struct containing the result of the indexing.
50  """
51  files = parsedCmd.files
52  butler = parsedCmd.butler
53  task = self.TaskClass(config=self.config, log=self.log, butler=butler)
54  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
55 
56  result = task.create_indexed_catalog(files)
57  if self.doReturnResults:
58  return pipeBase.Struct(
59  result=result,
60  )
61 
62 
63 class DatasetConfig(pexConfig.Config):
64  ref_dataset_name = pexConfig.Field(
65  dtype=str,
66  default='cal_ref_cat',
67  doc='String to pass to the butler to retrieve persisted files.',
68  )
69  indexer = IndexerRegistry.makeField(
70  default='HTM',
71  doc='Name of indexer algoritm to use. Default is HTM',
72  )
73 
74 
75 class IngestIndexedReferenceConfig(pexConfig.Config):
76  dataset_config = pexConfig.ConfigField(
77  dtype=DatasetConfig,
78  doc="Configuration for reading the ingested data",
79  )
80  file_reader = pexConfig.ConfigurableField(
81  target=ReadTextCatalogTask,
82  doc='Task to use to read the files. Default is to expect text files.'
83  )
84  ra_name = pexConfig.Field(
85  dtype=str,
86  doc="Name of RA column",
87  )
88  dec_name = pexConfig.Field(
89  dtype=str,
90  doc="Name of Dec column",
91  )
92  mag_column_list = pexConfig.ListField(
93  dtype=str,
94  doc="The values in the reference catalog are assumed to be in AB magnitudes. "
95  "List of column names to use for photometric information. At least one entry is required."
96  )
97  mag_err_column_map = pexConfig.DictField(
98  keytype=str,
99  itemtype=str,
100  default={},
101  doc="A map of magnitude column name (key) to magnitude error column (value)."
102  )
103  is_photometric_name = pexConfig.Field(
104  dtype=str,
105  optional=True,
106  doc='Name of column stating if satisfactory for photometric calibration (optional).'
107  )
108  is_resolved_name = pexConfig.Field(
109  dtype=str,
110  optional=True,
111  doc='Name of column stating if the object is resolved (optional).'
112  )
113  is_variable_name = pexConfig.Field(
114  dtype=str,
115  optional=True,
116  doc='Name of column stating if the object is measured to be variable (optional).'
117  )
118  id_name = pexConfig.Field(
119  dtype=str,
120  optional=True,
121  doc='Name of column to use as an identifier (optional).'
122  )
123  extra_col_names = pexConfig.ListField(
124  dtype=str,
125  default=[],
126  doc='Extra columns to add to the reference catalog.'
127  )
128 
129  def validate(self):
130  pexConfig.Config.validate(self)
131  if not (self.ra_name and self.dec_name and self.mag_column_list):
132  raise ValueError("ra_name and dec_name and at least one entry in mag_column_list must be" +
133  " supplied.")
134  if len(self.mag_err_column_map) > 0 and not len(self.mag_column_list) == len(self.mag_err_column_map):
135  raise ValueError("If magnitude errors are provided, all magnitudes must have an error column")
136 
137 
138 class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
139  """!Class for both producing indexed reference catalogs and for loading them.
140 
141  This implements an indexing scheme based on hierarchical triangular mesh (HTM).
142  The term index really means breaking the catalog into localized chunks called
143  shards. In this case each shard contains the entries from the catalog in a single
144  HTM trixel
145  """
146  canMultiprocess = False
147  ConfigClass = IngestIndexedReferenceConfig
148  RunnerClass = IngestReferenceRunner
149  _DefaultName = 'IngestIndexedReferenceTask'
150 
151  _flags = ['photometric', 'resolved', 'variable']
152 
153  @classmethod
154  def _makeArgumentParser(cls):
155  """Create an argument parser
156 
157  This overrides the original because we need the file arguments
158  """
159  parser = pipeBase.InputOnlyArgumentParser(name=cls._DefaultName)
160  parser.add_argument("files", nargs="+", help="Names of files to index")
161  return parser
162 
163  def __init__(self, *args, **kwargs):
164  """!Constructor for the HTM indexing engine
165 
166  @param[in] butler dafPersistence.Butler object for reading and writing catalogs
167  """
168  self.butler = kwargs.pop('butler')
169  pipeBase.Task.__init__(self, *args, **kwargs)
170  self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
171  self.config.dataset_config.indexer.active)
172  self.makeSubtask('file_reader')
173 
174  def create_indexed_catalog(self, files):
175  """!Index a set of files comprising a reference catalog. Outputs are persisted in the
176  data repository.
177 
178  @param[in] files A list of file names to read.
179  """
180  rec_num = 0
181  first = True
182  for filename in files:
183  arr = self.file_reader.run(filename)
184  index_list = self.indexer.index_points(arr[self.config.ra_name], arr[self.config.dec_name])
185  if first:
186  schema, key_map = self.make_schema(arr.dtype)
187  # persist empty catalog to hold the master schema
188  dataId = self.indexer.make_data_id('master_schema',
189  self.config.dataset_config.ref_dataset_name)
190  self.butler.put(self.get_catalog(dataId, schema), 'ref_cat',
191  dataId=dataId)
192  first = False
193  pixel_ids = set(index_list)
194  for pixel_id in pixel_ids:
195  dataId = self.indexer.make_data_id(pixel_id, self.config.dataset_config.ref_dataset_name)
196  catalog = self.get_catalog(dataId, schema)
197  els = np.where(index_list == pixel_id)
198  for row in arr[els]:
199  record = catalog.addNew()
200  rec_num = self._fill_record(record, row, rec_num, key_map)
201  self.butler.put(catalog, 'ref_cat', dataId=dataId)
202  dataId = self.indexer.make_data_id(None, self.config.dataset_config.ref_dataset_name)
203  self.butler.put(self.config.dataset_config, 'ref_cat_config', dataId=dataId)
204 
205  @staticmethod
206  def compute_coord(row, ra_name, dec_name):
207  """!Create an ICRS SpherePoint from a np.array row
208  @param[in] row dict like object with ra/dec info in degrees
209  @param[in] ra_name name of RA key
210  @param[in] dec_name name of Dec key
211  @param[out] ICRS SpherePoint constructed from the RA/Dec values
212  """
213  return afwGeom.SpherePoint(row[ra_name], row[dec_name], afwGeom.degrees)
214 
215  def _set_flags(self, record, row, key_map):
216  """!Set the flags for a record. Relies on the _flags class attribute
217  @param[in,out] record SourceCatalog record to modify
218  @param[in] row dict like object containing flag info
219  @param[in] key_map Map of catalog keys to use in filling the record
220  """
221  names = record.schema.getNames()
222  for flag in self._flags:
223  if flag in names:
224  attr_name = 'is_{}_name'.format(flag)
225  record.set(key_map[flag], bool(row[getattr(self.config, attr_name)]))
226 
227  def _set_mags(self, record, row, key_map):
228  """!Set the flux records from the input magnitudes
229  @param[in,out] record SourceCatalog record to modify
230  @param[in] row dict like object containing magnitude values
231  @param[in] key_map Map of catalog keys to use in filling the record
232  """
233  for item in self.config.mag_column_list:
234  record.set(key_map[item+'_flux'], fluxFromABMag(row[item]))
235  if len(self.config.mag_err_column_map) > 0:
236  for err_key in self.config.mag_err_column_map.keys():
237  error_col_name = self.config.mag_err_column_map[err_key]
238  record.set(key_map[err_key+'_fluxSigma'],
239  fluxErrFromABMagErr(row[error_col_name], row[err_key]))
240 
241  def _set_extra(self, record, row, key_map):
242  """!Copy the extra column information to the record
243  @param[in,out] record SourceCatalog record to modify
244  @param[in] row dict like object containing the column values
245  @param[in] key_map Map of catalog keys to use in filling the record
246  """
247  for extra_col in self.config.extra_col_names:
248  value = row[extra_col]
249  # If data read from a text file contains string like entires,
250  # numpy stores this as its own internal type, a numpy.str_
251  # object. This seems to be a consequence of how numpy stores
252  # string like objects in fixed column arrays. This checks
253  # if any of the values to be added to the catalog are numpy
254  # string types, and if they are, casts them to a python string
255  # which is what the python c++ records expect
256  if isinstance(value, np.str_):
257  value = str(value)
258  record.set(key_map[extra_col], value)
259 
260  def _fill_record(self, record, row, rec_num, key_map):
261  """!Fill a record to put in the persisted indexed catalogs
262 
263  @param[in,out] record afwTable.SourceRecord in a reference catalog to fill.
264  @param[in] row A row from a numpy array constructed from the input catalogs.
265  @param[in] rec_num Starting integer to increment for the unique id
266  @param[in] key_map Map of catalog keys to use in filling the record
267  """
268  record.setCoord(self.compute_coord(row, self.config.ra_name, self.config.dec_name))
269  if self.config.id_name:
270  record.setId(row[self.config.id_name])
271  else:
272  rec_num += 1
273  record.setId(rec_num)
274  # No parents
275  record.setParent(-1)
276 
277  self._set_flags(record, row, key_map)
278  self._set_mags(record, row, key_map)
279  self._set_extra(record, row, key_map)
280  return rec_num
281 
282  def get_catalog(self, dataId, schema):
283  """!Get a catalog from the butler or create it if it doesn't exist
284 
285  @param[in] dataId Identifier for catalog to retrieve
286  @param[in] schema Schema to use in catalog creation if the butler can't get it
287  @param[out] afwTable.SourceCatalog for the specified identifier
288  """
289  if self.butler.datasetExists('ref_cat', dataId=dataId):
290  return self.butler.get('ref_cat', dataId=dataId)
291  return afwTable.SourceCatalog(schema)
292 
293  def make_schema(self, dtype):
294  """!Make the schema to use in constructing the persisted catalogs.
295 
296  @param[in] dtype A np.dtype to use in constructing the schema
297  @param[out] The schema for the output source catalog.
298  @param[out] A map of catalog keys to use in filling the record
299  """
300  key_map = {}
301  mag_column_list = self.config.mag_column_list
302  mag_err_column_map = self.config.mag_err_column_map
303  if len(mag_err_column_map) > 0 and (
304  not len(mag_column_list) == len(mag_err_column_map) or
305  not sorted(mag_column_list) == sorted(mag_err_column_map.keys())):
306  raise ValueError("Every magnitude column must have a corresponding error column")
307  # makes a schema with a coord, id and parent_id
308  schema = afwTable.SourceTable.makeMinimalSchema()
309 
310  def add_field(name):
311  if dtype[name].kind == 'U':
312  # dealing with a string like thing. Need to get type and size.
313  at_type = str
314  at_size = dtype[name].itemsize
315  return schema.addField(name, type=at_type, size=at_size)
316  else:
317  at_type = dtype[name].type
318  return schema.addField(name, at_type)
319 
320  for item in mag_column_list:
321  key_map[item+'_flux'] = schema.addField(item+'_flux', float)
322  if len(mag_err_column_map) > 0:
323  for err_item in mag_err_column_map.keys():
324  key_map[err_item+'_fluxSigma'] = schema.addField(err_item+'_fluxSigma', float)
325  for flag in self._flags:
326  attr_name = 'is_{}_name'.format(flag)
327  if getattr(self.config, attr_name):
328  key_map[flag] = schema.addField(flag, 'Flag')
329  for col in self.config.extra_col_names:
330  key_map[col] = add_field(col)
331  return schema, key_map
def create_indexed_catalog(self, files)
Index a set of files comprising a reference catalog.
def _fill_record(self, record, row, rec_num, key_map)
Fill a record to put in the persisted indexed catalogs.
def make_schema(self, dtype)
Make the schema to use in constructing the persisted catalogs.
def _set_mags(self, record, row, key_map)
Set the flux records from the input magnitudes.
def _set_extra(self, record, row, key_map)
Copy the extra column information to the record.
def get_catalog(self, dataId, schema)
Get a catalog from the butler or create it if it doesn&#39;t exist.
def compute_coord(row, ra_name, dec_name)
Create an ICRS SpherePoint from a np.array row.
Class for both producing indexed reference catalogs and for loading them.
def _set_flags(self, record, row, key_map)
Set the flags for a record.
def __init__(self, args, kwargs)
Constructor for the HTM indexing engine.