lsst.meas.algorithms  18.0.0-5-ga38416e7
ingestIndexReferenceTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 #
4 # Copyright 2008-2017 AURA/LSST.
5 #
6 # This product includes software developed by the
7 # LSST Project (http://www.lsst.org/).
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the LSST License Statement and
20 # the GNU General Public License along with this program. If not,
21 # see <https://www.lsstcorp.org/LegalNotices/>.
22 #
23 
24 __all__ = ["IngestIndexedReferenceConfig", "IngestIndexedReferenceTask", "DatasetConfig"]
25 
26 import os.path
27 
28 import lsst.pex.config as pexConfig
29 import lsst.pipe.base as pipeBase
30 import lsst.geom
31 import lsst.sphgeom
32 import lsst.afw.table as afwTable
33 from lsst.daf.base import PropertyList
34 from .indexerRegistry import IndexerRegistry
35 from .readTextCatalogTask import ReadTextCatalogTask
36 from .loadReferenceObjects import LoadReferenceObjectsTask
37 from .ingestIndexManager import IngestIndexManager
38 
39 # The most recent Indexed Reference Catalog on-disk format version.
40 LATEST_FORMAT_VERSION = 1
41 
42 
43 def addRefCatMetadata(catalog):
44  """Add metadata to a new (not yet populated) reference catalog.
45 
46  Parameters
47  ----------
48  catalog : `lsst.afw.table.SimpleCatalog`
49  Catalog to which metadata should be attached. Will be modified
50  in-place.
51  """
52  md = catalog.getMetadata()
53  if md is None:
54  md = PropertyList()
55  md.set("REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION)
56  catalog.setMetadata(md)
57 
58 
59 class IngestReferenceRunner(pipeBase.TaskRunner):
60  """Task runner for the reference catalog ingester
61 
62  Data IDs are ignored so the runner should just run the task on the parsed command.
63  """
64 
65  def run(self, parsedCmd):
66  """Run the task.
67 
68  Several arguments need to be collected to send on to the task methods.
69 
70  Parameters
71  ----------
72  parsedCmd : `argparse.Namespace`
73  Parsed command.
74 
75  Returns
76  -------
77  results : `lsst.pipe.base.Struct` or `None`
78  A empty struct if self.doReturnResults, else None
79  """
80  files = parsedCmd.files
81  butler = parsedCmd.butler
82  task = self.TaskClass(config=self.config, log=self.log, butler=butler)
83  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
84 
85  task.createIndexedCatalog(files)
86  if self.doReturnResults:
87  return pipeBase.Struct()
88 
89 
90 class DatasetConfig(pexConfig.Config):
91  """The description of the on-disk storage format for the persisted
92  reference catalog.
93  """
94  format_version = pexConfig.Field(
95  dtype=int,
96  doc="Version number of the persisted on-disk storage format."
97  "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)."
98  "\nVersion 1 had nJy as flux units.",
99  default=0 # This needs to always be 0, so that unversioned catalogs are interpreted as version 0.
100  )
101  ref_dataset_name = pexConfig.Field(
102  dtype=str,
103  default='cal_ref_cat',
104  doc='String to pass to the butler to retrieve persisted files.',
105  )
106  indexer = IndexerRegistry.makeField(
107  default='HTM',
108  doc='Name of indexer algoritm to use. Default is HTM',
109  )
110 
111 
112 class IngestIndexedReferenceConfig(pexConfig.Config):
113  dataset_config = pexConfig.ConfigField(
114  dtype=DatasetConfig,
115  doc="Configuration for reading the ingested data",
116  )
117  n_processes = pexConfig.Field(
118  dtype=int,
119  doc=("Number of python processes to use when ingesting."),
120  default=1
121  )
122  file_reader = pexConfig.ConfigurableField(
123  target=ReadTextCatalogTask,
124  doc='Task to use to read the files. Default is to expect text files.'
125  )
126  ra_name = pexConfig.Field(
127  dtype=str,
128  doc="Name of RA column",
129  )
130  dec_name = pexConfig.Field(
131  dtype=str,
132  doc="Name of Dec column",
133  )
134  ra_err_name = pexConfig.Field(
135  dtype=str,
136  doc="Name of RA error column",
137  optional=True,
138  )
139  dec_err_name = pexConfig.Field(
140  dtype=str,
141  doc="Name of Dec error column",
142  optional=True,
143  )
144  mag_column_list = pexConfig.ListField(
145  dtype=str,
146  doc="The values in the reference catalog are assumed to be in AB magnitudes. "
147  "List of column names to use for photometric information. At least one entry is required."
148  )
149  mag_err_column_map = pexConfig.DictField(
150  keytype=str,
151  itemtype=str,
152  default={},
153  doc="A map of magnitude column name (key) to magnitude error column (value)."
154  )
155  is_photometric_name = pexConfig.Field(
156  dtype=str,
157  optional=True,
158  doc='Name of column stating if satisfactory for photometric calibration (optional).'
159  )
160  is_resolved_name = pexConfig.Field(
161  dtype=str,
162  optional=True,
163  doc='Name of column stating if the object is resolved (optional).'
164  )
165  is_variable_name = pexConfig.Field(
166  dtype=str,
167  optional=True,
168  doc='Name of column stating if the object is measured to be variable (optional).'
169  )
170  id_name = pexConfig.Field(
171  dtype=str,
172  optional=True,
173  doc='Name of column to use as an identifier (optional).'
174  )
175  pm_ra_name = pexConfig.Field(
176  dtype=str,
177  doc="Name of proper motion RA column",
178  optional=True,
179  )
180  pm_dec_name = pexConfig.Field(
181  dtype=str,
182  doc="Name of proper motion Dec column",
183  optional=True,
184  )
185  pm_ra_err_name = pexConfig.Field(
186  dtype=str,
187  doc="Name of proper motion RA error column",
188  optional=True,
189  )
190  pm_dec_err_name = pexConfig.Field(
191  dtype=str,
192  doc="Name of proper motion Dec error column",
193  optional=True,
194  )
195  pm_scale = pexConfig.Field(
196  dtype=float,
197  doc="Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year",
198  default=1.0,
199  )
200  parallax_name = pexConfig.Field(
201  dtype=str,
202  doc="Name of parallax column",
203  optional=True,
204  )
205  parallax_err_name = pexConfig.Field(
206  dtype=str,
207  doc="Name of parallax error column",
208  optional=True,
209  )
210  parallax_scale = pexConfig.Field(
211  dtype=float,
212  doc="Scale factor by which to multiply parallax values to obtain units of milliarcsec",
213  default=1.0,
214  )
215  epoch_name = pexConfig.Field(
216  dtype=str,
217  doc="Name of epoch column",
218  optional=True,
219  )
220  epoch_format = pexConfig.Field(
221  dtype=str,
222  doc="Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'",
223  optional=True,
224  )
225  epoch_scale = pexConfig.Field(
226  dtype=str,
227  doc="Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'",
228  optional=True,
229  )
230  extra_col_names = pexConfig.ListField(
231  dtype=str,
232  default=[],
233  doc='Extra columns to add to the reference catalog.'
234  )
235 
236  def setDefaults(self):
237  # Newly ingested reference catalogs always have the latest format_version.
238  self.dataset_config.format_version = LATEST_FORMAT_VERSION
239 
240  def validate(self):
241  pexConfig.Config.validate(self)
242 
243  def assertAllOrNone(*names):
244  """Raise ValueError unless all the named fields are set or are
245  all none (or blank)
246  """
247  setNames = [name for name in names if bool(getattr(self, name))]
248  if len(setNames) in (len(names), 0):
249  return
250  prefix = "Both or neither" if len(names) == 2 else "All or none"
251  raise ValueError("{} of {} must be set, but only {} are set".format(
252  prefix, ", ".join(names), ", ".join(setNames)))
253 
254  if not (self.ra_name and self.dec_name and self.mag_column_list):
255  raise ValueError(
256  "ra_name and dec_name and at least one entry in mag_column_list must be supplied.")
257  if self.mag_err_column_map and set(self.mag_column_list) != set(self.mag_err_column_map.keys()):
258  raise ValueError(
259  "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format(
260  sorted(self.mag_err_column_map.keys()), sorted(self.mag_column_list)))
261  assertAllOrNone("ra_err_name", "dec_err_name")
262  assertAllOrNone("epoch_name", "epoch_format", "epoch_scale")
263  assertAllOrNone("pm_ra_name", "pm_dec_name")
264  assertAllOrNone("pm_ra_err_name", "pm_dec_err_name")
265  if self.pm_ra_err_name and not self.pm_ra_name:
266  raise ValueError('"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified')
267  if (self.pm_ra_name or self.parallax_name) and not self.epoch_name:
268  raise ValueError(
269  '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified')
270 
271 
272 class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
273  """Class for producing and loading indexed reference catalogs.
274 
275  This implements an indexing scheme based on hierarchical triangular
276  mesh (HTM). The term index really means breaking the catalog into
277  localized chunks called shards. In this case each shard contains
278  the entries from the catalog in a single HTM trixel
279 
280  For producing catalogs this task makes the following assumptions
281  about the input catalogs:
282  - RA, Dec, RA error and Dec error are all in decimal degrees.
283  - Epoch is available in a column, in a format supported by astropy.time.Time.
284  - There are no off-diagonal covariance terms, such as covariance
285  between RA and Dec, or between PM RA and PM Dec. Gaia is a well
286  known example of a catalog that has such terms, and thus should not
287  be ingested with this task.
288 
289  Parameters
290  ----------
291  butler : `lsst.daf.persistence.Butler`
292  Data butler for reading and writing catalogs
293  """
294  canMultiprocess = False
295  ConfigClass = IngestIndexedReferenceConfig
296  RunnerClass = IngestReferenceRunner
297  _DefaultName = 'IngestIndexedReferenceTask'
298 
299  @classmethod
300  def _makeArgumentParser(cls):
301  """Create an argument parser.
302 
303  This returns a standard parser with an extra "files" argument.
304  """
305  parser = pipeBase.InputOnlyArgumentParser(name=cls._DefaultName)
306  parser.add_argument("files", nargs="+", help="Names of files to index")
307  return parser
308 
309  def __init__(self, *args, butler=None, **kwargs):
310  self.butler = butler
311  super().__init__(*args, **kwargs)
312  self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
313  self.config.dataset_config.indexer.active)
314  self.makeSubtask('file_reader')
315 
316  def createIndexedCatalog(self, inputFiles):
317  """Index a set of files comprising a reference catalog.
318 
319  Outputs are persisted in the butler repository.
320 
321  Parameters
322  ----------
323  inputFiles : `list`
324  A list of file paths to read.
325  """
326  schema, key_map = self._saveMasterSchema(inputFiles[0])
327  # create an HTM we can interrogate about pixel ids
328  htm = lsst.sphgeom.HtmPixelization(self.indexer.htm.get_depth())
329  filenames = self._getButlerFilenames(htm)
330  worker = IngestIndexManager(filenames,
331  self.config,
332  self.file_reader,
333  self.indexer,
334  schema,
335  key_map,
336  htm.universe()[0],
337  addRefCatMetadata,
338  self.log)
339  worker.run(inputFiles)
340 
341  # write the config that was used to generate the refcat
342  dataId = self.indexer.makeDataId(None, self.config.dataset_config.ref_dataset_name)
343  self.butler.put(self.config.dataset_config, 'ref_cat_config', dataId=dataId)
344 
345  def _saveMasterSchema(self, filename):
346  """Generate and save the master catalog schema.
347 
348  Parameters
349  ----------
350  filename : `str`
351  An input file to read to get the input dtype.
352  """
353  arr = self.file_reader.run(filename)
354  schema, key_map = self.makeSchema(arr.dtype)
355  dataId = self.indexer.makeDataId('master_schema',
356  self.config.dataset_config.ref_dataset_name)
357 
358  catalog = afwTable.SimpleCatalog(schema)
359  addRefCatMetadata(catalog)
360  self.butler.put(catalog, 'ref_cat', dataId=dataId)
361  return schema, key_map
362 
363  def _getButlerFilenames(self, htm):
364  """Get filenames from the butler for each output pixel."""
365  filenames = {}
366  start, end = htm.universe()[0]
367  # path manipulation because butler.get() per pixel will take forever
368  dataId = self.indexer.makeDataId(start, self.config.dataset_config.ref_dataset_name)
369  path = self.butler.get('ref_cat_filename', dataId=dataId)[0]
370  base = os.path.join(os.path.dirname(path), "%d"+os.path.splitext(path)[1])
371  for pixelId in range(start, end):
372  filenames[pixelId] = base % pixelId
373 
374  return filenames
375 
376  def makeSchema(self, dtype):
377  """Make the schema to use in constructing the persisted catalogs.
378 
379  Parameters
380  ----------
381  dtype : `numpy.dtype`
382  Data type describing each entry in ``config.extra_col_names``
383  for the catalogs being ingested.
384 
385  Returns
386  -------
387  schemaAndKeyMap : `tuple` of (`lsst.afw.table.Schema`, `dict`)
388  A tuple containing two items:
389  - The schema for the output source catalog.
390  - A map of catalog keys to use in filling the record
391  """
392  # make a schema with the standard fields
393  schema = LoadReferenceObjectsTask.makeMinimalSchema(
394  filterNameList=self.config.mag_column_list,
395  addCentroid=False,
396  addIsPhotometric=bool(self.config.is_photometric_name),
397  addIsResolved=bool(self.config.is_resolved_name),
398  addIsVariable=bool(self.config.is_variable_name),
399  coordErrDim=2 if bool(self.config.ra_err_name) else 0,
400  addProperMotion=2 if bool(self.config.pm_ra_name) else 0,
401  properMotionErrDim=2 if bool(self.config.pm_ra_err_name) else 0,
402  addParallax=bool(self.config.parallax_name),
403  addParallaxErr=bool(self.config.parallax_err_name),
404  )
405  keysToSkip = set(("id", "centroid_x", "centroid_y", "hasCentroid"))
406  key_map = {fieldName: schema[fieldName].asKey() for fieldName in schema.getOrderedNames()
407  if fieldName not in keysToSkip}
408 
409  def addField(name):
410  if dtype[name].kind == 'U':
411  # dealing with a string like thing. Need to get type and size.
412  at_size = dtype[name].itemsize
413  return schema.addField(name, type=str, size=at_size)
414  else:
415  at_type = dtype[name].type
416  return schema.addField(name, at_type)
417 
418  for col in self.config.extra_col_names:
419  key_map[col] = addField(col)
420  return schema, key_map