lsst.meas.algorithms  18.1.0-7-g89824ecc+3
ingestIndexReferenceTask.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 #
4 # Copyright 2008-2017 AURA/LSST.
5 #
6 # This product includes software developed by the
7 # LSST Project (http://www.lsst.org/).
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the LSST License Statement and
20 # the GNU General Public License along with this program. If not,
21 # see <https://www.lsstcorp.org/LegalNotices/>.
22 #
23 
24 __all__ = ["IngestIndexedReferenceConfig", "IngestIndexedReferenceTask", "DatasetConfig"]
25 
26 import os.path
27 
28 import lsst.pex.config as pexConfig
29 import lsst.pipe.base as pipeBase
30 import lsst.geom
31 import lsst.sphgeom
32 import lsst.afw.table as afwTable
33 from lsst.daf.base import PropertyList
34 from .indexerRegistry import IndexerRegistry
35 from .readTextCatalogTask import ReadTextCatalogTask
36 from .loadReferenceObjects import LoadReferenceObjectsTask
37 from .ingestIndexManager import IngestIndexManager
38 
39 # The most recent Indexed Reference Catalog on-disk format version.
40 LATEST_FORMAT_VERSION = 1
41 
42 
43 def addRefCatMetadata(catalog):
44  """Add metadata to a new (not yet populated) reference catalog.
45 
46  Parameters
47  ----------
48  catalog : `lsst.afw.table.SimpleCatalog`
49  Catalog to which metadata should be attached. Will be modified
50  in-place.
51  """
52  md = catalog.getMetadata()
53  if md is None:
54  md = PropertyList()
55  md.set("REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION)
56  catalog.setMetadata(md)
57 
58 
59 class IngestReferenceRunner(pipeBase.TaskRunner):
60  """Task runner for the reference catalog ingester
61 
62  Data IDs are ignored so the runner should just run the task on the parsed command.
63  """
64 
65  def run(self, parsedCmd):
66  """Run the task.
67 
68  Several arguments need to be collected to send on to the task methods.
69 
70  Parameters
71  ----------
72  parsedCmd : `argparse.Namespace`
73  Parsed command.
74 
75  Returns
76  -------
77  results : `lsst.pipe.base.Struct` or `None`
78  A empty struct if self.doReturnResults, else None
79  """
80  files = parsedCmd.files
81  butler = parsedCmd.butler
82  task = self.TaskClass(config=self.config, log=self.log, butler=butler)
83  task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
84 
85  task.createIndexedCatalog(files)
86  if self.doReturnResults:
87  return pipeBase.Struct()
88 
89 
90 class DatasetConfig(pexConfig.Config):
91  """The description of the on-disk storage format for the persisted
92  reference catalog.
93  """
94  format_version = pexConfig.Field(
95  dtype=int,
96  doc="Version number of the persisted on-disk storage format."
97  "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)."
98  "\nVersion 1 had nJy as flux units.",
99  default=0 # This needs to always be 0, so that unversioned catalogs are interpreted as version 0.
100  )
101  ref_dataset_name = pexConfig.Field(
102  dtype=str,
103  default='cal_ref_cat',
104  doc='String to pass to the butler to retrieve persisted files.',
105  )
106  indexer = IndexerRegistry.makeField(
107  default='HTM',
108  doc='Name of indexer algoritm to use. Default is HTM',
109  )
110 
111 
112 class IngestIndexedReferenceConfig(pexConfig.Config):
113  dataset_config = pexConfig.ConfigField(
114  dtype=DatasetConfig,
115  doc="Configuration for reading the ingested data",
116  )
117  n_processes = pexConfig.Field(
118  dtype=int,
119  doc=("Number of python processes to use when ingesting."),
120  default=1
121  )
122  file_reader = pexConfig.ConfigurableField(
123  target=ReadTextCatalogTask,
124  doc='Task to use to read the files. Default is to expect text files.'
125  )
126  ra_name = pexConfig.Field(
127  dtype=str,
128  doc="Name of RA column",
129  )
130  dec_name = pexConfig.Field(
131  dtype=str,
132  doc="Name of Dec column",
133  )
134  ra_err_name = pexConfig.Field(
135  dtype=str,
136  doc="Name of RA error column",
137  optional=True,
138  )
139  dec_err_name = pexConfig.Field(
140  dtype=str,
141  doc="Name of Dec error column",
142  optional=True,
143  )
144  mag_column_list = pexConfig.ListField(
145  dtype=str,
146  doc="The values in the reference catalog are assumed to be in AB magnitudes. "
147  "List of column names to use for photometric information. At least one entry is required."
148  )
149  mag_err_column_map = pexConfig.DictField(
150  keytype=str,
151  itemtype=str,
152  default={},
153  doc="A map of magnitude column name (key) to magnitude error column (value)."
154  )
155  is_photometric_name = pexConfig.Field(
156  dtype=str,
157  optional=True,
158  doc='Name of column stating if satisfactory for photometric calibration (optional).'
159  )
160  is_resolved_name = pexConfig.Field(
161  dtype=str,
162  optional=True,
163  doc='Name of column stating if the object is resolved (optional).'
164  )
165  is_variable_name = pexConfig.Field(
166  dtype=str,
167  optional=True,
168  doc='Name of column stating if the object is measured to be variable (optional).'
169  )
170  id_name = pexConfig.Field(
171  dtype=str,
172  optional=True,
173  doc='Name of column to use as an identifier (optional).'
174  )
175  pm_ra_name = pexConfig.Field(
176  dtype=str,
177  doc="Name of proper motion RA column",
178  optional=True,
179  )
180  pm_dec_name = pexConfig.Field(
181  dtype=str,
182  doc="Name of proper motion Dec column",
183  optional=True,
184  )
185  pm_ra_err_name = pexConfig.Field(
186  dtype=str,
187  doc="Name of proper motion RA error column",
188  optional=True,
189  )
190  pm_dec_err_name = pexConfig.Field(
191  dtype=str,
192  doc="Name of proper motion Dec error column",
193  optional=True,
194  )
195  pm_scale = pexConfig.Field(
196  dtype=float,
197  doc="Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year",
198  default=1.0,
199  )
200  parallax_name = pexConfig.Field(
201  dtype=str,
202  doc="Name of parallax column",
203  optional=True,
204  )
205  parallax_err_name = pexConfig.Field(
206  dtype=str,
207  doc="Name of parallax error column",
208  optional=True,
209  )
210  parallax_scale = pexConfig.Field(
211  dtype=float,
212  doc="Scale factor by which to multiply parallax values to obtain units of milliarcsec",
213  default=1.0,
214  )
215  epoch_name = pexConfig.Field(
216  dtype=str,
217  doc="Name of epoch column",
218  optional=True,
219  )
220  epoch_format = pexConfig.Field(
221  dtype=str,
222  doc="Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'",
223  optional=True,
224  )
225  epoch_scale = pexConfig.Field(
226  dtype=str,
227  doc="Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'",
228  optional=True,
229  )
230  extra_col_names = pexConfig.ListField(
231  dtype=str,
232  default=[],
233  doc='Extra columns to add to the reference catalog.'
234  )
235 
236  def setDefaults(self):
237  # Newly ingested reference catalogs always have the latest format_version.
238  self.dataset_config.format_version = LATEST_FORMAT_VERSION
239 
240  def validate(self):
241  pexConfig.Config.validate(self)
242 
243  def assertAllOrNone(*names):
244  """Raise ValueError unless all the named fields are set or are
245  all none (or blank)
246  """
247  setNames = [name for name in names if bool(getattr(self, name))]
248  if len(setNames) in (len(names), 0):
249  return
250  prefix = "Both or neither" if len(names) == 2 else "All or none"
251  raise ValueError("{} of {} must be set, but only {} are set".format(
252  prefix, ", ".join(names), ", ".join(setNames)))
253 
254  if not (self.ra_name and self.dec_name and self.mag_column_list):
255  raise ValueError(
256  "ra_name and dec_name and at least one entry in mag_column_list must be supplied.")
257  if self.mag_err_column_map and set(self.mag_column_list) != set(self.mag_err_column_map.keys()):
258  raise ValueError(
259  "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format(
260  sorted(self.mag_err_column_map.keys()), sorted(self.mag_column_list)))
261  assertAllOrNone("ra_err_name", "dec_err_name")
262  assertAllOrNone("epoch_name", "epoch_format", "epoch_scale")
263  assertAllOrNone("pm_ra_name", "pm_dec_name")
264  assertAllOrNone("pm_ra_err_name", "pm_dec_err_name")
265  assertAllOrNone("parallax_name", "parallax_err_name")
266  if self.pm_ra_err_name and not self.pm_ra_name:
267  raise ValueError('"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified')
268  if (self.pm_ra_name or self.parallax_name) and not self.epoch_name:
269  raise ValueError(
270  '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified')
271 
272 
273 class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
274  """Class for producing and loading indexed reference catalogs.
275 
276  This implements an indexing scheme based on hierarchical triangular
277  mesh (HTM). The term index really means breaking the catalog into
278  localized chunks called shards. In this case each shard contains
279  the entries from the catalog in a single HTM trixel
280 
281  For producing catalogs this task makes the following assumptions
282  about the input catalogs:
283  - RA, Dec, RA error and Dec error are all in decimal degrees.
284  - Epoch is available in a column, in a format supported by astropy.time.Time.
285  - There are no off-diagonal covariance terms, such as covariance
286  between RA and Dec, or between PM RA and PM Dec. Gaia is a well
287  known example of a catalog that has such terms, and thus should not
288  be ingested with this task.
289 
290  Parameters
291  ----------
292  butler : `lsst.daf.persistence.Butler`
293  Data butler for reading and writing catalogs
294  """
295  canMultiprocess = False
296  ConfigClass = IngestIndexedReferenceConfig
297  RunnerClass = IngestReferenceRunner
298  _DefaultName = 'IngestIndexedReferenceTask'
299 
300  @classmethod
301  def _makeArgumentParser(cls):
302  """Create an argument parser.
303 
304  This returns a standard parser with an extra "files" argument.
305  """
306  parser = pipeBase.InputOnlyArgumentParser(name=cls._DefaultName)
307  parser.add_argument("files", nargs="+", help="Names of files to index")
308  return parser
309 
310  def __init__(self, *args, butler=None, **kwargs):
311  self.butler = butler
312  super().__init__(*args, **kwargs)
313  self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
314  self.config.dataset_config.indexer.active)
315  self.makeSubtask('file_reader')
316 
317  def createIndexedCatalog(self, inputFiles):
318  """Index a set of files comprising a reference catalog.
319 
320  Outputs are persisted in the butler repository.
321 
322  Parameters
323  ----------
324  inputFiles : `list`
325  A list of file paths to read.
326  """
327  schema, key_map = self._saveMasterSchema(inputFiles[0])
328  # create an HTM we can interrogate about pixel ids
329  htm = lsst.sphgeom.HtmPixelization(self.indexer.htm.get_depth())
330  filenames = self._getButlerFilenames(htm)
331  worker = IngestIndexManager(filenames,
332  self.config,
333  self.file_reader,
334  self.indexer,
335  schema,
336  key_map,
337  htm.universe()[0],
338  addRefCatMetadata,
339  self.log)
340  worker.run(inputFiles)
341 
342  # write the config that was used to generate the refcat
343  dataId = self.indexer.makeDataId(None, self.config.dataset_config.ref_dataset_name)
344  self.butler.put(self.config.dataset_config, 'ref_cat_config', dataId=dataId)
345 
346  def _saveMasterSchema(self, filename):
347  """Generate and save the master catalog schema.
348 
349  Parameters
350  ----------
351  filename : `str`
352  An input file to read to get the input dtype.
353  """
354  arr = self.file_reader.run(filename)
355  schema, key_map = self.makeSchema(arr.dtype)
356  dataId = self.indexer.makeDataId('master_schema',
357  self.config.dataset_config.ref_dataset_name)
358 
359  catalog = afwTable.SimpleCatalog(schema)
360  addRefCatMetadata(catalog)
361  self.butler.put(catalog, 'ref_cat', dataId=dataId)
362  return schema, key_map
363 
364  def _getButlerFilenames(self, htm):
365  """Get filenames from the butler for each output pixel."""
366  filenames = {}
367  start, end = htm.universe()[0]
368  # path manipulation because butler.get() per pixel will take forever
369  dataId = self.indexer.makeDataId(start, self.config.dataset_config.ref_dataset_name)
370  path = self.butler.get('ref_cat_filename', dataId=dataId)[0]
371  base = os.path.join(os.path.dirname(path), "%d"+os.path.splitext(path)[1])
372  for pixelId in range(start, end):
373  filenames[pixelId] = base % pixelId
374 
375  return filenames
376 
377  def makeSchema(self, dtype):
378  """Make the schema to use in constructing the persisted catalogs.
379 
380  Parameters
381  ----------
382  dtype : `numpy.dtype`
383  Data type describing each entry in ``config.extra_col_names``
384  for the catalogs being ingested.
385 
386  Returns
387  -------
388  schemaAndKeyMap : `tuple` of (`lsst.afw.table.Schema`, `dict`)
389  A tuple containing two items:
390  - The schema for the output source catalog.
391  - A map of catalog keys to use in filling the record
392  """
393  # make a schema with the standard fields
394  schema = LoadReferenceObjectsTask.makeMinimalSchema(
395  filterNameList=self.config.mag_column_list,
396  addCentroid=False,
397  addIsPhotometric=bool(self.config.is_photometric_name),
398  addIsResolved=bool(self.config.is_resolved_name),
399  addIsVariable=bool(self.config.is_variable_name),
400  coordErrDim=2 if bool(self.config.ra_err_name) else 0,
401  addProperMotion=2 if bool(self.config.pm_ra_name) else 0,
402  properMotionErrDim=2 if bool(self.config.pm_ra_err_name) else 0,
403  addParallax=bool(self.config.parallax_name),
404  )
405  keysToSkip = set(("id", "centroid_x", "centroid_y", "hasCentroid"))
406  key_map = {fieldName: schema[fieldName].asKey() for fieldName in schema.getOrderedNames()
407  if fieldName not in keysToSkip}
408 
409  def addField(name):
410  if dtype[name].kind == 'U':
411  # dealing with a string like thing. Need to get type and size.
412  at_size = dtype[name].itemsize
413  return schema.addField(name, type=str, size=at_size)
414  else:
415  at_type = dtype[name].type
416  return schema.addField(name, at_type)
417 
418  for col in self.config.extra_col_names:
419  key_map[col] = addField(col)
420  return schema, key_map