lsst.meas.algorithms ga26ab52c63+19811520c9
Loading...
Searching...
No Matches
convertRefcatManager.py
Go to the documentation of this file.
1# This file is part of meas_algorithms.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
21
22__all__ = ["ConvertRefcatManager", "ConvertGaiaManager", "ConvertGaiaXpManager"]
23
24from ctypes import c_int
25import os.path
26import itertools
27import multiprocessing
28import time
29
30import astropy.time
31import astropy.units as u
32import numpy as np
33
34import lsst.sphgeom
35import lsst.afw.table as afwTable
36from lsst.afw.image import fluxErrFromABMagErr
37import lsst.pex.config as pexConfig
38
39
40# global shared counter to keep track of source ids
41# (multiprocess sharing is most easily done with a global)
42COUNTER = multiprocessing.Value(c_int, 0)
43# global shared counter to keep track of number of files processed.
44FILE_PROGRESS = multiprocessing.Value(c_int, 0)
45
46
47class ConvertRefcatManagerConfig(pexConfig.Config):
48 """Placeholder for ConfigurableField validation; refcat convert is
49 configured by the parent convert Task.
50 """
51 pass
52
53
55 """
56 Convert a reference catalog from external files into the LSST HTM sharded
57 format, using a multiprocessing Pool to speed up the work.
58
59 Parameters
60 ----------
61 filenames : `dict` [`int`, `str`]
62 The HTM pixel id and filenames to convert the catalog into.
64 The Task configuration holding the field names.
65 file_reader : `lsst.pipe.base.Task`
66 The file reader to use to load the files.
68 The class used to compute the HTM pixel per coordinate.
69 schema : `lsst.afw.table.Schema`
70 The schema of the output catalog.
71 key_map : `dict` [`str`, `lsst.afw.table.Key`]
72 The mapping from output field names to keys in the Schema.
73 htmRange : `tuple` [`int`]
74 The start and end HTM pixel ids.
75 addRefCatMetadata : callable
76 A function called to add extra metadata to each output Catalog.
77 log : `lsst.log.Log` or `logging.Logger`
78 The log to send messages to.
79 """
80 _flags = ['photometric', 'resolved', 'variable']
81 _DefaultName = 'convertRefcatManager'
82 ConfigClass = ConvertRefcatManagerConfig
83
84 def __init__(self, filenames, config, file_reader, indexer,
85 schema, key_map, htmRange, addRefCatMetadata, log):
86 self.filenames = filenames
87 self.config = config
88 self.file_reader = file_reader
89 self.indexer = indexer
90 self.schema = schema
91 self.key_map = key_map
92 self.htmRange = htmRange
93 self.addRefCatMetadata = addRefCatMetadata
94 self.log = log
95
96 if self.config.coord_err_unit is not None:
97 # cache this to speed up coordinate conversions.
98 self.coord_err_unit = u.Unit(self.config.coord_err_unit)
99
100 def run(self, inputFiles):
101 """Index a set of input files from a reference catalog, and write the
102 output to the appropriate filenames, in parallel.
103
104 Parameters
105 ----------
106 inputFiles : `list`
107 A list of file paths to read data from.
108
109 Returns
110 -------
111 output : `dict` [`int`, `str`]
112 The htm ids and the filenames that were written to.
113 """
114 global COUNTER, FILE_PROGRESS
115 self.nInputFiles = len(inputFiles)
116
117 with multiprocessing.Manager() as manager:
118 COUNTER.value = 0
119 FILE_PROGRESS.value = 0
120 fileLocks = manager.dict()
121 self.log.info("Creating %s file locks.", self.htmRange[1] - self.htmRange[0])
122 for i in range(self.htmRange[0], self.htmRange[1]):
123 fileLocks[i] = manager.Lock()
124 self.log.info("File locks created.")
125
126 start_time = time.perf_counter()
127 with multiprocessing.Pool(self.config.n_processes) as pool:
128 result = pool.starmap(self._convertOneFile, zip(inputFiles, itertools.repeat(fileLocks)))
129 end_time = time.perf_counter()
130 self.log.info("Finished writing files. Elapsed time: %.2f seconds", end_time-start_time)
131
132 return {id: self.filenames[id] for item in result for id in item}
133
134 def _convertOneFile(self, filename, fileLocks):
135 """Read and process one file, and write its records to the correct
136 indexed files, while handling exceptions in a useful way so that they
137 don't get swallowed by the multiprocess pool.
138
139 Parameters
140 ----------
141 filename : `str`
142 The file to process.
143 fileLocks : `dict` [`int`, `multiprocessing.Lock`]
144 A Lock for each HTM pixel; each pixel gets one file written, and
145 we need to block when one process is accessing that file.
146
147 Returns
148 -------
149 pixels, files : `list` [`int`]
150 The pixel ids that were written to.
151 """
152 global FILE_PROGRESS
153 inputData = self.file_reader.run(filename)
154 fluxes = self._getFluxes(inputData)
155 coordErr = self._getCoordErr(inputData)
156 matchedPixels = self.indexer.indexPoints(inputData[self.config.ra_name],
157 inputData[self.config.dec_name])
158 pixel_ids = set(matchedPixels)
159 for pixelId in pixel_ids:
160 with fileLocks[pixelId]:
161 self._doOnePixel(inputData, matchedPixels, pixelId, fluxes, coordErr)
162 with FILE_PROGRESS.get_lock():
163 oldPercent = 100 * FILE_PROGRESS.value / self.nInputFiles
164 FILE_PROGRESS.value += 1
165 percent = 100 * FILE_PROGRESS.value / self.nInputFiles
166 # only log each "new percent"
167 if np.floor(percent) - np.floor(oldPercent) >= 1:
168 self.log.info("Completed %d / %d files: %d %% complete ",
169 FILE_PROGRESS.value,
170 self.nInputFiles,
171 percent)
172 return pixel_ids
173
174 def _doOnePixel(self, inputData, matchedPixels, pixelId, fluxes, coordErr):
175 """Process one HTM pixel, appending to an existing catalog or creating
176 a new catalog, as needed.
177
178 Parameters
179 ----------
180 inputData : `numpy.ndarray`
181 The data from one input file.
182 matchedPixels : `numpy.ndarray`
183 The row-matched pixel indexes corresponding to ``inputData``.
184 pixelId : `int`
185 The pixel index we are currently processing.
186 fluxes : `dict` [`str`, `numpy.ndarray`]
187 The values that will go into the flux and fluxErr fields in the
188 output catalog.
189 coordErr : `dict` [`str`, `numpy.ndarray`]
190 The values that will go into the coord_raErr, coord_decErr, and
191 coord_ra_dec_Cov fields in the output catalog (in radians).
192 """
193 idx = np.where(matchedPixels == pixelId)[0]
194 catalog = self.getCatalog(pixelId, self.schema, len(idx))
195 for outputRow, inputRow in zip(catalog[-len(idx):], inputData[idx]):
196 self._fillRecord(outputRow, inputRow)
197
198 global COUNTER
199 with COUNTER.get_lock():
200 self._setIds(inputData[idx], catalog)
201
202 # set fluxes from the pre-computed array
203 for name, array in fluxes.items():
204 catalog[self.key_map[name]][-len(idx):] = array[idx]
205
206 # set coordinate errors from the pre-computed array
207 for name, array in coordErr.items():
208 catalog[name][-len(idx):] = array[idx]
209
210 catalog.writeFits(self.filenames[pixelId])
211
212 def _setIds(self, inputData, catalog):
213 """Fill the `id` field of catalog with a running index, filling the
214 last values up to the length of ``inputData``.
215
216 Fill with `self.config.id_name` if specified, otherwise use the
217 global running counter value.
218
219 Parameters
220 ----------
221 inputData : `numpy.ndarray`
222 The input data that is being processed.
224 The output catalog to fill the ids.
225 """
226 global COUNTER
227 size = len(inputData)
228 if self.config.id_name:
229 catalog['id'][-size:] = inputData[self.config.id_name]
230 else:
231 idEnd = COUNTER.value + size
232 catalog['id'][-size:] = np.arange(COUNTER.value, idEnd)
233 COUNTER.value = idEnd
234
235 def getCatalog(self, pixelId, schema, nNewElements):
236 """Get a catalog from disk or create it if it doesn't exist.
237
238 Parameters
239 ----------
240 pixelId : `dict`
241 Identifier for catalog to retrieve
242 schema : `lsst.afw.table.Schema`
243 Schema to use in catalog creation it does not exist.
244 nNewElements : `int`
245 The number of new elements that will be added to the catalog,
246 so space can be preallocated.
247
248 Returns
249 -------
251 The new or read-and-resized catalog specified by `dataId`.
252 """
253 # This is safe, because we lock on this file before getCatalog is called.
254 if os.path.isfile(self.filenames[pixelId]):
255 catalog = afwTable.SimpleCatalog.readFits(self.filenames[pixelId])
256 catalog.resize(len(catalog) + nNewElements)
257 return catalog.copy(deep=True) # ensure contiguity, so that column-assignment works
258 catalog = afwTable.SimpleCatalog(schema)
259 catalog.resize(nNewElements)
260 self.addRefCatMetadata(catalog)
261 return catalog
262
263 @staticmethod
264 def computeCoord(row, ra_name, dec_name):
265 """Create an ICRS coord. from a row of a catalog being converted.
266
267 Parameters
268 ----------
269 row : `numpy.ndarray`
270 Row from catalog being converted.
271 ra_name : `str`
272 Name of RA key in catalog being converted.
273 dec_name : `str`
274 Name of Dec key in catalog being converted.
275
276 Returns
277 -------
278 coord : `lsst.geom.SpherePoint`
279 ICRS coordinate.
280 """
281 return lsst.geom.SpherePoint(row[ra_name], row[dec_name], lsst.geom.degrees)
282
283 def _getCoordErr(self, inputData, ):
284 """Compute the ra/dec error fields that will go into the output catalog.
285
286 Parameters
287 ----------
288 inputData : `numpy.ndarray`
289 The input data to compute fluxes for.
290
291 Returns
292 -------
293 coordErr : `dict` [`str`, `numpy.ndarray`]
294 The values that will go into the coord_raErr, coord_decErr, fields
295 in the output catalog (in radians).
296
297 Notes
298 -----
299 This does not handle the ra/dec covariance field,
300 ``coord_ra_coord_dec_Cov``. That field is handled in
301 `_setCoordinateCovariance`.
302 """
303 result = {}
304 if hasattr(self, "coord_err_unit"):
305 result['coord_raErr'] = u.Quantity(inputData[self.config.ra_err_name],
306 self.coord_err_unit).to_value(u.radian)
307 result['coord_decErr'] = u.Quantity(inputData[self.config.dec_err_name],
308 self.coord_err_unit).to_value(u.radian)
309 return result
310
311 def _setFlags(self, record, row):
312 """Set flags in an output record.
313
314 Parameters
315 ----------
317 Row from indexed catalog to modify.
318 row : `numpy.ndarray`
319 Row from catalog being converted.
320 """
321 names = record.schema.getNames()
322 for flag in self._flags:
323 if flag in names:
324 attr_name = 'is_{}_name'.format(flag)
325 record.set(self.key_map[flag], bool(row[getattr(self.config, attr_name)]))
326
327 def _getFluxes(self, inputData):
328 """Compute the flux fields that will go into the output catalog.
329
330 Parameters
331 ----------
332 inputData : `numpy.ndarray`
333 The input data to compute fluxes for.
334
335 Returns
336 -------
337 fluxes : `dict` [`str`, `numpy.ndarray`]
338 The values that will go into the flux and fluxErr fields in the
339 output catalog.
340 """
341 result = {}
342 for item in self.config.mag_column_list:
343 result[item+'_flux'] = (inputData[item]*u.ABmag).to_value(u.nJy)
344 if len(self.config.mag_err_column_map) > 0:
345 for err_key in self.config.mag_err_column_map.keys():
346 error_col_name = self.config.mag_err_column_map[err_key]
347 # TODO: multiply by 1e9 here until we have a replacement (see DM-16903)
348 # NOTE: copy the arrays because the numpy strides may not be useable by C++.
349 fluxErr = fluxErrFromABMagErr(inputData[error_col_name].copy(),
350 inputData[err_key].copy())*1e9
351 result[err_key+'_fluxErr'] = fluxErr
352 return result
353
354 def _setProperMotion(self, record, row):
355 """Set proper motion fields in a record of an indexed catalog.
356
357 The proper motions are read from the specified columns,
358 scaled appropriately, and installed in the appropriate
359 columns of the output.
360
361 Parameters
362 ----------
364 Row from indexed catalog to modify.
365 row : structured `numpy.array`
366 Row from catalog being converted.
367 """
368 if self.config.pm_ra_name is None: # ConvertReferenceCatalogConfig.validate ensures all or none
369 return
370 radPerOriginal = np.radians(self.config.pm_scale)/(3600*1000)
371 record.set(self.key_map["pm_ra"], row[self.config.pm_ra_name]*radPerOriginal*lsst.geom.radians)
372 record.set(self.key_map["pm_dec"], row[self.config.pm_dec_name]*radPerOriginal*lsst.geom.radians)
373 record.set(self.key_map["epoch"], self._epochToMjdTai(row[self.config.epoch_name]))
374 if self.config.pm_ra_err_name is not None: # pm_dec_err_name also, by validation
375 record.set(self.key_map["pm_raErr"], row[self.config.pm_ra_err_name]*radPerOriginal)
376 record.set(self.key_map["pm_decErr"], row[self.config.pm_dec_err_name]*radPerOriginal)
377
378 def _setParallax(self, record, row):
379 """Set the parallax fields in a record of a refcat.
380 """
381 if self.config.parallax_name is None:
382 return
383 scale = self.config.parallax_scale*lsst.geom.milliarcseconds
384 record.set(self.key_map['parallax'], row[self.config.parallax_name]*scale)
385 record.set(self.key_map['parallaxErr'], row[self.config.parallax_err_name]*scale)
386
387 def _epochToMjdTai(self, nativeEpoch):
388 """Convert an epoch in native format to TAI MJD (a float).
389 """
390 return astropy.time.Time(nativeEpoch, format=self.config.epoch_format,
391 scale=self.config.epoch_scale).tai.mjd
392
393 def _setCoordinateCovariance(self, record, row):
394 """Set the off-diagonal position covariance in a record of an indexed
395 catalog.
396
397 There is no generic way to determine covariance. Override this method
398 in a subclass specialized for your dataset.
399
400 Parameters
401 ----------
403 Row from indexed catalog to modify.
404 row : structured `numpy.array`
405 Row from catalog being converted.
406 """
407 raise NotImplementedError("There is no default method for setting the covariance. Override this "
408 "method in a subclass specialized for your dataset.")
409
410 def _setExtra(self, record, row):
411 """Set extra data fields in a record of an indexed catalog.
412
413 Parameters
414 ----------
416 Row from indexed catalog to modify.
417 row : structured `numpy.array`
418 Row from catalog being converted.
419 """
420 for extra_col in self.config.extra_col_names:
421 value = row[extra_col]
422 # If data read from a text file contains string like entires,
423 # numpy stores this as its own internal type, a numpy.str_
424 # object. This seems to be a consequence of how numpy stores
425 # string like objects in fixed column arrays. This checks
426 # if any of the values to be added to the catalog are numpy
427 # string types, and if they are, casts them to a python string
428 # which is what the python c++ records expect
429 if isinstance(value, np.str_):
430 value = str(value)
431 record.set(self.key_map[extra_col], value)
432
433 def _fillRecord(self, record, row):
434 """Fill a record in an indexed catalog to be persisted.
435
436 Parameters
437 ----------
439 Row from indexed catalog to modify.
440 row : structured `numpy.array`
441 Row from catalog being converted.
442 """
443 record.setCoord(self.computeCoord(row, self.config.ra_name, self.config.dec_name))
444
445 self._setFlags(record, row)
446 if self.config.full_position_information:
447 self._setProperMotion(record, row)
448 self._setParallax(record, row)
449 self._setCoordinateCovariance(record, row)
450 self._setExtra(record, row)
451
452
454 """Special-case convert manager to deal with Gaia fluxes.
455 """
456 def __init__(self, *args, **kwargs):
457 super().__init__(*args, **kwargs)
458 self.properMotionUnit = self.config.pm_scale * u.milliarcsecond
459 self.parallaxUnit = self.config.parallax_scale * u.milliarcsecond
460 self.outputUnit = u.radian * u.radian
461
462 def _getFluxes(self, input):
463 result = {}
464
465 def gaiaFluxToFlux(flux, zeroPoint):
466 """Equations 5.19 and 5.30 from the Gaia calibration document define the
467 conversion from Gaia electron/second fluxes to AB magnitudes.
468 https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html
469 """
470 result = ((zeroPoint + -2.5 * np.log10(flux))*u.ABmag).to_value(u.nJy)
471 # set 0 instrumental fluxes to 0 (instead of NaN/inf from the math)
472 result[flux == 0] = 0
473 return result
474
475 # Some fluxes are 0, so log10(flux) can give warnings. We handle the
476 # zeros explicitly, so they warnings are irrelevant.
477 with np.errstate(invalid='ignore', divide='ignore'):
478 # The constants below come from table 5.3 in this document;
479 # https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html
480 result['phot_g_mean_flux'] = gaiaFluxToFlux(input['phot_g_mean_flux'], 25.7934)
481 result['phot_bp_mean_flux'] = gaiaFluxToFlux(input['phot_bp_mean_flux'], 25.3806)
482 result['phot_rp_mean_flux'] = gaiaFluxToFlux(input['phot_rp_mean_flux'], 25.1161)
483
484 result['phot_g_mean_fluxErr'] = result['phot_g_mean_flux'] / input['phot_g_mean_flux_over_error']
485 result['phot_bp_mean_fluxErr'] = result['phot_bp_mean_flux'] / input['phot_bp_mean_flux_over_error']
486 result['phot_rp_mean_fluxErr'] = result['phot_rp_mean_flux'] / input['phot_rp_mean_flux_over_error']
487
488 return result
489
490 def _setCoordinateCovariance(self, record, row):
491 """Set the off-diagonal position covariance in a record of an indexed
492 catalog.
493
494 Convert the Gaia coordinate correlations into covariances.
495
496 Parameters
497 ----------
499 Row from indexed catalog to modify.
500 row : structured `numpy.array`
501 Row from catalog being converted.
502 """
503 inputParams = ['ra', 'dec', 'parallax', 'pmra', 'pmdec']
504 outputParams = ['coord_ra', 'coord_dec', 'parallax', 'pm_ra', 'pm_dec']
505 # The Gaia standard for naming is to order the parameters as
506 # (coordinates, parallax, proper motion), so they need to be reordered
507 # as (coordinates, proper motion, parallax) to match the order used
508 # in LSST code (i.g. 'coord_parallax_pm_ra_Cov' becomes
509 # 'coord_pm_ra_parallax_Cov').
510 reorder = [0, 1, 4, 2, 3]
511
512 inputUnits = [self.coord_err_unit, self.coord_err_unit, self.parallaxUnit, self.properMotionUnit,
513 self.properMotionUnit]
514
515 for i in range(5):
516 for j in range(i):
517 j_error = row[f'{inputParams[j]}_error'] * inputUnits[j]
518 i_error = row[f'{inputParams[i]}_error'] * inputUnits[i]
519 ij_corr = row[f'{inputParams[j]}_{inputParams[i]}_corr']
520 cov = (i_error * j_error * ij_corr).to_value(self.outputUnit)
521
522 # Switch from order of Gaia parallax and proper motion
523 # parameters to the desired schema:
524 a = (i if (reorder[i] < reorder[j]) else j)
525 b = (j if (reorder[i] < reorder[j]) else i)
526
527 record.set(self.key_map[f'{outputParams[a]}_{outputParams[b]}_Cov'], cov)
528
529
531 """Special-case convert manager for Gaia XP spectrophotometry catalogs,
532 that have fluxes/flux errors, instead of magnitudes/mag errors. The input
533 flux and error values are in units of W/Hz/(m^2) (Gaia Collaboration, Montegriffo et al. 2022).
534 The the flux and fluxErr fields in the output catalog have units of nJy.
535 """
536
537 def _getFluxes(self, inputData):
538 result = {}
539 for item in self.config.mag_column_list:
540
541 error_col_name = item.replace("_flux_", "_flux_error_")
542
543 result[item + "_flux"] = (
544 inputData[item] * u.Watt / u.Hz / u.meter / u.meter
545 ).to_value(u.nJy)
546 result[item + "_fluxErr"] = (
547 inputData[error_col_name] * u.Watt / u.Hz / u.meter / u.meter
548 ).to_value(u.nJy)
549
550 return result
def __init__(self, filenames, config, file_reader, indexer, schema, key_map, htmRange, addRefCatMetadata, log)
def _doOnePixel(self, inputData, matchedPixels, pixelId, fluxes, coordErr)