Coverage for python/lsst/meas/algorithms/convertRefcatManager.py: 17%

194 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-15 10:20 +0000

1# This file is part of meas_algorithms. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22__all__ = ["ConvertRefcatManager", "ConvertGaiaManager", "ConvertGaiaXpManager"] 

23 

24from ctypes import c_int 

25import os.path 

26import itertools 

27import multiprocessing 

28import time 

29 

30import astropy.time 

31import astropy.units as u 

32import numpy as np 

33 

34import lsst.sphgeom 

35import lsst.afw.table as afwTable 

36from lsst.afw.image import fluxErrFromABMagErr 

37import lsst.pex.config as pexConfig 

38 

39 

40# global shared counter to keep track of source ids 

41# (multiprocess sharing is most easily done with a global) 

42COUNTER = multiprocessing.Value(c_int, 0) 

43# global shared counter to keep track of number of files processed. 

44FILE_PROGRESS = multiprocessing.Value(c_int, 0) 

45 

46 

47class ConvertRefcatManagerConfig(pexConfig.Config): 

48 """Placeholder for ConfigurableField validation; refcat convert is 

49 configured by the parent convert Task. 

50 """ 

51 pass 

52 

53 

54class ConvertRefcatManager: 

55 """ 

56 Convert a reference catalog from external files into the LSST HTM sharded 

57 format, using a multiprocessing Pool to speed up the work. 

58 

59 Parameters 

60 ---------- 

61 filenames : `dict` [`int`, `str`] 

62 The HTM pixel id and filenames to convert the catalog into. 

63 config : `lsst.meas.algorithms.ConvertReferenceCatalogConfig` 

64 The Task configuration holding the field names. 

65 file_reader : `lsst.pipe.base.Task` 

66 The file reader to use to load the files. 

67 indexer : `lsst.meas.algorithms.HtmIndexer` 

68 The class used to compute the HTM pixel per coordinate. 

69 schema : `lsst.afw.table.Schema` 

70 The schema of the output catalog. 

71 key_map : `dict` [`str`, `lsst.afw.table.Key`] 

72 The mapping from output field names to keys in the Schema. 

73 htmRange : `tuple` [`int`] 

74 The start and end HTM pixel ids. 

75 addRefCatMetadata : callable 

76 A function called to add extra metadata to each output Catalog. 

77 log : `lsst.log.Log` or `logging.Logger` 

78 The log to send messages to. 

79 """ 

80 _flags = ['photometric', 'resolved', 'variable'] 

81 _DefaultName = 'convertRefcatManager' 

82 ConfigClass = ConvertRefcatManagerConfig 

83 

84 def __init__(self, filenames, config, file_reader, indexer, 

85 schema, key_map, htmRange, addRefCatMetadata, log): 

86 self.filenames = filenames 

87 self.config = config 

88 self.file_reader = file_reader 

89 self.indexer = indexer 

90 self.schema = schema 

91 self.key_map = key_map 

92 self.htmRange = htmRange 

93 self.addRefCatMetadata = addRefCatMetadata 

94 self.log = log 

95 

96 if self.config.coord_err_unit is not None: 

97 # cache this to speed up coordinate conversions. 

98 self.coord_err_unit = u.Unit(self.config.coord_err_unit) 

99 

100 def run(self, inputFiles): 

101 """Index a set of input files from a reference catalog, and write the 

102 output to the appropriate filenames, in parallel. 

103 

104 Parameters 

105 ---------- 

106 inputFiles : `list` 

107 A list of file paths to read data from. 

108 

109 Returns 

110 ------- 

111 output : `dict` [`int`, `str`] 

112 The htm ids and the filenames that were written to. 

113 """ 

114 global COUNTER, FILE_PROGRESS 

115 self.nInputFiles = len(inputFiles) 

116 

117 with multiprocessing.Manager() as manager: 

118 COUNTER.value = 0 

119 FILE_PROGRESS.value = 0 

120 fileLocks = manager.dict() 

121 self.log.info("Creating %s file locks.", self.htmRange[1] - self.htmRange[0]) 

122 for i in range(self.htmRange[0], self.htmRange[1]): 

123 fileLocks[i] = manager.Lock() 

124 self.log.info("File locks created.") 

125 

126 start_time = time.perf_counter() 

127 with multiprocessing.Pool(self.config.n_processes) as pool: 

128 result = pool.starmap(self._convertOneFile, zip(inputFiles, itertools.repeat(fileLocks))) 

129 end_time = time.perf_counter() 

130 self.log.info("Finished writing files. Elapsed time: %.2f seconds", end_time-start_time) 

131 

132 return {id: self.filenames[id] for item in result for id in item} 

133 

134 def _convertOneFile(self, filename, fileLocks): 

135 """Read and process one file, and write its records to the correct 

136 indexed files, while handling exceptions in a useful way so that they 

137 don't get swallowed by the multiprocess pool. 

138 

139 Parameters 

140 ---------- 

141 filename : `str` 

142 The file to process. 

143 fileLocks : `dict` [`int`, `multiprocessing.Lock`] 

144 A Lock for each HTM pixel; each pixel gets one file written, and 

145 we need to block when one process is accessing that file. 

146 

147 Returns 

148 ------- 

149 pixels, files : `list` [`int`] 

150 The pixel ids that were written to. 

151 """ 

152 global FILE_PROGRESS 

153 inputData = self.file_reader.run(filename) 

154 fluxes = self._getFluxes(inputData) 

155 coordErr = self._getCoordErr(inputData) 

156 matchedPixels = self.indexer.indexPoints(inputData[self.config.ra_name], 

157 inputData[self.config.dec_name]) 

158 pixel_ids = set(matchedPixels) 

159 for pixelId in pixel_ids: 

160 with fileLocks[pixelId]: 

161 self._doOnePixel(inputData, matchedPixels, pixelId, fluxes, coordErr) 

162 with FILE_PROGRESS.get_lock(): 

163 oldPercent = 100 * FILE_PROGRESS.value / self.nInputFiles 

164 FILE_PROGRESS.value += 1 

165 percent = 100 * FILE_PROGRESS.value / self.nInputFiles 

166 # only log each "new percent" 

167 if np.floor(percent) - np.floor(oldPercent) >= 1: 

168 self.log.info("Completed %d / %d files: %d %% complete ", 

169 FILE_PROGRESS.value, 

170 self.nInputFiles, 

171 percent) 

172 return pixel_ids 

173 

174 def _doOnePixel(self, inputData, matchedPixels, pixelId, fluxes, coordErr): 

175 """Process one HTM pixel, appending to an existing catalog or creating 

176 a new catalog, as needed. 

177 

178 Parameters 

179 ---------- 

180 inputData : `numpy.ndarray` 

181 The data from one input file. 

182 matchedPixels : `numpy.ndarray` 

183 The row-matched pixel indexes corresponding to ``inputData``. 

184 pixelId : `int` 

185 The pixel index we are currently processing. 

186 fluxes : `dict` [`str`, `numpy.ndarray`] 

187 The values that will go into the flux and fluxErr fields in the 

188 output catalog. 

189 coordErr : `dict` [`str`, `numpy.ndarray`] 

190 The values that will go into the coord_raErr, coord_decErr, and 

191 coord_ra_dec_Cov fields in the output catalog (in radians). 

192 """ 

193 idx = np.where(matchedPixels == pixelId)[0] 

194 catalog = self.getCatalog(pixelId, self.schema, len(idx)) 

195 for outputRow, inputRow in zip(catalog[-len(idx):], inputData[idx]): 

196 self._fillRecord(outputRow, inputRow) 

197 

198 global COUNTER 

199 with COUNTER.get_lock(): 

200 self._setIds(inputData[idx], catalog) 

201 

202 # set fluxes from the pre-computed array 

203 for name, array in fluxes.items(): 

204 catalog[self.key_map[name]][-len(idx):] = array[idx] 

205 

206 # set coordinate errors from the pre-computed array 

207 for name, array in coordErr.items(): 

208 catalog[name][-len(idx):] = array[idx] 

209 

210 catalog.writeFits(self.filenames[pixelId]) 

211 

212 def _setIds(self, inputData, catalog): 

213 """Fill the `id` field of catalog with a running index, filling the 

214 last values up to the length of ``inputData``. 

215 

216 Fill with `self.config.id_name` if specified, otherwise use the 

217 global running counter value. 

218 

219 Parameters 

220 ---------- 

221 inputData : `numpy.ndarray` 

222 The input data that is being processed. 

223 catalog : `lsst.afw.table.SimpleCatalog` 

224 The output catalog to fill the ids. 

225 """ 

226 global COUNTER 

227 size = len(inputData) 

228 if self.config.id_name: 

229 catalog['id'][-size:] = inputData[self.config.id_name] 

230 else: 

231 idEnd = COUNTER.value + size 

232 catalog['id'][-size:] = np.arange(COUNTER.value, idEnd) 

233 COUNTER.value = idEnd 

234 

235 def getCatalog(self, pixelId, schema, nNewElements): 

236 """Get a catalog from disk or create it if it doesn't exist. 

237 

238 Parameters 

239 ---------- 

240 pixelId : `dict` 

241 Identifier for catalog to retrieve 

242 schema : `lsst.afw.table.Schema` 

243 Schema to use in catalog creation it does not exist. 

244 nNewElements : `int` 

245 The number of new elements that will be added to the catalog, 

246 so space can be preallocated. 

247 

248 Returns 

249 ------- 

250 catalog : `lsst.afw.table.SimpleCatalog` 

251 The new or read-and-resized catalog specified by `dataId`. 

252 """ 

253 # This is safe, because we lock on this file before getCatalog is called. 

254 if os.path.isfile(self.filenames[pixelId]): 

255 catalog = afwTable.SimpleCatalog.readFits(self.filenames[pixelId]) 

256 catalog.resize(len(catalog) + nNewElements) 

257 return catalog.copy(deep=True) # ensure contiguity, so that column-assignment works 

258 catalog = afwTable.SimpleCatalog(schema) 

259 catalog.resize(nNewElements) 

260 self.addRefCatMetadata(catalog) 

261 return catalog 

262 

263 @staticmethod 

264 def computeCoord(row, ra_name, dec_name): 

265 """Create an ICRS coord. from a row of a catalog being converted. 

266 

267 Parameters 

268 ---------- 

269 row : `numpy.ndarray` 

270 Row from catalog being converted. 

271 ra_name : `str` 

272 Name of RA key in catalog being converted. 

273 dec_name : `str` 

274 Name of Dec key in catalog being converted. 

275 

276 Returns 

277 ------- 

278 coord : `lsst.geom.SpherePoint` 

279 ICRS coordinate. 

280 """ 

281 return lsst.geom.SpherePoint(row[ra_name], row[dec_name], lsst.geom.degrees) 

282 

283 def _getCoordErr(self, inputData, ): 

284 """Compute the ra/dec error fields that will go into the output catalog. 

285 

286 Parameters 

287 ---------- 

288 inputData : `numpy.ndarray` 

289 The input data to compute fluxes for. 

290 

291 Returns 

292 ------- 

293 coordErr : `dict` [`str`, `numpy.ndarray`] 

294 The values that will go into the coord_raErr, coord_decErr, fields 

295 in the output catalog (in radians). 

296 

297 Notes 

298 ----- 

299 This does not handle the ra/dec covariance field, 

300 ``coord_ra_coord_dec_Cov``. That field is handled in 

301 `_setCoordinateCovariance`. 

302 """ 

303 result = {} 

304 if hasattr(self, "coord_err_unit"): 

305 result['coord_raErr'] = u.Quantity(inputData[self.config.ra_err_name], 

306 self.coord_err_unit).to_value(u.radian) 

307 result['coord_decErr'] = u.Quantity(inputData[self.config.dec_err_name], 

308 self.coord_err_unit).to_value(u.radian) 

309 return result 

310 

311 def _setFlags(self, record, row): 

312 """Set flags in an output record. 

313 

314 Parameters 

315 ---------- 

316 record : `lsst.afw.table.SimpleRecord` 

317 Row from indexed catalog to modify. 

318 row : `numpy.ndarray` 

319 Row from catalog being converted. 

320 """ 

321 names = record.schema.getNames() 

322 for flag in self._flags: 

323 if flag in names: 

324 attr_name = 'is_{}_name'.format(flag) 

325 record.set(self.key_map[flag], bool(row[getattr(self.config, attr_name)])) 

326 

327 def _getFluxes(self, inputData): 

328 """Compute the flux fields that will go into the output catalog. 

329 

330 Parameters 

331 ---------- 

332 inputData : `numpy.ndarray` 

333 The input data to compute fluxes for. 

334 

335 Returns 

336 ------- 

337 fluxes : `dict` [`str`, `numpy.ndarray`] 

338 The values that will go into the flux and fluxErr fields in the 

339 output catalog. 

340 """ 

341 result = {} 

342 for item in self.config.mag_column_list: 

343 result[item+'_flux'] = (inputData[item]*u.ABmag).to_value(u.nJy) 

344 if len(self.config.mag_err_column_map) > 0: 

345 for err_key in self.config.mag_err_column_map.keys(): 

346 error_col_name = self.config.mag_err_column_map[err_key] 

347 # TODO: multiply by 1e9 here until we have a replacement (see DM-16903) 

348 # NOTE: copy the arrays because the numpy strides may not be useable by C++. 

349 fluxErr = fluxErrFromABMagErr(inputData[error_col_name].copy(), 

350 inputData[err_key].copy())*1e9 

351 result[err_key+'_fluxErr'] = fluxErr 

352 return result 

353 

354 def _setProperMotion(self, record, row): 

355 """Set proper motion fields in a record of an indexed catalog. 

356 

357 The proper motions are read from the specified columns, 

358 scaled appropriately, and installed in the appropriate 

359 columns of the output. 

360 

361 Parameters 

362 ---------- 

363 record : `lsst.afw.table.SimpleRecord` 

364 Row from indexed catalog to modify. 

365 row : structured `numpy.array` 

366 Row from catalog being converted. 

367 """ 

368 if self.config.pm_ra_name is None: # ConvertReferenceCatalogConfig.validate ensures all or none 

369 return 

370 radPerOriginal = np.radians(self.config.pm_scale)/(3600*1000) 

371 record.set(self.key_map["pm_ra"], row[self.config.pm_ra_name]*radPerOriginal*lsst.geom.radians) 

372 record.set(self.key_map["pm_dec"], row[self.config.pm_dec_name]*radPerOriginal*lsst.geom.radians) 

373 record.set(self.key_map["epoch"], self._epochToMjdTai(row[self.config.epoch_name])) 

374 if self.config.pm_ra_err_name is not None: # pm_dec_err_name also, by validation 

375 record.set(self.key_map["pm_raErr"], row[self.config.pm_ra_err_name]*radPerOriginal) 

376 record.set(self.key_map["pm_decErr"], row[self.config.pm_dec_err_name]*radPerOriginal) 

377 

378 def _setParallax(self, record, row): 

379 """Set the parallax fields in a record of a refcat. 

380 """ 

381 if self.config.parallax_name is None: 

382 return 

383 scale = self.config.parallax_scale*lsst.geom.milliarcseconds 

384 record.set(self.key_map['parallax'], row[self.config.parallax_name]*scale) 

385 record.set(self.key_map['parallaxErr'], row[self.config.parallax_err_name]*scale) 

386 

387 def _epochToMjdTai(self, nativeEpoch): 

388 """Convert an epoch in native format to TAI MJD (a float). 

389 """ 

390 return astropy.time.Time(nativeEpoch, format=self.config.epoch_format, 

391 scale=self.config.epoch_scale).tai.mjd 

392 

393 def _setCoordinateCovariance(self, record, row): 

394 """Set the off-diagonal position covariance in a record of an indexed 

395 catalog. 

396 

397 There is no generic way to determine covariance. Override this method 

398 in a subclass specialized for your dataset. 

399 

400 Parameters 

401 ---------- 

402 record : `lsst.afw.table.SimpleRecord` 

403 Row from indexed catalog to modify. 

404 row : structured `numpy.array` 

405 Row from catalog being converted. 

406 """ 

407 raise NotImplementedError("There is no default method for setting the covariance. Override this " 

408 "method in a subclass specialized for your dataset.") 

409 

410 def _setExtra(self, record, row): 

411 """Set extra data fields in a record of an indexed catalog. 

412 

413 Parameters 

414 ---------- 

415 record : `lsst.afw.table.SimpleRecord` 

416 Row from indexed catalog to modify. 

417 row : structured `numpy.array` 

418 Row from catalog being converted. 

419 """ 

420 for extra_col in self.config.extra_col_names: 

421 value = row[extra_col] 

422 # If data read from a text file contains string like entires, 

423 # numpy stores this as its own internal type, a numpy.str_ 

424 # object. This seems to be a consequence of how numpy stores 

425 # string like objects in fixed column arrays. This checks 

426 # if any of the values to be added to the catalog are numpy 

427 # string types, and if they are, casts them to a python string 

428 # which is what the python c++ records expect 

429 if isinstance(value, np.str_): 

430 value = str(value) 

431 record.set(self.key_map[extra_col], value) 

432 

433 def _fillRecord(self, record, row): 

434 """Fill a record in an indexed catalog to be persisted. 

435 

436 Parameters 

437 ---------- 

438 record : `lsst.afw.table.SimpleRecord` 

439 Row from indexed catalog to modify. 

440 row : structured `numpy.array` 

441 Row from catalog being converted. 

442 """ 

443 record.setCoord(self.computeCoord(row, self.config.ra_name, self.config.dec_name)) 

444 

445 self._setFlags(record, row) 

446 if self.config.full_position_information: 

447 self._setProperMotion(record, row) 

448 self._setParallax(record, row) 

449 self._setCoordinateCovariance(record, row) 

450 self._setExtra(record, row) 

451 

452 

453class ConvertGaiaManager(ConvertRefcatManager): 

454 """Special-case convert manager to deal with Gaia fluxes. 

455 """ 

456 def __init__(self, *args, **kwargs): 

457 super().__init__(*args, **kwargs) 

458 self.properMotionUnit = self.config.pm_scale * u.milliarcsecond 

459 self.parallaxUnit = self.config.parallax_scale * u.milliarcsecond 

460 self.outputUnit = u.radian * u.radian 

461 

462 def _getFluxes(self, input): 

463 result = {} 

464 

465 def gaiaFluxToFlux(flux, zeroPoint): 

466 """Equations 5.19 and 5.30 from the Gaia calibration document define the 

467 conversion from Gaia electron/second fluxes to AB magnitudes. 

468 https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html 

469 """ 

470 result = ((zeroPoint + -2.5 * np.log10(flux))*u.ABmag).to_value(u.nJy) 

471 # set 0 instrumental fluxes to 0 (instead of NaN/inf from the math) 

472 result[flux == 0] = 0 

473 return result 

474 

475 # Some fluxes are 0, so log10(flux) can give warnings. We handle the 

476 # zeros explicitly, so they warnings are irrelevant. 

477 with np.errstate(invalid='ignore', divide='ignore'): 

478 # The constants below come from table 5.3 in this document; 

479 # https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html 

480 result['phot_g_mean_flux'] = gaiaFluxToFlux(input['phot_g_mean_flux'], 25.7934) 

481 result['phot_bp_mean_flux'] = gaiaFluxToFlux(input['phot_bp_mean_flux'], 25.3806) 

482 result['phot_rp_mean_flux'] = gaiaFluxToFlux(input['phot_rp_mean_flux'], 25.1161) 

483 

484 result['phot_g_mean_fluxErr'] = result['phot_g_mean_flux'] / input['phot_g_mean_flux_over_error'] 

485 result['phot_bp_mean_fluxErr'] = result['phot_bp_mean_flux'] / input['phot_bp_mean_flux_over_error'] 

486 result['phot_rp_mean_fluxErr'] = result['phot_rp_mean_flux'] / input['phot_rp_mean_flux_over_error'] 

487 

488 return result 

489 

490 def _setCoordinateCovariance(self, record, row): 

491 """Set the off-diagonal position covariance in a record of an indexed 

492 catalog. 

493 

494 Convert the Gaia coordinate correlations into covariances. 

495 

496 Parameters 

497 ---------- 

498 record : `lsst.afw.table.SimpleRecord` 

499 Row from indexed catalog to modify. 

500 row : structured `numpy.array` 

501 Row from catalog being converted. 

502 """ 

503 inputParams = ['ra', 'dec', 'parallax', 'pmra', 'pmdec'] 

504 outputParams = ['coord_ra', 'coord_dec', 'parallax', 'pm_ra', 'pm_dec'] 

505 # The Gaia standard for naming is to order the parameters as 

506 # (coordinates, parallax, proper motion), so they need to be reordered 

507 # as (coordinates, proper motion, parallax) to match the order used 

508 # in LSST code (i.g. 'coord_parallax_pm_ra_Cov' becomes 

509 # 'coord_pm_ra_parallax_Cov'). 

510 reorder = [0, 1, 4, 2, 3] 

511 

512 inputUnits = [self.coord_err_unit, self.coord_err_unit, self.parallaxUnit, self.properMotionUnit, 

513 self.properMotionUnit] 

514 

515 for i in range(5): 

516 for j in range(i): 

517 j_error = row[f'{inputParams[j]}_error'] * inputUnits[j] 

518 i_error = row[f'{inputParams[i]}_error'] * inputUnits[i] 

519 ij_corr = row[f'{inputParams[j]}_{inputParams[i]}_corr'] 

520 cov = (i_error * j_error * ij_corr).to_value(self.outputUnit) 

521 

522 # Switch from order of Gaia parallax and proper motion 

523 # parameters to the desired schema: 

524 a = (i if (reorder[i] < reorder[j]) else j) 

525 b = (j if (reorder[i] < reorder[j]) else i) 

526 

527 record.set(self.key_map[f'{outputParams[a]}_{outputParams[b]}_Cov'], cov) 

528 

529 

530class ConvertGaiaXpManager(ConvertRefcatManager): 

531 """Special-case convert manager for Gaia XP spectrophotometry catalogs, 

532 that have fluxes/flux errors, instead of magnitudes/mag errors. The input 

533 flux and error values are in units of W/Hz/(m^2) (Gaia Collaboration, Montegriffo et al. 2022). 

534 The the flux and fluxErr fields in the output catalog have units of nJy. 

535 """ 

536 

537 def _getFluxes(self, inputData): 

538 result = {} 

539 for item in self.config.mag_column_list: 

540 

541 error_col_name = item.replace("_flux_", "_flux_error_") 

542 

543 result[item + "_flux"] = ( 

544 inputData[item] * u.Watt / u.Hz / u.meter / u.meter 

545 ).to_value(u.nJy) 

546 result[item + "_fluxErr"] = ( 

547 inputData[error_col_name] * u.Watt / u.Hz / u.meter / u.meter 

548 ).to_value(u.nJy) 

549 

550 return result