Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of meas_algorithms. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22__all__ = ["IngestIndexManager", "IngestGaiaManager"] 

23 

24import os.path 

25import itertools 

26import multiprocessing 

27 

28import astropy.time 

29import astropy.units as u 

30import numpy as np 

31 

32import lsst.sphgeom 

33import lsst.afw.table as afwTable 

34from lsst.afw.image import fluxErrFromABMagErr 

35 

36 

37# global shared counter to keep track of source ids 

38# (multiprocess sharing is most easily done with a global) 

39COUNTER = 0 

40# global shared counter to keep track of number of files processed. 

41FILE_PROGRESS = 0 

42 

43 

44class IngestIndexManager: 

45 """ 

46 Ingest a reference catalog from external files into a butler repository, 

47 using a multiprocessing Pool to speed up the work. 

48 

49 Parameters 

50 ---------- 

51 filenames : `dict` [`int`, `str`] 

52 The HTM pixel id and filenames to ingest the catalog into. 

53 config : `lsst.meas.algorithms.IngestIndexedReferenceConfig` 

54 The Task configuration holding the field names. 

55 file_reader : `lsst.pipe.base.Task` 

56 The file reader to use to load the files. 

57 indexer : `lsst.meas.algorithms.HtmIndexer` 

58 The class used to compute the HTM pixel per coordinate. 

59 schema : `lsst.afw.table.Schema` 

60 The schema of the output catalog. 

61 key_map : `dict` [`str`, `lsst.afw.table.Key`] 

62 The mapping from output field names to keys in the Schema. 

63 htmRange : `tuple` [`int`] 

64 The start and end HTM pixel ids. 

65 addRefCatMetadata : callable 

66 A function called to add extra metadata to each output Catalog. 

67 log : `lsst.log.Log` 

68 The log to send messages to. 

69 """ 

70 _flags = ['photometric', 'resolved', 'variable'] 

71 

72 def __init__(self, filenames, config, file_reader, indexer, 

73 schema, key_map, htmRange, addRefCatMetadata, log): 

74 self.filenames = filenames 

75 self.config = config 

76 self.file_reader = file_reader 

77 self.indexer = indexer 

78 self.schema = schema 

79 self.key_map = key_map 

80 self.htmRange = htmRange 

81 self.addRefCatMetadata = addRefCatMetadata 

82 self.log = log 

83 if self.config.coord_err_unit is not None: 

84 # cache this to speed up coordinate conversions 

85 self.coord_err_unit = u.Unit(self.config.coord_err_unit) 

86 

87 def run(self, inputFiles): 

88 """Index a set of input files from a reference catalog, and write the 

89 output to the appropriate filenames, in parallel. 

90 

91 Parameters 

92 ---------- 

93 inputFiles : `list` 

94 A list of file paths to read data from. 

95 """ 

96 global COUNTER, FILE_PROGRESS 

97 self.nInputFiles = len(inputFiles) 

98 

99 with multiprocessing.Manager() as manager: 

100 COUNTER = multiprocessing.Value('i', 0) 

101 FILE_PROGRESS = multiprocessing.Value('i', 0) 

102 fileLocks = manager.dict() 

103 self.log.info("Creating %s file locks.", self.htmRange[1] - self.htmRange[0]) 

104 for i in range(self.htmRange[0], self.htmRange[1]): 

105 fileLocks[i] = manager.Lock() 

106 self.log.info("File locks created.") 

107 with multiprocessing.Pool(self.config.n_processes) as pool: 

108 pool.starmap(self._ingestOneFile, zip(inputFiles, itertools.repeat(fileLocks))) 

109 

110 def _ingestOneFile(self, filename, fileLocks): 

111 """Read and process one file, and write its records to the correct 

112 indexed files, while handling exceptions in a useful way so that they 

113 don't get swallowed by the multiprocess pool. 

114 

115 Parameters 

116 ---------- 

117 filename : `str` 

118 The file to process. 

119 fileLocks : `dict` [`int`, `multiprocessing.Lock`] 

120 A Lock for each HTM pixel; each pixel gets one file written, and 

121 we need to block when one process is accessing that file. 

122 """ 

123 global FILE_PROGRESS 

124 inputData = self.file_reader.run(filename) 

125 fluxes = self._getFluxes(inputData) 

126 coordErr = self._getCoordErr(inputData) 

127 matchedPixels = self.indexer.indexPoints(inputData[self.config.ra_name], 

128 inputData[self.config.dec_name]) 

129 pixel_ids = set(matchedPixels) 

130 for pixelId in pixel_ids: 

131 with fileLocks[pixelId]: 

132 self._doOnePixel(inputData, matchedPixels, pixelId, fluxes, coordErr) 

133 with FILE_PROGRESS.get_lock(): 

134 oldPercent = 100 * FILE_PROGRESS.value / self.nInputFiles 

135 FILE_PROGRESS.value += 1 

136 percent = 100 * FILE_PROGRESS.value / self.nInputFiles 

137 # only log each "new percent" 

138 if np.floor(percent) - np.floor(oldPercent) >= 1: 

139 self.log.info("Completed %d / %d files: %d %% complete ", 

140 FILE_PROGRESS.value, 

141 self.nInputFiles, 

142 percent) 

143 

144 def _doOnePixel(self, inputData, matchedPixels, pixelId, fluxes, coordErr): 

145 """Process one HTM pixel, appending to an existing catalog or creating 

146 a new catalog, as needed. 

147 

148 Parameters 

149 ---------- 

150 inputData : `numpy.ndarray` 

151 The data from one input file. 

152 matchedPixels : `numpy.ndarray` 

153 The row-matched pixel indexes corresponding to ``inputData``. 

154 pixelId : `int` 

155 The pixel index we are currently processing. 

156 fluxes : `dict` [`str`, `numpy.ndarray`] 

157 The values that will go into the flux and fluxErr fields in the 

158 output catalog. 

159 coordErr : `dict` [`str`, `numpy.ndarray`] 

160 The values that will go into the coord_raErr, coord_decErr, and 

161 coord_ra_dec_Cov fields in the output catalog (in radians). 

162 """ 

163 idx = np.where(matchedPixels == pixelId)[0] 

164 catalog = self.getCatalog(pixelId, self.schema, len(idx)) 

165 for outputRow, inputRow in zip(catalog[-len(idx):], inputData[idx]): 

166 self._fillRecord(outputRow, inputRow) 

167 

168 global COUNTER 

169 with COUNTER.get_lock(): 

170 self._setIds(inputData[idx], catalog) 

171 

172 # set fluxes from the pre-computed array 

173 for name, array in fluxes.items(): 173 ↛ 174line 173 didn't jump to line 174, because the loop on line 173 never started

174 catalog[self.key_map[name]][-len(idx):] = array[idx] 

175 

176 # set coordinate errors from the pre-computed array 

177 for name, array in coordErr.items(): 177 ↛ 178line 177 didn't jump to line 178, because the loop on line 177 never started

178 catalog[name][-len(idx):] = array[idx] 

179 

180 catalog.writeFits(self.filenames[pixelId]) 

181 

182 def _setIds(self, inputData, catalog): 

183 """Fill the `id` field of catalog with a running index, filling the 

184 last values up to the length of ``inputData``. 

185 

186 Fill with `self.config.id_name` if specified, otherwise use the 

187 global running counter value. 

188 

189 Parameters 

190 ---------- 

191 inputData : `numpy.ndarray` 

192 The input data that is being processed. 

193 catalog : `lsst.afw.table.SimpleCatalog` 

194 The output catalog to fill the ids. 

195 """ 

196 global COUNTER 

197 size = len(inputData) 

198 if self.config.id_name: 198 ↛ 201line 198 didn't jump to line 201, because the condition on line 198 was never false

199 catalog['id'][-size:] = inputData[self.config.id_name] 

200 else: 

201 idEnd = COUNTER.value + size 

202 catalog['id'][-size:] = np.arange(COUNTER.value, idEnd) 

203 COUNTER.value = idEnd 

204 

205 def getCatalog(self, pixelId, schema, nNewElements): 

206 """Get a catalog from disk or create it if it doesn't exist. 

207 

208 Parameters 

209 ---------- 

210 pixelId : `dict` 

211 Identifier for catalog to retrieve 

212 schema : `lsst.afw.table.Schema` 

213 Schema to use in catalog creation it does not exist. 

214 nNewElements : `int` 

215 The number of new elements that will be added to the catalog, 

216 so space can be preallocated. 

217 

218 Returns 

219 ------- 

220 catalog : `lsst.afw.table.SimpleCatalog` 

221 The new or read-and-resized catalog specified by `dataId`. 

222 """ 

223 # This is safe, because we lock on this file before getCatalog is called. 

224 if os.path.isfile(self.filenames[pixelId]): 224 ↛ 228line 224 didn't jump to line 228, because the condition on line 224 was never false

225 catalog = afwTable.SimpleCatalog.readFits(self.filenames[pixelId]) 

226 catalog.resize(len(catalog) + nNewElements) 

227 return catalog.copy(deep=True) # ensure contiguity, so that column-assignment works 

228 catalog = afwTable.SimpleCatalog(schema) 

229 catalog.resize(nNewElements) 

230 self.addRefCatMetadata(catalog) 

231 return catalog 

232 

233 @staticmethod 

234 def computeCoord(row, ra_name, dec_name): 

235 """Create an ICRS coord. from a row of a catalog being ingested. 

236 

237 Parameters 

238 ---------- 

239 row : `numpy.ndarray` 

240 Row from catalog being ingested. 

241 ra_name : `str` 

242 Name of RA key in catalog being ingested. 

243 dec_name : `str` 

244 Name of Dec key in catalog being ingested. 

245 

246 Returns 

247 ------- 

248 coord : `lsst.geom.SpherePoint` 

249 ICRS coordinate. 

250 """ 

251 return lsst.geom.SpherePoint(row[ra_name], row[dec_name], lsst.geom.degrees) 

252 

253 def _getCoordErr(self, inputData, ): 

254 """Compute the ra/dec error fields that will go into the output catalog. 

255 

256 Parameters 

257 ---------- 

258 inputData : `numpy.ndarray` 

259 The input data to compute fluxes for. 

260 

261 Returns 

262 ------- 

263 coordErr : `dict` [`str`, `numpy.ndarray`] 

264 The values that will go into the coord_raErr, coord_decErr, fields 

265 in the output catalog (in radians). 

266 

267 Notes 

268 ----- 

269 This does not currently handle the ra/dec covariance field, 

270 ``coord_ra_dec_Cov``. That field may require extra work, as its units 

271 may be more complicated in external catalogs. 

272 """ 

273 result = {} 

274 if hasattr(self, "coord_err_unit"): 

275 result['coord_raErr'] = u.Quantity(inputData[self.config.ra_err_name], 

276 self.coord_err_unit).to_value(u.radian) 

277 result['coord_decErr'] = u.Quantity(inputData[self.config.dec_err_name], 

278 self.coord_err_unit).to_value(u.radian) 

279 return result 

280 

281 def _setFlags(self, record, row): 

282 """Set flags in an output record. 

283 

284 Parameters 

285 ---------- 

286 record : `lsst.afw.table.SimpleRecord` 

287 Row from indexed catalog to modify. 

288 row : `numpy.ndarray` 

289 Row from catalog being ingested. 

290 """ 

291 names = record.schema.getNames() 

292 for flag in self._flags: 

293 if flag in names: 293 ↛ 294line 293 didn't jump to line 294, because the condition on line 293 was never true

294 attr_name = 'is_{}_name'.format(flag) 

295 record.set(self.key_map[flag], bool(row[getattr(self.config, attr_name)])) 

296 

297 def _getFluxes(self, inputData): 

298 """Compute the flux fields that will go into the output catalog. 

299 

300 Parameters 

301 ---------- 

302 inputData : `numpy.ndarray` 

303 The input data to compute fluxes for. 

304 

305 Returns 

306 ------- 

307 fluxes : `dict` [`str`, `numpy.ndarray`] 

308 The values that will go into the flux and fluxErr fields in the 

309 output catalog. 

310 """ 

311 result = {} 

312 for item in self.config.mag_column_list: 

313 result[item+'_flux'] = (inputData[item]*u.ABmag).to_value(u.nJy) 

314 if len(self.config.mag_err_column_map) > 0: 

315 for err_key in self.config.mag_err_column_map.keys(): 

316 error_col_name = self.config.mag_err_column_map[err_key] 

317 # TODO: multiply by 1e9 here until we have a replacement (see DM-16903) 

318 # NOTE: copy the arrays because the numpy strides may not be useable by C++. 

319 fluxErr = fluxErrFromABMagErr(inputData[error_col_name].copy(), 

320 inputData[err_key].copy())*1e9 

321 result[err_key+'_fluxErr'] = fluxErr 

322 return result 

323 

324 def _setProperMotion(self, record, row): 

325 """Set proper motion fields in a record of an indexed catalog. 

326 

327 The proper motions are read from the specified columns, 

328 scaled appropriately, and installed in the appropriate 

329 columns of the output. 

330 

331 Parameters 

332 ---------- 

333 record : `lsst.afw.table.SimpleRecord` 

334 Row from indexed catalog to modify. 

335 row : structured `numpy.array` 

336 Row from catalog being ingested. 

337 """ 

338 if self.config.pm_ra_name is None: # IngestIndexedReferenceConfig.validate ensures all or none 338 ↛ 340line 338 didn't jump to line 340, because the condition on line 338 was never false

339 return 

340 radPerOriginal = np.radians(self.config.pm_scale)/(3600*1000) 

341 record.set(self.key_map["pm_ra"], row[self.config.pm_ra_name]*radPerOriginal*lsst.geom.radians) 

342 record.set(self.key_map["pm_dec"], row[self.config.pm_dec_name]*radPerOriginal*lsst.geom.radians) 

343 record.set(self.key_map["epoch"], self._epochToMjdTai(row[self.config.epoch_name])) 

344 if self.config.pm_ra_err_name is not None: # pm_dec_err_name also, by validation 

345 record.set(self.key_map["pm_raErr"], row[self.config.pm_ra_err_name]*radPerOriginal) 

346 record.set(self.key_map["pm_decErr"], row[self.config.pm_dec_err_name]*radPerOriginal) 

347 

348 def _setParallax(self, record, row): 

349 """Set the parallax fields in a record of a refcat. 

350 """ 

351 if self.config.parallax_name is None: 351 ↛ 353line 351 didn't jump to line 353, because the condition on line 351 was never false

352 return 

353 scale = self.config.parallax_scale*lsst.geom.milliarcseconds 

354 record.set(self.key_map['parallax'], row[self.config.parallax_name]*scale) 

355 record.set(self.key_map['parallaxErr'], row[self.config.parallax_err_name]*scale) 

356 

357 def _epochToMjdTai(self, nativeEpoch): 

358 """Convert an epoch in native format to TAI MJD (a float). 

359 """ 

360 return astropy.time.Time(nativeEpoch, format=self.config.epoch_format, 

361 scale=self.config.epoch_scale).tai.mjd 

362 

363 def _setExtra(self, record, row): 

364 """Set extra data fields in a record of an indexed catalog. 

365 

366 Parameters 

367 ---------- 

368 record : `lsst.afw.table.SimpleRecord` 

369 Row from indexed catalog to modify. 

370 row : structured `numpy.array` 

371 Row from catalog being ingested. 

372 """ 

373 for extra_col in self.config.extra_col_names: 373 ↛ 374line 373 didn't jump to line 374, because the loop on line 373 never started

374 value = row[extra_col] 

375 # If data read from a text file contains string like entires, 

376 # numpy stores this as its own internal type, a numpy.str_ 

377 # object. This seems to be a consequence of how numpy stores 

378 # string like objects in fixed column arrays. This checks 

379 # if any of the values to be added to the catalog are numpy 

380 # string types, and if they are, casts them to a python string 

381 # which is what the python c++ records expect 

382 if isinstance(value, np.str_): 

383 value = str(value) 

384 record.set(self.key_map[extra_col], value) 

385 

386 def _fillRecord(self, record, row): 

387 """Fill a record in an indexed catalog to be persisted. 

388 

389 Parameters 

390 ---------- 

391 record : `lsst.afw.table.SimpleRecord` 

392 Row from indexed catalog to modify. 

393 row : structured `numpy.array` 

394 Row from catalog being ingested. 

395 """ 

396 record.setCoord(self.computeCoord(row, self.config.ra_name, self.config.dec_name)) 

397 

398 self._setFlags(record, row) 

399 self._setProperMotion(record, row) 

400 self._setParallax(record, row) 

401 self._setExtra(record, row) 

402 

403 

404class IngestGaiaManager(IngestIndexManager): 

405 """Special-case ingest manager to deal with Gaia fluxes. 

406 """ 

407 def _getFluxes(self, input): 

408 result = {} 

409 

410 def gaiaFluxToFlux(flux, zeroPoint): 

411 """Equations 5.19 and 5.30 from the Gaia calibration document define the 

412 conversion from Gaia electron/second fluxes to AB magnitudes. 

413 https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html 

414 """ 

415 result = ((zeroPoint + -2.5 * np.log10(flux))*u.ABmag).to_value(u.nJy) 

416 # set 0 instrumental fluxes to 0 (instead of NaN/inf from the math) 

417 result[flux == 0] = 0 

418 return result 

419 

420 # Some fluxes are 0, so log10(flux) can give warnings. We handle the 

421 # zeros explicitly, so they warnings are irrelevant. 

422 with np.errstate(invalid='ignore', divide='ignore'): 

423 # The constants below come from table 5.3 in this document; 

424 # https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html 

425 result['phot_g_mean_flux'] = gaiaFluxToFlux(input['phot_g_mean_flux'], 25.7934) 

426 result['phot_bp_mean_flux'] = gaiaFluxToFlux(input['phot_bp_mean_flux'], 25.3806) 

427 result['phot_rp_mean_flux'] = gaiaFluxToFlux(input['phot_rp_mean_flux'], 25.1161) 

428 

429 result['phot_g_mean_fluxErr'] = result['phot_g_mean_flux'] / input['phot_g_mean_flux_over_error'] 

430 result['phot_bp_mean_fluxErr'] = result['phot_bp_mean_flux'] / input['phot_bp_mean_flux_over_error'] 

431 result['phot_rp_mean_fluxErr'] = result['phot_rp_mean_flux'] / input['phot_rp_mean_flux_over_error'] 

432 

433 return result