Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ap_association. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""A simple implementation of source association task for ap_verify. 

23""" 

24 

25__all__ = ["AssociationConfig", "AssociationTask"] 

26 

27import numpy as np 

28import pandas as pd 

29from scipy.spatial import cKDTree 

30 

31import lsst.geom as geom 

32import lsst.pex.config as pexConfig 

33import lsst.pipe.base as pipeBase 

34 

35from .diaCalculation import DiaObjectCalculationTask 

36 

37# Enforce an error for unsafe column/array value setting in pandas. 

38pd.options.mode.chained_assignment = 'raise' 

39 

40 

41class AssociationConfig(pexConfig.Config): 

42 """Config class for AssociationTask. 

43 """ 

44 maxDistArcSeconds = pexConfig.Field( 

45 dtype=float, 

46 doc='Maximum distance in arcseconds to test for a DIASource to be a ' 

47 'match to a DIAObject.', 

48 default=1.0, 

49 ) 

50 diaCalculation = pexConfig.ConfigurableField( 

51 target=DiaObjectCalculationTask, 

52 doc="Task to compute summary statistics for DiaObjects.", 

53 ) 

54 

55 def setDefaults(self): 

56 self.diaCalculation.plugins = ["ap_meanPosition", 

57 "ap_HTMIndex", 

58 "ap_nDiaSources", 

59 "ap_diaObjectFlag", 

60 "ap_meanFlux", 

61 "ap_percentileFlux", 

62 "ap_sigmaFlux", 

63 "ap_chi2Flux", 

64 "ap_madFlux", 

65 "ap_skewFlux", 

66 "ap_minMaxFlux", 

67 "ap_maxSlopeFlux", 

68 "ap_meanErrFlux", 

69 "ap_linearFit", 

70 "ap_stetsonJ", 

71 "ap_meanTotFlux", 

72 "ap_sigmaTotFlux"] 

73 

74 def validate(self): 

75 if "ap_HTMIndex" not in self.diaCalculation.plugins: 

76 raise ValueError("AssociationTask requires the ap_HTMIndex plugin " 

77 "be enabled for proper insertion into the Apdb.") 

78 

79 

80class AssociationTask(pipeBase.Task): 

81 """Associate DIAOSources into existing DIAObjects. 

82 

83 This task performs the association of detected DIASources in a visit 

84 with the previous DIAObjects detected over time. It also creates new 

85 DIAObjects out of DIASources that cannot be associated with previously 

86 detected DIAObjects. 

87 """ 

88 

89 ConfigClass = AssociationConfig 

90 _DefaultName = "association" 

91 

92 def __init__(self, **kwargs): 

93 pipeBase.Task.__init__(self, **kwargs) 

94 self.makeSubtask("diaCalculation") 

95 

96 @pipeBase.timeMethod 

97 def run(self, 

98 diaSources, 

99 diaObjects, 

100 diaSourceHistory): 

101 """Associate the new DiaSources with existing or new DiaObjects, 

102 updating the DiaObjects. 

103 

104 Parameters 

105 ---------- 

106 diaSources : `pandas.DataFrame` 

107 New DIASources to be associated with existing DIAObjects. 

108 diaObjects : `pandas.DataFrame` 

109 Existing diaObjects from the Apdb. 

110 diaSourceHistory : `pandas.DataFrame` 

111 12 month DiaSource history of the loaded ``diaObjects``. 

112 

113 Returns 

114 ------- 

115 result : `lsst.pipe.base.Struct` 

116 Results struct with components. 

117 

118 - ``diaObjects`` : Complete set of dia_objects covering the input 

119 exposure. Catalog contains newly created, updated, and untouched 

120 diaObjects. (`pandas.DataFrame`) 

121 - ``updatedDiaObjects`` : Subset of DiaObjects that were updated 

122 or created during processing. (`pandas.DataFrame`) 

123 - ``diaSources`` : DiaSources detected in this ccdVisit with 

124 associated diaObjectIds. (`pandas.DataFrame`) 

125 """ 

126 diaSources = self.check_dia_source_radec(diaSources) 

127 

128 matchResult = self.associate_sources(diaObjects, diaSources) 

129 

130 # Now that we know the DiaObjects our new DiaSources are associated 

131 # with, we index the new DiaSources the same way as the full history 

132 # and merge the tables. 

133 diaSources.set_index(["diaObjectId", "filterName", "diaSourceId"], 

134 drop=False, 

135 inplace=True) 

136 # Test for DiaSource duplication first. If duplicates are found, 

137 # this likely means this is duplicate data being processed and sent 

138 # to the Apdb. 

139 mergedDiaSourceHistory = diaSourceHistory.append(diaSources, sort=True) 

140 if mergedDiaSourceHistory.index.has_duplicates: 

141 raise RuntimeError( 

142 "Duplicate DiaSources found after association and merging " 

143 "with history. This is likely due to re-running data with an " 

144 "already populated Apdb. If this was not the case then there " 

145 "was an unexpected failure in Association while matching " 

146 "sources to objects, and should be reported. Exiting.") 

147 

148 diaObjects = diaObjects.append(matchResult.new_dia_objects, 

149 sort=True) 

150 # Double check to make sure there are no duplicates in the DiaObject 

151 # table after association. 

152 if diaObjects.index.has_duplicates: 

153 raise RuntimeError( 

154 "Duplicate DiaObjects created after association. This is " 

155 "likely due to re-running data with an already populated " 

156 "Apdb. If this was not the case then there was an unexpected " 

157 "failure in Association while matching and creating new " 

158 "DiaObjectsand should be reported. Exiting.") 

159 

160 # Get the current filter being processed. 

161 filterName = diaSources["filterName"].iat[0] 

162 

163 # Update previously existing DIAObjects with the information from their 

164 # newly association DIASources and create new DIAObjects from 

165 # unassociated sources. 

166 updatedResults = self.diaCalculation.run( 

167 diaObjects, 

168 mergedDiaSourceHistory, 

169 matchResult.associated_dia_object_ids, 

170 filterName) 

171 

172 allDiaObjects = updatedResults.diaObjectCat 

173 updatedDiaObjects = updatedResults.updatedDiaObjects 

174 if allDiaObjects.index.has_duplicates: 

175 raise RuntimeError( 

176 "Duplicate DiaObjects (loaded + updated) created after " 

177 "DiaCalculation. This is unexpected behavior and should be " 

178 "reported. Existing.") 

179 if updatedDiaObjects.index.has_duplicates: 

180 raise RuntimeError( 

181 "Duplicate DiaObjects (updated) created after " 

182 "DiaCalculation. This is unexpected behavior and should be " 

183 "reported. Existing.") 

184 

185 return pipeBase.Struct( 

186 diaObjects=allDiaObjects, 

187 updatedDiaObjects=updatedDiaObjects, 

188 diaSources=diaSources, 

189 ) 

190 

191 def check_dia_source_radec(self, dia_sources): 

192 """Check that all DiaSources have non-NaN values for RA/DEC. 

193 

194 If one or more DiaSources are found to have NaN values, throw a 

195 warning to the log with the ids of the offending sources. Drop them 

196 from the table. 

197 

198 Parameters 

199 ---------- 

200 dia_sources : `pandas.DataFrame` 

201 Input DiaSources to check for NaN values. 

202 

203 Returns 

204 ------- 

205 trimmed_sources : `pandas.DataFrame` 

206 DataFrame of DiaSources trimmed of all entries with NaN values for 

207 RA/DEC. 

208 """ 

209 nan_mask = (dia_sources.loc[:, "ra"].isnull() 

210 | dia_sources.loc[:, "decl"].isnull()) 

211 if np.any(nan_mask): 

212 nan_idxs = np.argwhere(nan_mask.to_numpy()).flatten() 

213 for nan_idx in nan_idxs: 

214 self.log.warning( 

215 "DiaSource %i has NaN value for RA/DEC, " 

216 "dropping from association." % 

217 dia_sources.loc[nan_idx, "diaSourceId"]) 

218 dia_sources = dia_sources[~nan_mask] 

219 return dia_sources 

220 

221 @pipeBase.timeMethod 

222 def associate_sources(self, dia_objects, dia_sources): 

223 """Associate the input DIASources with the catalog of DIAObjects. 

224 

225 DiaObject DataFrame must be indexed on ``diaObjectId``. 

226 

227 Parameters 

228 ---------- 

229 dia_objects : `pandas.DataFrame` 

230 Catalog of DIAObjects to attempt to associate the input 

231 DIASources into. 

232 dia_sources : `pandas.DataFrame` 

233 DIASources to associate into the DIAObjectCollection. 

234 

235 Returns 

236 ------- 

237 result : `lsst.pipeBase.Struct` 

238 Results struct with components: 

239 

240 - ``updated_and_new_dia_object_ids`` : ids of new and updated 

241 dia_objects as the result of association. (`list` of `int`). 

242 - ``new_dia_objects`` : Newly created DiaObjects from 

243 unassociated diaSources. (`pandas.DataFrame`) 

244 - ``n_updated_dia_objects`` : Number of previously known 

245 dia_objects with newly associated DIASources. (`int`). 

246 - ``n_new_dia_objects`` : Number of newly created DIAObjects from 

247 unassociated DIASources (`int`). 

248 - ``n_unupdated_dia_objects`` : Number of previous DIAObjects that 

249 were not associated to a new DIASource (`int`). 

250 """ 

251 

252 scores = self.score( 

253 dia_objects, dia_sources, 

254 self.config.maxDistArcSeconds * geom.arcseconds) 

255 match_result = self.match(dia_objects, dia_sources, scores) 

256 

257 self._add_association_meta_data(match_result) 

258 

259 return match_result 

260 

261 @pipeBase.timeMethod 

262 def score(self, dia_objects, dia_sources, max_dist): 

263 """Compute a quality score for each dia_source/dia_object pair 

264 between this catalog of DIAObjects and the input DIASource catalog. 

265 

266 ``max_dist`` sets maximum separation in arcseconds to consider a 

267 dia_source a possible match to a dia_object. If the pair is 

268 beyond this distance no score is computed. 

269 

270 Parameters 

271 ---------- 

272 dia_objects : `pandas.DataFrame` 

273 A contiguous catalog of DIAObjects to score against dia_sources. 

274 dia_sources : `pandas.DataFrame` 

275 A contiguous catalog of dia_sources to "score" based on distance 

276 and (in the future) other metrics. 

277 max_dist : `lsst.geom.Angle` 

278 Maximum allowed distance to compute a score for a given DIAObject 

279 DIASource pair. 

280 

281 Returns 

282 ------- 

283 result : `lsst.pipe.base.Struct` 

284 Results struct with components: 

285 

286 - ``scores``: array of floats of match quality updated DIAObjects 

287 (array-like of `float`). 

288 - ``obj_idxs``: indexes of the matched DIAObjects in the catalog. 

289 (array-like of `int`) 

290 - ``obj_ids``: array of floats of match quality updated DIAObjects 

291 (array-like of `int`). 

292 

293 Default values for these arrays are 

294 INF, -1, and -1 respectively for unassociated sources. 

295 """ 

296 scores = np.full(len(dia_sources), np.inf, dtype=np.float64) 

297 obj_idxs = np.full(len(dia_sources), -1, dtype=np.int64) 

298 obj_ids = np.full(len(dia_sources), 0, dtype=np.int64) 

299 

300 if len(dia_objects) == 0: 

301 return pipeBase.Struct( 

302 scores=scores, 

303 obj_idxs=obj_idxs, 

304 obj_ids=obj_ids) 

305 

306 spatial_tree = self._make_spatial_tree(dia_objects) 

307 

308 max_dist_rad = max_dist.asRadians() 

309 

310 vectors = self._radec_to_xyz(dia_sources) 

311 

312 scores, obj_idxs = spatial_tree.query( 

313 vectors, 

314 distance_upper_bound=max_dist_rad) 

315 matched_src_idxs = np.argwhere(np.isfinite(scores)) 

316 obj_ids[matched_src_idxs] = dia_objects.index.to_numpy()[ 

317 obj_idxs[matched_src_idxs]] 

318 

319 return pipeBase.Struct( 

320 scores=scores, 

321 obj_idxs=obj_idxs, 

322 obj_ids=obj_ids) 

323 

324 def _make_spatial_tree(self, dia_objects): 

325 """Create a searchable kd-tree the input dia_object positions. 

326 

327 Parameters 

328 ---------- 

329 dia_objects : `pandas.DataFrame` 

330 A catalog of DIAObjects to create the tree from. 

331 

332 Returns 

333 ------- 

334 kd_tree : `scipy.spatical.cKDTree` 

335 Searchable kd-tree created from the positions of the DIAObjects. 

336 """ 

337 vectors = self._radec_to_xyz(dia_objects) 

338 return cKDTree(vectors) 

339 

340 def _radec_to_xyz(self, catalog): 

341 """Convert input ra/dec coordinates to spherical unit-vectors. 

342 

343 Parameters 

344 ---------- 

345 catalog : `pandas.DataFrame` 

346 Catalog to produce spherical unit-vector from. 

347 

348 Returns 

349 ------- 

350 vectors : `numpy.ndarray`, (N, 3) 

351 Output unit-vectors 

352 """ 

353 ras = np.radians(catalog["ra"]) 

354 decs = np.radians(catalog["decl"]) 

355 vectors = np.empty((len(ras), 3)) 

356 

357 sin_dec = np.sin(np.pi / 2 - decs) 

358 vectors[:, 0] = sin_dec * np.cos(ras) 

359 vectors[:, 1] = sin_dec * np.sin(ras) 

360 vectors[:, 2] = np.cos(np.pi / 2 - decs) 

361 

362 return vectors 

363 

364 @pipeBase.timeMethod 

365 def match(self, dia_objects, dia_sources, score_struct): 

366 """Match DIAsources to DIAObjects given a score and create new 

367 DIAObject Ids for new unassociated DIASources. 

368 

369 Parameters 

370 ---------- 

371 dia_objects : `pandas.DataFrame` 

372 A SourceCatalog of DIAObjects to associate to DIASources. 

373 dia_sources : `pandas.DataFrame` 

374 A contiguous catalog of dia_sources for which the set of scores 

375 has been computed on with DIAObjectCollection.score. 

376 score_struct : `lsst.pipe.base.Struct` 

377 Results struct with components: 

378 

379 - ``scores``: array of floats of match quality 

380 updated DIAObjects (array-like of `float`). 

381 - ``obj_ids``: array of floats of match quality 

382 updated DIAObjects (array-like of `int`). 

383 - ``obj_idxs``: indexes of the matched DIAObjects in the catalog. 

384 (array-like of `int`) 

385 

386 Default values for these arrays are 

387 INF, -1 and -1 respectively for unassociated sources. 

388 

389 Returns 

390 ------- 

391 result : `lsst.pipeBase.Struct` 

392 Results struct with components: 

393 

394 - ``updated_and_new_dia_object_ids`` : ids of new and updated 

395 dia_objects as the result of association. (`list` of `int`). 

396 - ``new_dia_objects`` : Newly created DiaObjects from unassociated 

397 diaSources. (`pandas.DataFrame`) 

398 - ``n_updated_dia_objects`` : Number of previously know dia_objects 

399 with newly associated DIASources. (`int`). 

400 - ``n_new_dia_objects`` : Number of newly created DIAObjects from 

401 unassociated DIASources (`int`). 

402 - ``n_unupdated_dia_objects`` : Number of previous DIAObjects that 

403 were not associated to a new DIASource (`int`). 

404 """ 

405 

406 n_previous_dia_objects = len(dia_objects) 

407 used_dia_object = np.zeros(n_previous_dia_objects, dtype=bool) 

408 used_dia_source = np.zeros(len(dia_sources), dtype=bool) 

409 associated_dia_object_ids = np.zeros(len(dia_sources), 

410 dtype=np.uint64) 

411 new_dia_objects = [] 

412 

413 n_updated_dia_objects = 0 

414 n_new_dia_objects = 0 

415 

416 # We sort from best match to worst to effectively perform a 

417 # "handshake" match where both the DIASources and DIAObjects agree 

418 # their the best match. By sorting this way, scores with NaN (those 

419 # sources that have no match and will create new DIAObjects) will be 

420 # placed at the end of the array. 

421 score_args = score_struct.scores.argsort(axis=None) 

422 for score_idx in score_args: 

423 if not np.isfinite(score_struct.scores[score_idx]): 

424 # Thanks to the sorting the rest of the sources will be 

425 # NaN for their score. We therefore exit the loop to append 

426 # sources to a existing DIAObject, leaving these for 

427 # the loop creating new objects. 

428 break 

429 dia_obj_idx = score_struct.obj_idxs[score_idx] 

430 if used_dia_object[dia_obj_idx]: 

431 continue 

432 used_dia_object[dia_obj_idx] = True 

433 used_dia_source[score_idx] = True 

434 obj_id = score_struct.obj_ids[score_idx] 

435 associated_dia_object_ids[score_idx] = obj_id 

436 n_updated_dia_objects += 1 

437 dia_sources.loc[score_idx, "diaObjectId"] = obj_id 

438 

439 # Argwhere returns a array shape (N, 1) so we access the index 

440 # thusly to retrieve the value rather than the tuple 

441 for (src_idx,) in np.argwhere(np.logical_not(used_dia_source)): 

442 src_id = dia_sources.loc[src_idx, "diaSourceId"] 

443 new_dia_objects.append(self._initialize_dia_object(src_id)) 

444 associated_dia_object_ids[src_idx] = src_id 

445 dia_sources.loc[src_idx, "diaObjectId"] = src_id 

446 n_new_dia_objects += 1 

447 

448 if len(new_dia_objects) > 0: 

449 new_dia_objects = pd.DataFrame(data=new_dia_objects) 

450 else: 

451 # Create a junk DiaObject to get the columns. 

452 tmpObj = self._initialize_dia_object(0) 

453 new_dia_objects = pd.DataFrame(data=new_dia_objects, 

454 columns=tmpObj.keys()) 

455 new_dia_objects.set_index("diaObjectId", inplace=True, drop=False) 

456 

457 # Return the ids of the DIAObjects in this DIAObjectCollection that 

458 # were updated or newly created. 

459 n_unassociated_dia_objects = \ 

460 n_previous_dia_objects - n_updated_dia_objects 

461 return pipeBase.Struct( 

462 associated_dia_object_ids=associated_dia_object_ids, 

463 new_dia_objects=new_dia_objects, 

464 n_updated_dia_objects=n_updated_dia_objects, 

465 n_new_dia_objects=n_new_dia_objects, 

466 n_unassociated_dia_objects=n_unassociated_dia_objects,) 

467 

468 def _initialize_dia_object(self, objId): 

469 """Create a new DiaObject with values required to be initialized by the 

470 Ppdb. 

471 

472 Parameters 

473 ---------- 

474 objid : `int` 

475 ``diaObjectId`` value for the of the new DiaObject. 

476 

477 Returns 

478 ------- 

479 diaObject : `dict` 

480 Newly created DiaObject with keys: 

481 

482 ``diaObjectId`` 

483 Unique DiaObjectId (`int`). 

484 ``pmParallaxNdata`` 

485 Number of data points used for parallax calculation (`int`). 

486 ``nearbyObj1`` 

487 Id of the a nearbyObject in the Object table (`int`). 

488 ``nearbyObj2`` 

489 Id of the a nearbyObject in the Object table (`int`). 

490 ``nearbyObj3`` 

491 Id of the a nearbyObject in the Object table (`int`). 

492 ``?PSFluxData`` 

493 Number of data points used to calculate point source flux 

494 summary statistics in each bandpass (`int`). 

495 """ 

496 new_dia_object = {"diaObjectId": objId, 

497 "pmParallaxNdata": 0, 

498 "nearbyObj1": 0, 

499 "nearbyObj2": 0, 

500 "nearbyObj3": 0, 

501 "flags": 0} 

502 for f in ["u", "g", "r", "i", "z", "y"]: 

503 new_dia_object["%sPSFluxNdata" % f] = 0 

504 return new_dia_object 

505 

506 def _add_association_meta_data(self, match_result): 

507 """Store summaries of the association step in the task metadata. 

508 

509 Parameters 

510 ---------- 

511 match_result : `lsst.pipeBase.Struct` 

512 Results struct with components: 

513 

514 - ``updated_and_new_dia_object_ids`` : ids new and updated 

515 dia_objects in the collection (`list` of `int`). 

516 - ``n_updated_dia_objects`` : Number of previously know dia_objects 

517 with newly associated DIASources. (`int`). 

518 - ``n_new_dia_objects`` : Number of newly created DIAObjects from 

519 unassociated DIASources (`int`). 

520 - ``n_unupdated_dia_objects`` : Number of previous DIAObjects that 

521 were not associated to a new DIASource (`int`). 

522 """ 

523 self.metadata.add('numUpdatedDiaObjects', 

524 match_result.n_updated_dia_objects) 

525 self.metadata.add('numNewDiaObjects', 

526 match_result.n_new_dia_objects) 

527 self.metadata.add('numUnassociatedDiaObjects', 

528 match_result.n_unassociated_dia_objects)