Coverage for python/lsst/meas/astrom/matcher_probabilistic.py: 22%

257 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-16 03:58 -0700

1# This file is part of meas_astrom. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22__all__ = ['ConvertCatalogCoordinatesConfig', 'MatchProbabilisticConfig', 'MatcherProbabilistic'] 

23 

24import lsst.pex.config as pexConfig 

25 

26from dataclasses import dataclass 

27import logging 

28import numpy as np 

29import pandas as pd 

30from scipy.spatial import cKDTree 

31from smatch.matcher import Matcher 

32import time 

33from typing import Callable, Set 

34 

35logger_default = logging.getLogger(__name__) 

36 

37 

38def _mul_column(column: np.array, value: float): 

39 if value is not None and value != 1: 

40 column *= value 

41 return column 

42 

43 

44def _radec_to_xyz(ra, dec): 

45 """Convert input ra/dec coordinates to spherical unit vectors. 

46 

47 Parameters 

48 ---------- 

49 ra, dec: `numpy.ndarray` 

50 Arrays of right ascension/declination in degrees. 

51 

52 Returns 

53 ------- 

54 vectors : `numpy.ndarray`, (N, 3) 

55 Output unit vectors. 

56 """ 

57 if ra.size != dec.size: 

58 raise ValueError('ra and dec must be same size') 

59 ras = np.radians(ra) 

60 decs = np.radians(dec) 

61 vectors = np.empty((ras.size, 3)) 

62 

63 sin_dec = np.sin(np.pi / 2 - decs) 

64 vectors[:, 0] = sin_dec * np.cos(ras) 

65 vectors[:, 1] = sin_dec * np.sin(ras) 

66 vectors[:, 2] = np.cos(np.pi / 2 - decs) 

67 

68 return vectors 

69 

70 

71@dataclass 

72class CatalogExtras: 

73 """Store frequently-reference (meta)data relevant for matching a catalog. 

74 

75 Parameters 

76 ---------- 

77 catalog : `pandas.DataFrame` 

78 A pandas catalog to store extra information for. 

79 select : `numpy.array` 

80 A numpy boolean array of the same length as catalog to be used for 

81 target selection. 

82 """ 

83 

84 n: int 

85 indices: np.array 

86 select: np.array 

87 

88 coordinate_factor: float = None 

89 

90 def __init__(self, catalog: pd.DataFrame, select: np.array = None, coordinate_factor: float = None): 

91 self.n = len(catalog) 

92 self.select = np.ones(self.n, dtype=bool) if select is None else select 

93 self.indices = np.flatnonzero(select) if select is not None else np.arange(self.n) 

94 self.coordinate_factor = coordinate_factor 

95 

96 

97@dataclass(frozen=True) 

98class ComparableCatalog: 

99 """A catalog with sources with coordinate columns in some standard format/units. 

100 

101 catalog : `pandas.DataFrame` 

102 A catalog with comparable coordinate columns. 

103 column_coord1 : `str` 

104 The first spatial coordinate column name. 

105 column_coord2 : `str` 

106 The second spatial coordinate column name. 

107 coord1 : `numpy.array` 

108 The first spatial coordinate values. 

109 coord2 : `numpy.array` 

110 The second spatial coordinate values. 

111 extras : `CatalogExtras` 

112 Extra cached (meta)data for the `catalog`. 

113 """ 

114 

115 catalog: pd.DataFrame 

116 column_coord1: str 

117 column_coord2: str 

118 coord1: np.array 

119 coord2: np.array 

120 extras: CatalogExtras 

121 

122 

123class ConvertCatalogCoordinatesConfig(pexConfig.Config): 

124 """Configuration for the MatchProbabilistic matcher.""" 

125 

126 column_ref_coord1 = pexConfig.Field[str]( 

127 default='ra', 

128 doc='The reference table column for the first spatial coordinate (usually x or ra).', 

129 ) 

130 column_ref_coord2 = pexConfig.Field[str]( 

131 default='dec', 

132 doc='The reference table column for the second spatial coordinate (usually y or dec).' 

133 'Units must match column_ref_coord1.', 

134 ) 

135 column_target_coord1 = pexConfig.Field[str]( 

136 default='coord_ra', 

137 doc='The target table column for the first spatial coordinate (usually x or ra).' 

138 'Units must match column_ref_coord1.', 

139 ) 

140 column_target_coord2 = pexConfig.Field[str]( 

141 default='coord_dec', 

142 doc='The target table column for the second spatial coordinate (usually y or dec).' 

143 'Units must match column_ref_coord2.', 

144 ) 

145 coords_spherical = pexConfig.Field[bool]( 

146 default=True, 

147 doc='Whether column_*_coord[12] are spherical coordinates (ra/dec) or not (pixel x/y).', 

148 ) 

149 coords_ref_factor = pexConfig.Field[float]( 

150 default=1.0, 

151 doc='Multiplicative factor for reference catalog coordinates.' 

152 'If coords_spherical is true, this must be the number of degrees per unit increment of ' 

153 'column_ref_coord[12]. Otherwise, it must convert the coordinate to the same units' 

154 ' as the target coordinates.', 

155 ) 

156 coords_target_factor = pexConfig.Field[float]( 

157 default=1.0, 

158 doc='Multiplicative factor for target catalog coordinates.' 

159 'If coords_spherical is true, this must be the number of degrees per unit increment of ' 

160 'column_target_coord[12]. Otherwise, it must convert the coordinate to the same units' 

161 ' as the reference coordinates.', 

162 ) 

163 coords_ref_to_convert = pexConfig.DictField[str, str]( 163 ↛ exitline 163 didn't jump to the function exit

164 default=None, 

165 optional=True, 

166 dictCheck=lambda x: len(x) == 2, 

167 doc='Dict mapping sky coordinate columns to be converted to pixel columns.', 

168 ) 

169 mag_zeropoint_ref = pexConfig.Field[float]( 

170 default=31.4, 

171 doc='Magnitude zeropoint for reference catalog.', 

172 ) 

173 return_converted_coords = pexConfig.Field[float]( 

174 default=True, 

175 doc='Whether to return converted coordinates for matching or only write them.', 

176 ) 

177 

178 def format_catalogs( 

179 self, 

180 catalog_ref: pd.DataFrame, 

181 catalog_target: pd.DataFrame, 

182 select_ref: np.array = None, 

183 select_target: np.array = None, 

184 radec_to_xy_func: Callable = None, 

185 **kwargs, 

186 ): 

187 """Format matched catalogs that may require coordinate conversions. 

188 

189 Parameters 

190 ---------- 

191 catalog_ref : `pandas.DataFrame` 

192 A reference catalog for comparison to `catalog_target`. 

193 catalog_target : `pandas.DataFrame` 

194 A target catalog with measurements for comparison to `catalog_ref`. 

195 select_ref : `numpy.ndarray`, (Nref,) 

196 A boolean array of len `catalog_ref`, True for valid match candidates. 

197 select_target : `numpy.ndarray`, (Ntarget,) 

198 A boolean array of len `catalog_target`, True for valid match candidates. 

199 radec_to_xy_func : `typing.Callable` 

200 Function taking equal-length ra, dec arrays and returning an ndarray of 

201 - ``x``: current parameter (`float`). 

202 - ``extra_args``: additional arguments (`dict`). 

203 kwargs 

204 Additional keyword arguments to pass to radec_to_xy_func. 

205 

206 Returns 

207 ------- 

208 compcat_ref, compcat_target : `ComparableCatalog` 

209 Comparable catalogs corresponding to the input reference and target. 

210 """ 

211 convert_ref = self.coords_ref_to_convert 

212 if convert_ref and not callable(radec_to_xy_func): 

213 raise TypeError('radec_to_xy_func must be callable if converting ref coords') 

214 

215 # Set up objects with frequently-used attributes like selection bool array 

216 extras_ref, extras_target = ( 

217 CatalogExtras(catalog, select=select, coordinate_factor=coord_factor) 

218 for catalog, select, coord_factor in zip( 

219 (catalog_ref, catalog_target), 

220 (select_ref, select_target), 

221 (self.coords_ref_factor, self.coords_target_factor), 

222 ) 

223 ) 

224 

225 compcats = [] 

226 

227 # Retrieve coordinates and multiply them by scaling factors 

228 for catalog, extras, (column1, column2), convert in ( 

229 (catalog_ref, extras_ref, (self.column_ref_coord1, self.column_ref_coord2), convert_ref), 

230 (catalog_target, extras_target, (self.column_target_coord1, self.column_target_coord2), False), 

231 ): 

232 coord1, coord2 = ( 

233 _mul_column(catalog[column], extras.coordinate_factor) 

234 for column in (column1, column2) 

235 ) 

236 if convert: 

237 xy_ref = radec_to_xy_func(coord1, coord2, self.coords_ref_factor, **kwargs) 

238 for idx_coord, column_out in enumerate(self.coords_ref_to_convert.values()): 

239 coord = np.array([xy[idx_coord] for xy in xy_ref]) 

240 catalog[column_out] = coord 

241 if convert_ref: 

242 column1, column2 = self.coords_ref_to_convert.values() 

243 if self.return_converted_coords: 

244 coord1, coord2 = catalog[column1], catalog[column2] 

245 if isinstance(coord1, pd.Series): 

246 coord1 = coord1.values 

247 if isinstance(coord2, pd.Series): 

248 coord2 = coord2.values 

249 

250 compcats.append(ComparableCatalog( 

251 catalog=catalog, column_coord1=column1, column_coord2=column2, 

252 coord1=coord1, coord2=coord2, extras=extras, 

253 )) 

254 

255 return tuple(compcats) 

256 

257 

258class MatchProbabilisticConfig(pexConfig.Config): 

259 """Configuration for the MatchProbabilistic matcher.""" 

260 

261 column_ref_order = pexConfig.Field( 

262 dtype=str, 

263 default=None, 

264 optional=True, 

265 doc='Name of column in reference catalog specifying order for matching' 

266 ' Derived from columns_ref_flux if not set.', 

267 ) 

268 

269 @property 

270 def columns_in_ref(self) -> Set[str]: 

271 columns_all = [ 

272 self.coord_format.column_ref_coord1, 

273 self.coord_format.column_ref_coord2, 

274 ] 

275 for columns in ( 

276 self.columns_ref_flux, 

277 self.columns_ref_meas, 

278 self.columns_ref_select_false, 

279 self.columns_ref_select_true, 

280 self.columns_ref_copy, 

281 ): 

282 columns_all.extend(columns) 

283 if self.column_ref_order: 

284 columns_all.append(self.column_ref_order) 

285 

286 return set(columns_all) 

287 

288 @property 

289 def columns_in_target(self) -> Set[str]: 

290 columns_all = [ 

291 self.coord_format.column_target_coord1, 

292 self.coord_format.column_target_coord2, 

293 ] 

294 for columns in ( 

295 self.columns_target_meas, 

296 self.columns_target_err, 

297 self.columns_target_select_false, 

298 self.columns_target_select_true, 

299 self.columns_target_copy, 

300 ): 

301 columns_all.extend(columns) 

302 return set(columns_all) 

303 

304 columns_ref_copy = pexConfig.ListField( 304 ↛ exitline 304 didn't jump to the function exit

305 dtype=str, 

306 default=[], 

307 listCheck=lambda x: len(set(x)) == len(x), 

308 optional=True, 

309 doc='Reference table columns to copy unchanged into both match tables', 

310 ) 

311 columns_ref_flux = pexConfig.ListField( 

312 dtype=str, 

313 default=[], 

314 optional=True, 

315 doc="List of reference flux columns to nansum total magnitudes from if column_order is None", 

316 ) 

317 columns_ref_meas = pexConfig.ListField( 

318 dtype=str, 

319 doc='The reference table columns to compute match likelihoods from ' 

320 '(usually centroids and fluxes/magnitudes)', 

321 ) 

322 columns_ref_select_true = pexConfig.ListField( 

323 dtype=str, 

324 default=tuple(), 

325 doc='Reference table columns to require to be True for selecting sources', 

326 ) 

327 columns_ref_select_false = pexConfig.ListField( 

328 dtype=str, 

329 default=tuple(), 

330 doc='Reference table columns to require to be False for selecting sources', 

331 ) 

332 columns_target_copy = pexConfig.ListField( 332 ↛ exitline 332 didn't jump to the function exit

333 dtype=str, 

334 default=[], 

335 listCheck=lambda x: len(set(x)) == len(x), 

336 optional=True, 

337 doc='Target table columns to copy unchanged into both match tables', 

338 ) 

339 columns_target_meas = pexConfig.ListField( 

340 dtype=str, 

341 doc='Target table columns with measurements corresponding to columns_ref_meas', 

342 ) 

343 columns_target_err = pexConfig.ListField( 

344 dtype=str, 

345 doc='Target table columns with standard errors (sigma) corresponding to columns_ref_meas', 

346 ) 

347 columns_target_select_true = pexConfig.ListField( 

348 dtype=str, 

349 default=('detect_isPrimary',), 

350 doc='Target table columns to require to be True for selecting sources', 

351 ) 

352 columns_target_select_false = pexConfig.ListField( 

353 dtype=str, 

354 default=('merge_peak_sky',), 

355 doc='Target table columns to require to be False for selecting sources', 

356 ) 

357 coord_format = pexConfig.ConfigField( 

358 dtype=ConvertCatalogCoordinatesConfig, 

359 doc="Configuration for coordinate conversion", 

360 ) 

361 mag_brightest_ref = pexConfig.Field( 

362 dtype=float, 

363 default=-np.inf, 

364 doc='Bright magnitude cutoff for selecting reference sources to match.' 

365 ' Ignored if column_ref_order is None.' 

366 ) 

367 mag_faintest_ref = pexConfig.Field( 

368 dtype=float, 

369 default=np.Inf, 

370 doc='Faint magnitude cutoff for selecting reference sources to match.' 

371 ' Ignored if column_ref_order is None.' 

372 ) 

373 match_dist_max = pexConfig.Field( 

374 dtype=float, 

375 default=0.5, 

376 doc='Maximum match distance. Units must be arcseconds if coords_spherical, ' 

377 'or else match those of column_*_coord[12] multiplied by coords_*_factor.', 

378 ) 

379 match_n_max = pexConfig.Field( 379 ↛ exitline 379 didn't jump to the function exit

380 dtype=int, 

381 default=10, 

382 optional=True, 

383 doc='Maximum number of spatial matches to consider (in ascending distance order).', 

384 check=lambda x: x >= 1, 

385 ) 

386 match_n_finite_min = pexConfig.Field( 

387 dtype=int, 

388 default=2, 

389 optional=True, 

390 doc='Minimum number of columns with a finite value to measure match likelihood', 

391 ) 

392 order_ascending = pexConfig.Field( 

393 dtype=bool, 

394 default=False, 

395 optional=True, 

396 doc='Whether to order reference match candidates in ascending order of column_ref_order ' 

397 '(should be False if the column is a flux and True if it is a magnitude.', 

398 ) 

399 

400 def validate(self): 

401 super().validate() 

402 n_ref_meas = len(self.columns_ref_meas) 

403 n_target_meas = len(self.columns_target_meas) 

404 n_target_err = len(self.columns_target_err) 

405 match_n_finite_min = self.match_n_finite_min 

406 errors = [] 

407 if n_target_meas != n_ref_meas: 

408 errors.append(f"{len(self.columns_target_meas)=} != {len(self.columns_ref_meas)=}") 

409 if n_target_err != n_ref_meas: 

410 errors.append(f"{len(self.columns_target_err)=} != {len(self.columns_ref_meas)=}") 

411 if not (n_ref_meas >= match_n_finite_min): 

412 errors.append( 

413 f"{len(self.columns_ref_meas)=} !>= {self.match_n_finite_min=}, no matches possible" 

414 ) 

415 if errors: 

416 raise ValueError("\n".join(errors)) 

417 

418 

419def default_value(dtype): 

420 if dtype == str: 

421 return '' 

422 elif dtype == np.signedinteger: 

423 return np.Inf 

424 elif dtype == np.unsignedinteger: 

425 return -np.Inf 

426 return None 

427 

428 

429class MatcherProbabilistic: 

430 """A probabilistic, greedy catalog matcher. 

431 

432 Parameters 

433 ---------- 

434 config: `MatchProbabilisticConfig` 

435 A configuration instance. 

436 """ 

437 

438 config: MatchProbabilisticConfig 

439 

440 def __init__( 

441 self, 

442 config: MatchProbabilisticConfig, 

443 ): 

444 self.config = config 

445 

446 def match( 

447 self, 

448 catalog_ref: pd.DataFrame, 

449 catalog_target: pd.DataFrame, 

450 select_ref: np.array = None, 

451 select_target: np.array = None, 

452 logger: logging.Logger = None, 

453 logging_n_rows: int = None, 

454 **kwargs 

455 ): 

456 """Match catalogs. 

457 

458 Parameters 

459 ---------- 

460 catalog_ref : `pandas.DataFrame` 

461 A reference catalog to match in order of a given column (i.e. greedily). 

462 catalog_target : `pandas.DataFrame` 

463 A target catalog for matching sources from `catalog_ref`. Must contain measurements with errors. 

464 select_ref : `numpy.array` 

465 A boolean array of the same length as `catalog_ref` selecting the sources that can be matched. 

466 select_target : `numpy.array` 

467 A boolean array of the same length as `catalog_target` selecting the sources that can be matched. 

468 logger : `logging.Logger` 

469 A Logger for logging. 

470 logging_n_rows : `int` 

471 The number of sources to match before printing a log message. 

472 kwargs 

473 Additional keyword arguments to pass to `format_catalogs`. 

474 

475 Returns 

476 ------- 

477 catalog_out_ref : `pandas.DataFrame` 

478 A catalog of identical length to `catalog_ref`, containing match information for rows selected by 

479 `select_ref` (including the matching row index in `catalog_target`). 

480 catalog_out_target : `pandas.DataFrame` 

481 A catalog of identical length to `catalog_target`, containing the indices of matching rows in 

482 `catalog_ref`. 

483 exceptions : `dict` [`int`, `Exception`] 

484 A dictionary keyed by `catalog_target` row number of the first exception caught when matching. 

485 """ 

486 if logger is None: 

487 logger = logger_default 

488 

489 t_init = time.process_time() 

490 config = self.config 

491 

492 # Transform any coordinates, if required 

493 # Note: The returned objects contain the original catalogs, as well as 

494 # transformed coordinates, and the selection of sources for matching. 

495 # These might be identical to the arrays passed as kwargs, but that 

496 # depends on config settings. 

497 # For the rest of this function, the selection arrays will be used, 

498 # but the indices of the original, unfiltered catalog will also be 

499 # output, so some further indexing steps are needed. 

500 ref, target = config.coord_format.format_catalogs( 

501 catalog_ref=catalog_ref, catalog_target=catalog_target, 

502 select_ref=select_ref, select_target=select_target, 

503 **kwargs 

504 ) 

505 

506 # If no order is specified, take nansum of all flux columns for a 'total flux' 

507 # Note: it won't actually be a total flux if bands overlap significantly 

508 # (or it might define a filter with >100% efficiency 

509 # Also, this is done on the original dataframe as it's harder to accomplish 

510 # just with a recarray 

511 column_order = ( 

512 catalog_ref.loc[ref.extras.select, config.column_ref_order] 

513 if config.column_ref_order is not None else 

514 np.nansum(catalog_ref.loc[ref.extras.select, config.columns_ref_flux], axis=1) 

515 ) 

516 order = np.argsort(column_order if config.order_ascending else -column_order) 

517 

518 n_ref_select = len(ref.extras.indices) 

519 

520 coords_spherical = config.coord_format.coords_spherical 

521 coords_ref, coords_target = ( 

522 (cat.coord1[cat.extras.select], cat.coord2[cat.extras.select]) 

523 for cat in (ref, target) 

524 ) 

525 

526 # Generate K-d tree to compute distances 

527 logger.info('Generating cKDTree with match_n_max=%d', config.match_n_max) 

528 

529 if coords_spherical: 

530 match_dist_max = config.match_dist_max/3600. 

531 with Matcher(coords_target[0], coords_target[1]) as matcher: 

532 idxs_target_select = matcher.query_knn( 

533 coords_ref[0], coords_ref[1], 

534 distance_upper_bound=match_dist_max, 

535 k=config.match_n_max, 

536 ) 

537 # Call scipy for non-spherical case 

538 # The spherical case won't trigger, but the implementation is left for comparison, if needed 

539 else: 

540 match_dist_max = np.radians(config.match_dist_max/3600.) 

541 # Convert ra/dec sky coordinates to spherical vectors for accurate distances 

542 func_convert = _radec_to_xyz if coords_spherical else np.vstack 

543 vec_ref, vec_target = ( 

544 func_convert(coords[0], coords[1]) 

545 for coords in (coords_ref, coords_target) 

546 ) 

547 tree_obj = cKDTree(vec_target) 

548 _, idxs_target_select = tree_obj.query( 

549 vec_ref, 

550 distance_upper_bound=match_dist_max, 

551 k=config.match_n_max, 

552 ) 

553 

554 n_target_select = len(target.extras.indices) 

555 n_matches = np.sum(idxs_target_select != n_target_select, axis=1) 

556 n_matched_max = np.sum(n_matches == config.match_n_max) 

557 if n_matched_max > 0: 

558 logger.warning( 

559 '%d/%d (%.2f%%) selected true objects have n_matches=n_match_max(%d)', 

560 n_matched_max, n_ref_select, 100.*n_matched_max/n_ref_select, config.match_n_max 

561 ) 

562 

563 # Pre-allocate outputs 

564 target_row_match = np.full(target.extras.n, np.nan, dtype=np.int64) 

565 ref_candidate_match = np.zeros(ref.extras.n, dtype=bool) 

566 ref_row_match = np.full(ref.extras.n, np.nan, dtype=np.int64) 

567 ref_match_count = np.zeros(ref.extras.n, dtype=np.int32) 

568 ref_match_meas_finite = np.zeros(ref.extras.n, dtype=np.int32) 

569 ref_chisq = np.full(ref.extras.n, np.nan, dtype=float) 

570 

571 # Need the original reference row indices for output 

572 idx_orig_ref, idx_orig_target = (np.argwhere(cat.extras.select)[:, 0] for cat in (ref, target)) 

573 

574 # Retrieve required columns, including any converted ones (default to original column name) 

575 columns_convert = config.coord_format.coords_ref_to_convert 

576 if columns_convert is None: 

577 columns_convert = {} 

578 data_ref = ref.catalog[ 

579 [columns_convert.get(column, column) for column in config.columns_ref_meas] 

580 ].iloc[ref.extras.indices[order]] 

581 data_target = target.catalog[config.columns_target_meas][target.extras.select] 

582 errors_target = target.catalog[config.columns_target_err][target.extras.select] 

583 

584 exceptions = {} 

585 # The kdTree uses len(inputs) as a sentinel value for no match 

586 matched_target = {n_target_select, } 

587 index_ref = idx_orig_ref[order] 

588 # Fill in the candidate column 

589 ref_candidate_match[index_ref] = True 

590 

591 # Count this as the time when disambiguation begins 

592 t_begin = time.process_time() 

593 

594 # Exclude unmatched sources 

595 matched_ref = idxs_target_select[order, 0] != n_target_select 

596 order = order[matched_ref] 

597 idx_first = idxs_target_select[order, 0] 

598 chi_0 = (data_target.iloc[idx_first].values - data_ref.iloc[matched_ref].values)/( 

599 errors_target.iloc[idx_first].values) 

600 chi_finite_0 = np.isfinite(chi_0) 

601 n_finite_0 = np.sum(chi_finite_0, axis=1) 

602 chi_0[~chi_finite_0] = 0 

603 chisq_sum_0 = np.sum(chi_0*chi_0, axis=1) 

604 

605 logger.info('Disambiguating %d/%d matches/targets', len(order), len(ref.catalog)) 

606 for index_n, index_row_select in enumerate(order): 

607 index_row = idx_orig_ref[index_row_select] 

608 found = idxs_target_select[index_row_select, :] 

609 # Unambiguous match, short-circuit some evaluations 

610 if (found[1] == n_target_select) and (found[0] not in matched_target): 

611 n_finite = n_finite_0[index_n] 

612 if not (n_finite >= config.match_n_finite_min): 

613 continue 

614 idx_chisq_min = 0 

615 n_matched = 1 

616 chisq_sum = chisq_sum_0[index_n] 

617 else: 

618 # Select match candidates from nearby sources not already matched 

619 # Note: set lookup is apparently fast enough that this is a few percent faster than: 

620 # found = [x for x in found[found != n_target_select] if x not in matched_target] 

621 # ... at least for ~1M sources 

622 found = [x for x in found if x not in matched_target] 

623 n_found = len(found) 

624 if n_found == 0: 

625 continue 

626 # This is an ndarray of n_found rows x len(data_ref/target) columns 

627 chi = ( 

628 (data_target.iloc[found].values - data_ref.iloc[index_n].values) 

629 / errors_target.iloc[found].values 

630 ) 

631 finite = np.isfinite(chi) 

632 n_finite = np.sum(finite, axis=1) 

633 # Require some number of finite chi_sq to match 

634 chisq_good = n_finite >= config.match_n_finite_min 

635 if not any(chisq_good): 

636 continue 

637 try: 

638 chisq_sum = np.zeros(n_found, dtype=float) 

639 chisq_sum[chisq_good] = np.nansum(chi[chisq_good, :] ** 2, axis=1) 

640 idx_chisq_min = np.nanargmin(chisq_sum / n_finite) 

641 n_finite = n_finite[idx_chisq_min] 

642 n_matched = len(chisq_good) 

643 chisq_sum = chisq_sum[idx_chisq_min] 

644 except Exception as error: 

645 # Can't foresee any exceptions, but they shouldn't prevent 

646 # matching subsequent sources 

647 exceptions[index_row] = error 

648 ref_match_meas_finite[index_row] = n_finite 

649 ref_match_count[index_row] = n_matched 

650 ref_chisq[index_row] = chisq_sum 

651 idx_match_select = found[idx_chisq_min] 

652 row_target = target.extras.indices[idx_match_select] 

653 ref_row_match[index_row] = row_target 

654 

655 target_row_match[row_target] = index_row 

656 matched_target.add(idx_match_select) 

657 

658 if logging_n_rows and ((index_n + 1) % logging_n_rows == 0): 

659 t_elapsed = time.process_time() - t_begin 

660 logger.info( 

661 'Processed %d/%d in %.2fs at sort value=%.3f', 

662 index_n + 1, n_ref_select, t_elapsed, column_order[order[index_n]], 

663 ) 

664 

665 data_ref = { 

666 'match_candidate': ref_candidate_match, 

667 'match_row': ref_row_match, 

668 'match_count': ref_match_count, 

669 'match_chisq': ref_chisq, 

670 'match_n_chisq_finite': ref_match_meas_finite, 

671 } 

672 data_target = { 

673 'match_candidate': target.extras.select if target.extras.select is not None else ( 

674 np.ones(target.extras.n, dtype=bool)), 

675 'match_row': target_row_match, 

676 } 

677 

678 for (columns, out_original, out_matched, in_original, in_matched, matches, name_cat) in ( 

679 ( 

680 self.config.columns_ref_copy, 

681 data_ref, 

682 data_target, 

683 ref, 

684 target, 

685 target_row_match, 

686 'target', 

687 ), 

688 ( 

689 self.config.columns_target_copy, 

690 data_target, 

691 data_ref, 

692 target, 

693 ref, 

694 ref_row_match, 

695 'reference', 

696 ), 

697 ): 

698 matched = matches >= 0 

699 idx_matched = matches[matched] 

700 logger.info('Matched %d/%d %s sources', np.sum(matched), len(matched), name_cat) 

701 

702 for column in columns: 

703 values = in_original.catalog[column] 

704 out_original[column] = values 

705 dtype = in_original.catalog[column].dtype 

706 

707 # Pandas object columns can have mixed types - check for that 

708 if dtype == object: 

709 types = list(set((type(x) for x in values))) 

710 if len(types) != 1: 

711 raise RuntimeError(f'Column {column} dtype={dtype} has multiple types={types}') 

712 dtype = types[0] 

713 

714 value_fill = default_value(dtype) 

715 

716 # Without this, the dtype would be '<U1' for an empty Unicode string 

717 if dtype == str: 

718 dtype = f'<U{max(len(x) for x in values)}' 

719 

720 column_match = np.full(in_matched.extras.n, value_fill, dtype=dtype) 

721 column_match[matched] = in_original.catalog[column][idx_matched] 

722 out_matched[f'match_{column}'] = column_match 

723 

724 logger.info( 

725 'Completed match disambiguating in %.2fs (total %.2fs)', 

726 time.process_time() - t_begin, 

727 time.process_time() - t_init, 

728 ) 

729 

730 catalog_out_ref = pd.DataFrame(data_ref) 

731 catalog_out_target = pd.DataFrame(data_target) 

732 

733 return catalog_out_ref, catalog_out_target, exceptions