448 catalog_ref: pd.DataFrame,
449 catalog_target: pd.DataFrame,
450 select_ref: np.array =
None,
451 select_target: np.array =
None,
452 logger: logging.Logger =
None,
453 logging_n_rows: int =
None,
460 catalog_ref : `pandas.DataFrame`
461 A reference catalog to match in order of a given column (i.e. greedily).
462 catalog_target : `pandas.DataFrame`
463 A target catalog for matching sources from `catalog_ref`. Must contain measurements with errors.
464 select_ref : `numpy.array`
465 A boolean array of the same length as `catalog_ref` selecting the sources that can be matched.
466 select_target : `numpy.array`
467 A boolean array of the same length as `catalog_target` selecting the sources that can be matched.
468 logger : `logging.Logger`
469 A Logger for logging.
470 logging_n_rows : `int`
471 The number of sources to match before printing a log message.
473 Additional keyword arguments to pass to `format_catalogs`.
477 catalog_out_ref : `pandas.DataFrame`
478 A catalog of identical length to `catalog_ref`, containing match information for rows selected by
479 `select_ref` (including the matching row index in `catalog_target`).
480 catalog_out_target : `pandas.DataFrame`
481 A catalog of identical length to `catalog_target`, containing the indices of matching rows in
483 exceptions : `dict` [`int`, `Exception`]
484 A dictionary keyed by `catalog_target` row number of the first exception caught when matching.
487 logger = logger_default
489 t_init = time.process_time()
500 ref, target = config.coord_format.format_catalogs(
501 catalog_ref=catalog_ref, catalog_target=catalog_target,
502 select_ref=select_ref, select_target=select_target,
512 catalog_ref.loc[ref.extras.select, config.column_ref_order]
513 if config.column_ref_order
is not None else
514 np.nansum(catalog_ref.loc[ref.extras.select, config.columns_ref_flux], axis=1)
516 order = np.argsort(column_order
if config.order_ascending
else -column_order)
518 n_ref_select = len(ref.extras.indices)
520 coords_spherical = config.coord_format.coords_spherical
521 coords_ref, coords_target = (
522 (cat.coord1[cat.extras.select], cat.coord2[cat.extras.select])
523 for cat
in (ref, target)
527 logger.info(
'Generating cKDTree with match_n_max=%d', config.match_n_max)
530 match_dist_max = config.match_dist_max/3600.
531 with Matcher(coords_target[0], coords_target[1])
as matcher:
532 idxs_target_select = matcher.query_knn(
533 coords_ref[0], coords_ref[1],
534 distance_upper_bound=match_dist_max,
535 k=config.match_n_max,
540 match_dist_max = np.radians(config.match_dist_max/3600.)
542 func_convert = _radec_to_xyz
if coords_spherical
else np.vstack
543 vec_ref, vec_target = (
544 func_convert(coords[0], coords[1])
545 for coords
in (coords_ref, coords_target)
547 tree_obj = cKDTree(vec_target)
548 _, idxs_target_select = tree_obj.query(
550 distance_upper_bound=match_dist_max,
551 k=config.match_n_max,
554 n_target_select = len(target.extras.indices)
555 n_matches = np.sum(idxs_target_select != n_target_select, axis=1)
556 n_matched_max = np.sum(n_matches == config.match_n_max)
557 if n_matched_max > 0:
559 '%d/%d (%.2f%%) selected true objects have n_matches=n_match_max(%d)',
560 n_matched_max, n_ref_select, 100.*n_matched_max/n_ref_select, config.match_n_max
564 target_row_match = np.full(target.extras.n, np.nan, dtype=np.int64)
565 ref_candidate_match = np.zeros(ref.extras.n, dtype=bool)
566 ref_row_match = np.full(ref.extras.n, np.nan, dtype=np.int64)
567 ref_match_count = np.zeros(ref.extras.n, dtype=np.int32)
568 ref_match_meas_finite = np.zeros(ref.extras.n, dtype=np.int32)
569 ref_chisq = np.full(ref.extras.n, np.nan, dtype=float)
572 idx_orig_ref, idx_orig_target = (np.argwhere(cat.extras.select)[:, 0]
for cat
in (ref, target))
575 columns_convert = config.coord_format.coords_ref_to_convert
576 if columns_convert
is None:
578 data_ref = ref.catalog[
579 [columns_convert.get(column, column)
for column
in config.columns_ref_meas]
580 ].iloc[ref.extras.indices[order]]
581 data_target = target.catalog[config.columns_target_meas][target.extras.select]
582 errors_target = target.catalog[config.columns_target_err][target.extras.select]
586 matched_target = {n_target_select, }
587 index_ref = idx_orig_ref[order]
589 ref_candidate_match[index_ref] =
True
592 t_begin = time.process_time()
595 matched_ref = idxs_target_select[order, 0] != n_target_select
596 order = order[matched_ref]
597 idx_first = idxs_target_select[order, 0]
598 chi_0 = (data_target.iloc[idx_first].values - data_ref.iloc[matched_ref].values)/(
599 errors_target.iloc[idx_first].values)
600 chi_finite_0 = np.isfinite(chi_0)
601 n_finite_0 = np.sum(chi_finite_0, axis=1)
602 chi_0[~chi_finite_0] = 0
603 chisq_sum_0 = np.sum(chi_0*chi_0, axis=1)
605 logger.info(
'Disambiguating %d/%d matches/targets', len(order), len(ref.catalog))
606 for index_n, index_row_select
in enumerate(order):
607 index_row = idx_orig_ref[index_row_select]
608 found = idxs_target_select[index_row_select, :]
610 if (found[1] == n_target_select)
and (found[0]
not in matched_target):
611 n_finite = n_finite_0[index_n]
612 if not (n_finite >= config.match_n_finite_min):
616 chisq_sum = chisq_sum_0[index_n]
622 found = [x
for x
in found
if x
not in matched_target]
628 (data_target.iloc[found].values - data_ref.iloc[index_n].values)
629 / errors_target.iloc[found].values
631 finite = np.isfinite(chi)
632 n_finite = np.sum(finite, axis=1)
634 chisq_good = n_finite >= config.match_n_finite_min
635 if not any(chisq_good):
638 chisq_sum = np.zeros(n_found, dtype=float)
639 chisq_sum[chisq_good] = np.nansum(chi[chisq_good, :] ** 2, axis=1)
640 idx_chisq_min = np.nanargmin(chisq_sum / n_finite)
641 n_finite = n_finite[idx_chisq_min]
642 n_matched = len(chisq_good)
643 chisq_sum = chisq_sum[idx_chisq_min]
644 except Exception
as error:
647 exceptions[index_row] = error
648 ref_match_meas_finite[index_row] = n_finite
649 ref_match_count[index_row] = n_matched
650 ref_chisq[index_row] = chisq_sum
651 idx_match_select = found[idx_chisq_min]
652 row_target = target.extras.indices[idx_match_select]
653 ref_row_match[index_row] = row_target
655 target_row_match[row_target] = index_row
656 matched_target.add(idx_match_select)
658 if logging_n_rows
and ((index_n + 1) % logging_n_rows == 0):
659 t_elapsed = time.process_time() - t_begin
661 'Processed %d/%d in %.2fs at sort value=%.3f',
662 index_n + 1, n_ref_select, t_elapsed, column_order[order[index_n]],
666 'match_candidate': ref_candidate_match,
667 'match_row': ref_row_match,
668 'match_count': ref_match_count,
669 'match_chisq': ref_chisq,
670 'match_n_chisq_finite': ref_match_meas_finite,
673 'match_candidate': target.extras.select
if target.extras.select
is not None else (
674 np.ones(target.extras.n, dtype=bool)),
675 'match_row': target_row_match,
678 for (columns, out_original, out_matched, in_original, in_matched, matches, name_cat)
in (
698 matched = matches >= 0
699 idx_matched = matches[matched]
700 logger.info(
'Matched %d/%d %s sources', np.sum(matched), len(matched), name_cat)
702 for column
in columns:
703 values = in_original.catalog[column]
704 out_original[column] = values
705 dtype = in_original.catalog[column].dtype
709 types = list(set((type(x)
for x
in values)))
711 raise RuntimeError(f
'Column {column} dtype={dtype} has multiple types={types}')
718 dtype = f
'<U{max(len(x) for x in values)}'
720 column_match = np.full(in_matched.extras.n, value_fill, dtype=dtype)
721 column_match[matched] = in_original.catalog[column][idx_matched]
722 out_matched[f
'match_{column}'] = column_match
725 'Completed match disambiguating in %.2fs (total %.2fs)',
726 time.process_time() - t_begin,
727 time.process_time() - t_init,
730 catalog_out_ref = pd.DataFrame(data_ref)
731 catalog_out_target = pd.DataFrame(data_target)
733 return catalog_out_ref, catalog_out_target, exceptions