Coverage for python/lsst/meas/astrom/matcher_probabilistic.py: 22%
257 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 03:58 -0700
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 03:58 -0700
1# This file is part of meas_astrom.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22__all__ = ['ConvertCatalogCoordinatesConfig', 'MatchProbabilisticConfig', 'MatcherProbabilistic']
24import lsst.pex.config as pexConfig
26from dataclasses import dataclass
27import logging
28import numpy as np
29import pandas as pd
30from scipy.spatial import cKDTree
31from smatch.matcher import Matcher
32import time
33from typing import Callable, Set
35logger_default = logging.getLogger(__name__)
38def _mul_column(column: np.array, value: float):
39 if value is not None and value != 1:
40 column *= value
41 return column
44def _radec_to_xyz(ra, dec):
45 """Convert input ra/dec coordinates to spherical unit vectors.
47 Parameters
48 ----------
49 ra, dec: `numpy.ndarray`
50 Arrays of right ascension/declination in degrees.
52 Returns
53 -------
54 vectors : `numpy.ndarray`, (N, 3)
55 Output unit vectors.
56 """
57 if ra.size != dec.size:
58 raise ValueError('ra and dec must be same size')
59 ras = np.radians(ra)
60 decs = np.radians(dec)
61 vectors = np.empty((ras.size, 3))
63 sin_dec = np.sin(np.pi / 2 - decs)
64 vectors[:, 0] = sin_dec * np.cos(ras)
65 vectors[:, 1] = sin_dec * np.sin(ras)
66 vectors[:, 2] = np.cos(np.pi / 2 - decs)
68 return vectors
71@dataclass
72class CatalogExtras:
73 """Store frequently-reference (meta)data relevant for matching a catalog.
75 Parameters
76 ----------
77 catalog : `pandas.DataFrame`
78 A pandas catalog to store extra information for.
79 select : `numpy.array`
80 A numpy boolean array of the same length as catalog to be used for
81 target selection.
82 """
84 n: int
85 indices: np.array
86 select: np.array
88 coordinate_factor: float = None
90 def __init__(self, catalog: pd.DataFrame, select: np.array = None, coordinate_factor: float = None):
91 self.n = len(catalog)
92 self.select = np.ones(self.n, dtype=bool) if select is None else select
93 self.indices = np.flatnonzero(select) if select is not None else np.arange(self.n)
94 self.coordinate_factor = coordinate_factor
97@dataclass(frozen=True)
98class ComparableCatalog:
99 """A catalog with sources with coordinate columns in some standard format/units.
101 catalog : `pandas.DataFrame`
102 A catalog with comparable coordinate columns.
103 column_coord1 : `str`
104 The first spatial coordinate column name.
105 column_coord2 : `str`
106 The second spatial coordinate column name.
107 coord1 : `numpy.array`
108 The first spatial coordinate values.
109 coord2 : `numpy.array`
110 The second spatial coordinate values.
111 extras : `CatalogExtras`
112 Extra cached (meta)data for the `catalog`.
113 """
115 catalog: pd.DataFrame
116 column_coord1: str
117 column_coord2: str
118 coord1: np.array
119 coord2: np.array
120 extras: CatalogExtras
123class ConvertCatalogCoordinatesConfig(pexConfig.Config):
124 """Configuration for the MatchProbabilistic matcher."""
126 column_ref_coord1 = pexConfig.Field[str](
127 default='ra',
128 doc='The reference table column for the first spatial coordinate (usually x or ra).',
129 )
130 column_ref_coord2 = pexConfig.Field[str](
131 default='dec',
132 doc='The reference table column for the second spatial coordinate (usually y or dec).'
133 'Units must match column_ref_coord1.',
134 )
135 column_target_coord1 = pexConfig.Field[str](
136 default='coord_ra',
137 doc='The target table column for the first spatial coordinate (usually x or ra).'
138 'Units must match column_ref_coord1.',
139 )
140 column_target_coord2 = pexConfig.Field[str](
141 default='coord_dec',
142 doc='The target table column for the second spatial coordinate (usually y or dec).'
143 'Units must match column_ref_coord2.',
144 )
145 coords_spherical = pexConfig.Field[bool](
146 default=True,
147 doc='Whether column_*_coord[12] are spherical coordinates (ra/dec) or not (pixel x/y).',
148 )
149 coords_ref_factor = pexConfig.Field[float](
150 default=1.0,
151 doc='Multiplicative factor for reference catalog coordinates.'
152 'If coords_spherical is true, this must be the number of degrees per unit increment of '
153 'column_ref_coord[12]. Otherwise, it must convert the coordinate to the same units'
154 ' as the target coordinates.',
155 )
156 coords_target_factor = pexConfig.Field[float](
157 default=1.0,
158 doc='Multiplicative factor for target catalog coordinates.'
159 'If coords_spherical is true, this must be the number of degrees per unit increment of '
160 'column_target_coord[12]. Otherwise, it must convert the coordinate to the same units'
161 ' as the reference coordinates.',
162 )
163 coords_ref_to_convert = pexConfig.DictField[str, str]( 163 ↛ exitline 163 didn't jump to the function exit
164 default=None,
165 optional=True,
166 dictCheck=lambda x: len(x) == 2,
167 doc='Dict mapping sky coordinate columns to be converted to pixel columns.',
168 )
169 mag_zeropoint_ref = pexConfig.Field[float](
170 default=31.4,
171 doc='Magnitude zeropoint for reference catalog.',
172 )
173 return_converted_coords = pexConfig.Field[float](
174 default=True,
175 doc='Whether to return converted coordinates for matching or only write them.',
176 )
178 def format_catalogs(
179 self,
180 catalog_ref: pd.DataFrame,
181 catalog_target: pd.DataFrame,
182 select_ref: np.array = None,
183 select_target: np.array = None,
184 radec_to_xy_func: Callable = None,
185 **kwargs,
186 ):
187 """Format matched catalogs that may require coordinate conversions.
189 Parameters
190 ----------
191 catalog_ref : `pandas.DataFrame`
192 A reference catalog for comparison to `catalog_target`.
193 catalog_target : `pandas.DataFrame`
194 A target catalog with measurements for comparison to `catalog_ref`.
195 select_ref : `numpy.ndarray`, (Nref,)
196 A boolean array of len `catalog_ref`, True for valid match candidates.
197 select_target : `numpy.ndarray`, (Ntarget,)
198 A boolean array of len `catalog_target`, True for valid match candidates.
199 radec_to_xy_func : `typing.Callable`
200 Function taking equal-length ra, dec arrays and returning an ndarray of
201 - ``x``: current parameter (`float`).
202 - ``extra_args``: additional arguments (`dict`).
203 kwargs
204 Additional keyword arguments to pass to radec_to_xy_func.
206 Returns
207 -------
208 compcat_ref, compcat_target : `ComparableCatalog`
209 Comparable catalogs corresponding to the input reference and target.
210 """
211 convert_ref = self.coords_ref_to_convert
212 if convert_ref and not callable(radec_to_xy_func):
213 raise TypeError('radec_to_xy_func must be callable if converting ref coords')
215 # Set up objects with frequently-used attributes like selection bool array
216 extras_ref, extras_target = (
217 CatalogExtras(catalog, select=select, coordinate_factor=coord_factor)
218 for catalog, select, coord_factor in zip(
219 (catalog_ref, catalog_target),
220 (select_ref, select_target),
221 (self.coords_ref_factor, self.coords_target_factor),
222 )
223 )
225 compcats = []
227 # Retrieve coordinates and multiply them by scaling factors
228 for catalog, extras, (column1, column2), convert in (
229 (catalog_ref, extras_ref, (self.column_ref_coord1, self.column_ref_coord2), convert_ref),
230 (catalog_target, extras_target, (self.column_target_coord1, self.column_target_coord2), False),
231 ):
232 coord1, coord2 = (
233 _mul_column(catalog[column], extras.coordinate_factor)
234 for column in (column1, column2)
235 )
236 if convert:
237 xy_ref = radec_to_xy_func(coord1, coord2, self.coords_ref_factor, **kwargs)
238 for idx_coord, column_out in enumerate(self.coords_ref_to_convert.values()):
239 coord = np.array([xy[idx_coord] for xy in xy_ref])
240 catalog[column_out] = coord
241 if convert_ref:
242 column1, column2 = self.coords_ref_to_convert.values()
243 if self.return_converted_coords:
244 coord1, coord2 = catalog[column1], catalog[column2]
245 if isinstance(coord1, pd.Series):
246 coord1 = coord1.values
247 if isinstance(coord2, pd.Series):
248 coord2 = coord2.values
250 compcats.append(ComparableCatalog(
251 catalog=catalog, column_coord1=column1, column_coord2=column2,
252 coord1=coord1, coord2=coord2, extras=extras,
253 ))
255 return tuple(compcats)
258class MatchProbabilisticConfig(pexConfig.Config):
259 """Configuration for the MatchProbabilistic matcher."""
261 column_ref_order = pexConfig.Field(
262 dtype=str,
263 default=None,
264 optional=True,
265 doc='Name of column in reference catalog specifying order for matching'
266 ' Derived from columns_ref_flux if not set.',
267 )
269 @property
270 def columns_in_ref(self) -> Set[str]:
271 columns_all = [
272 self.coord_format.column_ref_coord1,
273 self.coord_format.column_ref_coord2,
274 ]
275 for columns in (
276 self.columns_ref_flux,
277 self.columns_ref_meas,
278 self.columns_ref_select_false,
279 self.columns_ref_select_true,
280 self.columns_ref_copy,
281 ):
282 columns_all.extend(columns)
283 if self.column_ref_order:
284 columns_all.append(self.column_ref_order)
286 return set(columns_all)
288 @property
289 def columns_in_target(self) -> Set[str]:
290 columns_all = [
291 self.coord_format.column_target_coord1,
292 self.coord_format.column_target_coord2,
293 ]
294 for columns in (
295 self.columns_target_meas,
296 self.columns_target_err,
297 self.columns_target_select_false,
298 self.columns_target_select_true,
299 self.columns_target_copy,
300 ):
301 columns_all.extend(columns)
302 return set(columns_all)
304 columns_ref_copy = pexConfig.ListField( 304 ↛ exitline 304 didn't jump to the function exit
305 dtype=str,
306 default=[],
307 listCheck=lambda x: len(set(x)) == len(x),
308 optional=True,
309 doc='Reference table columns to copy unchanged into both match tables',
310 )
311 columns_ref_flux = pexConfig.ListField(
312 dtype=str,
313 default=[],
314 optional=True,
315 doc="List of reference flux columns to nansum total magnitudes from if column_order is None",
316 )
317 columns_ref_meas = pexConfig.ListField(
318 dtype=str,
319 doc='The reference table columns to compute match likelihoods from '
320 '(usually centroids and fluxes/magnitudes)',
321 )
322 columns_ref_select_true = pexConfig.ListField(
323 dtype=str,
324 default=tuple(),
325 doc='Reference table columns to require to be True for selecting sources',
326 )
327 columns_ref_select_false = pexConfig.ListField(
328 dtype=str,
329 default=tuple(),
330 doc='Reference table columns to require to be False for selecting sources',
331 )
332 columns_target_copy = pexConfig.ListField( 332 ↛ exitline 332 didn't jump to the function exit
333 dtype=str,
334 default=[],
335 listCheck=lambda x: len(set(x)) == len(x),
336 optional=True,
337 doc='Target table columns to copy unchanged into both match tables',
338 )
339 columns_target_meas = pexConfig.ListField(
340 dtype=str,
341 doc='Target table columns with measurements corresponding to columns_ref_meas',
342 )
343 columns_target_err = pexConfig.ListField(
344 dtype=str,
345 doc='Target table columns with standard errors (sigma) corresponding to columns_ref_meas',
346 )
347 columns_target_select_true = pexConfig.ListField(
348 dtype=str,
349 default=('detect_isPrimary',),
350 doc='Target table columns to require to be True for selecting sources',
351 )
352 columns_target_select_false = pexConfig.ListField(
353 dtype=str,
354 default=('merge_peak_sky',),
355 doc='Target table columns to require to be False for selecting sources',
356 )
357 coord_format = pexConfig.ConfigField(
358 dtype=ConvertCatalogCoordinatesConfig,
359 doc="Configuration for coordinate conversion",
360 )
361 mag_brightest_ref = pexConfig.Field(
362 dtype=float,
363 default=-np.inf,
364 doc='Bright magnitude cutoff for selecting reference sources to match.'
365 ' Ignored if column_ref_order is None.'
366 )
367 mag_faintest_ref = pexConfig.Field(
368 dtype=float,
369 default=np.Inf,
370 doc='Faint magnitude cutoff for selecting reference sources to match.'
371 ' Ignored if column_ref_order is None.'
372 )
373 match_dist_max = pexConfig.Field(
374 dtype=float,
375 default=0.5,
376 doc='Maximum match distance. Units must be arcseconds if coords_spherical, '
377 'or else match those of column_*_coord[12] multiplied by coords_*_factor.',
378 )
379 match_n_max = pexConfig.Field( 379 ↛ exitline 379 didn't jump to the function exit
380 dtype=int,
381 default=10,
382 optional=True,
383 doc='Maximum number of spatial matches to consider (in ascending distance order).',
384 check=lambda x: x >= 1,
385 )
386 match_n_finite_min = pexConfig.Field(
387 dtype=int,
388 default=2,
389 optional=True,
390 doc='Minimum number of columns with a finite value to measure match likelihood',
391 )
392 order_ascending = pexConfig.Field(
393 dtype=bool,
394 default=False,
395 optional=True,
396 doc='Whether to order reference match candidates in ascending order of column_ref_order '
397 '(should be False if the column is a flux and True if it is a magnitude.',
398 )
400 def validate(self):
401 super().validate()
402 n_ref_meas = len(self.columns_ref_meas)
403 n_target_meas = len(self.columns_target_meas)
404 n_target_err = len(self.columns_target_err)
405 match_n_finite_min = self.match_n_finite_min
406 errors = []
407 if n_target_meas != n_ref_meas:
408 errors.append(f"{len(self.columns_target_meas)=} != {len(self.columns_ref_meas)=}")
409 if n_target_err != n_ref_meas:
410 errors.append(f"{len(self.columns_target_err)=} != {len(self.columns_ref_meas)=}")
411 if not (n_ref_meas >= match_n_finite_min):
412 errors.append(
413 f"{len(self.columns_ref_meas)=} !>= {self.match_n_finite_min=}, no matches possible"
414 )
415 if errors:
416 raise ValueError("\n".join(errors))
419def default_value(dtype):
420 if dtype == str:
421 return ''
422 elif dtype == np.signedinteger:
423 return np.Inf
424 elif dtype == np.unsignedinteger:
425 return -np.Inf
426 return None
429class MatcherProbabilistic:
430 """A probabilistic, greedy catalog matcher.
432 Parameters
433 ----------
434 config: `MatchProbabilisticConfig`
435 A configuration instance.
436 """
438 config: MatchProbabilisticConfig
440 def __init__(
441 self,
442 config: MatchProbabilisticConfig,
443 ):
444 self.config = config
446 def match(
447 self,
448 catalog_ref: pd.DataFrame,
449 catalog_target: pd.DataFrame,
450 select_ref: np.array = None,
451 select_target: np.array = None,
452 logger: logging.Logger = None,
453 logging_n_rows: int = None,
454 **kwargs
455 ):
456 """Match catalogs.
458 Parameters
459 ----------
460 catalog_ref : `pandas.DataFrame`
461 A reference catalog to match in order of a given column (i.e. greedily).
462 catalog_target : `pandas.DataFrame`
463 A target catalog for matching sources from `catalog_ref`. Must contain measurements with errors.
464 select_ref : `numpy.array`
465 A boolean array of the same length as `catalog_ref` selecting the sources that can be matched.
466 select_target : `numpy.array`
467 A boolean array of the same length as `catalog_target` selecting the sources that can be matched.
468 logger : `logging.Logger`
469 A Logger for logging.
470 logging_n_rows : `int`
471 The number of sources to match before printing a log message.
472 kwargs
473 Additional keyword arguments to pass to `format_catalogs`.
475 Returns
476 -------
477 catalog_out_ref : `pandas.DataFrame`
478 A catalog of identical length to `catalog_ref`, containing match information for rows selected by
479 `select_ref` (including the matching row index in `catalog_target`).
480 catalog_out_target : `pandas.DataFrame`
481 A catalog of identical length to `catalog_target`, containing the indices of matching rows in
482 `catalog_ref`.
483 exceptions : `dict` [`int`, `Exception`]
484 A dictionary keyed by `catalog_target` row number of the first exception caught when matching.
485 """
486 if logger is None:
487 logger = logger_default
489 t_init = time.process_time()
490 config = self.config
492 # Transform any coordinates, if required
493 # Note: The returned objects contain the original catalogs, as well as
494 # transformed coordinates, and the selection of sources for matching.
495 # These might be identical to the arrays passed as kwargs, but that
496 # depends on config settings.
497 # For the rest of this function, the selection arrays will be used,
498 # but the indices of the original, unfiltered catalog will also be
499 # output, so some further indexing steps are needed.
500 ref, target = config.coord_format.format_catalogs(
501 catalog_ref=catalog_ref, catalog_target=catalog_target,
502 select_ref=select_ref, select_target=select_target,
503 **kwargs
504 )
506 # If no order is specified, take nansum of all flux columns for a 'total flux'
507 # Note: it won't actually be a total flux if bands overlap significantly
508 # (or it might define a filter with >100% efficiency
509 # Also, this is done on the original dataframe as it's harder to accomplish
510 # just with a recarray
511 column_order = (
512 catalog_ref.loc[ref.extras.select, config.column_ref_order]
513 if config.column_ref_order is not None else
514 np.nansum(catalog_ref.loc[ref.extras.select, config.columns_ref_flux], axis=1)
515 )
516 order = np.argsort(column_order if config.order_ascending else -column_order)
518 n_ref_select = len(ref.extras.indices)
520 coords_spherical = config.coord_format.coords_spherical
521 coords_ref, coords_target = (
522 (cat.coord1[cat.extras.select], cat.coord2[cat.extras.select])
523 for cat in (ref, target)
524 )
526 # Generate K-d tree to compute distances
527 logger.info('Generating cKDTree with match_n_max=%d', config.match_n_max)
529 if coords_spherical:
530 match_dist_max = config.match_dist_max/3600.
531 with Matcher(coords_target[0], coords_target[1]) as matcher:
532 idxs_target_select = matcher.query_knn(
533 coords_ref[0], coords_ref[1],
534 distance_upper_bound=match_dist_max,
535 k=config.match_n_max,
536 )
537 # Call scipy for non-spherical case
538 # The spherical case won't trigger, but the implementation is left for comparison, if needed
539 else:
540 match_dist_max = np.radians(config.match_dist_max/3600.)
541 # Convert ra/dec sky coordinates to spherical vectors for accurate distances
542 func_convert = _radec_to_xyz if coords_spherical else np.vstack
543 vec_ref, vec_target = (
544 func_convert(coords[0], coords[1])
545 for coords in (coords_ref, coords_target)
546 )
547 tree_obj = cKDTree(vec_target)
548 _, idxs_target_select = tree_obj.query(
549 vec_ref,
550 distance_upper_bound=match_dist_max,
551 k=config.match_n_max,
552 )
554 n_target_select = len(target.extras.indices)
555 n_matches = np.sum(idxs_target_select != n_target_select, axis=1)
556 n_matched_max = np.sum(n_matches == config.match_n_max)
557 if n_matched_max > 0:
558 logger.warning(
559 '%d/%d (%.2f%%) selected true objects have n_matches=n_match_max(%d)',
560 n_matched_max, n_ref_select, 100.*n_matched_max/n_ref_select, config.match_n_max
561 )
563 # Pre-allocate outputs
564 target_row_match = np.full(target.extras.n, np.nan, dtype=np.int64)
565 ref_candidate_match = np.zeros(ref.extras.n, dtype=bool)
566 ref_row_match = np.full(ref.extras.n, np.nan, dtype=np.int64)
567 ref_match_count = np.zeros(ref.extras.n, dtype=np.int32)
568 ref_match_meas_finite = np.zeros(ref.extras.n, dtype=np.int32)
569 ref_chisq = np.full(ref.extras.n, np.nan, dtype=float)
571 # Need the original reference row indices for output
572 idx_orig_ref, idx_orig_target = (np.argwhere(cat.extras.select)[:, 0] for cat in (ref, target))
574 # Retrieve required columns, including any converted ones (default to original column name)
575 columns_convert = config.coord_format.coords_ref_to_convert
576 if columns_convert is None:
577 columns_convert = {}
578 data_ref = ref.catalog[
579 [columns_convert.get(column, column) for column in config.columns_ref_meas]
580 ].iloc[ref.extras.indices[order]]
581 data_target = target.catalog[config.columns_target_meas][target.extras.select]
582 errors_target = target.catalog[config.columns_target_err][target.extras.select]
584 exceptions = {}
585 # The kdTree uses len(inputs) as a sentinel value for no match
586 matched_target = {n_target_select, }
587 index_ref = idx_orig_ref[order]
588 # Fill in the candidate column
589 ref_candidate_match[index_ref] = True
591 # Count this as the time when disambiguation begins
592 t_begin = time.process_time()
594 # Exclude unmatched sources
595 matched_ref = idxs_target_select[order, 0] != n_target_select
596 order = order[matched_ref]
597 idx_first = idxs_target_select[order, 0]
598 chi_0 = (data_target.iloc[idx_first].values - data_ref.iloc[matched_ref].values)/(
599 errors_target.iloc[idx_first].values)
600 chi_finite_0 = np.isfinite(chi_0)
601 n_finite_0 = np.sum(chi_finite_0, axis=1)
602 chi_0[~chi_finite_0] = 0
603 chisq_sum_0 = np.sum(chi_0*chi_0, axis=1)
605 logger.info('Disambiguating %d/%d matches/targets', len(order), len(ref.catalog))
606 for index_n, index_row_select in enumerate(order):
607 index_row = idx_orig_ref[index_row_select]
608 found = idxs_target_select[index_row_select, :]
609 # Unambiguous match, short-circuit some evaluations
610 if (found[1] == n_target_select) and (found[0] not in matched_target):
611 n_finite = n_finite_0[index_n]
612 if not (n_finite >= config.match_n_finite_min):
613 continue
614 idx_chisq_min = 0
615 n_matched = 1
616 chisq_sum = chisq_sum_0[index_n]
617 else:
618 # Select match candidates from nearby sources not already matched
619 # Note: set lookup is apparently fast enough that this is a few percent faster than:
620 # found = [x for x in found[found != n_target_select] if x not in matched_target]
621 # ... at least for ~1M sources
622 found = [x for x in found if x not in matched_target]
623 n_found = len(found)
624 if n_found == 0:
625 continue
626 # This is an ndarray of n_found rows x len(data_ref/target) columns
627 chi = (
628 (data_target.iloc[found].values - data_ref.iloc[index_n].values)
629 / errors_target.iloc[found].values
630 )
631 finite = np.isfinite(chi)
632 n_finite = np.sum(finite, axis=1)
633 # Require some number of finite chi_sq to match
634 chisq_good = n_finite >= config.match_n_finite_min
635 if not any(chisq_good):
636 continue
637 try:
638 chisq_sum = np.zeros(n_found, dtype=float)
639 chisq_sum[chisq_good] = np.nansum(chi[chisq_good, :] ** 2, axis=1)
640 idx_chisq_min = np.nanargmin(chisq_sum / n_finite)
641 n_finite = n_finite[idx_chisq_min]
642 n_matched = len(chisq_good)
643 chisq_sum = chisq_sum[idx_chisq_min]
644 except Exception as error:
645 # Can't foresee any exceptions, but they shouldn't prevent
646 # matching subsequent sources
647 exceptions[index_row] = error
648 ref_match_meas_finite[index_row] = n_finite
649 ref_match_count[index_row] = n_matched
650 ref_chisq[index_row] = chisq_sum
651 idx_match_select = found[idx_chisq_min]
652 row_target = target.extras.indices[idx_match_select]
653 ref_row_match[index_row] = row_target
655 target_row_match[row_target] = index_row
656 matched_target.add(idx_match_select)
658 if logging_n_rows and ((index_n + 1) % logging_n_rows == 0):
659 t_elapsed = time.process_time() - t_begin
660 logger.info(
661 'Processed %d/%d in %.2fs at sort value=%.3f',
662 index_n + 1, n_ref_select, t_elapsed, column_order[order[index_n]],
663 )
665 data_ref = {
666 'match_candidate': ref_candidate_match,
667 'match_row': ref_row_match,
668 'match_count': ref_match_count,
669 'match_chisq': ref_chisq,
670 'match_n_chisq_finite': ref_match_meas_finite,
671 }
672 data_target = {
673 'match_candidate': target.extras.select if target.extras.select is not None else (
674 np.ones(target.extras.n, dtype=bool)),
675 'match_row': target_row_match,
676 }
678 for (columns, out_original, out_matched, in_original, in_matched, matches, name_cat) in (
679 (
680 self.config.columns_ref_copy,
681 data_ref,
682 data_target,
683 ref,
684 target,
685 target_row_match,
686 'target',
687 ),
688 (
689 self.config.columns_target_copy,
690 data_target,
691 data_ref,
692 target,
693 ref,
694 ref_row_match,
695 'reference',
696 ),
697 ):
698 matched = matches >= 0
699 idx_matched = matches[matched]
700 logger.info('Matched %d/%d %s sources', np.sum(matched), len(matched), name_cat)
702 for column in columns:
703 values = in_original.catalog[column]
704 out_original[column] = values
705 dtype = in_original.catalog[column].dtype
707 # Pandas object columns can have mixed types - check for that
708 if dtype == object:
709 types = list(set((type(x) for x in values)))
710 if len(types) != 1:
711 raise RuntimeError(f'Column {column} dtype={dtype} has multiple types={types}')
712 dtype = types[0]
714 value_fill = default_value(dtype)
716 # Without this, the dtype would be '<U1' for an empty Unicode string
717 if dtype == str:
718 dtype = f'<U{max(len(x) for x in values)}'
720 column_match = np.full(in_matched.extras.n, value_fill, dtype=dtype)
721 column_match[matched] = in_original.catalog[column][idx_matched]
722 out_matched[f'match_{column}'] = column_match
724 logger.info(
725 'Completed match disambiguating in %.2fs (total %.2fs)',
726 time.process_time() - t_begin,
727 time.process_time() - t_init,
728 )
730 catalog_out_ref = pd.DataFrame(data_ref)
731 catalog_out_target = pd.DataFrame(data_target)
733 return catalog_out_ref, catalog_out_target, exceptions