Coverage for python/lsst/afw/table/multiMatch.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of afw.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
21import collections.abc
23import numpy
25import lsst.geom
26from ._schemaMapper import SchemaMapper
27from ._table import CoordKey, SourceRecord
30class MultiMatch:
31 """Initialize a multi-catalog match.
33 Parameters
34 ----------
35 schema : `lsst.afw.table.Schema`
36 Schema shared by all catalogs to be included in the match.
37 dataIdFormat : `dict`
38 Set of name: type for all data ID keys (e.g. {"visit":int,
39 "ccd":int}).
40 coordField : `str`, optional
41 Prefix for _ra and _dec fields that contain the
42 coordinates to use for the match.
43 idField : `str`, optional
44 Name of the field in schema that contains unique object
45 IDs.
46 radius : `lsst.geom.Angle`, optional
47 Maximum separation for a match. Defaults to 0.5 arcseconds.
48 RecordClass : `lsst.afw.table.BaseRecord`
49 Type of record to expect in catalogs to be matched.
50 """
52 def __init__(self, schema, dataIdFormat, coordField="coord", idField="id", radius=None,
53 RecordClass=SourceRecord):
54 if radius is None:
55 radius = 0.5*lsst.geom.arcseconds
56 elif not isinstance(radius, lsst.geom.Angle):
57 raise ValueError("'radius' argument must be an Angle")
58 self.radius = radius
59 self.mapper = SchemaMapper(schema)
60 self.mapper.addMinimalSchema(schema, True)
61 self.coordKey = CoordKey(schema[coordField])
62 self.idKey = schema.find(idField).key
63 self.dataIdKeys = {}
64 outSchema = self.mapper.editOutputSchema()
65 outSchema.setAliasMap(self.mapper.getInputSchema().getAliasMap())
66 self.objectKey = outSchema.addField(
67 "object", type=numpy.int64, doc="Unique ID for joined sources")
68 for name, dataType in dataIdFormat.items():
69 self.dataIdKeys[name] = outSchema.addField(
70 name, type=dataType, doc="'%s' data ID component")
71 # self.result will be a catalog containing the union of all matched records, with an 'object' ID
72 # column that can be used to group matches. Sources that have ambiguous matches may appear
73 # multiple times.
74 self.result = None
75 # self.reference will be a subset of self.result, with exactly one record for each group of matches
76 # (we'll use the one from the first catalog matched into this group)
77 # We'll use this to match against each subsequent catalog.
78 self.reference = None
79 # A set of ambiguous objects that we may want to ultimately remove from
80 # the final merged catalog.
81 self.ambiguous = set()
82 # Table used to allocate new records for the ouput catalog.
83 self.table = RecordClass.Table.make(self.mapper.getOutputSchema())
84 # Counter used to assign the next object ID
85 self.nextObjId = 1
87 def makeRecord(self, inputRecord, dataId, objId):
88 """Create a new result record from the given input record, using the
89 given data ID and object ID to fill in additional columns.
91 Parameters
92 ----------
93 inputRecord : `lsst.afw.table.source.sourceRecord`
94 Record to use as the reference for the new result.
95 dataId : `DataId` or `dict`
96 Data id describing the data.
97 objId : `int`
98 Object id of the object to be added.
100 Returns
101 -------
102 outputRecord : `lsst.afw.table.source.sourceRecord`
103 Newly generated record.
104 """
105 outputRecord = self.table.copyRecord(inputRecord, self.mapper)
106 for name, key in self.dataIdKeys.items():
107 outputRecord.set(key, dataId[name])
108 outputRecord.set(self.objectKey, objId)
109 return outputRecord
111 def add(self, catalog, dataId):
112 """Add a new catalog to the match, corresponding to the given data ID.
113 The new catalog is appended to the `self.result` and
114 `self.reference` catalogs.
116 Parameters
117 ----------
118 catalog : `lsst.afw.table.base.Catalog`
119 Catalog to be added to the match result.
120 dataId : `DataId` or `dict`
121 Data id for the catalog to be added.
122 """
123 if self.result is None:
124 self.result = self.table.Catalog(self.table)
125 for record in catalog:
126 self.result.append(self.makeRecord(
127 record, dataId, objId=self.nextObjId))
128 self.nextObjId += 1
129 self.reference = self.result.copy(deep=False)
130 return
131 catalog.sort(self.idKey) # pre-sort for speedy by-id access later.
132 # Will remove from this set as objects are matched.
133 unmatchedIds = {record.get(self.idKey) for record in catalog}
134 # Temporary dict mapping new source ID to a set of associated objects.
135 newToObj = {}
136 matches = lsst.afw.table.matchRaDec(self.reference, catalog, self.radius)
137 matchedRefIds = set()
138 matchedCatIds = set()
139 for refRecord, newRecord, distance in matches:
140 objId = refRecord.get(self.objectKey)
141 if objId in matchedRefIds:
142 # We've already matched this object against another new source,
143 # mark it as ambiguous.
144 self.ambiguous.add(objId)
145 matchedRefIds.add(objId)
146 if newRecord.get(self.idKey) in matchedCatIds:
147 # We've already matched this new source to one or more other objects
148 # Mark all involved objects as ambiguous
149 self.ambiguous.add(objId)
150 self.ambiguous |= newToObj.get(newRecord.get(self.idKey), set())
151 matchedCatIds.add(newRecord.get(self.idKey))
152 unmatchedIds.discard(newRecord.get(self.idKey))
153 # Populate the newToObj dict (setdefault trick is an idiom for
154 # appending to a dict-of-sets)
155 newToObj.setdefault(newRecord.get(self.idKey), set()).add(objId)
156 # Add a new result record for this match.
157 self.result.append(self.makeRecord(newRecord, dataId, objId))
158 # Add any unmatched sources from the new catalog as new objects to both
159 # the joined result catalog and the reference catalog.
160 for objId in unmatchedIds:
161 newRecord = catalog.find(objId, self.idKey)
162 resultRecord = self.makeRecord(newRecord, dataId, self.nextObjId)
163 self.nextObjId += 1
164 self.result.append(resultRecord)
165 self.reference.append(resultRecord)
167 def finish(self, removeAmbiguous=True):
168 """Return the final match catalog, after sorting it by object, copying
169 it to ensure contiguousness, and optionally removing ambiguous
170 matches.
172 After calling finish(), the in-progress state of the matcher
173 is returned to the state it was just after construction, with
174 the exception of the object ID counter (which is not reset).
176 Parameters
177 ----------
178 removeAmbiguous : `bool`, optional
179 Should ambiguous matches be removed from the match
180 catalog? Defaults to True.
182 Returns
183 -------
184 result : `lsst.afw.table.base.Catalog`
185 Final match catalog, sorted by object.
186 """
187 if removeAmbiguous:
188 result = self.table.Catalog(self.table)
189 for record in self.result:
190 if record.get(self.objectKey) not in self.ambiguous:
191 result.append(record)
192 else:
193 result = self.result
194 result.sort(self.objectKey)
195 result = result.copy(deep=True)
196 self.result = None
197 self.reference = None
198 self.ambiguous = set()
199 return result
202class GroupView(collections.abc.Mapping):
203 """A mapping (i.e. dict-like object) that provides convenient
204 operations on the concatenated catalogs returned by a MultiMatch
205 object.
207 A GroupView provides access to a catalog of grouped objects, in
208 which the grouping is indicated by a field for which all records
209 in a group have the same value. Once constructed, it allows
210 operations similar to those supported by SQL "GROUP BY", such as
211 filtering and aggregate calculation.
213 Parameters
214 ----------
215 schema : `lsst.afw.table.Schema`
216 Catalog schema to use for the grouped object catalog.
217 ids : `List`
218 List of identifying keys for the groups in the catalog.
219 groups : `List`
220 List of catalog subsets associated with each key in ids.
221 """
223 @classmethod
224 def build(cls, catalog, groupField="object"):
225 """Construct a GroupView from a concatenated catalog.
227 Parameters
228 ----------
229 catalog : `lsst.afw.table.base.Catalog`
230 Input catalog, containing records grouped by a field in
231 which all records in the same group have the same value.
232 Must be sorted by the group field.
233 groupField : `str`, optional
234 Name or Key for the field that indicates groups. Defaults
235 to "object".
237 Returns
238 -------
239 groupCatalog : `lsst.afw.table.multiMatch.GroupView`
240 Constructed GroupView from the input concatenated catalog.
241 """
242 groupKey = catalog.schema.find(groupField).key
243 ids, indices = numpy.unique(catalog.get(groupKey), return_index=True)
244 groups = numpy.zeros(len(ids), dtype=object)
245 ends = list(indices[1:]) + [len(catalog)]
246 for n, (i1, i2) in enumerate(zip(indices, ends)):
247 # casts are a work-around for DM-8557
248 groups[n] = catalog[int(i1):int(i2)]
249 assert (groups[n].get(groupKey) == ids[n]).all()
250 return cls(catalog.schema, ids, groups)
252 def __init__(self, schema, ids, groups):
253 self.schema = schema
254 self.ids = ids
255 self.groups = groups
256 self.count = sum(len(cat) for cat in self.groups)
258 def __len__(self):
259 return len(self.ids)
261 def __iter__(self):
262 return iter(self.ids)
264 def __getitem__(self, key):
265 index = numpy.searchsorted(self.ids, key)
266 if self.ids[index] != key:
267 raise KeyError("Group with ID {0} not found".format(key))
268 return self.groups[index]
270 def where(self, predicate):
271 """Return a new GroupView that contains only groups for which the
272 given predicate function returns True.
274 The predicate function is called once for each group, and
275 passed a single argument: the subset catalog for that group.
277 Parameters
278 ----------
279 predicate :
280 Function to identify which groups should be included in
281 the output.
283 Returns
284 -------
285 outGroupView : `lsst.afw.table.multiMatch.GroupView`
286 Subset GroupView containing only groups that match the
287 predicate.
288 """
289 mask = numpy.zeros(len(self), dtype=bool)
290 for i in range(len(self)):
291 mask[i] = predicate(self.groups[i])
292 return type(self)(self.schema, self.ids[mask], self.groups[mask])
294 def aggregate(self, function, field=None, dtype=float):
295 """Run an aggregate function on each group, returning an array with
296 one element for each group.
298 Parameters
299 ----------
300 function :
301 Callable object that computes the aggregate value. If
302 `field` is None, called with the entire subset catalog as an
303 argument. If `field` is not None, called with an array view
304 into that field.
305 field : `str`, optional
306 A string name or Key object that indicates a single field the aggregate
307 is computed over.
308 dtype :
309 Data type of the output array.
311 Returns
312 -------
313 result : Array of `dtype`
314 Aggregated values for each group.
315 """
316 result = numpy.zeros(len(self), dtype=dtype)
317 if field is not None:
318 key = self.schema.find(field).key
320 def f(cat):
321 return function(cat.get(key))
322 else:
323 f = function
324 for i in range(len(self)):
325 result[i] = f(self.groups[i])
326 return result
328 def apply(self, function, field=None, dtype=float):
329 """Run a non-aggregate function on each group, returning an array with
330 one element for each record.
332 Parameters
333 ----------
334 function :
335 Callable object that computes the aggregate value. If field is None,
336 called with the entire subset catalog as an argument. If field is not
337 None, called with an array view into that field.
338 field : `str`
339 A string name or Key object that indicates a single field the aggregate
340 is computed over.
341 dtype :
342 Data type for the output array.
344 Returns
345 -------
346 result : `numpy.array` of `dtype`
347 Result of the function calculated on an element-by-element basis.
348 """
349 result = numpy.zeros(self.count, dtype=dtype)
350 if field is not None:
351 key = self.schema.find(field).key
353 def f(cat):
354 return function(cat.get(key))
355 else:
356 f = function
357 last = 0
358 for i in range(len(self)):
359 next = last + len(self.groups[i])
360 result[last:next] = f(self.groups[i])
361 last = next
362 return result