Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of afw. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21import collections.abc 

22 

23import numpy 

24 

25import lsst.geom 

26from ._schemaMapper import SchemaMapper 

27from ._table import CoordKey, SourceRecord 

28 

29 

30class MultiMatch: 

31 """Initialize a multi-catalog match. 

32 

33 Parameters 

34 ---------- 

35 schema : `lsst.afw.table.Schema` 

36 Schema shared by all catalogs to be included in the match. 

37 dataIdFormat : `dict` 

38 Set of name: type for all data ID keys (e.g. {"visit":int, 

39 "ccd":int}). 

40 coordField : `str`, optional 

41 Prefix for _ra and _dec fields that contain the 

42 coordinates to use for the match. 

43 idField : `str`, optional 

44 Name of the field in schema that contains unique object 

45 IDs. 

46 radius : `lsst.geom.Angle`, optional 

47 Maximum separation for a match. Defaults to 0.5 arcseconds. 

48 RecordClass : `lsst.afw.table.BaseRecord` 

49 Type of record to expect in catalogs to be matched. 

50 """ 

51 

52 def __init__(self, schema, dataIdFormat, coordField="coord", idField="id", radius=None, 

53 RecordClass=SourceRecord): 

54 if radius is None: 

55 radius = 0.5*lsst.geom.arcseconds 

56 elif not isinstance(radius, lsst.geom.Angle): 

57 raise ValueError("'radius' argument must be an Angle") 

58 self.radius = radius 

59 self.mapper = SchemaMapper(schema) 

60 self.mapper.addMinimalSchema(schema, True) 

61 self.coordKey = CoordKey(schema[coordField]) 

62 self.idKey = schema.find(idField).key 

63 self.dataIdKeys = {} 

64 outSchema = self.mapper.editOutputSchema() 

65 outSchema.setAliasMap(self.mapper.getInputSchema().getAliasMap()) 

66 self.objectKey = outSchema.addField( 

67 "object", type=numpy.int64, doc="Unique ID for joined sources") 

68 for name, dataType in dataIdFormat.items(): 

69 self.dataIdKeys[name] = outSchema.addField( 

70 name, type=dataType, doc="'%s' data ID component") 

71 # self.result will be a catalog containing the union of all matched records, with an 'object' ID 

72 # column that can be used to group matches. Sources that have ambiguous matches may appear 

73 # multiple times. 

74 self.result = None 

75 # self.reference will be a subset of self.result, with exactly one record for each group of matches 

76 # (we'll use the one from the first catalog matched into this group) 

77 # We'll use this to match against each subsequent catalog. 

78 self.reference = None 

79 # A set of ambiguous objects that we may want to ultimately remove from 

80 # the final merged catalog. 

81 self.ambiguous = set() 

82 # Table used to allocate new records for the ouput catalog. 

83 self.table = RecordClass.Table.make(self.mapper.getOutputSchema()) 

84 # Counter used to assign the next object ID 

85 self.nextObjId = 1 

86 

87 def makeRecord(self, inputRecord, dataId, objId): 

88 """Create a new result record from the given input record, using the 

89 given data ID and object ID to fill in additional columns. 

90 

91 Parameters 

92 ---------- 

93 inputRecord : `lsst.afw.table.source.sourceRecord` 

94 Record to use as the reference for the new result. 

95 dataId : `DataId` or `dict` 

96 Data id describing the data. 

97 objId : `int` 

98 Object id of the object to be added. 

99 

100 Returns 

101 ------- 

102 outputRecord : `lsst.afw.table.source.sourceRecord` 

103 Newly generated record. 

104 """ 

105 outputRecord = self.table.copyRecord(inputRecord, self.mapper) 

106 for name, key in self.dataIdKeys.items(): 

107 outputRecord.set(key, dataId[name]) 

108 outputRecord.set(self.objectKey, objId) 

109 return outputRecord 

110 

111 def add(self, catalog, dataId): 

112 """Add a new catalog to the match, corresponding to the given data ID. 

113 The new catalog is appended to the `self.result` and 

114 `self.reference` catalogs. 

115 

116 Parameters 

117 ---------- 

118 catalog : `lsst.afw.table.base.Catalog` 

119 Catalog to be added to the match result. 

120 dataId : `DataId` or `dict` 

121 Data id for the catalog to be added. 

122 """ 

123 if self.result is None: 

124 self.result = self.table.Catalog(self.table) 

125 for record in catalog: 

126 self.result.append(self.makeRecord( 

127 record, dataId, objId=self.nextObjId)) 

128 self.nextObjId += 1 

129 self.reference = self.result.copy(deep=False) 

130 return 

131 catalog.sort(self.idKey) # pre-sort for speedy by-id access later. 

132 # Will remove from this set as objects are matched. 

133 unmatchedIds = {record.get(self.idKey) for record in catalog} 

134 # Temporary dict mapping new source ID to a set of associated objects. 

135 newToObj = {} 

136 matches = lsst.afw.table.matchRaDec(self.reference, catalog, self.radius) 

137 matchedRefIds = set() 

138 matchedCatIds = set() 

139 for refRecord, newRecord, distance in matches: 

140 objId = refRecord.get(self.objectKey) 

141 if objId in matchedRefIds: 

142 # We've already matched this object against another new source, 

143 # mark it as ambiguous. 

144 self.ambiguous.add(objId) 

145 matchedRefIds.add(objId) 

146 if newRecord.get(self.idKey) in matchedCatIds: 

147 # We've already matched this new source to one or more other objects 

148 # Mark all involved objects as ambiguous 

149 self.ambiguous.add(objId) 

150 self.ambiguous |= newToObj.get(newRecord.get(self.idKey), set()) 

151 matchedCatIds.add(newRecord.get(self.idKey)) 

152 unmatchedIds.discard(newRecord.get(self.idKey)) 

153 # Populate the newToObj dict (setdefault trick is an idiom for 

154 # appending to a dict-of-sets) 

155 newToObj.setdefault(newRecord.get(self.idKey), set()).add(objId) 

156 # Add a new result record for this match. 

157 self.result.append(self.makeRecord(newRecord, dataId, objId)) 

158 # Add any unmatched sources from the new catalog as new objects to both 

159 # the joined result catalog and the reference catalog. 

160 for objId in unmatchedIds: 

161 newRecord = catalog.find(objId, self.idKey) 

162 resultRecord = self.makeRecord(newRecord, dataId, self.nextObjId) 

163 self.nextObjId += 1 

164 self.result.append(resultRecord) 

165 self.reference.append(resultRecord) 

166 

167 def finish(self, removeAmbiguous=True): 

168 """Return the final match catalog, after sorting it by object, copying 

169 it to ensure contiguousness, and optionally removing ambiguous 

170 matches. 

171 

172 After calling finish(), the in-progress state of the matcher 

173 is returned to the state it was just after construction, with 

174 the exception of the object ID counter (which is not reset). 

175 

176 Parameters 

177 ---------- 

178 removeAmbiguous : `bool`, optional 

179 Should ambiguous matches be removed from the match 

180 catalog? Defaults to True. 

181 

182 Returns 

183 ------- 

184 result : `lsst.afw.table.base.Catalog` 

185 Final match catalog, sorted by object. 

186 """ 

187 if removeAmbiguous: 

188 result = self.table.Catalog(self.table) 

189 for record in self.result: 

190 if record.get(self.objectKey) not in self.ambiguous: 

191 result.append(record) 

192 else: 

193 result = self.result 

194 result.sort(self.objectKey) 

195 result = result.copy(deep=True) 

196 self.result = None 

197 self.reference = None 

198 self.ambiguous = set() 

199 return result 

200 

201 

202class GroupView(collections.abc.Mapping): 

203 """A mapping (i.e. dict-like object) that provides convenient 

204 operations on the concatenated catalogs returned by a MultiMatch 

205 object. 

206 

207 A GroupView provides access to a catalog of grouped objects, in 

208 which the grouping is indicated by a field for which all records 

209 in a group have the same value. Once constructed, it allows 

210 operations similar to those supported by SQL "GROUP BY", such as 

211 filtering and aggregate calculation. 

212 

213 Parameters 

214 ---------- 

215 schema : `lsst.afw.table.Schema` 

216 Catalog schema to use for the grouped object catalog. 

217 ids : `List` 

218 List of identifying keys for the groups in the catalog. 

219 groups : `List` 

220 List of catalog subsets associated with each key in ids. 

221 """ 

222 

223 @classmethod 

224 def build(cls, catalog, groupField="object"): 

225 """Construct a GroupView from a concatenated catalog. 

226 

227 Parameters 

228 ---------- 

229 catalog : `lsst.afw.table.base.Catalog` 

230 Input catalog, containing records grouped by a field in 

231 which all records in the same group have the same value. 

232 Must be sorted by the group field. 

233 groupField : `str`, optional 

234 Name or Key for the field that indicates groups. Defaults 

235 to "object". 

236 

237 Returns 

238 ------- 

239 groupCatalog : `lsst.afw.table.multiMatch.GroupView` 

240 Constructed GroupView from the input concatenated catalog. 

241 """ 

242 groupKey = catalog.schema.find(groupField).key 

243 ids, indices = numpy.unique(catalog.get(groupKey), return_index=True) 

244 groups = numpy.zeros(len(ids), dtype=object) 

245 ends = list(indices[1:]) + [len(catalog)] 

246 for n, (i1, i2) in enumerate(zip(indices, ends)): 

247 # casts are a work-around for DM-8557 

248 groups[n] = catalog[int(i1):int(i2)] 

249 assert (groups[n].get(groupKey) == ids[n]).all() 

250 return cls(catalog.schema, ids, groups) 

251 

252 def __init__(self, schema, ids, groups): 

253 self.schema = schema 

254 self.ids = ids 

255 self.groups = groups 

256 self.count = sum(len(cat) for cat in self.groups) 

257 

258 def __len__(self): 

259 return len(self.ids) 

260 

261 def __iter__(self): 

262 return iter(self.ids) 

263 

264 def __getitem__(self, key): 

265 index = numpy.searchsorted(self.ids, key) 

266 if self.ids[index] != key: 

267 raise KeyError("Group with ID {0} not found".format(key)) 

268 return self.groups[index] 

269 

270 def where(self, predicate): 

271 """Return a new GroupView that contains only groups for which the 

272 given predicate function returns True. 

273 

274 The predicate function is called once for each group, and 

275 passed a single argument: the subset catalog for that group. 

276 

277 Parameters 

278 ---------- 

279 predicate : 

280 Function to identify which groups should be included in 

281 the output. 

282 

283 Returns 

284 ------- 

285 outGroupView : `lsst.afw.table.multiMatch.GroupView` 

286 Subset GroupView containing only groups that match the 

287 predicate. 

288 """ 

289 mask = numpy.zeros(len(self), dtype=bool) 

290 for i in range(len(self)): 

291 mask[i] = predicate(self.groups[i]) 

292 return type(self)(self.schema, self.ids[mask], self.groups[mask]) 

293 

294 def aggregate(self, function, field=None, dtype=float): 

295 """Run an aggregate function on each group, returning an array with 

296 one element for each group. 

297 

298 Parameters 

299 ---------- 

300 function : 

301 Callable object that computes the aggregate value. If 

302 `field` is None, called with the entire subset catalog as an 

303 argument. If `field` is not None, called with an array view 

304 into that field. 

305 field : `str`, optional 

306 A string name or Key object that indicates a single field the aggregate 

307 is computed over. 

308 dtype : 

309 Data type of the output array. 

310 

311 Returns 

312 ------- 

313 result : Array of `dtype` 

314 Aggregated values for each group. 

315 """ 

316 result = numpy.zeros(len(self), dtype=dtype) 

317 if field is not None: 

318 key = self.schema.find(field).key 

319 

320 def f(cat): 

321 return function(cat.get(key)) 

322 else: 

323 f = function 

324 for i in range(len(self)): 

325 result[i] = f(self.groups[i]) 

326 return result 

327 

328 def apply(self, function, field=None, dtype=float): 

329 """Run a non-aggregate function on each group, returning an array with 

330 one element for each record. 

331 

332 Parameters 

333 ---------- 

334 function : 

335 Callable object that computes the aggregate value. If field is None, 

336 called with the entire subset catalog as an argument. If field is not 

337 None, called with an array view into that field. 

338 field : `str` 

339 A string name or Key object that indicates a single field the aggregate 

340 is computed over. 

341 dtype : 

342 Data type for the output array. 

343 

344 Returns 

345 ------- 

346 result : `numpy.array` of `dtype` 

347 Result of the function calculated on an element-by-element basis. 

348 """ 

349 result = numpy.zeros(self.count, dtype=dtype) 

350 if field is not None: 

351 key = self.schema.find(field).key 

352 

353 def f(cat): 

354 return function(cat.get(key)) 

355 else: 

356 f = function 

357 last = 0 

358 for i in range(len(self)): 

359 next = last + len(self.groups[i]) 

360 result[last:next] = f(self.groups[i]) 

361 last = next 

362 return result