Coverage for python/lsst/meas/algorithms/ingestIndexReferenceTask.py: 42%

139 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-15 03:09 -0700

1# This file is part of meas_algorithms. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22 

23# TODO DM-31698: post-gen2 removal notes 

24# `DatasetConfig`, `ConvertReferenceCatalogBase`, and `ConvertReferenceCatalogConfig` 

25# should all be moved to to `convertReferenceCatalog.py` once gen2 butler 

26# has been removed. 

27 

28__all__ = ["DatasetConfig", "ConvertReferenceCatalogBase", "ConvertReferenceCatalogConfig"] 

29 

30import abc 

31import os.path 

32 

33import astropy.units 

34 

35import lsst.pex.config as pexConfig 

36import lsst.pipe.base as pipeBase 

37import lsst.geom 

38import lsst.sphgeom 

39import lsst.afw.table as afwTable 

40from lsst.daf.base import PropertyList 

41from .indexerRegistry import IndexerRegistry 

42from .readTextCatalogTask import ReadTextCatalogTask 

43from .loadReferenceObjects import ReferenceObjectLoader 

44from . import convertRefcatManager 

45 

46# The most recent Indexed Reference Catalog on-disk format version. 

47LATEST_FORMAT_VERSION = 1 

48 

49 

50def addRefCatMetadata(catalog): 

51 """Add metadata to a new (not yet populated) reference catalog. 

52 

53 Parameters 

54 ---------- 

55 catalog : `lsst.afw.table.SimpleCatalog` 

56 Catalog to which metadata should be attached. Will be modified 

57 in-place. 

58 """ 

59 md = catalog.getMetadata() 

60 if md is None: 

61 md = PropertyList() 

62 md.set("REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION) 

63 catalog.setMetadata(md) 

64 

65 

66class DatasetConfig(pexConfig.Config): 

67 """The description of the on-disk storage format for the persisted 

68 reference catalog. 

69 """ 

70 format_version = pexConfig.Field( 

71 dtype=int, 

72 doc="Version number of the persisted on-disk storage format." 

73 "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)." 

74 "\nVersion 1 had nJy as flux units.", 

75 default=0 # This needs to always be 0, so that unversioned catalogs are interpreted as version 0. 

76 ) 

77 ref_dataset_name = pexConfig.Field( 

78 dtype=str, 

79 # TODO DM-31817: remove this default value. 

80 default='cal_ref_cat', 

81 doc="Name of this reference catalog to be used in the butler registry.", 

82 ) 

83 indexer = IndexerRegistry.makeField( 

84 default='HTM', 

85 doc='Name of indexer algoritm to use. Default is HTM', 

86 ) 

87 

88 

89class ConvertReferenceCatalogConfig(pexConfig.Config): 

90 dataset_config = pexConfig.ConfigField( 

91 dtype=DatasetConfig, 

92 doc="Configuration for reading the ingested data", 

93 ) 

94 n_processes = pexConfig.Field( 

95 dtype=int, 

96 doc=("Number of python processes to use when ingesting."), 

97 default=1 

98 ) 

99 manager = pexConfig.ConfigurableField( 

100 target=convertRefcatManager.ConvertRefcatManager, 

101 doc="Multiprocessing manager to perform the actual conversion of values, file-by-file." 

102 ) 

103 file_reader = pexConfig.ConfigurableField( 

104 target=ReadTextCatalogTask, 

105 doc='Task to use to read the files. Default is to expect text files.' 

106 ) 

107 ra_name = pexConfig.Field( 

108 dtype=str, 

109 doc="Name of RA column (values in decimal degrees)", 

110 ) 

111 dec_name = pexConfig.Field( 

112 dtype=str, 

113 doc="Name of Dec column (values in decimal degrees)", 

114 ) 

115 ra_err_name = pexConfig.Field( 

116 dtype=str, 

117 doc="Name of RA error column", 

118 optional=True, 

119 ) 

120 dec_err_name = pexConfig.Field( 

121 dtype=str, 

122 doc="Name of Dec error column", 

123 optional=True, 

124 ) 

125 coord_err_unit = pexConfig.Field( 

126 dtype=str, 

127 doc="Unit of RA/Dec error fields (astropy.unit.Unit compatible)", 

128 optional=True 

129 ) 

130 mag_column_list = pexConfig.ListField( 

131 dtype=str, 

132 doc="The values in the reference catalog are assumed to be in AB magnitudes. " 

133 "List of column names to use for photometric information. At least one entry is required." 

134 ) 

135 mag_err_column_map = pexConfig.DictField( 

136 keytype=str, 

137 itemtype=str, 

138 default={}, 

139 doc="A map of magnitude column name (key) to magnitude error column (value)." 

140 ) 

141 is_photometric_name = pexConfig.Field( 

142 dtype=str, 

143 optional=True, 

144 doc='Name of column stating if satisfactory for photometric calibration (optional).' 

145 ) 

146 is_resolved_name = pexConfig.Field( 

147 dtype=str, 

148 optional=True, 

149 doc='Name of column stating if the object is resolved (optional).' 

150 ) 

151 is_variable_name = pexConfig.Field( 

152 dtype=str, 

153 optional=True, 

154 doc='Name of column stating if the object is measured to be variable (optional).' 

155 ) 

156 id_name = pexConfig.Field( 

157 dtype=str, 

158 optional=True, 

159 doc='Name of column to use as an identifier (optional).' 

160 ) 

161 pm_ra_name = pexConfig.Field( 

162 dtype=str, 

163 doc="Name of proper motion RA column", 

164 optional=True, 

165 ) 

166 pm_dec_name = pexConfig.Field( 

167 dtype=str, 

168 doc="Name of proper motion Dec column", 

169 optional=True, 

170 ) 

171 pm_ra_err_name = pexConfig.Field( 

172 dtype=str, 

173 doc="Name of proper motion RA error column", 

174 optional=True, 

175 ) 

176 pm_dec_err_name = pexConfig.Field( 

177 dtype=str, 

178 doc="Name of proper motion Dec error column", 

179 optional=True, 

180 ) 

181 pm_scale = pexConfig.Field( 

182 dtype=float, 

183 doc="Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year", 

184 default=1.0, 

185 ) 

186 parallax_name = pexConfig.Field( 

187 dtype=str, 

188 doc="Name of parallax column", 

189 optional=True, 

190 ) 

191 parallax_err_name = pexConfig.Field( 

192 dtype=str, 

193 doc="Name of parallax error column", 

194 optional=True, 

195 ) 

196 parallax_scale = pexConfig.Field( 

197 dtype=float, 

198 doc="Scale factor by which to multiply parallax values to obtain units of milliarcsec", 

199 default=1.0, 

200 ) 

201 epoch_name = pexConfig.Field( 

202 dtype=str, 

203 doc="Name of epoch column", 

204 optional=True, 

205 ) 

206 epoch_format = pexConfig.Field( 

207 dtype=str, 

208 doc="Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'", 

209 optional=True, 

210 ) 

211 epoch_scale = pexConfig.Field( 

212 dtype=str, 

213 doc="Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'", 

214 optional=True, 

215 ) 

216 extra_col_names = pexConfig.ListField( 

217 dtype=str, 

218 default=[], 

219 doc='Extra columns to add to the reference catalog.' 

220 ) 

221 

222 def setDefaults(self): 

223 # Newly ingested reference catalogs always have the latest format_version. 

224 self.dataset_config.format_version = LATEST_FORMAT_VERSION 

225 # gen3 refcats are all depth=7 

226 self.dataset_config.indexer['HTM'].depth = 7 

227 

228 def validate(self): 

229 pexConfig.Config.validate(self) 

230 

231 def assertAllOrNone(*names): 

232 """Raise ValueError unless all the named fields are set or are 

233 all none (or blank) 

234 """ 

235 setNames = [name for name in names if bool(getattr(self, name))] 

236 if len(setNames) in (len(names), 0): 

237 return 

238 prefix = "Both or neither" if len(names) == 2 else "All or none" 

239 raise ValueError("{} of {} must be set, but only {} are set".format( 

240 prefix, ", ".join(names), ", ".join(setNames))) 

241 

242 if not (self.ra_name and self.dec_name and self.mag_column_list): 

243 raise ValueError( 

244 "ra_name and dec_name and at least one entry in mag_column_list must be supplied.") 

245 if self.mag_err_column_map and set(self.mag_column_list) != set(self.mag_err_column_map.keys()): 

246 raise ValueError( 

247 "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format( 

248 sorted(self.mag_err_column_map.keys()), sorted(self.mag_column_list))) 

249 assertAllOrNone("ra_err_name", "dec_err_name", "coord_err_unit") 

250 if self.coord_err_unit is not None: 

251 result = astropy.units.Unit(self.coord_err_unit, parse_strict='silent') 

252 if isinstance(result, astropy.units.UnrecognizedUnit): 

253 msg = f"{self.coord_err_unit} is not a valid astropy unit string." 

254 raise pexConfig.FieldValidationError(ConvertReferenceCatalogConfig.coord_err_unit, self, msg) 

255 

256 assertAllOrNone("epoch_name", "epoch_format", "epoch_scale") 

257 assertAllOrNone("pm_ra_name", "pm_dec_name") 

258 assertAllOrNone("pm_ra_err_name", "pm_dec_err_name") 

259 assertAllOrNone("parallax_name", "parallax_err_name") 

260 if self.pm_ra_err_name and not self.pm_ra_name: 

261 raise ValueError('"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified') 

262 if (self.pm_ra_name or self.parallax_name) and not self.epoch_name: 

263 raise ValueError( 

264 '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified') 

265 

266 

267class ConvertReferenceCatalogBase(pipeBase.Task, abc.ABC): 

268 """Base class for producing and loading indexed reference catalogs, 

269 shared between gen2 and gen3. 

270 

271 This implements an indexing scheme based on hierarchical triangular 

272 mesh (HTM). The term index really means breaking the catalog into 

273 localized chunks called shards. In this case each shard contains 

274 the entries from the catalog in a single HTM trixel 

275 

276 For producing catalogs this task makes the following assumptions 

277 about the input catalogs: 

278 

279 - RA, Dec are in decimal degrees. 

280 - Epoch is available in a column, in a format supported by astropy.time.Time. 

281 - There are no off-diagonal covariance terms, such as covariance 

282 between RA and Dec, or between PM RA and PM Dec. Support for such 

283 covariance would have to be added to to the config, including consideration 

284 of the units in the input catalog. 

285 """ 

286 canMultiprocess = False 

287 ConfigClass = ConvertReferenceCatalogConfig 

288 

289 def __init__(self, *args, **kwargs): 

290 super().__init__(*args, **kwargs) 

291 self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name]( 

292 self.config.dataset_config.indexer.active) 

293 self.makeSubtask('file_reader') 

294 

295 def run(self, inputFiles): 

296 """Index a set of files comprising a reference catalog. 

297 

298 Outputs are persisted in the butler repository. 

299 

300 Parameters 

301 ---------- 

302 inputFiles : `list` 

303 A list of file paths to read. 

304 """ 

305 self._preRun() 

306 schema, key_map = self._saveMasterSchema(inputFiles[0]) 

307 # create an HTM we can interrogate about pixel ids 

308 htm = lsst.sphgeom.HtmPixelization(self.indexer.htm.get_depth()) 

309 filenames = self._getButlerFilenames(htm) 

310 worker = self.config.manager.target(filenames, 

311 self.config, 

312 self.file_reader, 

313 self.indexer, 

314 schema, 

315 key_map, 

316 htm.universe()[0], 

317 addRefCatMetadata, 

318 self.log) 

319 result = worker.run(inputFiles) 

320 

321 self._persistConfig() 

322 self._postRun(result) 

323 

324 def _preRun(self): 

325 """Any setup that has to be performed at the start of ``run``, but that 

326 cannot be performed during ``__init__`` (e.g. making directories). 

327 """ 

328 pass 

329 

330 def _postRun(self, result): 

331 """Any tasks that have to happen at the end of ``run``. 

332 

333 Parameters 

334 ---------- 

335 result 

336 The result returned from``worker.run()``. 

337 """ 

338 pass 

339 

340 def _getButlerFilenames(self, htm): 

341 """Get filenames from the butler for each output htm pixel. 

342 

343 Parameters 

344 ---------- 

345 htm : `lsst.sphgeom.HtmPixelization` 

346 The HTM pixelization scheme to be used to build filenames. 

347 

348 Returns 

349 ------- 

350 filenames : `list [str]` 

351 List of filenames to write each HTM pixel to. 

352 """ 

353 filenames = {} 

354 start, end = htm.universe()[0] 

355 # path manipulation because butler.get() per pixel will take forever 

356 path = self._getOnePixelFilename(start) 

357 base = os.path.join(os.path.dirname(path), "%d"+os.path.splitext(path)[1]) 

358 for pixelId in range(start, end): 

359 filenames[pixelId] = base % pixelId 

360 

361 return filenames 

362 

363 def makeSchema(self, dtype): 

364 """Make the schema to use in constructing the persisted catalogs. 

365 

366 Parameters 

367 ---------- 

368 dtype : `numpy.dtype` 

369 Data type describing each entry in ``config.extra_col_names`` 

370 for the catalogs being ingested. 

371 

372 Returns 

373 ------- 

374 schemaAndKeyMap : `tuple` of (`lsst.afw.table.Schema`, `dict`) 

375 A tuple containing two items: 

376 - The schema for the output source catalog. 

377 - A map of catalog keys to use in filling the record 

378 """ 

379 # make a schema with the standard fields 

380 schema = ReferenceObjectLoader.makeMinimalSchema( 

381 filterNameList=self.config.mag_column_list, 

382 addCentroid=False, 

383 addIsPhotometric=bool(self.config.is_photometric_name), 

384 addIsResolved=bool(self.config.is_resolved_name), 

385 addIsVariable=bool(self.config.is_variable_name), 

386 coordErrDim=2 if bool(self.config.ra_err_name) else 0, 

387 addProperMotion=2 if bool(self.config.pm_ra_name) else 0, 

388 properMotionErrDim=2 if bool(self.config.pm_ra_err_name) else 0, 

389 addParallax=bool(self.config.parallax_name), 

390 ) 

391 keysToSkip = set(("id", "centroid_x", "centroid_y", "hasCentroid")) 

392 key_map = {fieldName: schema[fieldName].asKey() for fieldName in schema.getOrderedNames() 

393 if fieldName not in keysToSkip} 

394 

395 def addField(name): 

396 if dtype[name].kind == 'U': 

397 # dealing with a string like thing. Need to get type and size. 

398 at_size = dtype[name].itemsize 

399 return schema.addField(name, type=str, size=at_size) 

400 else: 

401 at_type = dtype[name].type 

402 return schema.addField(name, at_type) 

403 

404 for col in self.config.extra_col_names: 

405 key_map[col] = addField(col) 

406 return schema, key_map 

407 

408 def _saveMasterSchema(self, filename): 

409 """Generate and save the master catalog schema. 

410 

411 Parameters 

412 ---------- 

413 filename : `str` 

414 An input file to read to get the input dtype. 

415 """ 

416 arr = self.file_reader.run(filename) 

417 schema, key_map = self.makeSchema(arr.dtype) 

418 

419 catalog = afwTable.SimpleCatalog(schema) 

420 addRefCatMetadata(catalog) 

421 self._writeMasterSchema(catalog) 

422 return schema, key_map 

423 

424 @abc.abstractmethod 

425 def _getOnePixelFilename(self, start): 

426 """Return one example filename to help construct the rest of the 

427 per-htm pixel filenames. 

428 

429 Parameters 

430 ---------- 

431 start : `int` 

432 The first HTM index in this HTM pixelization. 

433 

434 Returns 

435 ------- 

436 filename : `str` 

437 Path to a single file that would be written to the output location. 

438 """ 

439 pass 

440 

441 @abc.abstractmethod 

442 def _persistConfig(self): 

443 """Write the config that was used to generate the refcat. 

444 """ 

445 pass 

446 

447 @abc.abstractmethod 

448 def _writeMasterSchema(self, catalog): 

449 """Butler put the master catalog schema. 

450 

451 Parameters 

452 ---------- 

453 catalog : `lsst.afw.table.SimpleCatalog` 

454 An empty catalog with a fully-defined schema that matches the 

455 schema used in each of the HTM pixel files. 

456 """ 

457 pass