Coverage for python/lsst/meas/algorithms/ingestIndexReferenceTask.py: 42%

139 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-20 02:04 -0700

1# This file is part of meas_algorithms. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22 

23# TODO DM-31698: post-gen2 removal notes 

24# `DatasetConfig`, `ConvertReferenceCatalogBase`, and `ConvertReferenceCatalogConfig` 

25# should all be moved to to `convertReferenceCatalog.py` once gen2 butler 

26# has been removed. 

27 

28__all__ = ["DatasetConfig", "ConvertReferenceCatalogBase", "ConvertReferenceCatalogConfig"] 

29 

30import abc 

31import os.path 

32 

33import astropy.units 

34 

35import lsst.pex.config as pexConfig 

36import lsst.pipe.base as pipeBase 

37import lsst.geom 

38import lsst.sphgeom 

39import lsst.afw.table as afwTable 

40from lsst.daf.base import PropertyList 

41from .indexerRegistry import IndexerRegistry 

42from .readTextCatalogTask import ReadTextCatalogTask 

43from .loadReferenceObjects import ReferenceObjectLoader 

44from . import convertRefcatManager 

45 

46# The most recent Indexed Reference Catalog on-disk format version. 

47LATEST_FORMAT_VERSION = 1 

48 

49 

50def addRefCatMetadata(catalog): 

51 """Add metadata to a new (not yet populated) reference catalog. 

52 

53 Parameters 

54 ---------- 

55 catalog : `lsst.afw.table.SimpleCatalog` 

56 Catalog to which metadata should be attached. Will be modified 

57 in-place. 

58 """ 

59 md = catalog.getMetadata() 

60 if md is None: 

61 md = PropertyList() 

62 md.set("REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION) 

63 catalog.setMetadata(md) 

64 

65 

66class DatasetConfig(pexConfig.Config): 

67 """The description of the on-disk storage format for the persisted 

68 reference catalog. 

69 """ 

70 format_version = pexConfig.Field( 

71 dtype=int, 

72 doc="Version number of the persisted on-disk storage format." 

73 "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)." 

74 "\nVersion 1 had nJy as flux units.", 

75 default=0 # This needs to always be 0, so that unversioned catalogs are interpreted as version 0. 

76 ) 

77 ref_dataset_name = pexConfig.Field( 

78 dtype=str, 

79 # TODO DM-31817: remove this default value. 

80 default='cal_ref_cat', 

81 doc="Name of this reference catalog to be used in the butler registry.", 

82 ) 

83 indexer = IndexerRegistry.makeField( 

84 default='HTM', 

85 doc='Name of indexer algoritm to use. Default is HTM', 

86 ) 

87 

88 

89class ConvertReferenceCatalogConfig(pexConfig.Config): 

90 dataset_config = pexConfig.ConfigField( 

91 dtype=DatasetConfig, 

92 doc="Configuration for reading the ingested data", 

93 ) 

94 n_processes = pexConfig.Field( 

95 dtype=int, 

96 doc=("Number of python processes to use when ingesting."), 

97 default=1 

98 ) 

99 manager = pexConfig.ConfigurableField( 

100 target=convertRefcatManager.ConvertRefcatManager, 

101 doc="Multiprocessing manager to perform the actual conversion of values, file-by-file." 

102 ) 

103 file_reader = pexConfig.ConfigurableField( 

104 target=ReadTextCatalogTask, 

105 doc='Task to use to read the files. Default is to expect text files.' 

106 ) 

107 ra_name = pexConfig.Field( 

108 dtype=str, 

109 doc="Name of RA column (values in decimal degrees)", 

110 ) 

111 dec_name = pexConfig.Field( 

112 dtype=str, 

113 doc="Name of Dec column (values in decimal degrees)", 

114 ) 

115 ra_err_name = pexConfig.Field( 

116 dtype=str, 

117 doc="Name of RA error column", 

118 optional=True, 

119 ) 

120 dec_err_name = pexConfig.Field( 

121 dtype=str, 

122 doc="Name of Dec error column", 

123 optional=True, 

124 ) 

125 coord_err_unit = pexConfig.Field( 

126 dtype=str, 

127 doc="Unit of RA/Dec error fields (astropy.unit.Unit compatible)", 

128 optional=True 

129 ) 

130 mag_column_list = pexConfig.ListField( 

131 dtype=str, 

132 doc="The values in the reference catalog are assumed to be in AB magnitudes. " 

133 "List of column names to use for photometric information. At least one entry is required." 

134 ) 

135 mag_err_column_map = pexConfig.DictField( 

136 keytype=str, 

137 itemtype=str, 

138 default={}, 

139 doc="A map of magnitude column name (key) to magnitude error column (value)." 

140 ) 

141 is_photometric_name = pexConfig.Field( 

142 dtype=str, 

143 optional=True, 

144 doc='Name of column stating if satisfactory for photometric calibration (optional).' 

145 ) 

146 is_resolved_name = pexConfig.Field( 

147 dtype=str, 

148 optional=True, 

149 doc='Name of column stating if the object is resolved (optional).' 

150 ) 

151 is_variable_name = pexConfig.Field( 

152 dtype=str, 

153 optional=True, 

154 doc='Name of column stating if the object is measured to be variable (optional).' 

155 ) 

156 id_name = pexConfig.Field( 

157 dtype=str, 

158 optional=True, 

159 doc='Name of column to use as an identifier (optional).' 

160 ) 

161 pm_ra_name = pexConfig.Field( 

162 dtype=str, 

163 doc="Name of proper motion RA column", 

164 optional=True, 

165 ) 

166 pm_dec_name = pexConfig.Field( 

167 dtype=str, 

168 doc="Name of proper motion Dec column", 

169 optional=True, 

170 ) 

171 pm_ra_err_name = pexConfig.Field( 

172 dtype=str, 

173 doc="Name of proper motion RA error column", 

174 optional=True, 

175 ) 

176 pm_dec_err_name = pexConfig.Field( 

177 dtype=str, 

178 doc="Name of proper motion Dec error column", 

179 optional=True, 

180 ) 

181 pm_scale = pexConfig.Field( 

182 dtype=float, 

183 doc="Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year", 

184 default=1.0, 

185 ) 

186 parallax_name = pexConfig.Field( 

187 dtype=str, 

188 doc="Name of parallax column", 

189 optional=True, 

190 ) 

191 parallax_err_name = pexConfig.Field( 

192 dtype=str, 

193 doc="Name of parallax error column", 

194 optional=True, 

195 ) 

196 parallax_scale = pexConfig.Field( 

197 dtype=float, 

198 doc="Scale factor by which to multiply parallax values to obtain units of milliarcsec", 

199 default=1.0, 

200 ) 

201 epoch_name = pexConfig.Field( 

202 dtype=str, 

203 doc="Name of epoch column", 

204 optional=True, 

205 ) 

206 epoch_format = pexConfig.Field( 

207 dtype=str, 

208 doc="Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'", 

209 optional=True, 

210 ) 

211 epoch_scale = pexConfig.Field( 

212 dtype=str, 

213 doc="Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'", 

214 optional=True, 

215 ) 

216 extra_col_names = pexConfig.ListField( 

217 dtype=str, 

218 default=[], 

219 doc='Extra columns to add to the reference catalog.' 

220 ) 

221 

222 def setDefaults(self): 

223 # Newly ingested reference catalogs always have the latest format_version. 

224 self.dataset_config.format_version = LATEST_FORMAT_VERSION 

225 # gen3 refcats are all depth=7 

226 self.dataset_config.indexer['HTM'].depth = 7 

227 

228 def validate(self): 

229 pexConfig.Config.validate(self) 

230 

231 def assertAllOrNone(*names): 

232 """Raise ValueError unless all the named fields are set or are 

233 all none (or blank) 

234 """ 

235 setNames = [name for name in names if bool(getattr(self, name))] 

236 if len(setNames) in (len(names), 0): 

237 return 

238 prefix = "Both or neither" if len(names) == 2 else "All or none" 

239 raise ValueError("{} of {} must be set, but only {} are set".format( 

240 prefix, ", ".join(names), ", ".join(setNames))) 

241 

242 if not (self.ra_name and self.dec_name and self.mag_column_list): 

243 raise ValueError( 

244 "ra_name and dec_name and at least one entry in mag_column_list must be supplied.") 

245 if self.mag_err_column_map and set(self.mag_column_list) != set(self.mag_err_column_map.keys()): 

246 raise ValueError( 

247 "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format( 

248 sorted(self.mag_err_column_map.keys()), sorted(self.mag_column_list))) 

249 assertAllOrNone("ra_err_name", "dec_err_name", "coord_err_unit") 

250 if self.coord_err_unit is not None: 

251 result = astropy.units.Unit(self.coord_err_unit, parse_strict='silent') 

252 if isinstance(result, astropy.units.UnrecognizedUnit): 

253 msg = f"{self.coord_err_unit} is not a valid astropy unit string." 

254 raise pexConfig.FieldValidationError(ConvertReferenceCatalogConfig.coord_err_unit, self, msg) 

255 

256 assertAllOrNone("epoch_name", "epoch_format", "epoch_scale") 

257 assertAllOrNone("pm_ra_name", "pm_dec_name") 

258 assertAllOrNone("pm_ra_err_name", "pm_dec_err_name") 

259 assertAllOrNone("parallax_name", "parallax_err_name") 

260 if self.pm_ra_err_name and not self.pm_ra_name: 

261 raise ValueError('"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified') 

262 if (self.pm_ra_name or self.parallax_name) and not self.epoch_name: 

263 raise ValueError( 

264 '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified') 

265 

266 

267class ConvertReferenceCatalogBase(pipeBase.Task, abc.ABC): 

268 """Base class for producing and loading indexed reference catalogs, 

269 shared between gen2 and gen3. 

270 

271 This implements an indexing scheme based on hierarchical triangular 

272 mesh (HTM). The term index really means breaking the catalog into 

273 localized chunks called shards. In this case each shard contains 

274 the entries from the catalog in a single HTM trixel 

275 

276 For producing catalogs this task makes the following assumptions 

277 about the input catalogs: 

278 - RA, Dec are in decimal degrees. 

279 - Epoch is available in a column, in a format supported by astropy.time.Time. 

280 - There are no off-diagonal covariance terms, such as covariance 

281 between RA and Dec, or between PM RA and PM Dec. Support for such 

282 covariance would have to be added to to the config, including consideration 

283 of the units in the input catalog. 

284 """ 

285 canMultiprocess = False 

286 ConfigClass = ConvertReferenceCatalogConfig 

287 

288 def __init__(self, *args, **kwargs): 

289 super().__init__(*args, **kwargs) 

290 self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name]( 

291 self.config.dataset_config.indexer.active) 

292 self.makeSubtask('file_reader') 

293 

294 def run(self, inputFiles): 

295 """Index a set of files comprising a reference catalog. 

296 

297 Outputs are persisted in the butler repository. 

298 

299 Parameters 

300 ---------- 

301 inputFiles : `list` 

302 A list of file paths to read. 

303 """ 

304 self._preRun() 

305 schema, key_map = self._saveMasterSchema(inputFiles[0]) 

306 # create an HTM we can interrogate about pixel ids 

307 htm = lsst.sphgeom.HtmPixelization(self.indexer.htm.get_depth()) 

308 filenames = self._getButlerFilenames(htm) 

309 worker = self.config.manager.target(filenames, 

310 self.config, 

311 self.file_reader, 

312 self.indexer, 

313 schema, 

314 key_map, 

315 htm.universe()[0], 

316 addRefCatMetadata, 

317 self.log) 

318 result = worker.run(inputFiles) 

319 

320 self._persistConfig() 

321 self._postRun(result) 

322 

323 def _preRun(self): 

324 """Any setup that has to be performed at the start of ``run``, but that 

325 cannot be performed during ``__init__`` (e.g. making directories). 

326 """ 

327 pass 

328 

329 def _postRun(self, result): 

330 """Any tasks that have to happen at the end of ``run``. 

331 

332 Parameters 

333 ---------- 

334 result 

335 The result returned from``worker.run()``. 

336 """ 

337 pass 

338 

339 def _getButlerFilenames(self, htm): 

340 """Get filenames from the butler for each output htm pixel. 

341 

342 Parameters 

343 ---------- 

344 htm : `lsst.sphgeom.HtmPixelization` 

345 The HTM pixelization scheme to be used to build filenames. 

346 

347 Returns 

348 ------- 

349 filenames : `list [str]` 

350 List of filenames to write each HTM pixel to. 

351 """ 

352 filenames = {} 

353 start, end = htm.universe()[0] 

354 # path manipulation because butler.get() per pixel will take forever 

355 path = self._getOnePixelFilename(start) 

356 base = os.path.join(os.path.dirname(path), "%d"+os.path.splitext(path)[1]) 

357 for pixelId in range(start, end): 

358 filenames[pixelId] = base % pixelId 

359 

360 return filenames 

361 

362 def makeSchema(self, dtype): 

363 """Make the schema to use in constructing the persisted catalogs. 

364 

365 Parameters 

366 ---------- 

367 dtype : `numpy.dtype` 

368 Data type describing each entry in ``config.extra_col_names`` 

369 for the catalogs being ingested. 

370 

371 Returns 

372 ------- 

373 schemaAndKeyMap : `tuple` of (`lsst.afw.table.Schema`, `dict`) 

374 A tuple containing two items: 

375 - The schema for the output source catalog. 

376 - A map of catalog keys to use in filling the record 

377 """ 

378 # make a schema with the standard fields 

379 schema = ReferenceObjectLoader.makeMinimalSchema( 

380 filterNameList=self.config.mag_column_list, 

381 addCentroid=False, 

382 addIsPhotometric=bool(self.config.is_photometric_name), 

383 addIsResolved=bool(self.config.is_resolved_name), 

384 addIsVariable=bool(self.config.is_variable_name), 

385 coordErrDim=2 if bool(self.config.ra_err_name) else 0, 

386 addProperMotion=2 if bool(self.config.pm_ra_name) else 0, 

387 properMotionErrDim=2 if bool(self.config.pm_ra_err_name) else 0, 

388 addParallax=bool(self.config.parallax_name), 

389 ) 

390 keysToSkip = set(("id", "centroid_x", "centroid_y", "hasCentroid")) 

391 key_map = {fieldName: schema[fieldName].asKey() for fieldName in schema.getOrderedNames() 

392 if fieldName not in keysToSkip} 

393 

394 def addField(name): 

395 if dtype[name].kind == 'U': 

396 # dealing with a string like thing. Need to get type and size. 

397 at_size = dtype[name].itemsize 

398 return schema.addField(name, type=str, size=at_size) 

399 else: 

400 at_type = dtype[name].type 

401 return schema.addField(name, at_type) 

402 

403 for col in self.config.extra_col_names: 

404 key_map[col] = addField(col) 

405 return schema, key_map 

406 

407 def _saveMasterSchema(self, filename): 

408 """Generate and save the master catalog schema. 

409 

410 Parameters 

411 ---------- 

412 filename : `str` 

413 An input file to read to get the input dtype. 

414 """ 

415 arr = self.file_reader.run(filename) 

416 schema, key_map = self.makeSchema(arr.dtype) 

417 

418 catalog = afwTable.SimpleCatalog(schema) 

419 addRefCatMetadata(catalog) 

420 self._writeMasterSchema(catalog) 

421 return schema, key_map 

422 

423 @abc.abstractmethod 

424 def _getOnePixelFilename(self, start): 

425 """Return one example filename to help construct the rest of the 

426 per-htm pixel filenames. 

427 

428 Parameters 

429 ---------- 

430 start : `int` 

431 The first HTM index in this HTM pixelization. 

432 

433 Returns 

434 ------- 

435 filename : `str` 

436 Path to a single file that would be written to the output location. 

437 """ 

438 pass 

439 

440 @abc.abstractmethod 

441 def _persistConfig(self): 

442 """Write the config that was used to generate the refcat. 

443 """ 

444 pass 

445 

446 @abc.abstractmethod 

447 def _writeMasterSchema(self, catalog): 

448 """Butler put the master catalog schema. 

449 

450 Parameters 

451 ---------- 

452 catalog : `lsst.afw.table.SimpleCatalog` 

453 An empty catalog with a fully-defined schema that matches the 

454 schema used in each of the HTM pixel files. 

455 """ 

456 pass