Coverage for python/lsst/meas/algorithms/convertReferenceCatalog.py: 32%

171 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-09 03:22 -0800

1# This file is part of meas_algorithms. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22""" 

23Convert an external reference catalog into the hierarchical triangular mesh 

24(HTM) sharded LSST-style format, to be ingested into the butler. 

25""" 

26 

27__all__ = ["ConvertReferenceCatalogTask", "ConvertReferenceCatalogConfig", "DatasetConfig"] 

28 

29import argparse 

30import glob 

31import os 

32import pathlib 

33import logging 

34 

35import astropy 

36 

37import lsst.afw.table 

38import lsst.pipe.base 

39import lsst.pex.config as pexConfig 

40from lsst.daf.base import PropertyList 

41 

42from .indexerRegistry import IndexerRegistry 

43from .readTextCatalogTask import ReadTextCatalogTask 

44from . import convertRefcatManager 

45from . import ReferenceObjectLoader 

46 

47# The most recent Indexed Reference Catalog on-disk format version. 

48LATEST_FORMAT_VERSION = 1 

49 

50 

51def addRefCatMetadata(catalog): 

52 """Add metadata to a new (not yet populated) reference catalog. 

53 

54 Parameters 

55 ---------- 

56 catalog : `lsst.afw.table.SimpleCatalog` 

57 Catalog to which metadata should be attached. Will be modified 

58 in-place. 

59 """ 

60 md = catalog.getMetadata() 

61 if md is None: 

62 md = PropertyList() 

63 md.set("REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION) 

64 catalog.setMetadata(md) 

65 

66 

67class DatasetConfig(pexConfig.Config): 

68 """Description of the on-disk storage format for the converted reference 

69 catalog. 

70 """ 

71 format_version = pexConfig.Field( 

72 dtype=int, 

73 doc="Version number of the persisted on-disk storage format." 

74 "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)." 

75 "\nVersion 1 had nJy as flux units.", 

76 default=0 # This needs to always be 0, so that unversioned catalogs are interpreted as version 0. 

77 ) 

78 ref_dataset_name = pexConfig.Field( 

79 dtype=str, 

80 doc="Name of this reference catalog; this should match the name used during butler ingest.", 

81 ) 

82 indexer = IndexerRegistry.makeField( 

83 default='HTM', 

84 doc='Name of indexer algoritm to use. Default is HTM', 

85 ) 

86 

87 

88class ConvertReferenceCatalogConfig(pexConfig.Config): 

89 dataset_config = pexConfig.ConfigField( 

90 dtype=DatasetConfig, 

91 doc="Configuration for reading the ingested data", 

92 ) 

93 n_processes = pexConfig.Field( 

94 dtype=int, 

95 doc=("Number of python processes to use when ingesting."), 

96 default=1 

97 ) 

98 manager = pexConfig.ConfigurableField( 

99 target=convertRefcatManager.ConvertRefcatManager, 

100 doc="Multiprocessing manager to perform the actual conversion of values, file-by-file." 

101 ) 

102 file_reader = pexConfig.ConfigurableField( 

103 target=ReadTextCatalogTask, 

104 doc='Task to use to read the files. Default is to expect text files.' 

105 ) 

106 ra_name = pexConfig.Field( 

107 dtype=str, 

108 doc="Name of RA column (values in decimal degrees)", 

109 ) 

110 dec_name = pexConfig.Field( 

111 dtype=str, 

112 doc="Name of Dec column (values in decimal degrees)", 

113 ) 

114 ra_err_name = pexConfig.Field( 

115 dtype=str, 

116 doc="Name of RA error column", 

117 optional=True, 

118 ) 

119 dec_err_name = pexConfig.Field( 

120 dtype=str, 

121 doc="Name of Dec error column", 

122 optional=True, 

123 ) 

124 coord_err_unit = pexConfig.Field( 

125 dtype=str, 

126 doc="Unit of RA/Dec error fields (astropy.unit.Unit compatible)", 

127 optional=True 

128 ) 

129 mag_column_list = pexConfig.ListField( 

130 dtype=str, 

131 doc="The values in the reference catalog are assumed to be in AB magnitudes. " 

132 "List of column names to use for photometric information. At least one entry is required." 

133 ) 

134 mag_err_column_map = pexConfig.DictField( 

135 keytype=str, 

136 itemtype=str, 

137 default={}, 

138 doc="A map of magnitude column name (key) to magnitude error column (value)." 

139 ) 

140 is_photometric_name = pexConfig.Field( 

141 dtype=str, 

142 optional=True, 

143 doc='Name of column stating if satisfactory for photometric calibration (optional).' 

144 ) 

145 is_resolved_name = pexConfig.Field( 

146 dtype=str, 

147 optional=True, 

148 doc='Name of column stating if the object is resolved (optional).' 

149 ) 

150 is_variable_name = pexConfig.Field( 

151 dtype=str, 

152 optional=True, 

153 doc='Name of column stating if the object is measured to be variable (optional).' 

154 ) 

155 id_name = pexConfig.Field( 

156 dtype=str, 

157 optional=True, 

158 doc='Name of column to use as an identifier (optional).' 

159 ) 

160 pm_ra_name = pexConfig.Field( 

161 dtype=str, 

162 doc="Name of proper motion RA column", 

163 optional=True, 

164 ) 

165 pm_dec_name = pexConfig.Field( 

166 dtype=str, 

167 doc="Name of proper motion Dec column", 

168 optional=True, 

169 ) 

170 pm_ra_err_name = pexConfig.Field( 

171 dtype=str, 

172 doc="Name of proper motion RA error column", 

173 optional=True, 

174 ) 

175 pm_dec_err_name = pexConfig.Field( 

176 dtype=str, 

177 doc="Name of proper motion Dec error column", 

178 optional=True, 

179 ) 

180 pm_scale = pexConfig.Field( 

181 dtype=float, 

182 doc="Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year", 

183 default=1.0, 

184 ) 

185 parallax_name = pexConfig.Field( 

186 dtype=str, 

187 doc="Name of parallax column", 

188 optional=True, 

189 ) 

190 parallax_err_name = pexConfig.Field( 

191 dtype=str, 

192 doc="Name of parallax error column", 

193 optional=True, 

194 ) 

195 parallax_scale = pexConfig.Field( 

196 dtype=float, 

197 doc="Scale factor by which to multiply parallax values to obtain units of milliarcsec", 

198 default=1.0, 

199 ) 

200 epoch_name = pexConfig.Field( 

201 dtype=str, 

202 doc="Name of epoch column", 

203 optional=True, 

204 ) 

205 epoch_format = pexConfig.Field( 

206 dtype=str, 

207 doc="Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'", 

208 optional=True, 

209 ) 

210 epoch_scale = pexConfig.Field( 

211 dtype=str, 

212 doc="Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'", 

213 optional=True, 

214 ) 

215 extra_col_names = pexConfig.ListField( 

216 dtype=str, 

217 default=[], 

218 doc='Extra columns to add to the reference catalog.' 

219 ) 

220 

221 def setDefaults(self): 

222 # Newly ingested reference catalogs always have the latest format_version. 

223 self.dataset_config.format_version = LATEST_FORMAT_VERSION 

224 # gen3 refcats are all depth=7 

225 self.dataset_config.indexer['HTM'].depth = 7 

226 

227 def validate(self): 

228 super().validate() 

229 

230 def assertAllOrNone(*names): 

231 """Raise ValueError unless all the named fields are set or are 

232 all none (or blank). 

233 """ 

234 setNames = [name for name in names if bool(getattr(self, name))] 

235 if len(setNames) in (len(names), 0): 

236 return 

237 prefix = "Both or neither" if len(names) == 2 else "All or none" 

238 raise ValueError("{} of {} must be set, but only {} are set".format( 

239 prefix, ", ".join(names), ", ".join(setNames))) 

240 

241 if not (self.ra_name and self.dec_name and self.mag_column_list): 

242 raise ValueError( 

243 "ra_name and dec_name and at least one entry in mag_column_list must be supplied.") 

244 if self.mag_err_column_map and set(self.mag_column_list) != set(self.mag_err_column_map.keys()): 

245 raise ValueError( 

246 "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format( 

247 sorted(self.mag_err_column_map.keys()), sorted(self.mag_column_list))) 

248 assertAllOrNone("ra_err_name", "dec_err_name", "coord_err_unit") 

249 if self.coord_err_unit is not None: 

250 result = astropy.units.Unit(self.coord_err_unit, parse_strict='silent') 

251 if isinstance(result, astropy.units.UnrecognizedUnit): 

252 msg = f"{self.coord_err_unit} is not a valid astropy unit string." 

253 raise pexConfig.FieldValidationError(ConvertReferenceCatalogConfig.coord_err_unit, self, msg) 

254 

255 assertAllOrNone("epoch_name", "epoch_format", "epoch_scale") 

256 assertAllOrNone("pm_ra_name", "pm_dec_name") 

257 assertAllOrNone("pm_ra_err_name", "pm_dec_err_name") 

258 assertAllOrNone("parallax_name", "parallax_err_name") 

259 if self.pm_ra_err_name and not self.pm_ra_name: 

260 raise ValueError('"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified') 

261 if (self.pm_ra_name or self.parallax_name) and not self.epoch_name: 

262 raise ValueError( 

263 '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified') 

264 

265 

266class ConvertReferenceCatalogTask(lsst.pipe.base.Task): 

267 """Class for producing HTM-indexed reference catalogs from external 

268 catalog data. 

269 

270 This implements an indexing scheme based on hierarchical triangular 

271 mesh (HTM). The term index really means breaking the catalog into 

272 localized chunks called shards. In this case each shard contains 

273 the entries from the catalog in a single HTM trixel 

274 

275 For producing catalogs this task makes the following assumptions 

276 about the input catalogs: 

277 

278 - RA, Dec are in decimal degrees. 

279 - Epoch is available in a column, in a format supported by astropy.time.Time. 

280 - There are no off-diagonal covariance terms, such as covariance 

281 between RA and Dec, or between PM RA and PM Dec. Support for such 

282 covariance would have to be added to to the config, including consideration 

283 of the units in the input catalog. 

284 

285 Parameters 

286 ---------- 

287 output_dir : `str` 

288 The path to write the output files to, in a subdirectory defined by 

289 ``DatasetConfig.ref_dataset_name``. 

290 """ 

291 canMultiprocess = False 

292 ConfigClass = ConvertReferenceCatalogConfig 

293 _DefaultName = 'ConvertReferenceCatalogTask' 

294 

295 def __init__(self, *, output_dir=None, **kwargs): 

296 super().__init__(**kwargs) 

297 if output_dir is None: 

298 raise RuntimeError("Must specify output_dir.") 

299 self.base_dir = output_dir 

300 self.output_dir = os.path.join(output_dir, self.config.dataset_config.ref_dataset_name) 

301 self.ingest_table_file = os.path.join(self.base_dir, "filename_to_htm.ecsv") 

302 self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name]( 

303 self.config.dataset_config.indexer.active) 

304 self.makeSubtask('file_reader') 

305 

306 def run(self, inputFiles): 

307 """Index a set of files comprising a reference catalog. 

308 

309 Outputs are persisted in the butler repository. 

310 

311 Parameters 

312 ---------- 

313 inputFiles : `list` 

314 A list of file paths to read. 

315 """ 

316 # Create the output path, if it doesn't exist; fail if the path exists: 

317 # we don't want to accidentally append to existing files. 

318 pathlib.Path(self.output_dir).mkdir(exist_ok=False) 

319 

320 schema, key_map = self._writeMasterSchema(inputFiles[0]) 

321 # create an HTM we can interrogate about pixel ids 

322 htm = lsst.sphgeom.HtmPixelization(self.indexer.htm.get_depth()) 

323 filenames = self._getOutputFilenames(htm) 

324 worker = self.config.manager.target(filenames, 

325 self.config, 

326 self.file_reader, 

327 self.indexer, 

328 schema, 

329 key_map, 

330 htm.universe()[0], 

331 addRefCatMetadata, 

332 self.log) 

333 result = worker.run(inputFiles) 

334 

335 self._writeConfig() 

336 self._writeIngestHelperFile(result) 

337 

338 def _writeIngestHelperFile(self, result): 

339 """Write the astropy table containing the htm->filename relationship, 

340 used for the ``butler ingest-files`` command after this task completes. 

341 """ 

342 dimension = f"htm{self.config.dataset_config.indexer.active.depth}" 

343 table = astropy.table.Table(names=("filename", dimension), dtype=('str', 'int')) 

344 for key in result: 

345 table.add_row((result[key], key)) 

346 table.write(self.ingest_table_file) 

347 

348 def _writeConfig(self): 

349 """Write the config that was used to generate the refcat.""" 

350 filename = os.path.join(self.output_dir, "config.py") 

351 with open(filename, 'w') as file: 

352 self.config.dataset_config.saveToStream(file) 

353 

354 def _getOutputFilenames(self, htm): 

355 """Get filenames from the butler for each output htm pixel. 

356 

357 Parameters 

358 ---------- 

359 htm : `lsst.sphgeom.HtmPixelization` 

360 The HTM pixelization scheme to be used to build filenames. 

361 

362 Returns 

363 ------- 

364 filenames : `list [str]` 

365 List of filenames to write each HTM pixel to. 

366 """ 

367 filenames = {} 

368 start, end = htm.universe()[0] 

369 path = os.path.join(self.output_dir, f"{self.indexer.htm}.fits") 

370 base = os.path.join(os.path.dirname(path), "%d"+os.path.splitext(path)[1]) 

371 for pixelId in range(start, end): 

372 filenames[pixelId] = base % pixelId 

373 

374 return filenames 

375 

376 def makeSchema(self, dtype): 

377 """Make the schema to use in constructing the persisted catalogs. 

378 

379 Parameters 

380 ---------- 

381 dtype : `numpy.dtype` 

382 Data type describing each entry in ``config.extra_col_names`` 

383 for the catalogs being ingested. 

384 

385 Returns 

386 ------- 

387 schemaAndKeyMap : `tuple` of (`lsst.afw.table.Schema`, `dict`) 

388 A tuple containing two items: 

389 - The schema for the output source catalog. 

390 - A map of catalog keys to use in filling the record 

391 """ 

392 # make a schema with the standard fields 

393 schema = ReferenceObjectLoader.makeMinimalSchema( 

394 filterNameList=self.config.mag_column_list, 

395 addCentroid=False, 

396 addIsPhotometric=bool(self.config.is_photometric_name), 

397 addIsResolved=bool(self.config.is_resolved_name), 

398 addIsVariable=bool(self.config.is_variable_name), 

399 coordErrDim=2 if bool(self.config.ra_err_name) else 0, 

400 addProperMotion=2 if bool(self.config.pm_ra_name) else 0, 

401 properMotionErrDim=2 if bool(self.config.pm_ra_err_name) else 0, 

402 addParallax=bool(self.config.parallax_name), 

403 ) 

404 keysToSkip = set(("id", "centroid_x", "centroid_y", "hasCentroid")) 

405 key_map = {fieldName: schema[fieldName].asKey() for fieldName in schema.getOrderedNames() 

406 if fieldName not in keysToSkip} 

407 

408 def addField(name): 

409 if dtype[name].kind == 'U': 

410 # dealing with a string like thing. Need to get type and size. 

411 at_size = dtype[name].itemsize 

412 return schema.addField(name, type=str, size=at_size) 

413 else: 

414 at_type = dtype[name].type 

415 return schema.addField(name, at_type) 

416 

417 for col in self.config.extra_col_names: 

418 key_map[col] = addField(col) 

419 return schema, key_map 

420 

421 def _writeMasterSchema(self, inputfile): 

422 """Generate and save the master catalog schema. 

423 

424 Parameters 

425 ---------- 

426 inputfile : `str` 

427 An input file to read to get the input dtype. 

428 """ 

429 arr = self.file_reader.run(inputfile) 

430 schema, key_map = self.makeSchema(arr.dtype) 

431 

432 catalog = lsst.afw.table.SimpleCatalog(schema) 

433 addRefCatMetadata(catalog) 

434 outputfile = os.path.join(self.output_dir, "master_schema.fits") 

435 catalog.writeFits(outputfile) 

436 return schema, key_map 

437 

438 def _reduce_kwargs(self): 

439 # Need to be able to pickle this class to use the multiprocess manager. 

440 kwargs = super()._reduce_kwargs() 

441 kwargs['output_dir'] = self.base_dir 

442 return kwargs 

443 

444 

445def build_argparser(): 

446 """Construct an argument parser for the ``convertReferenceCatalog`` script. 

447 

448 Returns 

449 ------- 

450 argparser : `argparse.ArgumentParser` 

451 The argument parser that defines the ``convertReferenceCatalog`` 

452 command-line interface. 

453 """ 

454 parser = argparse.ArgumentParser( 

455 description=__doc__, 

456 formatter_class=argparse.RawDescriptionHelpFormatter, 

457 epilog='More information is available at https://pipelines.lsst.io.' 

458 ) 

459 parser.add_argument("outputDir", 

460 help="Path to write the output shard files, configs, and `ingest-files` table to.") 

461 parser.add_argument("configFile", 

462 help="File containing the ConvertReferenceCatalogConfig fields.") 

463 # Use a "+"-list here, so we can produce a more useful error if the user 

464 # uses an unquoted glob that gets shell expanded. 

465 parser.add_argument("fileglob", nargs="+", 

466 help="Quoted glob for the files to be read in and converted." 

467 " Example (note required quotes to prevent shell expansion):" 

468 ' "gaia_source/csv/GaiaSource*"') 

469 return parser 

470 

471 

472def run_convert(outputDir, configFile, fileglob): 

473 """Run `ConvertReferenceCatalogTask` on the input arguments. 

474 

475 Parameters 

476 ---------- 

477 outputDir : `str` 

478 Path to write the output files to. 

479 configFile : `str` 

480 File specifying the ``ConvertReferenceCatalogConfig`` fields. 

481 fileglob : `str` 

482 Quoted glob for the files to be read in and converted. 

483 """ 

484 # We have to initialize the logger manually when running from the commandline. 

485 logging.basicConfig(level=logging.INFO, format="{name} {levelname}: {message}", style="{") 

486 

487 config = ConvertReferenceCatalogTask.ConfigClass() 

488 config.load(configFile) 

489 converter = ConvertReferenceCatalogTask(output_dir=outputDir, config=config) 

490 files = glob.glob(fileglob) 

491 converter.run(files) 

492 with open(os.path.join(outputDir, "convertReferenceCatalogConfig.py"), "w") as outfile: 

493 converter.config.saveToStream(outfile) 

494 msg = ("Completed refcat conversion.\n\n" 

495 "Ingest the resulting files with the following commands, substituting the path\n" 

496 "to your butler repo for `REPO`, and the ticket number you are tracking this\n" 

497 "ingest on for `DM-NNNNN`:\n" 

498 f"\n butler register-dataset-type REPO {config.dataset_config.ref_dataset_name} " 

499 "SimpleCatalog htm7" 

500 "\n butler ingest-files -t direct REPO gaia_dr2 refcats/DM-NNNNN " 

501 f"{converter.ingest_table_file}" 

502 "\n butler collection-chain REPO --mode extend refcats refcats/DM-NNNNN") 

503 print(msg) 

504 

505 

506def main(): 

507 args = build_argparser().parse_args() 

508 if len(args.fileglob) > 1: 

509 raise RuntimeError("Final argument must be a quoted file glob, not a shell-expanded list of files.") 

510 # Fileglob comes out as a length=1 list, so we can test it above. 

511 run_convert(args.outputDir, args.configFile, args.fileglob[0])