Coverage for python/lsst/meas/algorithms/ingestIndexReferenceTask.py: 65%

148 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-06 01:41 -0700

1# This file is part of meas_algorithms. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22 

23# TODO DM-31698: post-gen2 removal notes 

24# `DatasetConfig`, `ConvertReferenceCatalogBase`, and `ConvertReferenceCatalogConfig` 

25# should all be moved to to `convertReferenceCatalog.py` once gen2 butler 

26# has been removed. 

27 

28__all__ = ["DatasetConfig", "ConvertReferenceCatalogBase", "ConvertReferenceCatalogConfig"] 

29 

30import abc 

31import os.path 

32 

33import astropy.units 

34 

35import lsst.pex.config as pexConfig 

36import lsst.pipe.base as pipeBase 

37import lsst.geom 

38import lsst.sphgeom 

39import lsst.afw.table as afwTable 

40from lsst.daf.base import PropertyList 

41from .indexerRegistry import IndexerRegistry 

42from .readTextCatalogTask import ReadTextCatalogTask 

43from .loadReferenceObjects import ReferenceObjectLoader 

44from . import convertRefcatManager 

45 

46# The most recent Indexed Reference Catalog on-disk format version. 

47LATEST_FORMAT_VERSION = 1 

48 

49 

50def addRefCatMetadata(catalog): 

51 """Add metadata to a new (not yet populated) reference catalog. 

52 

53 Parameters 

54 ---------- 

55 catalog : `lsst.afw.table.SimpleCatalog` 

56 Catalog to which metadata should be attached. Will be modified 

57 in-place. 

58 """ 

59 md = catalog.getMetadata() 

60 if md is None: 60 ↛ 62line 60 didn't jump to line 62, because the condition on line 60 was never false

61 md = PropertyList() 

62 md.set("REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION) 

63 catalog.setMetadata(md) 

64 

65 

66class IngestReferenceRunner(pipeBase.TaskRunner): 

67 """Task runner for the reference catalog ingester (gen2 version). 

68 

69 Data IDs are ignored so the runner should just run the task on the parsed command. 

70 """ 

71 

72 def run(self, parsedCmd): 

73 """Run the task. 

74 

75 Several arguments need to be collected to send on to the task methods. 

76 

77 Parameters 

78 ---------- 

79 parsedCmd : `argparse.Namespace` 

80 Parsed command. 

81 

82 Returns 

83 ------- 

84 results : `lsst.pipe.base.Struct` or `None` 

85 A empty struct if self.doReturnResults, else None 

86 """ 

87 files = parsedCmd.files 

88 butler = parsedCmd.butler 

89 task = self.TaskClass(config=self.config, log=self.log, butler=butler) 

90 task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup) 

91 

92 task.run(files) 

93 if self.doReturnResults: 

94 return pipeBase.Struct() 

95 

96 

97class DatasetConfig(pexConfig.Config): 

98 """The description of the on-disk storage format for the persisted 

99 reference catalog. 

100 """ 

101 format_version = pexConfig.Field( 

102 dtype=int, 

103 doc="Version number of the persisted on-disk storage format." 

104 "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)." 

105 "\nVersion 1 had nJy as flux units.", 

106 default=0 # This needs to always be 0, so that unversioned catalogs are interpreted as version 0. 

107 ) 

108 ref_dataset_name = pexConfig.Field( 

109 dtype=str, 

110 # TODO DM-31817: remove this default value. 

111 default='cal_ref_cat', 

112 doc="Name of this reference catalog to be used in the butler registry.", 

113 ) 

114 indexer = IndexerRegistry.makeField( 

115 default='HTM', 

116 doc='Name of indexer algoritm to use. Default is HTM', 

117 ) 

118 

119 

120class ConvertReferenceCatalogConfig(pexConfig.Config): 

121 dataset_config = pexConfig.ConfigField( 

122 dtype=DatasetConfig, 

123 doc="Configuration for reading the ingested data", 

124 ) 

125 n_processes = pexConfig.Field( 

126 dtype=int, 

127 doc=("Number of python processes to use when ingesting."), 

128 default=1 

129 ) 

130 manager = pexConfig.ConfigurableField( 

131 target=convertRefcatManager.ConvertRefcatManager, 

132 doc="Multiprocessing manager to perform the actual conversion of values, file-by-file." 

133 ) 

134 file_reader = pexConfig.ConfigurableField( 

135 target=ReadTextCatalogTask, 

136 doc='Task to use to read the files. Default is to expect text files.' 

137 ) 

138 ra_name = pexConfig.Field( 

139 dtype=str, 

140 doc="Name of RA column (values in decimal degrees)", 

141 ) 

142 dec_name = pexConfig.Field( 

143 dtype=str, 

144 doc="Name of Dec column (values in decimal degrees)", 

145 ) 

146 ra_err_name = pexConfig.Field( 

147 dtype=str, 

148 doc="Name of RA error column", 

149 optional=True, 

150 ) 

151 dec_err_name = pexConfig.Field( 

152 dtype=str, 

153 doc="Name of Dec error column", 

154 optional=True, 

155 ) 

156 coord_err_unit = pexConfig.Field( 

157 dtype=str, 

158 doc="Unit of RA/Dec error fields (astropy.unit.Unit compatible)", 

159 optional=True 

160 ) 

161 mag_column_list = pexConfig.ListField( 

162 dtype=str, 

163 doc="The values in the reference catalog are assumed to be in AB magnitudes. " 

164 "List of column names to use for photometric information. At least one entry is required." 

165 ) 

166 mag_err_column_map = pexConfig.DictField( 

167 keytype=str, 

168 itemtype=str, 

169 default={}, 

170 doc="A map of magnitude column name (key) to magnitude error column (value)." 

171 ) 

172 is_photometric_name = pexConfig.Field( 

173 dtype=str, 

174 optional=True, 

175 doc='Name of column stating if satisfactory for photometric calibration (optional).' 

176 ) 

177 is_resolved_name = pexConfig.Field( 

178 dtype=str, 

179 optional=True, 

180 doc='Name of column stating if the object is resolved (optional).' 

181 ) 

182 is_variable_name = pexConfig.Field( 

183 dtype=str, 

184 optional=True, 

185 doc='Name of column stating if the object is measured to be variable (optional).' 

186 ) 

187 id_name = pexConfig.Field( 

188 dtype=str, 

189 optional=True, 

190 doc='Name of column to use as an identifier (optional).' 

191 ) 

192 pm_ra_name = pexConfig.Field( 

193 dtype=str, 

194 doc="Name of proper motion RA column", 

195 optional=True, 

196 ) 

197 pm_dec_name = pexConfig.Field( 

198 dtype=str, 

199 doc="Name of proper motion Dec column", 

200 optional=True, 

201 ) 

202 pm_ra_err_name = pexConfig.Field( 

203 dtype=str, 

204 doc="Name of proper motion RA error column", 

205 optional=True, 

206 ) 

207 pm_dec_err_name = pexConfig.Field( 

208 dtype=str, 

209 doc="Name of proper motion Dec error column", 

210 optional=True, 

211 ) 

212 pm_scale = pexConfig.Field( 

213 dtype=float, 

214 doc="Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year", 

215 default=1.0, 

216 ) 

217 parallax_name = pexConfig.Field( 

218 dtype=str, 

219 doc="Name of parallax column", 

220 optional=True, 

221 ) 

222 parallax_err_name = pexConfig.Field( 

223 dtype=str, 

224 doc="Name of parallax error column", 

225 optional=True, 

226 ) 

227 parallax_scale = pexConfig.Field( 

228 dtype=float, 

229 doc="Scale factor by which to multiply parallax values to obtain units of milliarcsec", 

230 default=1.0, 

231 ) 

232 epoch_name = pexConfig.Field( 

233 dtype=str, 

234 doc="Name of epoch column", 

235 optional=True, 

236 ) 

237 epoch_format = pexConfig.Field( 

238 dtype=str, 

239 doc="Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'", 

240 optional=True, 

241 ) 

242 epoch_scale = pexConfig.Field( 

243 dtype=str, 

244 doc="Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'", 

245 optional=True, 

246 ) 

247 extra_col_names = pexConfig.ListField( 

248 dtype=str, 

249 default=[], 

250 doc='Extra columns to add to the reference catalog.' 

251 ) 

252 

253 def setDefaults(self): 

254 # Newly ingested reference catalogs always have the latest format_version. 

255 self.dataset_config.format_version = LATEST_FORMAT_VERSION 

256 # gen3 refcats are all depth=7 

257 self.dataset_config.indexer['HTM'].depth = 7 

258 

259 def validate(self): 

260 pexConfig.Config.validate(self) 

261 

262 def assertAllOrNone(*names): 

263 """Raise ValueError unless all the named fields are set or are 

264 all none (or blank) 

265 """ 

266 setNames = [name for name in names if bool(getattr(self, name))] 

267 if len(setNames) in (len(names), 0): 

268 return 

269 prefix = "Both or neither" if len(names) == 2 else "All or none" 

270 raise ValueError("{} of {} must be set, but only {} are set".format( 

271 prefix, ", ".join(names), ", ".join(setNames))) 

272 

273 if not (self.ra_name and self.dec_name and self.mag_column_list): 

274 raise ValueError( 

275 "ra_name and dec_name and at least one entry in mag_column_list must be supplied.") 

276 if self.mag_err_column_map and set(self.mag_column_list) != set(self.mag_err_column_map.keys()): 

277 raise ValueError( 

278 "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format( 

279 sorted(self.mag_err_column_map.keys()), sorted(self.mag_column_list))) 

280 assertAllOrNone("ra_err_name", "dec_err_name", "coord_err_unit") 

281 if self.coord_err_unit is not None: 

282 result = astropy.units.Unit(self.coord_err_unit, parse_strict='silent') 

283 if isinstance(result, astropy.units.UnrecognizedUnit): 

284 msg = f"{self.coord_err_unit} is not a valid astropy unit string." 

285 raise pexConfig.FieldValidationError(ConvertReferenceCatalogConfig.coord_err_unit, self, msg) 

286 

287 assertAllOrNone("epoch_name", "epoch_format", "epoch_scale") 

288 assertAllOrNone("pm_ra_name", "pm_dec_name") 

289 assertAllOrNone("pm_ra_err_name", "pm_dec_err_name") 

290 assertAllOrNone("parallax_name", "parallax_err_name") 

291 if self.pm_ra_err_name and not self.pm_ra_name: 

292 raise ValueError('"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified') 

293 if (self.pm_ra_name or self.parallax_name) and not self.epoch_name: 

294 raise ValueError( 

295 '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified') 

296 

297 

298class ConvertReferenceCatalogBase(pipeBase.Task, abc.ABC): 

299 """Base class for producing and loading indexed reference catalogs, 

300 shared between gen2 and gen3. 

301 

302 This implements an indexing scheme based on hierarchical triangular 

303 mesh (HTM). The term index really means breaking the catalog into 

304 localized chunks called shards. In this case each shard contains 

305 the entries from the catalog in a single HTM trixel 

306 

307 For producing catalogs this task makes the following assumptions 

308 about the input catalogs: 

309 - RA, Dec are in decimal degrees. 

310 - Epoch is available in a column, in a format supported by astropy.time.Time. 

311 - There are no off-diagonal covariance terms, such as covariance 

312 between RA and Dec, or between PM RA and PM Dec. Support for such 

313 covariance would have to be added to to the config, including consideration 

314 of the units in the input catalog. 

315 """ 

316 canMultiprocess = False 

317 ConfigClass = ConvertReferenceCatalogConfig 

318 

319 def __init__(self, *args, **kwargs): 

320 super().__init__(*args, **kwargs) 

321 self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name]( 

322 self.config.dataset_config.indexer.active) 

323 self.makeSubtask('file_reader') 

324 

325 def run(self, inputFiles): 

326 """Index a set of files comprising a reference catalog. 

327 

328 Outputs are persisted in the butler repository. 

329 

330 Parameters 

331 ---------- 

332 inputFiles : `list` 

333 A list of file paths to read. 

334 """ 

335 self._preRun() 

336 schema, key_map = self._saveMasterSchema(inputFiles[0]) 

337 # create an HTM we can interrogate about pixel ids 

338 htm = lsst.sphgeom.HtmPixelization(self.indexer.htm.get_depth()) 

339 filenames = self._getButlerFilenames(htm) 

340 worker = self.config.manager.target(filenames, 

341 self.config, 

342 self.file_reader, 

343 self.indexer, 

344 schema, 

345 key_map, 

346 htm.universe()[0], 

347 addRefCatMetadata, 

348 self.log) 

349 result = worker.run(inputFiles) 

350 

351 self._persistConfig() 

352 self._postRun(result) 

353 

354 def _preRun(self): 

355 """Any setup that has to be performed at the start of ``run``, but that 

356 cannot be performed during ``__init__`` (e.g. making directories). 

357 """ 

358 pass 

359 

360 def _postRun(self, result): 

361 """Any tasks that have to happen at the end of ``run``. 

362 

363 Parameters 

364 ---------- 

365 result 

366 The result returned from``worker.run()``. 

367 """ 

368 pass 

369 

370 def _getButlerFilenames(self, htm): 

371 """Get filenames from the butler for each output htm pixel. 

372 

373 Parameters 

374 ---------- 

375 htm : `lsst.sphgeom.HtmPixelization` 

376 The HTM pixelization scheme to be used to build filenames. 

377 

378 Returns 

379 ------- 

380 filenames : `list [str]` 

381 List of filenames to write each HTM pixel to. 

382 """ 

383 filenames = {} 

384 start, end = htm.universe()[0] 

385 # path manipulation because butler.get() per pixel will take forever 

386 path = self._getOnePixelFilename(start) 

387 base = os.path.join(os.path.dirname(path), "%d"+os.path.splitext(path)[1]) 

388 for pixelId in range(start, end): 

389 filenames[pixelId] = base % pixelId 

390 

391 return filenames 

392 

393 def makeSchema(self, dtype): 

394 """Make the schema to use in constructing the persisted catalogs. 

395 

396 Parameters 

397 ---------- 

398 dtype : `numpy.dtype` 

399 Data type describing each entry in ``config.extra_col_names`` 

400 for the catalogs being ingested. 

401 

402 Returns 

403 ------- 

404 schemaAndKeyMap : `tuple` of (`lsst.afw.table.Schema`, `dict`) 

405 A tuple containing two items: 

406 - The schema for the output source catalog. 

407 - A map of catalog keys to use in filling the record 

408 """ 

409 # make a schema with the standard fields 

410 schema = ReferenceObjectLoader.makeMinimalSchema( 

411 filterNameList=self.config.mag_column_list, 

412 addCentroid=False, 

413 addIsPhotometric=bool(self.config.is_photometric_name), 

414 addIsResolved=bool(self.config.is_resolved_name), 

415 addIsVariable=bool(self.config.is_variable_name), 

416 coordErrDim=2 if bool(self.config.ra_err_name) else 0, 

417 addProperMotion=2 if bool(self.config.pm_ra_name) else 0, 

418 properMotionErrDim=2 if bool(self.config.pm_ra_err_name) else 0, 

419 addParallax=bool(self.config.parallax_name), 

420 ) 

421 keysToSkip = set(("id", "centroid_x", "centroid_y", "hasCentroid")) 

422 key_map = {fieldName: schema[fieldName].asKey() for fieldName in schema.getOrderedNames() 

423 if fieldName not in keysToSkip} 

424 

425 def addField(name): 

426 if dtype[name].kind == 'U': 

427 # dealing with a string like thing. Need to get type and size. 

428 at_size = dtype[name].itemsize 

429 return schema.addField(name, type=str, size=at_size) 

430 else: 

431 at_type = dtype[name].type 

432 return schema.addField(name, at_type) 

433 

434 for col in self.config.extra_col_names: 434 ↛ 435line 434 didn't jump to line 435, because the loop on line 434 never started

435 key_map[col] = addField(col) 

436 return schema, key_map 

437 

438 def _saveMasterSchema(self, filename): 

439 """Generate and save the master catalog schema. 

440 

441 Parameters 

442 ---------- 

443 filename : `str` 

444 An input file to read to get the input dtype. 

445 """ 

446 arr = self.file_reader.run(filename) 

447 schema, key_map = self.makeSchema(arr.dtype) 

448 

449 catalog = afwTable.SimpleCatalog(schema) 

450 addRefCatMetadata(catalog) 

451 self._writeMasterSchema(catalog) 

452 return schema, key_map 

453 

454 @abc.abstractmethod 

455 def _getOnePixelFilename(self, start): 

456 """Return one example filename to help construct the rest of the 

457 per-htm pixel filenames. 

458 

459 Parameters 

460 ---------- 

461 start : `int` 

462 The first HTM index in this HTM pixelization. 

463 

464 Returns 

465 ------- 

466 filename : `str` 

467 Path to a single file that would be written to the output location. 

468 """ 

469 pass 

470 

471 @abc.abstractmethod 

472 def _persistConfig(self): 

473 """Write the config that was used to generate the refcat. 

474 """ 

475 pass 

476 

477 @abc.abstractmethod 

478 def _writeMasterSchema(self, catalog): 

479 """Butler put the master catalog schema. 

480 

481 Parameters 

482 ---------- 

483 catalog : `lsst.afw.table.SimpleCatalog` 

484 An empty catalog with a fully-defined schema that matches the 

485 schema used in each of the HTM pixel files. 

486 """ 

487 pass