Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

# This file is part of meas_algorithms. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (https://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <https://www.gnu.org/licenses/>. 

 

__all__ = ["IngestIndexManager"] 

 

import os.path 

import itertools 

import multiprocessing 

 

import astropy.time 

import astropy.units as u 

import numpy as np 

 

import lsst.sphgeom 

import lsst.afw.table as afwTable 

from lsst.afw.image import fluxErrFromABMagErr 

 

 

# global shared counter to keep track of source ids 

# (multiprocess sharing is most easily done with a global) 

COUNTER = 0 

# global shared counter to keep track of number of files processed. 

FILE_PROGRESS = 0 

 

 

class IngestIndexManager: 

""" 

Ingest a reference catalog from external files into a butler repository, 

using a multiprocessing Pool to speed up the work. 

 

Parameters 

---------- 

filenames : `dict` [`int`, `str`] 

The HTM pixel id and filenames to ingest the catalog into. 

config : `lsst.meas.algorithms.IngestIndexedReferenceConfig` 

The Task configuration holding the field names. 

file_reader : `lsst.pipe.base.Task` 

The file reader to use to load the files. 

indexer : `lsst.meas.algorithms.HtmIndexer` 

The class used to compute the HTM pixel per coordinate. 

schema : `lsst.afw.table.Schema` 

The schema of the output catalog. 

key_map : `dict` [`str`, `lsst.afw.table.Key`] 

The mapping from output field names to keys in the Schema. 

htmRange : `tuple` [`int`] 

The start and end HTM pixel ids. 

addRefCatMetadata : callable 

A function called to add extra metadata to each output Catalog. 

log : `lsst.log.Log` 

The log to send messages to. 

""" 

_flags = ['photometric', 'resolved', 'variable'] 

 

def __init__(self, filenames, config, file_reader, indexer, 

schema, key_map, htmRange, addRefCatMetadata, log): 

self.filenames = filenames 

self.config = config 

self.file_reader = file_reader 

self.indexer = indexer 

self.schema = schema 

self.key_map = key_map 

self.htmRange = htmRange 

self.addRefCatMetadata = addRefCatMetadata 

self.log = log 

 

def run(self, inputFiles): 

"""Index a set of input files from a reference catalog, and write the 

output to the appropriate filenames, in parallel. 

 

Parameters 

---------- 

inputFiles : `list` 

A list of file paths to read data from. 

""" 

global COUNTER, FILE_PROGRESS 

self.nInputFiles = len(inputFiles) 

 

with multiprocessing.Manager() as manager: 

COUNTER = multiprocessing.Value('i', 0) 

FILE_PROGRESS = multiprocessing.Value('i', 0) 

fileLocks = manager.dict() 

self.log.info("Creating %s file locks.", self.htmRange[1] - self.htmRange[0]) 

for i in range(self.htmRange[0], self.htmRange[1]): 

fileLocks[i] = manager.Lock() 

self.log.info("File locks created.") 

with multiprocessing.Pool(self.config.n_processes) as pool: 

pool.starmap(self._ingestOneFile, zip(inputFiles, itertools.repeat(fileLocks))) 

 

def _ingestOneFile(self, filename, fileLocks): 

"""Read and process one file, and write its records to the correct 

indexed files, while handling exceptions in a useful way so that they 

don't get swallowed by the multiprocess pool. 

 

Parameters 

---------- 

filename : `str` 

The file to process. 

fileLocks : `dict` [`int`, `multiprocessing.Lock`] 

A Lock for each HTM pixel; each pixel gets one file written, and 

we need to block when one process is accessing that file. 

""" 

global FILE_PROGRESS 

inputData = self.file_reader.run(filename) 

fluxes = self._getFluxes(inputData) 

matchedPixels = self.indexer.indexPoints(inputData[self.config.ra_name], 

inputData[self.config.dec_name]) 

pixel_ids = set(matchedPixels) 

for pixelId in pixel_ids: 

with fileLocks[pixelId]: 

self._doOnePixel(inputData, matchedPixels, pixelId, fluxes) 

with FILE_PROGRESS.get_lock(): 

oldPercent = 100 * FILE_PROGRESS.value / self.nInputFiles 

FILE_PROGRESS.value += 1 

percent = 100 * FILE_PROGRESS.value / self.nInputFiles 

# only log each "new percent" 

if np.floor(percent) - np.floor(oldPercent) >= 1: 

self.log.info("Completed %d / %d files: %d %% complete ", 

FILE_PROGRESS.value, 

self.nInputFiles, 

percent) 

 

def _doOnePixel(self, inputData, matchedPixels, pixelId, fluxes): 

"""Process one HTM pixel, appending to an existing catalog or creating 

a new catalog, as needed. 

 

Parameters 

---------- 

inputData : `numpy.ndarray` 

The data from one input file. 

matchedPixels : `numpy.ndarray` 

The row-matched pixel indexes corresponding to ``inputData``. 

pixelId : `int` 

The pixel index we are currently processing. 

fluxes : `dict` [`str`, `numpy.ndarray`] 

The values that will go into the flux and fluxErr fields in the 

output catalog. 

""" 

idx = np.where(matchedPixels == pixelId)[0] 

catalog = self.getCatalog(pixelId, self.schema, len(idx)) 

for outputRow, inputRow in zip(catalog[-len(idx):], inputData[idx]): 

self._fillRecord(outputRow, inputRow) 

 

global COUNTER 

with COUNTER.get_lock(): 

self._setIds(inputData[idx], catalog) 

 

for name, array in fluxes.items(): 

catalog[self.key_map[name]][-len(idx):] = array[idx] 

 

catalog.writeFits(self.filenames[pixelId]) 

 

def _setIds(self, inputData, catalog): 

"""Fill the `id` field of catalog with a running index, filling the 

last values up to the length of ``inputData``. 

 

Fill with `self.config.id_name` if specified, otherwise use the 

global running counter value. 

 

Parameters 

---------- 

inputData : `numpy.ndarray` 

The input data that is being processed. 

catalog : `lsst.afw.table.SimpleCatalog` 

The output catalog to fill the ids. 

""" 

global COUNTER 

size = len(inputData) 

if self.config.id_name: 

catalog['id'][-size:] = inputData[self.config.id_name] 

else: 

idEnd = COUNTER.value + size 

catalog['id'][-size:] = np.arange(COUNTER.value, idEnd) 

COUNTER.value = idEnd 

 

def getCatalog(self, pixelId, schema, nNewElements): 

"""Get a catalog from disk or create it if it doesn't exist. 

 

Parameters 

---------- 

pixelId : `dict` 

Identifier for catalog to retrieve 

schema : `lsst.afw.table.Schema` 

Schema to use in catalog creation it does not exist. 

nNewElements : `int` 

The number of new elements that will be added to the catalog, 

so space can be preallocated. 

 

Returns 

------- 

catalog : `lsst.afw.table.SimpleCatalog` 

The new or read-and-resized catalog specified by `dataId`. 

""" 

# This is safe, because we lock on this file before getCatalog is called. 

if os.path.isfile(self.filenames[pixelId]): 

catalog = afwTable.SimpleCatalog.readFits(self.filenames[pixelId]) 

catalog.resize(len(catalog) + nNewElements) 

return catalog.copy(deep=True) # ensure contiguity, so that column-assignment works 

catalog = afwTable.SimpleCatalog(schema) 

catalog.resize(nNewElements) 

self.addRefCatMetadata(catalog) 

return catalog 

 

@staticmethod 

def computeCoord(row, ra_name, dec_name): 

"""Create an ICRS coord. from a row of a catalog being ingested. 

 

Parameters 

---------- 

row : `numpy.ndarray` 

Row from catalog being ingested. 

ra_name : `str` 

Name of RA key in catalog being ingested. 

dec_name : `str` 

Name of Dec key in catalog being ingested. 

 

Returns 

------- 

coord : `lsst.geom.SpherePoint` 

ICRS coordinate. 

""" 

return lsst.geom.SpherePoint(row[ra_name], row[dec_name], lsst.geom.degrees) 

 

def _setCoordErr(self, record, row): 

"""Set coordinate error in a record of an indexed catalog. 

 

The errors are read from the specified columns, and installed 

in the appropriate columns of the output. 

 

Parameters 

---------- 

record : `lsst.afw.table.SimpleRecord` 

Row from indexed catalog to modify. 

row : `numpy.ndarray` 

Row from catalog being ingested. 

""" 

if self.config.ra_err_name: # IngestIndexedReferenceConfig.validate ensures all or none 

record.set(self.key_map["coord_raErr"], np.radians(row[self.config.ra_err_name])) 

record.set(self.key_map["coord_decErr"], np.radians(row[self.config.dec_err_name])) 

 

def _setFlags(self, record, row): 

"""Set flags in an output record. 

 

Parameters 

---------- 

record : `lsst.afw.table.SimpleRecord` 

Row from indexed catalog to modify. 

row : `numpy.ndarray` 

Row from catalog being ingested. 

""" 

names = record.schema.getNames() 

for flag in self._flags: 

if flag in names: 

attr_name = 'is_{}_name'.format(flag) 

record.set(self.key_map[flag], bool(row[getattr(self.config, attr_name)])) 

 

def _getFluxes(self, inputData): 

"""Compute the flux fields that will go into the output catalog. 

 

Parameters 

---------- 

inputData : `numpy.ndarray` 

The input data to compute fluxes for. 

 

Returns 

------- 

fluxes : `dict` [`str`, `numpy.ndarray`] 

The values that will go into the flux and fluxErr fields in the 

output catalog. 

""" 

result = {} 

for item in self.config.mag_column_list: 

result[item+'_flux'] = (inputData[item]*u.ABmag).to_value(u.nJy) 

if len(self.config.mag_err_column_map) > 0: 

for err_key in self.config.mag_err_column_map.keys(): 

error_col_name = self.config.mag_err_column_map[err_key] 

# TODO: multiply by 1e9 here until we have a replacement (see DM-16903) 

# NOTE: copy the arrays because the numpy strides may not be useable by C++. 

fluxErr = fluxErrFromABMagErr(inputData[error_col_name].copy(), 

inputData[err_key].copy())*1e9 

result[err_key+'_fluxErr'] = fluxErr 

return result 

 

def _setProperMotion(self, record, row): 

"""Set proper motion fields in a record of an indexed catalog. 

 

The proper motions are read from the specified columns, 

scaled appropriately, and installed in the appropriate 

columns of the output. 

 

Parameters 

---------- 

record : `lsst.afw.table.SimpleRecord` 

Row from indexed catalog to modify. 

row : structured `numpy.array` 

Row from catalog being ingested. 

""" 

if self.config.pm_ra_name is None: # IngestIndexedReferenceConfig.validate ensures all or none 

return 

radPerOriginal = np.radians(self.config.pm_scale)/(3600*1000) 

record.set(self.key_map["pm_ra"], row[self.config.pm_ra_name]*radPerOriginal*lsst.geom.radians) 

record.set(self.key_map["pm_dec"], row[self.config.pm_dec_name]*radPerOriginal*lsst.geom.radians) 

record.set(self.key_map["epoch"], self._epochToMjdTai(row[self.config.epoch_name])) 

if self.config.pm_ra_err_name is not None: # pm_dec_err_name also, by validation 

record.set(self.key_map["pm_raErr"], row[self.config.pm_ra_err_name]*radPerOriginal) 

record.set(self.key_map["pm_decErr"], row[self.config.pm_dec_err_name]*radPerOriginal) 

 

def _setParallax(self, record, row): 

"""Set the parallax fields in a record of a refcat. 

""" 

if self.config.parallax_name is None: 

return 

scale = self.config.parallax_scale*lsst.geom.milliarcseconds 

record.set(self.key_map['parallax'], row[self.config.parallax_name]*scale) 

record.set(self.key_map['parallaxErr'], row[self.config.parallax_err_name]*scale) 

 

def _epochToMjdTai(self, nativeEpoch): 

"""Convert an epoch in native format to TAI MJD (a float). 

""" 

return astropy.time.Time(nativeEpoch, format=self.config.epoch_format, 

scale=self.config.epoch_scale).tai.mjd 

 

def _setExtra(self, record, row): 

"""Set extra data fields in a record of an indexed catalog. 

 

Parameters 

---------- 

record : `lsst.afw.table.SimpleRecord` 

Row from indexed catalog to modify. 

row : structured `numpy.array` 

Row from catalog being ingested. 

""" 

for extra_col in self.config.extra_col_names: 

value = row[extra_col] 

# If data read from a text file contains string like entires, 

# numpy stores this as its own internal type, a numpy.str_ 

# object. This seems to be a consequence of how numpy stores 

# string like objects in fixed column arrays. This checks 

# if any of the values to be added to the catalog are numpy 

# string types, and if they are, casts them to a python string 

# which is what the python c++ records expect 

if isinstance(value, np.str_): 

value = str(value) 

record.set(self.key_map[extra_col], value) 

 

def _fillRecord(self, record, row): 

"""Fill a record in an indexed catalog to be persisted. 

 

Parameters 

---------- 

record : `lsst.afw.table.SimpleRecord` 

Row from indexed catalog to modify. 

row : structured `numpy.array` 

Row from catalog being ingested. 

""" 

record.setCoord(self.computeCoord(row, self.config.ra_name, self.config.dec_name)) 

 

self._setCoordErr(record, row) 

self._setFlags(record, row) 

self._setProperMotion(record, row) 

self._setParallax(record, row) 

self._setExtra(record, row)