Coverage for python/lsst/meas/algorithms/ingestIndexManager.py : 52%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of meas_algorithms.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22__all__ = ["IngestIndexManager", "IngestGaiaManager"]
24from ctypes import c_int
25import os.path
26import itertools
27import multiprocessing
29import astropy.time
30import astropy.units as u
31import numpy as np
33import lsst.sphgeom
34import lsst.afw.table as afwTable
35from lsst.afw.image import fluxErrFromABMagErr
38# global shared counter to keep track of source ids
39# (multiprocess sharing is most easily done with a global)
40COUNTER = multiprocessing.Value(c_int, 0)
41# global shared counter to keep track of number of files processed.
42FILE_PROGRESS = multiprocessing.Value(c_int, 0)
45class IngestIndexManager:
46 """
47 Ingest a reference catalog from external files into a butler repository,
48 using a multiprocessing Pool to speed up the work.
50 Parameters
51 ----------
52 filenames : `dict` [`int`, `str`]
53 The HTM pixel id and filenames to ingest the catalog into.
54 config : `lsst.meas.algorithms.IngestIndexedReferenceConfig`
55 The Task configuration holding the field names.
56 file_reader : `lsst.pipe.base.Task`
57 The file reader to use to load the files.
58 indexer : `lsst.meas.algorithms.HtmIndexer`
59 The class used to compute the HTM pixel per coordinate.
60 schema : `lsst.afw.table.Schema`
61 The schema of the output catalog.
62 key_map : `dict` [`str`, `lsst.afw.table.Key`]
63 The mapping from output field names to keys in the Schema.
64 htmRange : `tuple` [`int`]
65 The start and end HTM pixel ids.
66 addRefCatMetadata : callable
67 A function called to add extra metadata to each output Catalog.
68 log : `lsst.log.Log`
69 The log to send messages to.
70 """
71 _flags = ['photometric', 'resolved', 'variable']
73 def __init__(self, filenames, config, file_reader, indexer,
74 schema, key_map, htmRange, addRefCatMetadata, log):
75 self.filenames = filenames
76 self.config = config
77 self.file_reader = file_reader
78 self.indexer = indexer
79 self.schema = schema
80 self.key_map = key_map
81 self.htmRange = htmRange
82 self.addRefCatMetadata = addRefCatMetadata
83 self.log = log
84 if self.config.coord_err_unit is not None:
85 # cache this to speed up coordinate conversions
86 self.coord_err_unit = u.Unit(self.config.coord_err_unit)
88 def run(self, inputFiles):
89 """Index a set of input files from a reference catalog, and write the
90 output to the appropriate filenames, in parallel.
92 Parameters
93 ----------
94 inputFiles : `list`
95 A list of file paths to read data from.
96 """
97 global COUNTER, FILE_PROGRESS
98 self.nInputFiles = len(inputFiles)
100 with multiprocessing.Manager() as manager:
101 COUNTER.value = 0
102 FILE_PROGRESS.value = 0
103 fileLocks = manager.dict()
104 self.log.info("Creating %s file locks.", self.htmRange[1] - self.htmRange[0])
105 for i in range(self.htmRange[0], self.htmRange[1]):
106 fileLocks[i] = manager.Lock()
107 self.log.info("File locks created.")
108 with multiprocessing.Pool(self.config.n_processes) as pool:
109 pool.starmap(self._ingestOneFile, zip(inputFiles, itertools.repeat(fileLocks)))
111 def _ingestOneFile(self, filename, fileLocks):
112 """Read and process one file, and write its records to the correct
113 indexed files, while handling exceptions in a useful way so that they
114 don't get swallowed by the multiprocess pool.
116 Parameters
117 ----------
118 filename : `str`
119 The file to process.
120 fileLocks : `dict` [`int`, `multiprocessing.Lock`]
121 A Lock for each HTM pixel; each pixel gets one file written, and
122 we need to block when one process is accessing that file.
123 """
124 global FILE_PROGRESS
125 inputData = self.file_reader.run(filename)
126 fluxes = self._getFluxes(inputData)
127 coordErr = self._getCoordErr(inputData)
128 matchedPixels = self.indexer.indexPoints(inputData[self.config.ra_name],
129 inputData[self.config.dec_name])
130 pixel_ids = set(matchedPixels)
131 for pixelId in pixel_ids:
132 with fileLocks[pixelId]:
133 self._doOnePixel(inputData, matchedPixels, pixelId, fluxes, coordErr)
134 with FILE_PROGRESS.get_lock():
135 oldPercent = 100 * FILE_PROGRESS.value / self.nInputFiles
136 FILE_PROGRESS.value += 1
137 percent = 100 * FILE_PROGRESS.value / self.nInputFiles
138 # only log each "new percent"
139 if np.floor(percent) - np.floor(oldPercent) >= 1:
140 self.log.info("Completed %d / %d files: %d %% complete ",
141 FILE_PROGRESS.value,
142 self.nInputFiles,
143 percent)
145 def _doOnePixel(self, inputData, matchedPixels, pixelId, fluxes, coordErr):
146 """Process one HTM pixel, appending to an existing catalog or creating
147 a new catalog, as needed.
149 Parameters
150 ----------
151 inputData : `numpy.ndarray`
152 The data from one input file.
153 matchedPixels : `numpy.ndarray`
154 The row-matched pixel indexes corresponding to ``inputData``.
155 pixelId : `int`
156 The pixel index we are currently processing.
157 fluxes : `dict` [`str`, `numpy.ndarray`]
158 The values that will go into the flux and fluxErr fields in the
159 output catalog.
160 coordErr : `dict` [`str`, `numpy.ndarray`]
161 The values that will go into the coord_raErr, coord_decErr, and
162 coord_ra_dec_Cov fields in the output catalog (in radians).
163 """
164 idx = np.where(matchedPixels == pixelId)[0]
165 catalog = self.getCatalog(pixelId, self.schema, len(idx))
166 for outputRow, inputRow in zip(catalog[-len(idx):], inputData[idx]):
167 self._fillRecord(outputRow, inputRow)
169 global COUNTER
170 with COUNTER.get_lock():
171 self._setIds(inputData[idx], catalog)
173 # set fluxes from the pre-computed array
174 for name, array in fluxes.items(): 174 ↛ 175line 174 didn't jump to line 175, because the loop on line 174 never started
175 catalog[self.key_map[name]][-len(idx):] = array[idx]
177 # set coordinate errors from the pre-computed array
178 for name, array in coordErr.items(): 178 ↛ 179line 178 didn't jump to line 179, because the loop on line 178 never started
179 catalog[name][-len(idx):] = array[idx]
181 catalog.writeFits(self.filenames[pixelId])
183 def _setIds(self, inputData, catalog):
184 """Fill the `id` field of catalog with a running index, filling the
185 last values up to the length of ``inputData``.
187 Fill with `self.config.id_name` if specified, otherwise use the
188 global running counter value.
190 Parameters
191 ----------
192 inputData : `numpy.ndarray`
193 The input data that is being processed.
194 catalog : `lsst.afw.table.SimpleCatalog`
195 The output catalog to fill the ids.
196 """
197 global COUNTER
198 size = len(inputData)
199 if self.config.id_name: 199 ↛ 202line 199 didn't jump to line 202, because the condition on line 199 was never false
200 catalog['id'][-size:] = inputData[self.config.id_name]
201 else:
202 idEnd = COUNTER.value + size
203 catalog['id'][-size:] = np.arange(COUNTER.value, idEnd)
204 COUNTER.value = idEnd
206 def getCatalog(self, pixelId, schema, nNewElements):
207 """Get a catalog from disk or create it if it doesn't exist.
209 Parameters
210 ----------
211 pixelId : `dict`
212 Identifier for catalog to retrieve
213 schema : `lsst.afw.table.Schema`
214 Schema to use in catalog creation it does not exist.
215 nNewElements : `int`
216 The number of new elements that will be added to the catalog,
217 so space can be preallocated.
219 Returns
220 -------
221 catalog : `lsst.afw.table.SimpleCatalog`
222 The new or read-and-resized catalog specified by `dataId`.
223 """
224 # This is safe, because we lock on this file before getCatalog is called.
225 if os.path.isfile(self.filenames[pixelId]): 225 ↛ 229line 225 didn't jump to line 229, because the condition on line 225 was never false
226 catalog = afwTable.SimpleCatalog.readFits(self.filenames[pixelId])
227 catalog.resize(len(catalog) + nNewElements)
228 return catalog.copy(deep=True) # ensure contiguity, so that column-assignment works
229 catalog = afwTable.SimpleCatalog(schema)
230 catalog.resize(nNewElements)
231 self.addRefCatMetadata(catalog)
232 return catalog
234 @staticmethod
235 def computeCoord(row, ra_name, dec_name):
236 """Create an ICRS coord. from a row of a catalog being ingested.
238 Parameters
239 ----------
240 row : `numpy.ndarray`
241 Row from catalog being ingested.
242 ra_name : `str`
243 Name of RA key in catalog being ingested.
244 dec_name : `str`
245 Name of Dec key in catalog being ingested.
247 Returns
248 -------
249 coord : `lsst.geom.SpherePoint`
250 ICRS coordinate.
251 """
252 return lsst.geom.SpherePoint(row[ra_name], row[dec_name], lsst.geom.degrees)
254 def _getCoordErr(self, inputData, ):
255 """Compute the ra/dec error fields that will go into the output catalog.
257 Parameters
258 ----------
259 inputData : `numpy.ndarray`
260 The input data to compute fluxes for.
262 Returns
263 -------
264 coordErr : `dict` [`str`, `numpy.ndarray`]
265 The values that will go into the coord_raErr, coord_decErr, fields
266 in the output catalog (in radians).
268 Notes
269 -----
270 This does not currently handle the ra/dec covariance field,
271 ``coord_ra_dec_Cov``. That field may require extra work, as its units
272 may be more complicated in external catalogs.
273 """
274 result = {}
275 if hasattr(self, "coord_err_unit"):
276 result['coord_raErr'] = u.Quantity(inputData[self.config.ra_err_name],
277 self.coord_err_unit).to_value(u.radian)
278 result['coord_decErr'] = u.Quantity(inputData[self.config.dec_err_name],
279 self.coord_err_unit).to_value(u.radian)
280 return result
282 def _setFlags(self, record, row):
283 """Set flags in an output record.
285 Parameters
286 ----------
287 record : `lsst.afw.table.SimpleRecord`
288 Row from indexed catalog to modify.
289 row : `numpy.ndarray`
290 Row from catalog being ingested.
291 """
292 names = record.schema.getNames()
293 for flag in self._flags:
294 if flag in names: 294 ↛ 295line 294 didn't jump to line 295, because the condition on line 294 was never true
295 attr_name = 'is_{}_name'.format(flag)
296 record.set(self.key_map[flag], bool(row[getattr(self.config, attr_name)]))
298 def _getFluxes(self, inputData):
299 """Compute the flux fields that will go into the output catalog.
301 Parameters
302 ----------
303 inputData : `numpy.ndarray`
304 The input data to compute fluxes for.
306 Returns
307 -------
308 fluxes : `dict` [`str`, `numpy.ndarray`]
309 The values that will go into the flux and fluxErr fields in the
310 output catalog.
311 """
312 result = {}
313 for item in self.config.mag_column_list:
314 result[item+'_flux'] = (inputData[item]*u.ABmag).to_value(u.nJy)
315 if len(self.config.mag_err_column_map) > 0:
316 for err_key in self.config.mag_err_column_map.keys():
317 error_col_name = self.config.mag_err_column_map[err_key]
318 # TODO: multiply by 1e9 here until we have a replacement (see DM-16903)
319 # NOTE: copy the arrays because the numpy strides may not be useable by C++.
320 fluxErr = fluxErrFromABMagErr(inputData[error_col_name].copy(),
321 inputData[err_key].copy())*1e9
322 result[err_key+'_fluxErr'] = fluxErr
323 return result
325 def _setProperMotion(self, record, row):
326 """Set proper motion fields in a record of an indexed catalog.
328 The proper motions are read from the specified columns,
329 scaled appropriately, and installed in the appropriate
330 columns of the output.
332 Parameters
333 ----------
334 record : `lsst.afw.table.SimpleRecord`
335 Row from indexed catalog to modify.
336 row : structured `numpy.array`
337 Row from catalog being ingested.
338 """
339 if self.config.pm_ra_name is None: # IngestIndexedReferenceConfig.validate ensures all or none 339 ↛ 341line 339 didn't jump to line 341, because the condition on line 339 was never false
340 return
341 radPerOriginal = np.radians(self.config.pm_scale)/(3600*1000)
342 record.set(self.key_map["pm_ra"], row[self.config.pm_ra_name]*radPerOriginal*lsst.geom.radians)
343 record.set(self.key_map["pm_dec"], row[self.config.pm_dec_name]*radPerOriginal*lsst.geom.radians)
344 record.set(self.key_map["epoch"], self._epochToMjdTai(row[self.config.epoch_name]))
345 if self.config.pm_ra_err_name is not None: # pm_dec_err_name also, by validation
346 record.set(self.key_map["pm_raErr"], row[self.config.pm_ra_err_name]*radPerOriginal)
347 record.set(self.key_map["pm_decErr"], row[self.config.pm_dec_err_name]*radPerOriginal)
349 def _setParallax(self, record, row):
350 """Set the parallax fields in a record of a refcat.
351 """
352 if self.config.parallax_name is None: 352 ↛ 354line 352 didn't jump to line 354, because the condition on line 352 was never false
353 return
354 scale = self.config.parallax_scale*lsst.geom.milliarcseconds
355 record.set(self.key_map['parallax'], row[self.config.parallax_name]*scale)
356 record.set(self.key_map['parallaxErr'], row[self.config.parallax_err_name]*scale)
358 def _epochToMjdTai(self, nativeEpoch):
359 """Convert an epoch in native format to TAI MJD (a float).
360 """
361 return astropy.time.Time(nativeEpoch, format=self.config.epoch_format,
362 scale=self.config.epoch_scale).tai.mjd
364 def _setExtra(self, record, row):
365 """Set extra data fields in a record of an indexed catalog.
367 Parameters
368 ----------
369 record : `lsst.afw.table.SimpleRecord`
370 Row from indexed catalog to modify.
371 row : structured `numpy.array`
372 Row from catalog being ingested.
373 """
374 for extra_col in self.config.extra_col_names: 374 ↛ 375line 374 didn't jump to line 375, because the loop on line 374 never started
375 value = row[extra_col]
376 # If data read from a text file contains string like entires,
377 # numpy stores this as its own internal type, a numpy.str_
378 # object. This seems to be a consequence of how numpy stores
379 # string like objects in fixed column arrays. This checks
380 # if any of the values to be added to the catalog are numpy
381 # string types, and if they are, casts them to a python string
382 # which is what the python c++ records expect
383 if isinstance(value, np.str_):
384 value = str(value)
385 record.set(self.key_map[extra_col], value)
387 def _fillRecord(self, record, row):
388 """Fill a record in an indexed catalog to be persisted.
390 Parameters
391 ----------
392 record : `lsst.afw.table.SimpleRecord`
393 Row from indexed catalog to modify.
394 row : structured `numpy.array`
395 Row from catalog being ingested.
396 """
397 record.setCoord(self.computeCoord(row, self.config.ra_name, self.config.dec_name))
399 self._setFlags(record, row)
400 self._setProperMotion(record, row)
401 self._setParallax(record, row)
402 self._setExtra(record, row)
405class IngestGaiaManager(IngestIndexManager):
406 """Special-case ingest manager to deal with Gaia fluxes.
407 """
408 def _getFluxes(self, input):
409 result = {}
411 def gaiaFluxToFlux(flux, zeroPoint):
412 """Equations 5.19 and 5.30 from the Gaia calibration document define the
413 conversion from Gaia electron/second fluxes to AB magnitudes.
414 https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html
415 """
416 result = ((zeroPoint + -2.5 * np.log10(flux))*u.ABmag).to_value(u.nJy)
417 # set 0 instrumental fluxes to 0 (instead of NaN/inf from the math)
418 result[flux == 0] = 0
419 return result
421 # Some fluxes are 0, so log10(flux) can give warnings. We handle the
422 # zeros explicitly, so they warnings are irrelevant.
423 with np.errstate(invalid='ignore', divide='ignore'):
424 # The constants below come from table 5.3 in this document;
425 # https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html
426 result['phot_g_mean_flux'] = gaiaFluxToFlux(input['phot_g_mean_flux'], 25.7934)
427 result['phot_bp_mean_flux'] = gaiaFluxToFlux(input['phot_bp_mean_flux'], 25.3806)
428 result['phot_rp_mean_flux'] = gaiaFluxToFlux(input['phot_rp_mean_flux'], 25.1161)
430 result['phot_g_mean_fluxErr'] = result['phot_g_mean_flux'] / input['phot_g_mean_flux_over_error']
431 result['phot_bp_mean_fluxErr'] = result['phot_bp_mean_flux'] / input['phot_bp_mean_flux_over_error']
432 result['phot_rp_mean_fluxErr'] = result['phot_rp_mean_flux'] / input['phot_rp_mean_flux_over_error']
434 return result