Coverage for python/lsst/meas/algorithms/ingestIndexManager.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of meas_algorithms.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22__all__ = ["IngestIndexManager", "IngestGaiaManager"]
24import os.path
25import itertools
26import multiprocessing
28import astropy.time
29import astropy.units as u
30import numpy as np
32import lsst.sphgeom
33import lsst.afw.table as afwTable
34from lsst.afw.image import fluxErrFromABMagErr
37# global shared counter to keep track of source ids
38# (multiprocess sharing is most easily done with a global)
39COUNTER = 0
40# global shared counter to keep track of number of files processed.
41FILE_PROGRESS = 0
44class IngestIndexManager:
45 """
46 Ingest a reference catalog from external files into a butler repository,
47 using a multiprocessing Pool to speed up the work.
49 Parameters
50 ----------
51 filenames : `dict` [`int`, `str`]
52 The HTM pixel id and filenames to ingest the catalog into.
53 config : `lsst.meas.algorithms.IngestIndexedReferenceConfig`
54 The Task configuration holding the field names.
55 file_reader : `lsst.pipe.base.Task`
56 The file reader to use to load the files.
57 indexer : `lsst.meas.algorithms.HtmIndexer`
58 The class used to compute the HTM pixel per coordinate.
59 schema : `lsst.afw.table.Schema`
60 The schema of the output catalog.
61 key_map : `dict` [`str`, `lsst.afw.table.Key`]
62 The mapping from output field names to keys in the Schema.
63 htmRange : `tuple` [`int`]
64 The start and end HTM pixel ids.
65 addRefCatMetadata : callable
66 A function called to add extra metadata to each output Catalog.
67 log : `lsst.log.Log`
68 The log to send messages to.
69 """
70 _flags = ['photometric', 'resolved', 'variable']
72 def __init__(self, filenames, config, file_reader, indexer,
73 schema, key_map, htmRange, addRefCatMetadata, log):
74 self.filenames = filenames
75 self.config = config
76 self.file_reader = file_reader
77 self.indexer = indexer
78 self.schema = schema
79 self.key_map = key_map
80 self.htmRange = htmRange
81 self.addRefCatMetadata = addRefCatMetadata
82 self.log = log
83 # cache this to speed up coordinate conversions
84 self.coord_err_unit = u.Unit(self.config.coord_err_unit)
86 def run(self, inputFiles):
87 """Index a set of input files from a reference catalog, and write the
88 output to the appropriate filenames, in parallel.
90 Parameters
91 ----------
92 inputFiles : `list`
93 A list of file paths to read data from.
94 """
95 global COUNTER, FILE_PROGRESS
96 self.nInputFiles = len(inputFiles)
98 with multiprocessing.Manager() as manager:
99 COUNTER = multiprocessing.Value('i', 0)
100 FILE_PROGRESS = multiprocessing.Value('i', 0)
101 fileLocks = manager.dict()
102 self.log.info("Creating %s file locks.", self.htmRange[1] - self.htmRange[0])
103 for i in range(self.htmRange[0], self.htmRange[1]):
104 fileLocks[i] = manager.Lock()
105 self.log.info("File locks created.")
106 with multiprocessing.Pool(self.config.n_processes) as pool:
107 pool.starmap(self._ingestOneFile, zip(inputFiles, itertools.repeat(fileLocks)))
109 def _ingestOneFile(self, filename, fileLocks):
110 """Read and process one file, and write its records to the correct
111 indexed files, while handling exceptions in a useful way so that they
112 don't get swallowed by the multiprocess pool.
114 Parameters
115 ----------
116 filename : `str`
117 The file to process.
118 fileLocks : `dict` [`int`, `multiprocessing.Lock`]
119 A Lock for each HTM pixel; each pixel gets one file written, and
120 we need to block when one process is accessing that file.
121 """
122 global FILE_PROGRESS
123 inputData = self.file_reader.run(filename)
124 fluxes = self._getFluxes(inputData)
125 coordErr = self._getCoordErr(inputData)
126 matchedPixels = self.indexer.indexPoints(inputData[self.config.ra_name],
127 inputData[self.config.dec_name])
128 pixel_ids = set(matchedPixels)
129 for pixelId in pixel_ids:
130 with fileLocks[pixelId]:
131 self._doOnePixel(inputData, matchedPixels, pixelId, fluxes, coordErr)
132 with FILE_PROGRESS.get_lock():
133 oldPercent = 100 * FILE_PROGRESS.value / self.nInputFiles
134 FILE_PROGRESS.value += 1
135 percent = 100 * FILE_PROGRESS.value / self.nInputFiles
136 # only log each "new percent"
137 if np.floor(percent) - np.floor(oldPercent) >= 1:
138 self.log.info("Completed %d / %d files: %d %% complete ",
139 FILE_PROGRESS.value,
140 self.nInputFiles,
141 percent)
143 def _doOnePixel(self, inputData, matchedPixels, pixelId, fluxes, coordErr):
144 """Process one HTM pixel, appending to an existing catalog or creating
145 a new catalog, as needed.
147 Parameters
148 ----------
149 inputData : `numpy.ndarray`
150 The data from one input file.
151 matchedPixels : `numpy.ndarray`
152 The row-matched pixel indexes corresponding to ``inputData``.
153 pixelId : `int`
154 The pixel index we are currently processing.
155 fluxes : `dict` [`str`, `numpy.ndarray`]
156 The values that will go into the flux and fluxErr fields in the
157 output catalog.
158 coordErr : `dict` [`str`, `numpy.ndarray`]
159 The values that will go into the coord_raErr, coord_decErr, and
160 coord_ra_dec_Cov fields in the output catalog (in radians).
161 """
162 idx = np.where(matchedPixels == pixelId)[0]
163 catalog = self.getCatalog(pixelId, self.schema, len(idx))
164 for outputRow, inputRow in zip(catalog[-len(idx):], inputData[idx]):
165 self._fillRecord(outputRow, inputRow)
167 global COUNTER
168 with COUNTER.get_lock():
169 self._setIds(inputData[idx], catalog)
171 # set fluxes from the pre-computed array
172 for name, array in fluxes.items():
173 catalog[self.key_map[name]][-len(idx):] = array[idx]
175 # set coordinate errors from the pre-computed array
176 for name, array in coordErr.items():
177 catalog[name][-len(idx):] = array[idx]
179 catalog.writeFits(self.filenames[pixelId])
181 def _setIds(self, inputData, catalog):
182 """Fill the `id` field of catalog with a running index, filling the
183 last values up to the length of ``inputData``.
185 Fill with `self.config.id_name` if specified, otherwise use the
186 global running counter value.
188 Parameters
189 ----------
190 inputData : `numpy.ndarray`
191 The input data that is being processed.
192 catalog : `lsst.afw.table.SimpleCatalog`
193 The output catalog to fill the ids.
194 """
195 global COUNTER
196 size = len(inputData)
197 if self.config.id_name:
198 catalog['id'][-size:] = inputData[self.config.id_name]
199 else:
200 idEnd = COUNTER.value + size
201 catalog['id'][-size:] = np.arange(COUNTER.value, idEnd)
202 COUNTER.value = idEnd
204 def getCatalog(self, pixelId, schema, nNewElements):
205 """Get a catalog from disk or create it if it doesn't exist.
207 Parameters
208 ----------
209 pixelId : `dict`
210 Identifier for catalog to retrieve
211 schema : `lsst.afw.table.Schema`
212 Schema to use in catalog creation it does not exist.
213 nNewElements : `int`
214 The number of new elements that will be added to the catalog,
215 so space can be preallocated.
217 Returns
218 -------
219 catalog : `lsst.afw.table.SimpleCatalog`
220 The new or read-and-resized catalog specified by `dataId`.
221 """
222 # This is safe, because we lock on this file before getCatalog is called.
223 if os.path.isfile(self.filenames[pixelId]):
224 catalog = afwTable.SimpleCatalog.readFits(self.filenames[pixelId])
225 catalog.resize(len(catalog) + nNewElements)
226 return catalog.copy(deep=True) # ensure contiguity, so that column-assignment works
227 catalog = afwTable.SimpleCatalog(schema)
228 catalog.resize(nNewElements)
229 self.addRefCatMetadata(catalog)
230 return catalog
232 @staticmethod
233 def computeCoord(row, ra_name, dec_name):
234 """Create an ICRS coord. from a row of a catalog being ingested.
236 Parameters
237 ----------
238 row : `numpy.ndarray`
239 Row from catalog being ingested.
240 ra_name : `str`
241 Name of RA key in catalog being ingested.
242 dec_name : `str`
243 Name of Dec key in catalog being ingested.
245 Returns
246 -------
247 coord : `lsst.geom.SpherePoint`
248 ICRS coordinate.
249 """
250 return lsst.geom.SpherePoint(row[ra_name], row[dec_name], lsst.geom.degrees)
252 def _getCoordErr(self, inputData, ):
253 """Compute the ra/dec error fields that will go into the output catalog.
255 Parameters
256 ----------
257 inputData : `numpy.ndarray`
258 The input data to compute fluxes for.
260 Returns
261 -------
262 coordErr : `dict` [`str`, `numpy.ndarray`]
263 The values that will go into the coord_raErr, coord_decErr, fields
264 in the output catalog (in radians).
266 Notes
267 -----
268 This does not currently handle the ra/dec covariance field,
269 ``coord_ra_dec_Cov``. That field may require extra work, as its units
270 may be more complicated in external catalogs.
271 """
272 result = {}
273 result['coord_raErr'] = u.Quantity(inputData[self.config.ra_err_name],
274 self.coord_err_unit).to_value(u.radian)
275 result['coord_decErr'] = u.Quantity(inputData[self.config.dec_err_name],
276 self.coord_err_unit).to_value(u.radian)
277 return result
279 def _setFlags(self, record, row):
280 """Set flags in an output record.
282 Parameters
283 ----------
284 record : `lsst.afw.table.SimpleRecord`
285 Row from indexed catalog to modify.
286 row : `numpy.ndarray`
287 Row from catalog being ingested.
288 """
289 names = record.schema.getNames()
290 for flag in self._flags:
291 if flag in names:
292 attr_name = 'is_{}_name'.format(flag)
293 record.set(self.key_map[flag], bool(row[getattr(self.config, attr_name)]))
295 def _getFluxes(self, inputData):
296 """Compute the flux fields that will go into the output catalog.
298 Parameters
299 ----------
300 inputData : `numpy.ndarray`
301 The input data to compute fluxes for.
303 Returns
304 -------
305 fluxes : `dict` [`str`, `numpy.ndarray`]
306 The values that will go into the flux and fluxErr fields in the
307 output catalog.
308 """
309 result = {}
310 for item in self.config.mag_column_list:
311 result[item+'_flux'] = (inputData[item]*u.ABmag).to_value(u.nJy)
312 if len(self.config.mag_err_column_map) > 0:
313 for err_key in self.config.mag_err_column_map.keys():
314 error_col_name = self.config.mag_err_column_map[err_key]
315 # TODO: multiply by 1e9 here until we have a replacement (see DM-16903)
316 # NOTE: copy the arrays because the numpy strides may not be useable by C++.
317 fluxErr = fluxErrFromABMagErr(inputData[error_col_name].copy(),
318 inputData[err_key].copy())*1e9
319 result[err_key+'_fluxErr'] = fluxErr
320 return result
322 def _setProperMotion(self, record, row):
323 """Set proper motion fields in a record of an indexed catalog.
325 The proper motions are read from the specified columns,
326 scaled appropriately, and installed in the appropriate
327 columns of the output.
329 Parameters
330 ----------
331 record : `lsst.afw.table.SimpleRecord`
332 Row from indexed catalog to modify.
333 row : structured `numpy.array`
334 Row from catalog being ingested.
335 """
336 if self.config.pm_ra_name is None: # IngestIndexedReferenceConfig.validate ensures all or none
337 return
338 radPerOriginal = np.radians(self.config.pm_scale)/(3600*1000)
339 record.set(self.key_map["pm_ra"], row[self.config.pm_ra_name]*radPerOriginal*lsst.geom.radians)
340 record.set(self.key_map["pm_dec"], row[self.config.pm_dec_name]*radPerOriginal*lsst.geom.radians)
341 record.set(self.key_map["epoch"], self._epochToMjdTai(row[self.config.epoch_name]))
342 if self.config.pm_ra_err_name is not None: # pm_dec_err_name also, by validation
343 record.set(self.key_map["pm_raErr"], row[self.config.pm_ra_err_name]*radPerOriginal)
344 record.set(self.key_map["pm_decErr"], row[self.config.pm_dec_err_name]*radPerOriginal)
346 def _setParallax(self, record, row):
347 """Set the parallax fields in a record of a refcat.
348 """
349 if self.config.parallax_name is None:
350 return
351 scale = self.config.parallax_scale*lsst.geom.milliarcseconds
352 record.set(self.key_map['parallax'], row[self.config.parallax_name]*scale)
353 record.set(self.key_map['parallaxErr'], row[self.config.parallax_err_name]*scale)
355 def _epochToMjdTai(self, nativeEpoch):
356 """Convert an epoch in native format to TAI MJD (a float).
357 """
358 return astropy.time.Time(nativeEpoch, format=self.config.epoch_format,
359 scale=self.config.epoch_scale).tai.mjd
361 def _setExtra(self, record, row):
362 """Set extra data fields in a record of an indexed catalog.
364 Parameters
365 ----------
366 record : `lsst.afw.table.SimpleRecord`
367 Row from indexed catalog to modify.
368 row : structured `numpy.array`
369 Row from catalog being ingested.
370 """
371 for extra_col in self.config.extra_col_names:
372 value = row[extra_col]
373 # If data read from a text file contains string like entires,
374 # numpy stores this as its own internal type, a numpy.str_
375 # object. This seems to be a consequence of how numpy stores
376 # string like objects in fixed column arrays. This checks
377 # if any of the values to be added to the catalog are numpy
378 # string types, and if they are, casts them to a python string
379 # which is what the python c++ records expect
380 if isinstance(value, np.str_):
381 value = str(value)
382 record.set(self.key_map[extra_col], value)
384 def _fillRecord(self, record, row):
385 """Fill a record in an indexed catalog to be persisted.
387 Parameters
388 ----------
389 record : `lsst.afw.table.SimpleRecord`
390 Row from indexed catalog to modify.
391 row : structured `numpy.array`
392 Row from catalog being ingested.
393 """
394 record.setCoord(self.computeCoord(row, self.config.ra_name, self.config.dec_name))
396 self._setFlags(record, row)
397 self._setProperMotion(record, row)
398 self._setParallax(record, row)
399 self._setExtra(record, row)
402class IngestGaiaManager(IngestIndexManager):
403 """Special-case ingest manager to deal with Gaia fluxes.
404 """
405 def _getFluxes(self, input):
406 result = {}
408 def gaiaFluxToFlux(flux, zeroPoint):
409 """Equations 5.19 and 5.30 from the Gaia calibration document define the
410 conversion from Gaia electron/second fluxes to AB magnitudes.
411 https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html
412 """
413 result = ((zeroPoint + -2.5 * np.log10(flux))*u.ABmag).to_value(u.nJy)
414 # set 0 instrumental fluxes to 0 (instead of NaN/inf from the math)
415 result[flux == 0] = 0
416 return result
418 # Some fluxes are 0, so log10(flux) can give warnings. We handle the
419 # zeros explicitly, so they warnings are irrelevant.
420 with np.errstate(invalid='ignore', divide='ignore'):
421 # The constants below come from table 5.3 in this document;
422 # https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html
423 result['phot_g_mean_flux'] = gaiaFluxToFlux(input['phot_g_mean_flux'], 25.7934)
424 result['phot_bp_mean_flux'] = gaiaFluxToFlux(input['phot_bp_mean_flux'], 25.3806)
425 result['phot_rp_mean_flux'] = gaiaFluxToFlux(input['phot_rp_mean_flux'], 25.1161)
427 result['phot_g_mean_fluxErr'] = result['phot_g_mean_flux'] / input['phot_g_mean_flux_over_error']
428 result['phot_bp_mean_fluxErr'] = result['phot_bp_mean_flux'] / input['phot_bp_mean_flux_over_error']
429 result['phot_rp_mean_fluxErr'] = result['phot_rp_mean_flux'] / input['phot_rp_mean_flux_over_error']
431 return result