Coverage for python/lsst/meas/algorithms/ingestIndexManager.py : 52%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of meas_algorithms.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22__all__ = ["IngestIndexManager", "IngestGaiaManager"]
24import os.path
25import itertools
26import multiprocessing
28import astropy.time
29import astropy.units as u
30import numpy as np
32import lsst.sphgeom
33import lsst.afw.table as afwTable
34from lsst.afw.image import fluxErrFromABMagErr
37# global shared counter to keep track of source ids
38# (multiprocess sharing is most easily done with a global)
39COUNTER = 0
40# global shared counter to keep track of number of files processed.
41FILE_PROGRESS = 0
44class IngestIndexManager:
45 """
46 Ingest a reference catalog from external files into a butler repository,
47 using a multiprocessing Pool to speed up the work.
49 Parameters
50 ----------
51 filenames : `dict` [`int`, `str`]
52 The HTM pixel id and filenames to ingest the catalog into.
53 config : `lsst.meas.algorithms.IngestIndexedReferenceConfig`
54 The Task configuration holding the field names.
55 file_reader : `lsst.pipe.base.Task`
56 The file reader to use to load the files.
57 indexer : `lsst.meas.algorithms.HtmIndexer`
58 The class used to compute the HTM pixel per coordinate.
59 schema : `lsst.afw.table.Schema`
60 The schema of the output catalog.
61 key_map : `dict` [`str`, `lsst.afw.table.Key`]
62 The mapping from output field names to keys in the Schema.
63 htmRange : `tuple` [`int`]
64 The start and end HTM pixel ids.
65 addRefCatMetadata : callable
66 A function called to add extra metadata to each output Catalog.
67 log : `lsst.log.Log`
68 The log to send messages to.
69 """
70 _flags = ['photometric', 'resolved', 'variable']
72 def __init__(self, filenames, config, file_reader, indexer,
73 schema, key_map, htmRange, addRefCatMetadata, log):
74 self.filenames = filenames
75 self.config = config
76 self.file_reader = file_reader
77 self.indexer = indexer
78 self.schema = schema
79 self.key_map = key_map
80 self.htmRange = htmRange
81 self.addRefCatMetadata = addRefCatMetadata
82 self.log = log
83 if self.config.coord_err_unit is not None:
84 # cache this to speed up coordinate conversions
85 self.coord_err_unit = u.Unit(self.config.coord_err_unit)
87 def run(self, inputFiles):
88 """Index a set of input files from a reference catalog, and write the
89 output to the appropriate filenames, in parallel.
91 Parameters
92 ----------
93 inputFiles : `list`
94 A list of file paths to read data from.
95 """
96 global COUNTER, FILE_PROGRESS
97 self.nInputFiles = len(inputFiles)
99 with multiprocessing.Manager() as manager:
100 COUNTER = multiprocessing.Value('i', 0)
101 FILE_PROGRESS = multiprocessing.Value('i', 0)
102 fileLocks = manager.dict()
103 self.log.info("Creating %s file locks.", self.htmRange[1] - self.htmRange[0])
104 for i in range(self.htmRange[0], self.htmRange[1]):
105 fileLocks[i] = manager.Lock()
106 self.log.info("File locks created.")
107 with multiprocessing.Pool(self.config.n_processes) as pool:
108 pool.starmap(self._ingestOneFile, zip(inputFiles, itertools.repeat(fileLocks)))
110 def _ingestOneFile(self, filename, fileLocks):
111 """Read and process one file, and write its records to the correct
112 indexed files, while handling exceptions in a useful way so that they
113 don't get swallowed by the multiprocess pool.
115 Parameters
116 ----------
117 filename : `str`
118 The file to process.
119 fileLocks : `dict` [`int`, `multiprocessing.Lock`]
120 A Lock for each HTM pixel; each pixel gets one file written, and
121 we need to block when one process is accessing that file.
122 """
123 global FILE_PROGRESS
124 inputData = self.file_reader.run(filename)
125 fluxes = self._getFluxes(inputData)
126 coordErr = self._getCoordErr(inputData)
127 matchedPixels = self.indexer.indexPoints(inputData[self.config.ra_name],
128 inputData[self.config.dec_name])
129 pixel_ids = set(matchedPixels)
130 for pixelId in pixel_ids:
131 with fileLocks[pixelId]:
132 self._doOnePixel(inputData, matchedPixels, pixelId, fluxes, coordErr)
133 with FILE_PROGRESS.get_lock():
134 oldPercent = 100 * FILE_PROGRESS.value / self.nInputFiles
135 FILE_PROGRESS.value += 1
136 percent = 100 * FILE_PROGRESS.value / self.nInputFiles
137 # only log each "new percent"
138 if np.floor(percent) - np.floor(oldPercent) >= 1:
139 self.log.info("Completed %d / %d files: %d %% complete ",
140 FILE_PROGRESS.value,
141 self.nInputFiles,
142 percent)
144 def _doOnePixel(self, inputData, matchedPixels, pixelId, fluxes, coordErr):
145 """Process one HTM pixel, appending to an existing catalog or creating
146 a new catalog, as needed.
148 Parameters
149 ----------
150 inputData : `numpy.ndarray`
151 The data from one input file.
152 matchedPixels : `numpy.ndarray`
153 The row-matched pixel indexes corresponding to ``inputData``.
154 pixelId : `int`
155 The pixel index we are currently processing.
156 fluxes : `dict` [`str`, `numpy.ndarray`]
157 The values that will go into the flux and fluxErr fields in the
158 output catalog.
159 coordErr : `dict` [`str`, `numpy.ndarray`]
160 The values that will go into the coord_raErr, coord_decErr, and
161 coord_ra_dec_Cov fields in the output catalog (in radians).
162 """
163 idx = np.where(matchedPixels == pixelId)[0]
164 catalog = self.getCatalog(pixelId, self.schema, len(idx))
165 for outputRow, inputRow in zip(catalog[-len(idx):], inputData[idx]):
166 self._fillRecord(outputRow, inputRow)
168 global COUNTER
169 with COUNTER.get_lock():
170 self._setIds(inputData[idx], catalog)
172 # set fluxes from the pre-computed array
173 for name, array in fluxes.items(): 173 ↛ 174line 173 didn't jump to line 174, because the loop on line 173 never started
174 catalog[self.key_map[name]][-len(idx):] = array[idx]
176 # set coordinate errors from the pre-computed array
177 for name, array in coordErr.items(): 177 ↛ 178line 177 didn't jump to line 178, because the loop on line 177 never started
178 catalog[name][-len(idx):] = array[idx]
180 catalog.writeFits(self.filenames[pixelId])
182 def _setIds(self, inputData, catalog):
183 """Fill the `id` field of catalog with a running index, filling the
184 last values up to the length of ``inputData``.
186 Fill with `self.config.id_name` if specified, otherwise use the
187 global running counter value.
189 Parameters
190 ----------
191 inputData : `numpy.ndarray`
192 The input data that is being processed.
193 catalog : `lsst.afw.table.SimpleCatalog`
194 The output catalog to fill the ids.
195 """
196 global COUNTER
197 size = len(inputData)
198 if self.config.id_name: 198 ↛ 201line 198 didn't jump to line 201, because the condition on line 198 was never false
199 catalog['id'][-size:] = inputData[self.config.id_name]
200 else:
201 idEnd = COUNTER.value + size
202 catalog['id'][-size:] = np.arange(COUNTER.value, idEnd)
203 COUNTER.value = idEnd
205 def getCatalog(self, pixelId, schema, nNewElements):
206 """Get a catalog from disk or create it if it doesn't exist.
208 Parameters
209 ----------
210 pixelId : `dict`
211 Identifier for catalog to retrieve
212 schema : `lsst.afw.table.Schema`
213 Schema to use in catalog creation it does not exist.
214 nNewElements : `int`
215 The number of new elements that will be added to the catalog,
216 so space can be preallocated.
218 Returns
219 -------
220 catalog : `lsst.afw.table.SimpleCatalog`
221 The new or read-and-resized catalog specified by `dataId`.
222 """
223 # This is safe, because we lock on this file before getCatalog is called.
224 if os.path.isfile(self.filenames[pixelId]): 224 ↛ 228line 224 didn't jump to line 228, because the condition on line 224 was never false
225 catalog = afwTable.SimpleCatalog.readFits(self.filenames[pixelId])
226 catalog.resize(len(catalog) + nNewElements)
227 return catalog.copy(deep=True) # ensure contiguity, so that column-assignment works
228 catalog = afwTable.SimpleCatalog(schema)
229 catalog.resize(nNewElements)
230 self.addRefCatMetadata(catalog)
231 return catalog
233 @staticmethod
234 def computeCoord(row, ra_name, dec_name):
235 """Create an ICRS coord. from a row of a catalog being ingested.
237 Parameters
238 ----------
239 row : `numpy.ndarray`
240 Row from catalog being ingested.
241 ra_name : `str`
242 Name of RA key in catalog being ingested.
243 dec_name : `str`
244 Name of Dec key in catalog being ingested.
246 Returns
247 -------
248 coord : `lsst.geom.SpherePoint`
249 ICRS coordinate.
250 """
251 return lsst.geom.SpherePoint(row[ra_name], row[dec_name], lsst.geom.degrees)
253 def _getCoordErr(self, inputData, ):
254 """Compute the ra/dec error fields that will go into the output catalog.
256 Parameters
257 ----------
258 inputData : `numpy.ndarray`
259 The input data to compute fluxes for.
261 Returns
262 -------
263 coordErr : `dict` [`str`, `numpy.ndarray`]
264 The values that will go into the coord_raErr, coord_decErr, fields
265 in the output catalog (in radians).
267 Notes
268 -----
269 This does not currently handle the ra/dec covariance field,
270 ``coord_ra_dec_Cov``. That field may require extra work, as its units
271 may be more complicated in external catalogs.
272 """
273 result = {}
274 if hasattr(self, "coord_err_unit"):
275 result['coord_raErr'] = u.Quantity(inputData[self.config.ra_err_name],
276 self.coord_err_unit).to_value(u.radian)
277 result['coord_decErr'] = u.Quantity(inputData[self.config.dec_err_name],
278 self.coord_err_unit).to_value(u.radian)
279 return result
281 def _setFlags(self, record, row):
282 """Set flags in an output record.
284 Parameters
285 ----------
286 record : `lsst.afw.table.SimpleRecord`
287 Row from indexed catalog to modify.
288 row : `numpy.ndarray`
289 Row from catalog being ingested.
290 """
291 names = record.schema.getNames()
292 for flag in self._flags:
293 if flag in names: 293 ↛ 294line 293 didn't jump to line 294, because the condition on line 293 was never true
294 attr_name = 'is_{}_name'.format(flag)
295 record.set(self.key_map[flag], bool(row[getattr(self.config, attr_name)]))
297 def _getFluxes(self, inputData):
298 """Compute the flux fields that will go into the output catalog.
300 Parameters
301 ----------
302 inputData : `numpy.ndarray`
303 The input data to compute fluxes for.
305 Returns
306 -------
307 fluxes : `dict` [`str`, `numpy.ndarray`]
308 The values that will go into the flux and fluxErr fields in the
309 output catalog.
310 """
311 result = {}
312 for item in self.config.mag_column_list:
313 result[item+'_flux'] = (inputData[item]*u.ABmag).to_value(u.nJy)
314 if len(self.config.mag_err_column_map) > 0:
315 for err_key in self.config.mag_err_column_map.keys():
316 error_col_name = self.config.mag_err_column_map[err_key]
317 # TODO: multiply by 1e9 here until we have a replacement (see DM-16903)
318 # NOTE: copy the arrays because the numpy strides may not be useable by C++.
319 fluxErr = fluxErrFromABMagErr(inputData[error_col_name].copy(),
320 inputData[err_key].copy())*1e9
321 result[err_key+'_fluxErr'] = fluxErr
322 return result
324 def _setProperMotion(self, record, row):
325 """Set proper motion fields in a record of an indexed catalog.
327 The proper motions are read from the specified columns,
328 scaled appropriately, and installed in the appropriate
329 columns of the output.
331 Parameters
332 ----------
333 record : `lsst.afw.table.SimpleRecord`
334 Row from indexed catalog to modify.
335 row : structured `numpy.array`
336 Row from catalog being ingested.
337 """
338 if self.config.pm_ra_name is None: # IngestIndexedReferenceConfig.validate ensures all or none 338 ↛ 340line 338 didn't jump to line 340, because the condition on line 338 was never false
339 return
340 radPerOriginal = np.radians(self.config.pm_scale)/(3600*1000)
341 record.set(self.key_map["pm_ra"], row[self.config.pm_ra_name]*radPerOriginal*lsst.geom.radians)
342 record.set(self.key_map["pm_dec"], row[self.config.pm_dec_name]*radPerOriginal*lsst.geom.radians)
343 record.set(self.key_map["epoch"], self._epochToMjdTai(row[self.config.epoch_name]))
344 if self.config.pm_ra_err_name is not None: # pm_dec_err_name also, by validation
345 record.set(self.key_map["pm_raErr"], row[self.config.pm_ra_err_name]*radPerOriginal)
346 record.set(self.key_map["pm_decErr"], row[self.config.pm_dec_err_name]*radPerOriginal)
348 def _setParallax(self, record, row):
349 """Set the parallax fields in a record of a refcat.
350 """
351 if self.config.parallax_name is None: 351 ↛ 353line 351 didn't jump to line 353, because the condition on line 351 was never false
352 return
353 scale = self.config.parallax_scale*lsst.geom.milliarcseconds
354 record.set(self.key_map['parallax'], row[self.config.parallax_name]*scale)
355 record.set(self.key_map['parallaxErr'], row[self.config.parallax_err_name]*scale)
357 def _epochToMjdTai(self, nativeEpoch):
358 """Convert an epoch in native format to TAI MJD (a float).
359 """
360 return astropy.time.Time(nativeEpoch, format=self.config.epoch_format,
361 scale=self.config.epoch_scale).tai.mjd
363 def _setExtra(self, record, row):
364 """Set extra data fields in a record of an indexed catalog.
366 Parameters
367 ----------
368 record : `lsst.afw.table.SimpleRecord`
369 Row from indexed catalog to modify.
370 row : structured `numpy.array`
371 Row from catalog being ingested.
372 """
373 for extra_col in self.config.extra_col_names: 373 ↛ 374line 373 didn't jump to line 374, because the loop on line 373 never started
374 value = row[extra_col]
375 # If data read from a text file contains string like entires,
376 # numpy stores this as its own internal type, a numpy.str_
377 # object. This seems to be a consequence of how numpy stores
378 # string like objects in fixed column arrays. This checks
379 # if any of the values to be added to the catalog are numpy
380 # string types, and if they are, casts them to a python string
381 # which is what the python c++ records expect
382 if isinstance(value, np.str_):
383 value = str(value)
384 record.set(self.key_map[extra_col], value)
386 def _fillRecord(self, record, row):
387 """Fill a record in an indexed catalog to be persisted.
389 Parameters
390 ----------
391 record : `lsst.afw.table.SimpleRecord`
392 Row from indexed catalog to modify.
393 row : structured `numpy.array`
394 Row from catalog being ingested.
395 """
396 record.setCoord(self.computeCoord(row, self.config.ra_name, self.config.dec_name))
398 self._setFlags(record, row)
399 self._setProperMotion(record, row)
400 self._setParallax(record, row)
401 self._setExtra(record, row)
404class IngestGaiaManager(IngestIndexManager):
405 """Special-case ingest manager to deal with Gaia fluxes.
406 """
407 def _getFluxes(self, input):
408 result = {}
410 def gaiaFluxToFlux(flux, zeroPoint):
411 """Equations 5.19 and 5.30 from the Gaia calibration document define the
412 conversion from Gaia electron/second fluxes to AB magnitudes.
413 https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html
414 """
415 result = ((zeroPoint + -2.5 * np.log10(flux))*u.ABmag).to_value(u.nJy)
416 # set 0 instrumental fluxes to 0 (instead of NaN/inf from the math)
417 result[flux == 0] = 0
418 return result
420 # Some fluxes are 0, so log10(flux) can give warnings. We handle the
421 # zeros explicitly, so they warnings are irrelevant.
422 with np.errstate(invalid='ignore', divide='ignore'):
423 # The constants below come from table 5.3 in this document;
424 # https://gea.esac.esa.int/archive/documentation/GDR2/Data_processing/chap_cu5pho/sec_cu5pho_calibr/ssec_cu5pho_calibr_extern.html
425 result['phot_g_mean_flux'] = gaiaFluxToFlux(input['phot_g_mean_flux'], 25.7934)
426 result['phot_bp_mean_flux'] = gaiaFluxToFlux(input['phot_bp_mean_flux'], 25.3806)
427 result['phot_rp_mean_flux'] = gaiaFluxToFlux(input['phot_rp_mean_flux'], 25.1161)
429 result['phot_g_mean_fluxErr'] = result['phot_g_mean_flux'] / input['phot_g_mean_flux_over_error']
430 result['phot_bp_mean_fluxErr'] = result['phot_bp_mean_flux'] / input['phot_bp_mean_flux_over_error']
431 result['phot_rp_mean_fluxErr'] = result['phot_rp_mean_flux'] / input['phot_rp_mean_flux_over_error']
433 return result