Coverage for python/lsst/meas/algorithms/ingestIndexReferenceTask.py: 67%
139 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-18 19:27 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-18 19:27 +0000
1# This file is part of meas_algorithms.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
23# TODO DM-31698: post-gen2 removal notes
24# `DatasetConfig`, `ConvertReferenceCatalogBase`, and `ConvertReferenceCatalogConfig`
25# should all be moved to to `convertReferenceCatalog.py` once gen2 butler
26# has been removed.
28__all__ = ["DatasetConfig", "ConvertReferenceCatalogBase", "ConvertReferenceCatalogConfig"]
30import abc
31import os.path
33import astropy.units
35import lsst.pex.config as pexConfig
36import lsst.pipe.base as pipeBase
37import lsst.geom
38import lsst.sphgeom
39import lsst.afw.table as afwTable
40from lsst.daf.base import PropertyList
41from .indexerRegistry import IndexerRegistry
42from .readTextCatalogTask import ReadTextCatalogTask
43from .loadReferenceObjects import ReferenceObjectLoader
44from . import convertRefcatManager
46# The most recent Indexed Reference Catalog on-disk format version.
47LATEST_FORMAT_VERSION = 1
50def addRefCatMetadata(catalog):
51 """Add metadata to a new (not yet populated) reference catalog.
53 Parameters
54 ----------
55 catalog : `lsst.afw.table.SimpleCatalog`
56 Catalog to which metadata should be attached. Will be modified
57 in-place.
58 """
59 md = catalog.getMetadata()
60 if md is None: 60 ↛ 62line 60 didn't jump to line 62, because the condition on line 60 was never false
61 md = PropertyList()
62 md.set("REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION)
63 catalog.setMetadata(md)
66class DatasetConfig(pexConfig.Config):
67 """The description of the on-disk storage format for the persisted
68 reference catalog.
69 """
70 format_version = pexConfig.Field(
71 dtype=int,
72 doc="Version number of the persisted on-disk storage format."
73 "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)."
74 "\nVersion 1 had nJy as flux units.",
75 default=0 # This needs to always be 0, so that unversioned catalogs are interpreted as version 0.
76 )
77 ref_dataset_name = pexConfig.Field(
78 dtype=str,
79 # TODO DM-31817: remove this default value.
80 default='cal_ref_cat',
81 doc="Name of this reference catalog to be used in the butler registry.",
82 )
83 indexer = IndexerRegistry.makeField(
84 default='HTM',
85 doc='Name of indexer algoritm to use. Default is HTM',
86 )
89class ConvertReferenceCatalogConfig(pexConfig.Config):
90 dataset_config = pexConfig.ConfigField(
91 dtype=DatasetConfig,
92 doc="Configuration for reading the ingested data",
93 )
94 n_processes = pexConfig.Field(
95 dtype=int,
96 doc=("Number of python processes to use when ingesting."),
97 default=1
98 )
99 manager = pexConfig.ConfigurableField(
100 target=convertRefcatManager.ConvertRefcatManager,
101 doc="Multiprocessing manager to perform the actual conversion of values, file-by-file."
102 )
103 file_reader = pexConfig.ConfigurableField(
104 target=ReadTextCatalogTask,
105 doc='Task to use to read the files. Default is to expect text files.'
106 )
107 ra_name = pexConfig.Field(
108 dtype=str,
109 doc="Name of RA column (values in decimal degrees)",
110 )
111 dec_name = pexConfig.Field(
112 dtype=str,
113 doc="Name of Dec column (values in decimal degrees)",
114 )
115 ra_err_name = pexConfig.Field(
116 dtype=str,
117 doc="Name of RA error column",
118 optional=True,
119 )
120 dec_err_name = pexConfig.Field(
121 dtype=str,
122 doc="Name of Dec error column",
123 optional=True,
124 )
125 coord_err_unit = pexConfig.Field(
126 dtype=str,
127 doc="Unit of RA/Dec error fields (astropy.unit.Unit compatible)",
128 optional=True
129 )
130 mag_column_list = pexConfig.ListField(
131 dtype=str,
132 doc="The values in the reference catalog are assumed to be in AB magnitudes. "
133 "List of column names to use for photometric information. At least one entry is required."
134 )
135 mag_err_column_map = pexConfig.DictField(
136 keytype=str,
137 itemtype=str,
138 default={},
139 doc="A map of magnitude column name (key) to magnitude error column (value)."
140 )
141 is_photometric_name = pexConfig.Field(
142 dtype=str,
143 optional=True,
144 doc='Name of column stating if satisfactory for photometric calibration (optional).'
145 )
146 is_resolved_name = pexConfig.Field(
147 dtype=str,
148 optional=True,
149 doc='Name of column stating if the object is resolved (optional).'
150 )
151 is_variable_name = pexConfig.Field(
152 dtype=str,
153 optional=True,
154 doc='Name of column stating if the object is measured to be variable (optional).'
155 )
156 id_name = pexConfig.Field(
157 dtype=str,
158 optional=True,
159 doc='Name of column to use as an identifier (optional).'
160 )
161 pm_ra_name = pexConfig.Field(
162 dtype=str,
163 doc="Name of proper motion RA column",
164 optional=True,
165 )
166 pm_dec_name = pexConfig.Field(
167 dtype=str,
168 doc="Name of proper motion Dec column",
169 optional=True,
170 )
171 pm_ra_err_name = pexConfig.Field(
172 dtype=str,
173 doc="Name of proper motion RA error column",
174 optional=True,
175 )
176 pm_dec_err_name = pexConfig.Field(
177 dtype=str,
178 doc="Name of proper motion Dec error column",
179 optional=True,
180 )
181 pm_scale = pexConfig.Field(
182 dtype=float,
183 doc="Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year",
184 default=1.0,
185 )
186 parallax_name = pexConfig.Field(
187 dtype=str,
188 doc="Name of parallax column",
189 optional=True,
190 )
191 parallax_err_name = pexConfig.Field(
192 dtype=str,
193 doc="Name of parallax error column",
194 optional=True,
195 )
196 parallax_scale = pexConfig.Field(
197 dtype=float,
198 doc="Scale factor by which to multiply parallax values to obtain units of milliarcsec",
199 default=1.0,
200 )
201 epoch_name = pexConfig.Field(
202 dtype=str,
203 doc="Name of epoch column",
204 optional=True,
205 )
206 epoch_format = pexConfig.Field(
207 dtype=str,
208 doc="Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'",
209 optional=True,
210 )
211 epoch_scale = pexConfig.Field(
212 dtype=str,
213 doc="Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'",
214 optional=True,
215 )
216 extra_col_names = pexConfig.ListField(
217 dtype=str,
218 default=[],
219 doc='Extra columns to add to the reference catalog.'
220 )
222 def setDefaults(self):
223 # Newly ingested reference catalogs always have the latest format_version.
224 self.dataset_config.format_version = LATEST_FORMAT_VERSION
225 # gen3 refcats are all depth=7
226 self.dataset_config.indexer['HTM'].depth = 7
228 def validate(self):
229 pexConfig.Config.validate(self)
231 def assertAllOrNone(*names):
232 """Raise ValueError unless all the named fields are set or are
233 all none (or blank)
234 """
235 setNames = [name for name in names if bool(getattr(self, name))]
236 if len(setNames) in (len(names), 0):
237 return
238 prefix = "Both or neither" if len(names) == 2 else "All or none"
239 raise ValueError("{} of {} must be set, but only {} are set".format(
240 prefix, ", ".join(names), ", ".join(setNames)))
242 if not (self.ra_name and self.dec_name and self.mag_column_list):
243 raise ValueError(
244 "ra_name and dec_name and at least one entry in mag_column_list must be supplied.")
245 if self.mag_err_column_map and set(self.mag_column_list) != set(self.mag_err_column_map.keys()):
246 raise ValueError(
247 "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format(
248 sorted(self.mag_err_column_map.keys()), sorted(self.mag_column_list)))
249 assertAllOrNone("ra_err_name", "dec_err_name", "coord_err_unit")
250 if self.coord_err_unit is not None:
251 result = astropy.units.Unit(self.coord_err_unit, parse_strict='silent')
252 if isinstance(result, astropy.units.UnrecognizedUnit):
253 msg = f"{self.coord_err_unit} is not a valid astropy unit string."
254 raise pexConfig.FieldValidationError(ConvertReferenceCatalogConfig.coord_err_unit, self, msg)
256 assertAllOrNone("epoch_name", "epoch_format", "epoch_scale")
257 assertAllOrNone("pm_ra_name", "pm_dec_name")
258 assertAllOrNone("pm_ra_err_name", "pm_dec_err_name")
259 assertAllOrNone("parallax_name", "parallax_err_name")
260 if self.pm_ra_err_name and not self.pm_ra_name:
261 raise ValueError('"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified')
262 if (self.pm_ra_name or self.parallax_name) and not self.epoch_name:
263 raise ValueError(
264 '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified')
267class ConvertReferenceCatalogBase(pipeBase.Task, abc.ABC):
268 """Base class for producing and loading indexed reference catalogs,
269 shared between gen2 and gen3.
271 This implements an indexing scheme based on hierarchical triangular
272 mesh (HTM). The term index really means breaking the catalog into
273 localized chunks called shards. In this case each shard contains
274 the entries from the catalog in a single HTM trixel
276 For producing catalogs this task makes the following assumptions
277 about the input catalogs:
278 - RA, Dec are in decimal degrees.
279 - Epoch is available in a column, in a format supported by astropy.time.Time.
280 - There are no off-diagonal covariance terms, such as covariance
281 between RA and Dec, or between PM RA and PM Dec. Support for such
282 covariance would have to be added to to the config, including consideration
283 of the units in the input catalog.
284 """
285 canMultiprocess = False
286 ConfigClass = ConvertReferenceCatalogConfig
288 def __init__(self, *args, **kwargs):
289 super().__init__(*args, **kwargs)
290 self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
291 self.config.dataset_config.indexer.active)
292 self.makeSubtask('file_reader')
294 def run(self, inputFiles):
295 """Index a set of files comprising a reference catalog.
297 Outputs are persisted in the butler repository.
299 Parameters
300 ----------
301 inputFiles : `list`
302 A list of file paths to read.
303 """
304 self._preRun()
305 schema, key_map = self._saveMasterSchema(inputFiles[0])
306 # create an HTM we can interrogate about pixel ids
307 htm = lsst.sphgeom.HtmPixelization(self.indexer.htm.get_depth())
308 filenames = self._getButlerFilenames(htm)
309 worker = self.config.manager.target(filenames,
310 self.config,
311 self.file_reader,
312 self.indexer,
313 schema,
314 key_map,
315 htm.universe()[0],
316 addRefCatMetadata,
317 self.log)
318 result = worker.run(inputFiles)
320 self._persistConfig()
321 self._postRun(result)
323 def _preRun(self):
324 """Any setup that has to be performed at the start of ``run``, but that
325 cannot be performed during ``__init__`` (e.g. making directories).
326 """
327 pass
329 def _postRun(self, result):
330 """Any tasks that have to happen at the end of ``run``.
332 Parameters
333 ----------
334 result
335 The result returned from``worker.run()``.
336 """
337 pass
339 def _getButlerFilenames(self, htm):
340 """Get filenames from the butler for each output htm pixel.
342 Parameters
343 ----------
344 htm : `lsst.sphgeom.HtmPixelization`
345 The HTM pixelization scheme to be used to build filenames.
347 Returns
348 -------
349 filenames : `list [str]`
350 List of filenames to write each HTM pixel to.
351 """
352 filenames = {}
353 start, end = htm.universe()[0]
354 # path manipulation because butler.get() per pixel will take forever
355 path = self._getOnePixelFilename(start)
356 base = os.path.join(os.path.dirname(path), "%d"+os.path.splitext(path)[1])
357 for pixelId in range(start, end):
358 filenames[pixelId] = base % pixelId
360 return filenames
362 def makeSchema(self, dtype):
363 """Make the schema to use in constructing the persisted catalogs.
365 Parameters
366 ----------
367 dtype : `numpy.dtype`
368 Data type describing each entry in ``config.extra_col_names``
369 for the catalogs being ingested.
371 Returns
372 -------
373 schemaAndKeyMap : `tuple` of (`lsst.afw.table.Schema`, `dict`)
374 A tuple containing two items:
375 - The schema for the output source catalog.
376 - A map of catalog keys to use in filling the record
377 """
378 # make a schema with the standard fields
379 schema = ReferenceObjectLoader.makeMinimalSchema(
380 filterNameList=self.config.mag_column_list,
381 addCentroid=False,
382 addIsPhotometric=bool(self.config.is_photometric_name),
383 addIsResolved=bool(self.config.is_resolved_name),
384 addIsVariable=bool(self.config.is_variable_name),
385 coordErrDim=2 if bool(self.config.ra_err_name) else 0,
386 addProperMotion=2 if bool(self.config.pm_ra_name) else 0,
387 properMotionErrDim=2 if bool(self.config.pm_ra_err_name) else 0,
388 addParallax=bool(self.config.parallax_name),
389 )
390 keysToSkip = set(("id", "centroid_x", "centroid_y", "hasCentroid"))
391 key_map = {fieldName: schema[fieldName].asKey() for fieldName in schema.getOrderedNames()
392 if fieldName not in keysToSkip}
394 def addField(name):
395 if dtype[name].kind == 'U':
396 # dealing with a string like thing. Need to get type and size.
397 at_size = dtype[name].itemsize
398 return schema.addField(name, type=str, size=at_size)
399 else:
400 at_type = dtype[name].type
401 return schema.addField(name, at_type)
403 for col in self.config.extra_col_names: 403 ↛ 404line 403 didn't jump to line 404, because the loop on line 403 never started
404 key_map[col] = addField(col)
405 return schema, key_map
407 def _saveMasterSchema(self, filename):
408 """Generate and save the master catalog schema.
410 Parameters
411 ----------
412 filename : `str`
413 An input file to read to get the input dtype.
414 """
415 arr = self.file_reader.run(filename)
416 schema, key_map = self.makeSchema(arr.dtype)
418 catalog = afwTable.SimpleCatalog(schema)
419 addRefCatMetadata(catalog)
420 self._writeMasterSchema(catalog)
421 return schema, key_map
423 @abc.abstractmethod
424 def _getOnePixelFilename(self, start):
425 """Return one example filename to help construct the rest of the
426 per-htm pixel filenames.
428 Parameters
429 ----------
430 start : `int`
431 The first HTM index in this HTM pixelization.
433 Returns
434 -------
435 filename : `str`
436 Path to a single file that would be written to the output location.
437 """
438 pass
440 @abc.abstractmethod
441 def _persistConfig(self):
442 """Write the config that was used to generate the refcat.
443 """
444 pass
446 @abc.abstractmethod
447 def _writeMasterSchema(self, catalog):
448 """Butler put the master catalog schema.
450 Parameters
451 ----------
452 catalog : `lsst.afw.table.SimpleCatalog`
453 An empty catalog with a fully-defined schema that matches the
454 schema used in each of the HTM pixel files.
455 """
456 pass