Coverage for python/lsst/meas/algorithms/ingestIndexReferenceTask.py: 42%
139 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-20 02:34 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-20 02:34 -0700
1# This file is part of meas_algorithms.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
23# TODO DM-31698: post-gen2 removal notes
24# `DatasetConfig`, `ConvertReferenceCatalogBase`, and `ConvertReferenceCatalogConfig`
25# should all be moved to to `convertReferenceCatalog.py` once gen2 butler
26# has been removed.
28__all__ = ["DatasetConfig", "ConvertReferenceCatalogBase", "ConvertReferenceCatalogConfig"]
30import abc
31import os.path
33import astropy.units
35import lsst.pex.config as pexConfig
36import lsst.pipe.base as pipeBase
37import lsst.geom
38import lsst.sphgeom
39import lsst.afw.table as afwTable
40from lsst.daf.base import PropertyList
41from .indexerRegistry import IndexerRegistry
42from .readTextCatalogTask import ReadTextCatalogTask
43from .loadReferenceObjects import ReferenceObjectLoader
44from . import convertRefcatManager
46# The most recent Indexed Reference Catalog on-disk format version.
47LATEST_FORMAT_VERSION = 1
50def addRefCatMetadata(catalog):
51 """Add metadata to a new (not yet populated) reference catalog.
53 Parameters
54 ----------
55 catalog : `lsst.afw.table.SimpleCatalog`
56 Catalog to which metadata should be attached. Will be modified
57 in-place.
58 """
59 md = catalog.getMetadata()
60 if md is None:
61 md = PropertyList()
62 md.set("REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION)
63 catalog.setMetadata(md)
66class DatasetConfig(pexConfig.Config):
67 """The description of the on-disk storage format for the persisted
68 reference catalog.
69 """
70 format_version = pexConfig.Field(
71 dtype=int,
72 doc="Version number of the persisted on-disk storage format."
73 "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)."
74 "\nVersion 1 had nJy as flux units.",
75 default=0 # This needs to always be 0, so that unversioned catalogs are interpreted as version 0.
76 )
77 ref_dataset_name = pexConfig.Field(
78 dtype=str,
79 # TODO DM-31817: remove this default value.
80 default='cal_ref_cat',
81 doc="Name of this reference catalog to be used in the butler registry.",
82 )
83 indexer = IndexerRegistry.makeField(
84 default='HTM',
85 doc='Name of indexer algoritm to use. Default is HTM',
86 )
89class ConvertReferenceCatalogConfig(pexConfig.Config):
90 dataset_config = pexConfig.ConfigField(
91 dtype=DatasetConfig,
92 doc="Configuration for reading the ingested data",
93 )
94 n_processes = pexConfig.Field(
95 dtype=int,
96 doc=("Number of python processes to use when ingesting."),
97 default=1
98 )
99 manager = pexConfig.ConfigurableField(
100 target=convertRefcatManager.ConvertRefcatManager,
101 doc="Multiprocessing manager to perform the actual conversion of values, file-by-file."
102 )
103 file_reader = pexConfig.ConfigurableField(
104 target=ReadTextCatalogTask,
105 doc='Task to use to read the files. Default is to expect text files.'
106 )
107 ra_name = pexConfig.Field(
108 dtype=str,
109 doc="Name of RA column (values in decimal degrees)",
110 )
111 dec_name = pexConfig.Field(
112 dtype=str,
113 doc="Name of Dec column (values in decimal degrees)",
114 )
115 ra_err_name = pexConfig.Field(
116 dtype=str,
117 doc="Name of RA error column",
118 optional=True,
119 )
120 dec_err_name = pexConfig.Field(
121 dtype=str,
122 doc="Name of Dec error column",
123 optional=True,
124 )
125 coord_err_unit = pexConfig.Field(
126 dtype=str,
127 doc="Unit of RA/Dec error fields (astropy.unit.Unit compatible)",
128 optional=True
129 )
130 mag_column_list = pexConfig.ListField(
131 dtype=str,
132 doc="The values in the reference catalog are assumed to be in AB magnitudes. "
133 "List of column names to use for photometric information. At least one entry is required."
134 )
135 mag_err_column_map = pexConfig.DictField(
136 keytype=str,
137 itemtype=str,
138 default={},
139 doc="A map of magnitude column name (key) to magnitude error column (value)."
140 )
141 is_photometric_name = pexConfig.Field(
142 dtype=str,
143 optional=True,
144 doc='Name of column stating if satisfactory for photometric calibration (optional).'
145 )
146 is_resolved_name = pexConfig.Field(
147 dtype=str,
148 optional=True,
149 doc='Name of column stating if the object is resolved (optional).'
150 )
151 is_variable_name = pexConfig.Field(
152 dtype=str,
153 optional=True,
154 doc='Name of column stating if the object is measured to be variable (optional).'
155 )
156 id_name = pexConfig.Field(
157 dtype=str,
158 optional=True,
159 doc='Name of column to use as an identifier (optional).'
160 )
161 pm_ra_name = pexConfig.Field(
162 dtype=str,
163 doc="Name of proper motion RA column",
164 optional=True,
165 )
166 pm_dec_name = pexConfig.Field(
167 dtype=str,
168 doc="Name of proper motion Dec column",
169 optional=True,
170 )
171 pm_ra_err_name = pexConfig.Field(
172 dtype=str,
173 doc="Name of proper motion RA error column",
174 optional=True,
175 )
176 pm_dec_err_name = pexConfig.Field(
177 dtype=str,
178 doc="Name of proper motion Dec error column",
179 optional=True,
180 )
181 pm_scale = pexConfig.Field(
182 dtype=float,
183 doc="Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year",
184 default=1.0,
185 )
186 parallax_name = pexConfig.Field(
187 dtype=str,
188 doc="Name of parallax column",
189 optional=True,
190 )
191 parallax_err_name = pexConfig.Field(
192 dtype=str,
193 doc="Name of parallax error column",
194 optional=True,
195 )
196 parallax_scale = pexConfig.Field(
197 dtype=float,
198 doc="Scale factor by which to multiply parallax values to obtain units of milliarcsec",
199 default=1.0,
200 )
201 epoch_name = pexConfig.Field(
202 dtype=str,
203 doc="Name of epoch column",
204 optional=True,
205 )
206 epoch_format = pexConfig.Field(
207 dtype=str,
208 doc="Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'",
209 optional=True,
210 )
211 epoch_scale = pexConfig.Field(
212 dtype=str,
213 doc="Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'",
214 optional=True,
215 )
216 extra_col_names = pexConfig.ListField(
217 dtype=str,
218 default=[],
219 doc='Extra columns to add to the reference catalog.'
220 )
222 def setDefaults(self):
223 # Newly ingested reference catalogs always have the latest format_version.
224 self.dataset_config.format_version = LATEST_FORMAT_VERSION
225 # gen3 refcats are all depth=7
226 self.dataset_config.indexer['HTM'].depth = 7
228 def validate(self):
229 pexConfig.Config.validate(self)
231 def assertAllOrNone(*names):
232 """Raise ValueError unless all the named fields are set or are
233 all none (or blank)
234 """
235 setNames = [name for name in names if bool(getattr(self, name))]
236 if len(setNames) in (len(names), 0):
237 return
238 prefix = "Both or neither" if len(names) == 2 else "All or none"
239 raise ValueError("{} of {} must be set, but only {} are set".format(
240 prefix, ", ".join(names), ", ".join(setNames)))
242 if not (self.ra_name and self.dec_name and self.mag_column_list):
243 raise ValueError(
244 "ra_name and dec_name and at least one entry in mag_column_list must be supplied.")
245 if self.mag_err_column_map and set(self.mag_column_list) != set(self.mag_err_column_map.keys()):
246 raise ValueError(
247 "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format(
248 sorted(self.mag_err_column_map.keys()), sorted(self.mag_column_list)))
249 assertAllOrNone("ra_err_name", "dec_err_name", "coord_err_unit")
250 if self.coord_err_unit is not None:
251 result = astropy.units.Unit(self.coord_err_unit, parse_strict='silent')
252 if isinstance(result, astropy.units.UnrecognizedUnit):
253 msg = f"{self.coord_err_unit} is not a valid astropy unit string."
254 raise pexConfig.FieldValidationError(ConvertReferenceCatalogConfig.coord_err_unit, self, msg)
256 assertAllOrNone("epoch_name", "epoch_format", "epoch_scale")
257 assertAllOrNone("pm_ra_name", "pm_dec_name")
258 assertAllOrNone("pm_ra_err_name", "pm_dec_err_name")
259 assertAllOrNone("parallax_name", "parallax_err_name")
260 if self.pm_ra_err_name and not self.pm_ra_name:
261 raise ValueError('"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified')
262 if (self.pm_ra_name or self.parallax_name) and not self.epoch_name:
263 raise ValueError(
264 '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified')
267class ConvertReferenceCatalogBase(pipeBase.Task, abc.ABC):
268 """Base class for producing and loading indexed reference catalogs,
269 shared between gen2 and gen3.
271 This implements an indexing scheme based on hierarchical triangular
272 mesh (HTM). The term index really means breaking the catalog into
273 localized chunks called shards. In this case each shard contains
274 the entries from the catalog in a single HTM trixel
276 For producing catalogs this task makes the following assumptions
277 about the input catalogs:
279 - RA, Dec are in decimal degrees.
280 - Epoch is available in a column, in a format supported by astropy.time.Time.
281 - There are no off-diagonal covariance terms, such as covariance
282 between RA and Dec, or between PM RA and PM Dec. Support for such
283 covariance would have to be added to to the config, including consideration
284 of the units in the input catalog.
285 """
286 canMultiprocess = False
287 ConfigClass = ConvertReferenceCatalogConfig
289 def __init__(self, *args, **kwargs):
290 super().__init__(*args, **kwargs)
291 self.indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
292 self.config.dataset_config.indexer.active)
293 self.makeSubtask('file_reader')
295 def run(self, inputFiles):
296 """Index a set of files comprising a reference catalog.
298 Outputs are persisted in the butler repository.
300 Parameters
301 ----------
302 inputFiles : `list`
303 A list of file paths to read.
304 """
305 self._preRun()
306 schema, key_map = self._saveMasterSchema(inputFiles[0])
307 # create an HTM we can interrogate about pixel ids
308 htm = lsst.sphgeom.HtmPixelization(self.indexer.htm.get_depth())
309 filenames = self._getButlerFilenames(htm)
310 worker = self.config.manager.target(filenames,
311 self.config,
312 self.file_reader,
313 self.indexer,
314 schema,
315 key_map,
316 htm.universe()[0],
317 addRefCatMetadata,
318 self.log)
319 result = worker.run(inputFiles)
321 self._persistConfig()
322 self._postRun(result)
324 def _preRun(self):
325 """Any setup that has to be performed at the start of ``run``, but that
326 cannot be performed during ``__init__`` (e.g. making directories).
327 """
328 pass
330 def _postRun(self, result):
331 """Any tasks that have to happen at the end of ``run``.
333 Parameters
334 ----------
335 result
336 The result returned from``worker.run()``.
337 """
338 pass
340 def _getButlerFilenames(self, htm):
341 """Get filenames from the butler for each output htm pixel.
343 Parameters
344 ----------
345 htm : `lsst.sphgeom.HtmPixelization`
346 The HTM pixelization scheme to be used to build filenames.
348 Returns
349 -------
350 filenames : `list [str]`
351 List of filenames to write each HTM pixel to.
352 """
353 filenames = {}
354 start, end = htm.universe()[0]
355 # path manipulation because butler.get() per pixel will take forever
356 path = self._getOnePixelFilename(start)
357 base = os.path.join(os.path.dirname(path), "%d"+os.path.splitext(path)[1])
358 for pixelId in range(start, end):
359 filenames[pixelId] = base % pixelId
361 return filenames
363 def makeSchema(self, dtype):
364 """Make the schema to use in constructing the persisted catalogs.
366 Parameters
367 ----------
368 dtype : `numpy.dtype`
369 Data type describing each entry in ``config.extra_col_names``
370 for the catalogs being ingested.
372 Returns
373 -------
374 schemaAndKeyMap : `tuple` of (`lsst.afw.table.Schema`, `dict`)
375 A tuple containing two items:
376 - The schema for the output source catalog.
377 - A map of catalog keys to use in filling the record
378 """
379 # make a schema with the standard fields
380 schema = ReferenceObjectLoader.makeMinimalSchema(
381 filterNameList=self.config.mag_column_list,
382 addCentroid=False,
383 addIsPhotometric=bool(self.config.is_photometric_name),
384 addIsResolved=bool(self.config.is_resolved_name),
385 addIsVariable=bool(self.config.is_variable_name),
386 coordErrDim=2 if bool(self.config.ra_err_name) else 0,
387 addProperMotion=2 if bool(self.config.pm_ra_name) else 0,
388 properMotionErrDim=2 if bool(self.config.pm_ra_err_name) else 0,
389 addParallax=bool(self.config.parallax_name),
390 )
391 keysToSkip = set(("id", "centroid_x", "centroid_y", "hasCentroid"))
392 key_map = {fieldName: schema[fieldName].asKey() for fieldName in schema.getOrderedNames()
393 if fieldName not in keysToSkip}
395 def addField(name):
396 if dtype[name].kind == 'U':
397 # dealing with a string like thing. Need to get type and size.
398 at_size = dtype[name].itemsize
399 return schema.addField(name, type=str, size=at_size)
400 else:
401 at_type = dtype[name].type
402 return schema.addField(name, at_type)
404 for col in self.config.extra_col_names:
405 key_map[col] = addField(col)
406 return schema, key_map
408 def _saveMasterSchema(self, filename):
409 """Generate and save the master catalog schema.
411 Parameters
412 ----------
413 filename : `str`
414 An input file to read to get the input dtype.
415 """
416 arr = self.file_reader.run(filename)
417 schema, key_map = self.makeSchema(arr.dtype)
419 catalog = afwTable.SimpleCatalog(schema)
420 addRefCatMetadata(catalog)
421 self._writeMasterSchema(catalog)
422 return schema, key_map
424 @abc.abstractmethod
425 def _getOnePixelFilename(self, start):
426 """Return one example filename to help construct the rest of the
427 per-htm pixel filenames.
429 Parameters
430 ----------
431 start : `int`
432 The first HTM index in this HTM pixelization.
434 Returns
435 -------
436 filename : `str`
437 Path to a single file that would be written to the output location.
438 """
439 pass
441 @abc.abstractmethod
442 def _persistConfig(self):
443 """Write the config that was used to generate the refcat.
444 """
445 pass
447 @abc.abstractmethod
448 def _writeMasterSchema(self, catalog):
449 """Butler put the master catalog schema.
451 Parameters
452 ----------
453 catalog : `lsst.afw.table.SimpleCatalog`
454 An empty catalog with a fully-defined schema that matches the
455 schema used in each of the HTM pixel files.
456 """
457 pass