23Convert an external reference catalog into the hierarchical triangular mesh
24(HTM) sharded LSST-style format, to be ingested into the butler.
27__all__ = [
"ConvertReferenceCatalogTask",
"ConvertReferenceCatalogConfig",
"DatasetConfig"]
42from .indexerRegistry
import IndexerRegistry
43from .readTextCatalogTask
import ReadTextCatalogTask
44from .
import convertRefcatManager
45from .
import ReferenceObjectLoader
48LATEST_FORMAT_VERSION = 1
52 """Add metadata to a new (not yet populated) reference catalog.
57 Catalog to which metadata should be attached. Will be modified
60 md = catalog.getMetadata()
63 md.set(
"REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION)
64 catalog.setMetadata(md)
68 """Description of the on-disk storage format for the converted reference
71 format_version = pexConfig.Field(
73 doc="Version number of the persisted on-disk storage format."
74 "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)."
75 "\nVersion 1 had nJy as flux units.",
78 ref_dataset_name = pexConfig.Field(
80 doc=
"Name of this reference catalog; this should match the name used during butler ingest.",
82 indexer = IndexerRegistry.makeField(
84 doc=
'Name of indexer algoritm to use. Default is HTM',
89 dataset_config = pexConfig.ConfigField(
91 doc=
"Configuration for reading the ingested data",
93 n_processes = pexConfig.Field(
95 doc=(
"Number of python processes to use when ingesting."),
98 manager = pexConfig.ConfigurableField(
100 doc=
"Multiprocessing manager to perform the actual conversion of values, file-by-file."
102 file_reader = pexConfig.ConfigurableField(
103 target=ReadTextCatalogTask,
104 doc=
'Task to use to read the files. Default is to expect text files.'
106 ra_name = pexConfig.Field(
108 doc=
"Name of RA column (values in decimal degrees)",
110 dec_name = pexConfig.Field(
112 doc=
"Name of Dec column (values in decimal degrees)",
114 ra_err_name = pexConfig.Field(
116 doc=
"Name of RA error column",
119 dec_err_name = pexConfig.Field(
121 doc=
"Name of Dec error column",
124 coord_err_unit = pexConfig.Field(
126 doc=
"Unit of RA/Dec error fields (astropy.unit.Unit compatible)",
129 mag_column_list = pexConfig.ListField(
131 doc=
"The values in the reference catalog are assumed to be in AB magnitudes. "
132 "List of column names to use for photometric information. At least one entry is required."
134 mag_err_column_map = pexConfig.DictField(
138 doc=
"A map of magnitude column name (key) to magnitude error column (value)."
140 is_photometric_name = pexConfig.Field(
143 doc=
'Name of column stating if satisfactory for photometric calibration (optional).'
145 is_resolved_name = pexConfig.Field(
148 doc=
'Name of column stating if the object is resolved (optional).'
150 is_variable_name = pexConfig.Field(
153 doc=
'Name of column stating if the object is measured to be variable (optional).'
155 id_name = pexConfig.Field(
158 doc=
'Name of column to use as an identifier (optional).'
160 pm_ra_name = pexConfig.Field(
162 doc=
"Name of proper motion RA column",
165 pm_dec_name = pexConfig.Field(
167 doc=
"Name of proper motion Dec column",
170 pm_ra_err_name = pexConfig.Field(
172 doc=
"Name of proper motion RA error column",
175 pm_dec_err_name = pexConfig.Field(
177 doc=
"Name of proper motion Dec error column",
180 pm_scale = pexConfig.Field(
182 doc=
"Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year",
185 parallax_name = pexConfig.Field(
187 doc=
"Name of parallax column",
190 parallax_err_name = pexConfig.Field(
192 doc=
"Name of parallax error column",
195 parallax_scale = pexConfig.Field(
197 doc=
"Scale factor by which to multiply parallax values to obtain units of milliarcsec",
200 epoch_name = pexConfig.Field(
202 doc=
"Name of epoch column",
205 epoch_format = pexConfig.Field(
207 doc=
"Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'",
210 epoch_scale = pexConfig.Field(
212 doc=
"Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'",
215 extra_col_names = pexConfig.ListField(
218 doc=
'Extra columns to add to the reference catalog.'
230 def assertAllOrNone(*names):
231 """Raise ValueError unless all the named fields are set or are
234 setNames = [name for name
in names
if bool(getattr(self, name))]
235 if len(setNames)
in (len(names), 0):
237 prefix =
"Both or neither" if len(names) == 2
else "All or none"
238 raise ValueError(
"{} of {} must be set, but only {} are set".format(
239 prefix,
", ".join(names),
", ".join(setNames)))
243 "ra_name and dec_name and at least one entry in mag_column_list must be supplied.")
246 "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format(
248 assertAllOrNone(
"ra_err_name",
"dec_err_name",
"coord_err_unit")
250 result = astropy.units.Unit(self.
coord_err_unit, parse_strict=
'silent')
251 if isinstance(result, astropy.units.UnrecognizedUnit):
252 msg = f
"{self.coord_err_unit} is not a valid astropy unit string."
253 raise pexConfig.FieldValidationError(ConvertReferenceCatalogConfig.coord_err_unit, self, msg)
255 assertAllOrNone(
"epoch_name",
"epoch_format",
"epoch_scale")
256 assertAllOrNone(
"pm_ra_name",
"pm_dec_name")
257 assertAllOrNone(
"pm_ra_err_name",
"pm_dec_err_name")
258 assertAllOrNone(
"parallax_name",
"parallax_err_name")
260 raise ValueError(
'"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified')
263 '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified')
267 """Class for producing HTM-indexed reference catalogs from external
270 This implements an indexing scheme based on hierarchical triangular
271 mesh (HTM). The term index really means breaking the catalog into
272 localized chunks called shards. In this case each shard contains
273 the entries from the catalog
in a single HTM trixel
275 For producing catalogs this task makes the following assumptions
276 about the input catalogs:
278 - RA, Dec are
in decimal degrees.
279 - Epoch
is available
in a column,
in a format supported by astropy.time.Time.
280 - There are no off-diagonal covariance terms, such
as covariance
281 between RA
and Dec,
or between PM RA
and PM Dec. Support
for such
282 covariance would have to be added to to the config, including consideration
283 of the units
in the input catalog.
288 The path to write the output files to,
in a subdirectory defined by
289 ``DatasetConfig.ref_dataset_name``.
291 canMultiprocess = False
292 ConfigClass = ConvertReferenceCatalogConfig
293 _DefaultName =
'ConvertReferenceCatalogTask'
297 if output_dir
is None:
298 raise RuntimeError(
"Must specify output_dir.")
300 self.
output_dir = os.path.join(output_dir, self.config.dataset_config.ref_dataset_name)
302 self.
indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
303 self.config.dataset_config.indexer.active)
304 self.makeSubtask(
'file_reader')
306 def run(self, inputFiles):
307 """Index a set of files comprising a reference catalog.
309 Outputs are persisted in the butler repository.
314 A list of file paths to read.
318 pathlib.Path(self.
output_dir).mkdir(exist_ok=
False)
324 worker = self.config.manager.target(filenames,
333 result = worker.run(inputFiles)
338 def _writeIngestHelperFile(self, result):
339 """Write the astropy table containing the htm->filename relationship,
340 used for the ``butler ingest-files`` command after this task completes.
342 dimension = f"htm{self.config.dataset_config.indexer.active.depth}"
343 table = astropy.table.Table(names=(
"filename", dimension), dtype=(
'str',
'int'))
345 table.add_row((result[key], key))
348 def _writeConfig(self):
349 """Write the config that was used to generate the refcat."""
350 filename = os.path.join(self.
output_dir,
"config.py")
351 with open(filename,
'w')
as file:
352 self.config.dataset_config.saveToStream(file)
354 def _getOutputFilenames(self, htm):
355 """Get filenames from the butler for each output htm pixel.
360 The HTM pixelization scheme to be used to build filenames.
364 filenames : `list [str]`
365 List of filenames to write each HTM pixel to.
368 start, end = htm.universe()[0]
369 path = os.path.join(self.output_dir, f"{self.indexer.htm}.fits")
370 base = os.path.join(os.path.dirname(path),
"%d"+os.path.splitext(path)[1])
371 for pixelId
in range(start, end):
372 filenames[pixelId] = base % pixelId
377 """Make the schema to use in constructing the persisted catalogs.
381 dtype : `numpy.dtype`
382 Data type describing each entry in ``config.extra_col_names``
383 for the catalogs being ingested.
388 A tuple containing two items:
389 - The schema
for the output source catalog.
390 - A map of catalog keys to use
in filling the record
393 schema = ReferenceObjectLoader.makeMinimalSchema(
394 filterNameList=self.config.mag_column_list,
396 addIsPhotometric=bool(self.config.is_photometric_name),
397 addIsResolved=bool(self.config.is_resolved_name),
398 addIsVariable=bool(self.config.is_variable_name),
399 coordErrDim=2
if bool(self.config.ra_err_name)
else 0,
400 addProperMotion=2
if bool(self.config.pm_ra_name)
else 0,
401 properMotionErrDim=2
if bool(self.config.pm_ra_err_name)
else 0,
402 addParallax=bool(self.config.parallax_name),
404 keysToSkip = set((
"id",
"centroid_x",
"centroid_y",
"hasCentroid"))
405 key_map = {fieldName: schema[fieldName].asKey()
for fieldName
in schema.getOrderedNames()
406 if fieldName
not in keysToSkip}
409 if dtype[name].kind ==
'U':
411 at_size = dtype[name].itemsize
412 return schema.addField(name, type=str, size=at_size)
414 at_type = dtype[name].type
415 return schema.addField(name, at_type)
417 for col
in self.config.extra_col_names:
418 key_map[col] = addField(col)
419 return schema, key_map
421 def _writeMasterSchema(self, inputfile):
422 """Generate and save the master catalog schema.
427 An input file to read to get the input dtype.
429 arr = self.file_reader.run(inputfile)
434 outputfile = os.path.join(self.output_dir, "master_schema.fits")
435 catalog.writeFits(outputfile)
436 return schema, key_map
438 def _reduce_kwargs(self):
440 kwargs = super()._reduce_kwargs()
441 kwargs[
'output_dir'] = self.
base_dir
446 """Construct an argument parser for the ``convertReferenceCatalog`` script.
450 argparser : `argparse.ArgumentParser`
451 The argument parser that defines the ``convertReferenceCatalog``
452 command-line interface.
454 parser = argparse.ArgumentParser(
456 formatter_class=argparse.RawDescriptionHelpFormatter,
457 epilog='More information is available at https://pipelines.lsst.io.'
459 parser.add_argument(
"outputDir",
460 help=
"Path to write the output shard files, configs, and `ingest-files` table to.")
461 parser.add_argument(
"configFile",
462 help=
"File containing the ConvertReferenceCatalogConfig fields.")
465 parser.add_argument(
"fileglob", nargs=
"+",
466 help=
"Quoted glob for the files to be read in and converted."
467 " Example (note required quotes to prevent shell expansion):"
468 ' "gaia_source/csv/GaiaSource*"')
473 """Run `ConvertReferenceCatalogTask` on the input arguments.
478 Path to write the output files to.
480 File specifying the ``ConvertReferenceCatalogConfig`` fields.
482 Quoted glob for the files to be read
in and converted.
485 logging.basicConfig(level=logging.INFO, format=
"{name} {levelname}: {message}", style=
"{")
487 config = ConvertReferenceCatalogTask.ConfigClass()
488 config.load(configFile)
490 files = glob.glob(fileglob)
492 with open(os.path.join(outputDir,
"convertReferenceCatalogConfig.py"),
"w")
as outfile:
493 converter.config.saveToStream(outfile)
494 msg = (
"Completed refcat conversion.\n\n"
495 "Ingest the resulting files with the following commands, substituting the path\n"
496 "to your butler repo for `REPO`, and the ticket number you are tracking this\n"
497 "ingest on for `DM-NNNNN`:\n"
498 f
"\n butler register-dataset-type REPO {config.dataset_config.ref_dataset_name} "
500 "\n butler ingest-files -t direct REPO gaia_dr2 refcats/DM-NNNNN "
501 f
"{converter.ingest_table_file}"
502 "\n butler collection-chain REPO --mode extend refcats refcats/DM-NNNNN")
508 if len(args.fileglob) > 1:
509 raise RuntimeError(
"Final argument must be a quoted file glob, not a shell-expanded list of files.")
511 run_convert(args.outputDir, args.configFile, args.fileglob[0])
def _getOutputFilenames(self, htm)
def _writeMasterSchema(self, inputfile)
def makeSchema(self, dtype)
def run(self, inputFiles)
def __init__(self, *output_dir=None, **kwargs)
def _writeIngestHelperFile(self, result)
def addRefCatMetadata(catalog)
def run_convert(outputDir, configFile, fileglob)