23Convert an external reference catalog into the hierarchical triangular mesh
24(HTM) sharded LSST-style format, to be ingested into the butler.
27__all__ = [
"ConvertReferenceCatalogTask",
"ConvertReferenceCatalogConfig",
"DatasetConfig"]
44from .indexerRegistry
import IndexerRegistry
45from .readTextCatalogTask
import ReadTextCatalogTask
46from .
import convertRefcatManager
50LATEST_FORMAT_VERSION = 2
54 """Add metadata to a new (not yet populated) reference catalog.
59 Catalog to which metadata should be attached. Will be modified
62 md = catalog.getMetadata()
65 md.set(
"REFCAT_FORMAT_VERSION", LATEST_FORMAT_VERSION)
66 catalog.setMetadata(md)
70 addIsPhotometric=False, addIsResolved=False,
71 addIsVariable=False, fullPositionInformation=False):
72 """Make a standard schema for reference object catalogs.
76 filterNameList : `list` of `str`
77 List of filter names. Used to create <filterName>_flux fields.
79 If True then add fields
"centroid" and "hasCentroid".
80 addIsPhotometric : `bool`
81 If
True then add field
"photometric".
82 addIsResolved : `bool`
83 If
True then add field
"resolved".
84 addIsVariable : `bool`
85 If
True then add field
"variable".
86 fullPositionInformation : `bool`
87 If
True then add epoch, proper motion,
and parallax, along
with the
88 full five-dimensional covariance between ra
and dec coordinates,
89 proper motion
in ra
and dec,
and parallax.
94 Schema
for reference catalog, an
99 lsst.afw.table.Point2DKey.addFields(
102 "centroid on an exposure, if relevant",
108 doc=
"is position known?",
110 for filterName
in filterNameList:
112 field=
"%s_flux" % (filterName,),
114 doc=
"flux in filter %s" % (filterName,),
117 for filterName
in filterNameList:
119 field=
"%s_fluxErr" % (filterName,),
121 doc=
"flux uncertainty in filter %s" % (filterName,),
128 doc=
"set if the object can be used for photometric calibration",
134 doc=
"set if the object is spatially resolved",
140 doc=
"set if the object has variable brightness",
142 lsst.afw.table.CovarianceMatrix2fKey.addFields(
146 units=[
"rad",
"rad"],
150 if fullPositionInformation:
154 doc=
"date of observation (TAI, MJD)",
160 doc=
"proper motion in the right ascension direction = dra/dt * cos(dec)",
166 doc=
"proper motion in the declination direction",
169 lsst.afw.table.CovarianceMatrix2fKey.addFields(
173 units=[
"rad/year",
"rad/year"],
179 doc=
"Set if proper motion or proper motion error is bad",
190 doc=
"uncertainty in parallax",
194 field=
"parallax_flag",
196 doc=
"Set if parallax or parallax error is bad",
199 fields = [
"coord_ra",
"coord_dec",
"pm_ra",
"pm_dec",
"parallax"]
200 units = [
"rad",
"rad",
"rad/year",
"rad/year",
"rad"]
201 for field, unit
in zip(itertools.combinations(fields, r=2), itertools.combinations(units, r=2)):
206 formatted_unit =
"rad^2"
207 if (
"year" in i_unit)
and (
"year" in j_unit):
208 formatted_unit +=
"/year^2"
209 elif (
"year" in i_unit)
or (
"year" in j_unit):
210 formatted_unit +=
"/year"
212 field=f
"{i_field}_{j_field}_Cov",
214 doc=f
"Covariance between {i_field} and {j_field}",
221 """Description of the on-disk storage format for the converted reference
224 format_version = pexConfig.Field(
226 doc="Version number of the persisted on-disk storage format."
227 "\nVersion 0 had Jy as flux units (default 0 for unversioned catalogs)."
228 "\nVersion 1 had nJy as flux units."
229 "\nVersion 2 had position-related covariances.",
232 ref_dataset_name = pexConfig.Field(
234 doc=
"Name of this reference catalog; this should match the name used during butler ingest.",
236 indexer = IndexerRegistry.makeField(
238 doc=
'Name of indexer algoritm to use. Default is HTM',
243 dataset_config = pexConfig.ConfigField(
245 doc=
"Configuration for reading the ingested data",
247 n_processes = pexConfig.Field(
249 doc=(
"Number of python processes to use when ingesting."),
252 manager = pexConfig.ConfigurableField(
254 doc=
"Multiprocessing manager to perform the actual conversion of values, file-by-file."
256 file_reader = pexConfig.ConfigurableField(
257 target=ReadTextCatalogTask,
258 doc=
'Task to use to read the files. Default is to expect text files.'
260 ra_name = pexConfig.Field(
262 doc=
"Name of RA column (values in decimal degrees)",
264 dec_name = pexConfig.Field(
266 doc=
"Name of Dec column (values in decimal degrees)",
268 ra_err_name = pexConfig.Field(
270 doc=
"Name of RA error column",
273 dec_err_name = pexConfig.Field(
275 doc=
"Name of Dec error column",
278 coord_err_unit = pexConfig.Field(
280 doc=
"Unit of RA/Dec error fields (astropy.unit.Unit compatible)",
283 mag_column_list = pexConfig.ListField(
285 doc=
"The values in the reference catalog are assumed to be in AB magnitudes. "
286 "List of column names to use for photometric information. At least one entry is required."
288 mag_err_column_map = pexConfig.DictField(
292 doc=
"A map of magnitude column name (key) to magnitude error column (value)."
294 is_photometric_name = pexConfig.Field(
297 doc=
'Name of column stating if satisfactory for photometric calibration (optional).'
299 is_resolved_name = pexConfig.Field(
302 doc=
'Name of column stating if the object is resolved (optional).'
304 is_variable_name = pexConfig.Field(
307 doc=
'Name of column stating if the object is measured to be variable (optional).'
309 id_name = pexConfig.Field(
312 doc=
'Name of column to use as an identifier (optional).'
314 pm_ra_name = pexConfig.Field(
316 doc=
"Name of proper motion RA column",
319 pm_dec_name = pexConfig.Field(
321 doc=
"Name of proper motion Dec column",
324 pm_ra_err_name = pexConfig.Field(
326 doc=
"Name of proper motion RA error column",
329 pm_dec_err_name = pexConfig.Field(
331 doc=
"Name of proper motion Dec error column",
334 pm_scale = pexConfig.Field(
336 doc=
"Scale factor by which to multiply proper motion values to obtain units of milliarcsec/year",
339 parallax_name = pexConfig.Field(
341 doc=
"Name of parallax column",
344 parallax_err_name = pexConfig.Field(
346 doc=
"Name of parallax error column",
349 parallax_scale = pexConfig.Field(
351 doc=
"Scale factor by which to multiply parallax values to obtain units of milliarcsec",
354 full_position_information = pexConfig.Field(
356 doc=
"Include epoch, proper motions, parallax, and covariances between sky coordinates, proper motion,"
357 " and parallax in the schema. If true, a custom ``ConvertRefcatManager`` class must exist to"
358 " compute the output covariances.",
361 epoch_name = pexConfig.Field(
363 doc=
"Name of epoch column",
366 epoch_format = pexConfig.Field(
368 doc=
"Format of epoch column: any value accepted by astropy.time.Time, e.g. 'iso' or 'unix'",
371 epoch_scale = pexConfig.Field(
373 doc=
"Scale of epoch column: any value accepted by astropy.time.Time, e.g. 'utc'",
376 extra_col_names = pexConfig.ListField(
379 doc=
'Extra columns to add to the reference catalog.'
391 def assertAllOrNone(*names):
392 """Raise ValueError unless all the named fields are set or are
395 setNames = [name for name
in names
if bool(getattr(self, name))]
396 if len(setNames)
in (len(names), 0):
398 prefix =
"Both or neither" if len(names) == 2
else "All or none"
399 raise ValueError(
"{} of {} must be set, but only {} are set".format(
400 prefix,
", ".join(names),
", ".join(setNames)))
404 "ra_name and dec_name and at least one entry in mag_column_list must be supplied.")
407 "mag_err_column_map specified, but keys do not match mag_column_list: {} != {}".format(
409 assertAllOrNone(
"ra_err_name",
"dec_err_name",
"coord_err_unit")
411 result = astropy.units.Unit(self.
coord_err_unit, parse_strict=
'silent')
412 if isinstance(result, astropy.units.UnrecognizedUnit):
413 msg = f
"{self.coord_err_unit} is not a valid astropy unit string."
414 raise pexConfig.FieldValidationError(ConvertReferenceCatalogConfig.coord_err_unit, self, msg)
416 assertAllOrNone(
"epoch_name",
"epoch_format",
"epoch_scale")
417 assertAllOrNone(
"pm_ra_name",
"pm_dec_name")
418 assertAllOrNone(
"pm_ra_err_name",
"pm_dec_err_name")
419 assertAllOrNone(
"parallax_name",
"parallax_err_name")
421 raise ValueError(
'"pm_ra/dec_name" must be specified if "pm_ra/dec_err_name" are specified')
424 '"epoch_name" must be specified if "pm_ra/dec_name" or "parallax_name" are specified')
430 assertAllOrNone(
"full_position_information",
431 "ra_err_name",
"dec_err_name",
"coord_err_unit",
432 "epoch_name",
"epoch_format",
"epoch_scale",
433 "pm_ra_name",
"pm_dec_name",
434 "pm_ra_err_name",
"pm_dec_err_name",
435 "parallax_name",
"parallax_err_name"
440 """Class for producing HTM-indexed reference catalogs from external
443 This implements an indexing scheme based on hierarchical triangular
444 mesh (HTM). The term index really means breaking the catalog into
445 localized chunks called shards. In this case each shard contains
446 the entries from the catalog
in a single HTM trixel
448 For producing catalogs this task makes the following assumptions
449 about the input catalogs:
451 - RA, Dec are
in decimal degrees.
452 - Epoch
is available
in a column,
in a format supported by astropy.time.Time.
453 - There are either no off-diagonal covariance terms,
or there are all the
454 five-dimensional covariance terms (between RA, Dec, proper motion,
and
455 parallax). In the latter case, a custom ``ConvertRefcatManager`` must
456 exist to handle the covariance terms.
461 The path to write the output files to,
in a subdirectory defined by
462 ``DatasetConfig.ref_dataset_name``.
464 canMultiprocess = False
465 ConfigClass = ConvertReferenceCatalogConfig
466 _DefaultName =
'ConvertReferenceCatalogTask'
470 if output_dir
is None:
471 raise RuntimeError(
"Must specify output_dir.")
473 self.
output_dir = os.path.join(output_dir, self.config.dataset_config.ref_dataset_name)
475 self.
indexer = IndexerRegistry[self.config.dataset_config.indexer.name](
476 self.config.dataset_config.indexer.active)
477 self.makeSubtask(
'file_reader')
479 def run(self, inputFiles):
480 """Index a set of files comprising a reference catalog.
482 Outputs are persisted in the butler repository.
487 A list of file paths to read.
491 pathlib.Path(self.
output_dir).mkdir(exist_ok=
False)
497 worker = self.config.manager.target(filenames,
506 result = worker.run(inputFiles)
512 """Write the astropy table containing the htm->filename relationship,
513 used for the ``butler ingest-files`` command after this task completes.
515 dimension = f"htm{self.config.dataset_config.indexer.active.depth}"
516 table = astropy.table.Table(names=(
"filename", dimension), dtype=(
'str',
'int'))
518 table.add_row((result[key], key))
522 """Write the config that was used to generate the refcat."""
523 filename = os.path.join(self.
output_dir,
"config.py")
524 with open(filename,
'w')
as file:
525 self.config.dataset_config.saveToStream(file)
528 """Get filenames from the butler for each output htm pixel.
533 The HTM pixelization scheme to be used to build filenames.
537 filenames : `list [str]`
538 List of filenames to write each HTM pixel to.
541 start, end = htm.universe()[0]
542 path = os.path.join(self.output_dir, f"{self.indexer.htm}.fits")
543 base = os.path.join(os.path.dirname(path),
"%d"+os.path.splitext(path)[1])
544 for pixelId
in range(start, end):
545 filenames[pixelId] = base % pixelId
550 """Make the schema to use in constructing the persisted catalogs.
554 dtype : `numpy.dtype`
555 Data type describing each entry in ``config.extra_col_names``
556 for the catalogs being ingested.
561 A tuple containing two items:
562 - The schema
for the output source catalog.
563 - A map of catalog keys to use
in filling the record
567 filterNameList=self.config.mag_column_list,
569 addIsPhotometric=bool(self.config.is_photometric_name),
570 addIsResolved=bool(self.config.is_resolved_name),
571 addIsVariable=bool(self.config.is_variable_name),
572 fullPositionInformation=self.config.full_position_information,
574 keysToSkip = set((
"id",
"centroid_x",
"centroid_y",
"hasCentroid"))
575 key_map = {fieldName: schema[fieldName].asKey()
for fieldName
in schema.getOrderedNames()
576 if fieldName
not in keysToSkip}
579 if dtype[name].kind ==
'U':
581 at_size = dtype[name].itemsize
582 return schema.addField(name, type=str, size=at_size)
584 at_type = dtype[name].type
585 return schema.addField(name, at_type)
587 for col
in self.config.extra_col_names:
588 key_map[col] = addField(col)
589 return schema, key_map
592 """Generate and save the master catalog schema.
597 An input file to read to get the input dtype.
599 arr = self.file_reader.run(inputfile)
604 outputfile = os.path.join(self.output_dir, "master_schema.fits")
605 catalog.writeFits(outputfile)
606 return schema, key_map
611 kwargs[
'output_dir'] = self.
base_dir
616 """Construct an argument parser for the ``convertReferenceCatalog`` script.
620 argparser : `argparse.ArgumentParser`
621 The argument parser that defines the ``convertReferenceCatalog``
622 command-line interface.
624 parser = argparse.ArgumentParser(
626 formatter_class=argparse.RawDescriptionHelpFormatter,
627 epilog='More information is available at https://pipelines.lsst.io.'
629 parser.add_argument(
"outputDir",
630 help=
"Path to write the output shard files, configs, and `ingest-files` table to.")
631 parser.add_argument(
"configFile",
632 help=
"File containing the ConvertReferenceCatalogConfig fields.")
635 parser.add_argument(
"fileglob", nargs=
"+",
636 help=
"Quoted glob for the files to be read in and converted."
637 " Example (note required quotes to prevent shell expansion):"
638 ' "gaia_source/csv/GaiaSource*"')
643 """Run `ConvertReferenceCatalogTask` on the input arguments.
648 Path to write the output files to.
650 File specifying the ``ConvertReferenceCatalogConfig`` fields.
652 Quoted glob for the files to be read
in and converted.
655 logging.basicConfig(level=logging.INFO, format=
"{name} {levelname}: {message}", style=
"{")
657 config = ConvertReferenceCatalogTask.ConfigClass()
658 config.load(configFile)
660 files = glob.glob(fileglob)
662 with open(os.path.join(outputDir,
"convertReferenceCatalogConfig.py"),
"w")
as outfile:
663 converter.config.saveToStream(outfile)
664 msg = (
"Completed refcat conversion.\n\n"
665 "Ingest the resulting files with the following commands, substituting the path\n"
666 "to your butler repo for `REPO`, and the ticket number you are tracking this\n"
667 "ingest on for `DM-NNNNN`:\n"
668 f
"\n butler register-dataset-type REPO {config.dataset_config.ref_dataset_name} "
670 "\n butler ingest-files -t direct REPO gaia_dr2 refcats/DM-NNNNN "
671 f
"{converter.ingest_table_file}"
672 "\n butler collection-chain REPO --mode extend refcats refcats/DM-NNNNN")
678 if len(args.fileglob) > 1:
679 raise RuntimeError(
"Final argument must be a quoted file glob, not a shell-expanded list of files.")
681 run_convert(args.outputDir, args.configFile, args.fileglob[0])
static Schema makeMinimalSchema()
pexConfig mag_err_column_map
pexConfig full_position_information
pexConfig mag_column_list
def _getOutputFilenames(self, htm)
def _writeMasterSchema(self, inputfile)
def makeSchema(self, dtype)
def run(self, inputFiles)
def __init__(self, *output_dir=None, **kwargs)
def _writeIngestHelperFile(self, result)
def addRefCatMetadata(catalog)
def run_convert(outputDir, configFile, fileglob)
def _makeSchema(filterNameList, *addCentroid=False, addIsPhotometric=False, addIsResolved=False, addIsVariable=False, fullPositionInformation=False)