lsst.meas.algorithms gbe01a4569f+ddc32327cb
convertReferenceCatalog.py
Go to the documentation of this file.
1# This file is part of meas_algorithms.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
21
22"""
23Convert an external reference catalog into the hierarchical triangular mesh
24(HTM) sharded LSST-style format, to be ingested into the butler.
25"""
26
27__all__ = ["ConvertReferenceCatalogTask"]
28
29import argparse
30import glob
31import os
32import pathlib
33import logging
34
35import astropy
36
37from . import ConvertReferenceCatalogBase
38
39
41 """Class for producing HTM-indexed reference catalogs from external
42 catalog data.
43
44 Parameters
45 ----------
46 output_dir : `str`
47 The path to write the output files to, in a subdirectory defined by
48 ``DatasetConfig.ref_dataset_name``.
49 """
50 _DefaultName = 'ConvertReferenceCatalogTask'
51
52 def __init__(self, *, output_dir=None, **kwargs):
53 super().__init__(**kwargs)
54 if output_dir is None:
55 raise RuntimeError("Must specify output_dir.")
56 self.base_dirbase_dir = output_dir
57 self.output_diroutput_dir = os.path.join(output_dir, self.config.dataset_config.ref_dataset_name)
58 self.ingest_table_fileingest_table_file = os.path.join(self.base_dirbase_dir, "filename_to_htm.ecsv")
59
60 def _preRun(self):
61 # Create the output path, if it doesn't exist; fail if the path exists:
62 # we don't want to accidentally append to existing files.
63 pathlib.Path(self.output_diroutput_dir).mkdir(exist_ok=False)
64
65 def _postRun(self, result):
66 # Write the astropy table containing the htm->filename relationship
67 dimension = f"htm{self.config.dataset_config.indexer.active.depth}"
68 table = astropy.table.Table(names=("filename", dimension), dtype=('str', 'int'))
69 for key in result:
70 table.add_row((result[key], key))
71 table.write(self.ingest_table_fileingest_table_file)
72
73 def _persistConfig(self):
74 filename = os.path.join(self.output_diroutput_dir, "config.py")
75 with open(filename, 'w') as file:
76 self.config.dataset_config.saveToStream(file)
77
78 def _getOnePixelFilename(self, start):
79 return os.path.join(self.output_diroutput_dir, f"{self.indexer.htm}.fits")
80
81 def _writeMasterSchema(self, catalog):
82 filename = os.path.join(self.output_diroutput_dir, "master_schema.fits")
83 catalog.writeFits(filename)
84
85 def _reduce_kwargs(self):
86 # Need to be able to pickle this class to use the multiprocess manager.
87 kwargs = super()._reduce_kwargs()
88 kwargs['output_dir'] = self.base_dirbase_dir
89 return kwargs
90
91
93 """Construct an argument parser for the ``convertReferenceCatalog`` script.
94
95 Returns
96 -------
97 argparser : `argparse.ArgumentParser`
98 The argument parser that defines the ``convertReferenceCatalog``
99 command-line interface.
100 """
101 parser = argparse.ArgumentParser(
102 description=__doc__,
103 formatter_class=argparse.RawDescriptionHelpFormatter,
104 epilog='More information is available at https://pipelines.lsst.io.'
105 )
106 parser.add_argument("outputDir",
107 help="Path to write the output shard files, configs, and `ingest-files` table to.")
108 parser.add_argument("configFile",
109 help="File containing the ConvertReferenceCatalogConfig fields.")
110 # Use a "+"-list here, so we can produce a more useful error if the user
111 # uses an unquoted glob that gets shell expanded.
112 parser.add_argument("fileglob", nargs="+",
113 help="Quoted glob for the files to be read in and converted."
114 " Example (note required quotes to prevent shell expansion):"
115 ' "gaia_source/csv/GaiaSource*"')
116 return parser
117
118
119def run_convert(outputDir, configFile, fileglob):
120 """Run `ConvertReferenceCatalogTask` on the input arguments.
121
122 Parameters
123 ----------
124 outputDir : `str`
125 Path to write the output files to.
126 configFile : `str`
127 File specifying the ``ConvertReferenceCatalogConfig`` fields.
128 fileglob : `str`
129 Quoted glob for the files to be read in and converted.
130 """
131 # We have to initialize the logger manually when running from the commandline.
132 logging.basicConfig(level=logging.INFO, format="{name} {levelname}: {message}", style="{")
133
134 config = ConvertReferenceCatalogTask.ConfigClass()
135 config.load(configFile)
136 config.validate()
137 converter = ConvertReferenceCatalogTask(output_dir=outputDir, config=config)
138 files = glob.glob(fileglob)
139 converter.run(files)
140 with open(os.path.join(outputDir, "convertReferenceCatalogConfig.py"), "w") as outfile:
141 converter.config.saveToStream(outfile)
142 msg = ("Completed refcat conversion."
143 " Ingest the resulting files with the following commands,"
144 " substituting the path to your butler repo for REPO:"
145 f"\n butler register-dataset-type REPO {config.dataset_config.ref_dataset_name} "
146 "SimpleCatalog htm7"
147 f"\n butler ingest-files -t direct REPO gaia_dr2 refcats {converter.ingest_table_file}")
148 print(msg)
149
150
151def main():
152 args = build_argparser().parse_args()
153 if len(args.fileglob) > 1:
154 raise RuntimeError("Final argument must be a quoted file glob, not a shell-expanded list of files.")
155 # Fileglob comes out as a length=1 list, so we can test it above.
156 run_convert(args.outputDir, args.configFile, args.fileglob[0])
def run_convert(outputDir, configFile, fileglob)