Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ("ParquetFormatter", ) 

23 

24import json 

25import re 

26import collections.abc 

27import itertools 

28from typing import ( 

29 Any, 

30 Dict, 

31 Iterable, 

32 Iterator, 

33 List, 

34 Optional, 

35 Tuple, 

36 Union, 

37) 

38 

39import pyarrow.parquet as pq 

40import pandas as pd 

41import pyarrow as pa 

42 

43from lsst.daf.butler.core.utils import iterable 

44from lsst.daf.butler import Formatter 

45 

46 

47class _ParquetLoader: 

48 """Helper class for loading Parquet files into `pandas.DataFrame` 

49 instances. 

50 

51 Parameters 

52 ---------- 

53 path : `str` 

54 Full path to the file to be loaded. 

55 """ 

56 

57 def __init__(self, path: str): 

58 self.file = pq.ParquetFile(path) 

59 self.md = json.loads(self.file.metadata.metadata[b"pandas"]) 

60 indexes = self.md["column_indexes"] 

61 if len(indexes) == 1: 

62 self.columns = pd.Index(name for name in self.file.metadata.schema.names 

63 if not name.startswith("__")) 

64 else: 

65 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names)) 

66 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

67 self.indexLevelNames = tuple(self.columns.names) 

68 

69 @staticmethod 

70 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Tuple[str]]: 

71 """Split a string that represents a multi-index column. 

72 

73 PyArrow maps Pandas' multi-index column names (which are tuples in 

74 Pythons) to flat strings on disk. This routine exists to 

75 reconstruct the original tuple. 

76 

77 Parameters 

78 ---------- 

79 n : `int` 

80 Number of levels in the `pd.MultiIndex` that is being 

81 reconstructed. 

82 names : `~collections.abc.Iterable` of `str` 

83 Strings to be split. 

84 

85 Yields 

86 ------ 

87 tuple : `tuple` of `str` 

88 A multi-index column name tuple. 

89 """ 

90 pattern = re.compile(r"\({}\)".format(', '.join(["'(.*)'"] * n))) 

91 for name in names: 

92 m = re.search(pattern, name) 

93 if m is not None: 

94 yield m.groups() 

95 

96 def _standardizeColumnParameter(self, columns: Dict[str, Union[str, List[str]]]) -> Iterator[str]: 

97 """Transform a dictionary index into a multi-index column into a 

98 string directly understandable by PyArrow. 

99 

100 Parameters 

101 ---------- 

102 columns : `dict` 

103 Dictionary whose elements are string multi-index level names 

104 and whose values are the value or values (as a list) for that 

105 level. 

106 

107 Yields 

108 ------ 

109 name : `str` 

110 Stringified tuple representing a multi-index column name. 

111 """ 

112 if not isinstance(columns, collections.abc.Mapping): 

113 raise ValueError("columns parameter for multi-index data frame must be a dictionary.") 

114 if not set(self.indexLevelNames).issuperset(columns.keys()): 114 ↛ 115line 114 didn't jump to line 115, because the condition on line 114 was never true

115 raise ValueError(f"Cannot use dict with keys {set(columns.keys())} " 

116 f"to select columns from {self.indexLevelNames}.") 

117 factors = [iterable(columns.get(level, self.columns.levels[i])) 

118 for i, level in enumerate(self.indexLevelNames)] 

119 for requested in itertools.product(*factors): 

120 for i, value in enumerate(requested): 

121 if value not in self.columns.levels[i]: 121 ↛ 122line 121 didn't jump to line 122, because the condition on line 121 was never true

122 raise ValueError(f"Unrecognized value {value!r} for index {self.indexLevelNames[i]!r}.") 

123 yield str(requested) 

124 

125 def read(self, columns: Union[str, List[str], Dict[str, Union[str, List[str]]]] = None 

126 ) -> pd.DataFrame: 

127 """Read some or all of the Parquet file into a `pandas.DataFrame` 

128 instance. 

129 

130 Parameters 

131 ---------- 

132 columns: : `dict`, `list`, or `str`, optional 

133 A description of the columns to be loaded. See 

134 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`. 

135 

136 Returns 

137 ------- 

138 df : `pandas.DataFrame` 

139 A Pandas DataFrame. 

140 """ 

141 if columns is None: 

142 return self.file.read(use_pandas_metadata=True).to_pandas() 

143 elif isinstance(self.columns, pd.MultiIndex): 

144 columns = list(self._standardizeColumnParameter(columns)) 

145 else: 

146 for column in columns: 

147 if column not in self.columns: 

148 raise ValueError(f"Unrecognized column name {column!r}.") 

149 return self.file.read(columns=columns, use_pandas_metadata=True).to_pandas() 

150 

151 

152def _writeParquet(path: str, inMemoryDataset: pd.DataFrame): 

153 """Write a `pandas.DataFrame` instance as a Parquet file. 

154 """ 

155 table = pa.Table.from_pandas(inMemoryDataset) 

156 pq.write_table(table, path, compression='none') 

157 

158 

159class ParquetFormatter(Formatter): 

160 """Interface for reading and writing Pandas DataFrames to and from Parquet 

161 files. 

162 

163 This formatter is for the 

164 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass. 

165 """ 

166 extension = ".parq" 

167 

168 def read(self, component: Optional[str] = None) -> object: 

169 # Docstring inherited from Formatter.read. 

170 loader = _ParquetLoader(self.fileDescriptor.location.path) 

171 if component == 'columns': 

172 return loader.columns 

173 

174 if not self.fileDescriptor.parameters: 

175 return loader.read() 

176 

177 return loader.read(**self.fileDescriptor.parameters) 

178 

179 def write(self, inMemoryDataset: Any) -> str: 

180 # Docstring inherited from Formatter.write. 

181 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

182 _writeParquet(location.path, inMemoryDataset) 

183 return location.pathInStore