Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("ParquetFormatter", ) 

25 

26import json 

27import re 

28import collections.abc 

29import itertools 

30from typing import ( 

31 Any, 

32 Dict, 

33 Iterable, 

34 Iterator, 

35 List, 

36 Optional, 

37 Sequence, 

38 Union, 

39) 

40 

41import pyarrow.parquet as pq 

42import pandas as pd 

43import pyarrow as pa 

44 

45from lsst.daf.butler.core.utils import iterable 

46from lsst.daf.butler import Formatter 

47 

48 

49class _ParquetLoader: 

50 """Helper class for loading Parquet files into `pandas.DataFrame` 

51 instances. 

52 

53 Parameters 

54 ---------- 

55 path : `str` 

56 Full path to the file to be loaded. 

57 """ 

58 

59 def __init__(self, path: str): 

60 self.file = pq.ParquetFile(path) 

61 self.md = json.loads(self.file.metadata.metadata[b"pandas"]) 

62 indexes = self.md["column_indexes"] 

63 if len(indexes) == 1: 

64 self.columns = pd.Index(name for name in self.file.metadata.schema.names 

65 if not name.startswith("__")) 

66 else: 

67 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names)) 

68 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

69 self.indexLevelNames = tuple(self.columns.names) 

70 

71 @staticmethod 

72 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Sequence[str]]: 

73 """Split a string that represents a multi-index column. 

74 

75 PyArrow maps Pandas' multi-index column names (which are tuples in 

76 Pythons) to flat strings on disk. This routine exists to 

77 reconstruct the original tuple. 

78 

79 Parameters 

80 ---------- 

81 n : `int` 

82 Number of levels in the `pd.MultiIndex` that is being 

83 reconstructed. 

84 names : `~collections.abc.Iterable` of `str` 

85 Strings to be split. 

86 

87 Yields 

88 ------ 

89 tuple : `tuple` of `str` 

90 A multi-index column name tuple. 

91 """ 

92 pattern = re.compile(r"\({}\)".format(', '.join(["'(.*)'"] * n))) 

93 for name in names: 

94 m = re.search(pattern, name) 

95 if m is not None: 

96 yield m.groups() 

97 

98 def _standardizeColumnParameter(self, columns: Dict[str, Union[str, List[str]]]) -> Iterator[str]: 

99 """Transform a dictionary index into a multi-index column into a 

100 string directly understandable by PyArrow. 

101 

102 Parameters 

103 ---------- 

104 columns : `dict` 

105 Dictionary whose elements are string multi-index level names 

106 and whose values are the value or values (as a list) for that 

107 level. 

108 

109 Yields 

110 ------ 

111 name : `str` 

112 Stringified tuple representing a multi-index column name. 

113 """ 

114 if not isinstance(columns, collections.abc.Mapping): 114 ↛ 115line 114 didn't jump to line 115, because the condition on line 114 was never true

115 raise ValueError("columns parameter for multi-index data frame must be a dictionary.") 

116 if not set(self.indexLevelNames).issuperset(columns.keys()): 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true

117 raise ValueError(f"Cannot use dict with keys {set(columns.keys())} " 

118 f"to select columns from {self.indexLevelNames}.") 

119 assert isinstance(self.columns, pd.MultiIndex) 

120 factors = [iterable(columns.get(level, self.columns.levels[i])) 

121 for i, level in enumerate(self.indexLevelNames)] 

122 for requested in itertools.product(*factors): 

123 for i, value in enumerate(requested): 

124 if value not in self.columns.levels[i]: 124 ↛ 125line 124 didn't jump to line 125, because the condition on line 124 was never true

125 raise ValueError(f"Unrecognized value {value!r} for index {self.indexLevelNames[i]!r}.") 

126 yield str(requested) 

127 

128 def read(self, columns: Union[str, List[str], Dict[str, Union[str, List[str]]]] = None 

129 ) -> pd.DataFrame: 

130 """Read some or all of the Parquet file into a `pandas.DataFrame` 

131 instance. 

132 

133 Parameters 

134 ---------- 

135 columns: : `dict`, `list`, or `str`, optional 

136 A description of the columns to be loaded. See 

137 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`. 

138 

139 Returns 

140 ------- 

141 df : `pandas.DataFrame` 

142 A Pandas DataFrame. 

143 """ 

144 if columns is None: 

145 return self.file.read(use_pandas_metadata=True).to_pandas() 

146 elif isinstance(self.columns, pd.MultiIndex): 

147 assert isinstance(columns, dict) 

148 columns = list(self._standardizeColumnParameter(columns)) 

149 else: 

150 for column in columns: 

151 if column not in self.columns: 

152 raise ValueError(f"Unrecognized column name {column!r}.") 

153 return self.file.read(columns=columns, use_pandas_metadata=True).to_pandas() 

154 

155 

156def _writeParquet(path: str, inMemoryDataset: pd.DataFrame) -> None: 

157 """Write a `pandas.DataFrame` instance as a Parquet file. 

158 """ 

159 table = pa.Table.from_pandas(inMemoryDataset) 

160 pq.write_table(table, path, compression='none') 

161 

162 

163class ParquetFormatter(Formatter): 

164 """Interface for reading and writing Pandas DataFrames to and from Parquet 

165 files. 

166 

167 This formatter is for the 

168 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass. 

169 """ 

170 extension = ".parq" 

171 

172 def read(self, component: Optional[str] = None) -> Any: 

173 # Docstring inherited from Formatter.read. 

174 loader = _ParquetLoader(self.fileDescriptor.location.path) 

175 if component == 'columns': 

176 return loader.columns 

177 

178 if not self.fileDescriptor.parameters: 

179 return loader.read() 

180 

181 return loader.read(**self.fileDescriptor.parameters) 

182 

183 def write(self, inMemoryDataset: Any) -> None: 

184 # Docstring inherited from Formatter.write. 

185 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

186 _writeParquet(location.path, inMemoryDataset)