Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("ParquetFormatter", ) 

25 

26import json 

27import re 

28import collections.abc 

29import itertools 

30from typing import ( 

31 Any, 

32 Dict, 

33 Iterable, 

34 Iterator, 

35 List, 

36 Optional, 

37 Tuple, 

38 Union, 

39) 

40 

41import pyarrow.parquet as pq 

42import pandas as pd 

43import pyarrow as pa 

44 

45from lsst.daf.butler.core.utils import iterable 

46from lsst.daf.butler import Formatter 

47 

48 

49class _ParquetLoader: 

50 """Helper class for loading Parquet files into `pandas.DataFrame` 

51 instances. 

52 

53 Parameters 

54 ---------- 

55 path : `str` 

56 Full path to the file to be loaded. 

57 """ 

58 

59 def __init__(self, path: str): 

60 self.file = pq.ParquetFile(path) 

61 self.md = json.loads(self.file.metadata.metadata[b"pandas"]) 

62 indexes = self.md["column_indexes"] 

63 if len(indexes) == 1: 

64 self.columns = pd.Index(name for name in self.file.metadata.schema.names 

65 if not name.startswith("__")) 

66 else: 

67 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names)) 

68 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

69 self.indexLevelNames = tuple(self.columns.names) 

70 

71 @staticmethod 

72 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Tuple[str, ...]]: 

73 """Split a string that represents a multi-index column. 

74 

75 PyArrow maps Pandas' multi-index column names (which are tuples in 

76 Pythons) to flat strings on disk. This routine exists to 

77 reconstruct the original tuple. 

78 

79 Parameters 

80 ---------- 

81 n : `int` 

82 Number of levels in the `pd.MultiIndex` that is being 

83 reconstructed. 

84 names : `~collections.abc.Iterable` of `str` 

85 Strings to be split. 

86 

87 Yields 

88 ------ 

89 tuple : `tuple` of `str` 

90 A multi-index column name tuple. 

91 """ 

92 pattern = re.compile(r"\({}\)".format(', '.join(["'(.*)'"] * n))) 

93 for name in names: 

94 m = re.search(pattern, name) 

95 if m is not None: 

96 yield m.groups() 

97 

98 def _standardizeColumnParameter(self, columns: Dict[str, Union[str, List[str]]]) -> Iterator[str]: 

99 """Transform a dictionary index into a multi-index column into a 

100 string directly understandable by PyArrow. 

101 

102 Parameters 

103 ---------- 

104 columns : `dict` 

105 Dictionary whose elements are string multi-index level names 

106 and whose values are the value or values (as a list) for that 

107 level. 

108 

109 Yields 

110 ------ 

111 name : `str` 

112 Stringified tuple representing a multi-index column name. 

113 """ 

114 if not isinstance(columns, collections.abc.Mapping): 

115 raise ValueError("columns parameter for multi-index data frame must be a dictionary.") 

116 if not set(self.indexLevelNames).issuperset(columns.keys()): 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true

117 raise ValueError(f"Cannot use dict with keys {set(columns.keys())} " 

118 f"to select columns from {self.indexLevelNames}.") 

119 factors = [iterable(columns.get(level, self.columns.levels[i])) 

120 for i, level in enumerate(self.indexLevelNames)] 

121 for requested in itertools.product(*factors): 

122 for i, value in enumerate(requested): 

123 if value not in self.columns.levels[i]: 123 ↛ 124line 123 didn't jump to line 124, because the condition on line 123 was never true

124 raise ValueError(f"Unrecognized value {value!r} for index {self.indexLevelNames[i]!r}.") 

125 yield str(requested) 

126 

127 def read(self, columns: Union[str, List[str], Dict[str, Union[str, List[str]]]] = None 

128 ) -> pd.DataFrame: 

129 """Read some or all of the Parquet file into a `pandas.DataFrame` 

130 instance. 

131 

132 Parameters 

133 ---------- 

134 columns: : `dict`, `list`, or `str`, optional 

135 A description of the columns to be loaded. See 

136 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`. 

137 

138 Returns 

139 ------- 

140 df : `pandas.DataFrame` 

141 A Pandas DataFrame. 

142 """ 

143 if columns is None: 

144 return self.file.read(use_pandas_metadata=True).to_pandas() 

145 elif isinstance(self.columns, pd.MultiIndex): 

146 columns = list(self._standardizeColumnParameter(columns)) 

147 else: 

148 for column in columns: 

149 if column not in self.columns: 

150 raise ValueError(f"Unrecognized column name {column!r}.") 

151 return self.file.read(columns=columns, use_pandas_metadata=True).to_pandas() 

152 

153 

154def _writeParquet(path: str, inMemoryDataset: pd.DataFrame) -> None: 

155 """Write a `pandas.DataFrame` instance as a Parquet file. 

156 """ 

157 table = pa.Table.from_pandas(inMemoryDataset) 

158 pq.write_table(table, path, compression='none') 

159 

160 

161class ParquetFormatter(Formatter): 

162 """Interface for reading and writing Pandas DataFrames to and from Parquet 

163 files. 

164 

165 This formatter is for the 

166 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass. 

167 """ 

168 extension = ".parq" 

169 

170 def read(self, component: Optional[str] = None) -> Any: 

171 # Docstring inherited from Formatter.read. 

172 loader = _ParquetLoader(self.fileDescriptor.location.path) 

173 if component == 'columns': 

174 return loader.columns 

175 

176 if not self.fileDescriptor.parameters: 

177 return loader.read() 

178 

179 return loader.read(**self.fileDescriptor.parameters) 

180 

181 def write(self, inMemoryDataset: Any) -> str: 

182 # Docstring inherited from Formatter.write. 

183 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

184 _writeParquet(location.path, inMemoryDataset) 

185 return location.pathInStore