Coverage for python/lsst/daf/butler/formatters/parquet.py: 95%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

70 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("ParquetFormatter",) 

25 

26import json 

27import re 

28import collections.abc 

29import itertools 

30from typing import ( 

31 Any, 

32 Dict, 

33 Iterable, 

34 Iterator, 

35 List, 

36 Optional, 

37 Sequence, 

38 Union, 

39) 

40 

41import pyarrow.parquet as pq 

42import pandas as pd 

43import pyarrow as pa 

44 

45from lsst.daf.butler.core.utils import iterable 

46from lsst.daf.butler import Formatter 

47 

48 

49class _ParquetLoader: 

50 """Helper class for loading Parquet files into `pandas.DataFrame` 

51 instances. 

52 

53 Parameters 

54 ---------- 

55 path : `str` 

56 Full path to the file to be loaded. 

57 """ 

58 def __init__(self, path: str): 

59 self.file = pq.ParquetFile(path) 

60 self.md = json.loads(self.file.metadata.metadata[b"pandas"]) 

61 indexes = self.md["column_indexes"] 

62 if len(indexes) <= 1: 

63 self.columns = pd.Index( 

64 name for name in self.file.metadata.schema.names if not name.startswith("__") 

65 ) 

66 else: 

67 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names)) 

68 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

69 self.indexLevelNames = tuple(self.columns.names) 

70 

71 @staticmethod 

72 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Sequence[str]]: 

73 """Split a string that represents a multi-index column. 

74 

75 PyArrow maps Pandas' multi-index column names (which are tuples in 

76 Pythons) to flat strings on disk. This routine exists to 

77 reconstruct the original tuple. 

78 

79 Parameters 

80 ---------- 

81 n : `int` 

82 Number of levels in the `pd.MultiIndex` that is being 

83 reconstructed. 

84 names : `~collections.abc.Iterable` of `str` 

85 Strings to be split. 

86 

87 Yields 

88 ------ 

89 tuple : `tuple` of `str` 

90 A multi-index column name tuple. 

91 """ 

92 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n))) 

93 for name in names: 

94 m = re.search(pattern, name) 

95 if m is not None: 

96 yield m.groups() 

97 

98 def _standardizeColumnParameter( 

99 self, columns: Union[List[str], List[tuple], Dict[str, Union[str, List[str]]]] 

100 ) -> Iterator[str]: 

101 """Transform a dictionary index into a multi-index column into a 

102 string directly understandable by PyArrow. 

103 

104 Parameters 

105 ---------- 

106 columns : `dict` 

107 Dictionary whose elements are string multi-index level names 

108 and whose values are the value or values (as a list) for that 

109 level. 

110 

111 Yields 

112 ------ 

113 name : `str` 

114 Stringified tuple representing a multi-index column name. 

115 """ 

116 if isinstance(columns, list): 

117 for requested in columns: 

118 if not isinstance(requested, tuple): 

119 raise ValueError("columns parameter for multi-index data frame" 

120 "must be either a dictionary or list of tuples.") 

121 yield str(requested) 

122 else: 

123 if not isinstance(columns, collections.abc.Mapping): 123 ↛ 124line 123 didn't jump to line 124, because the condition on line 123 was never true

124 raise ValueError("columns parameter for multi-index data frame" 

125 "must be either a dictionary or list of tuples.") 

126 if not set(self.indexLevelNames).issuperset(columns.keys()): 126 ↛ 127line 126 didn't jump to line 127, because the condition on line 126 was never true

127 raise ValueError(f"Cannot use dict with keys {set(columns.keys())} " 

128 f"to select columns from {self.indexLevelNames}.") 

129 factors = [iterable(columns.get(level, self.columns.levels[i])) 

130 for i, level in enumerate(self.indexLevelNames)] 

131 for requested in itertools.product(*factors): 

132 for i, value in enumerate(requested): 

133 if value not in self.columns.levels[i]: 133 ↛ 134line 133 didn't jump to line 134, because the condition on line 133 was never true

134 raise ValueError(f"Unrecognized value {value!r}" 

135 f"for index {self.indexLevelNames[i]!r}.") 

136 yield str(requested) 

137 

138 def read(self, columns: Union[str, List[str], List[tuple], 

139 Dict[str, Union[str, List[str]]]] = None) -> pd.DataFrame: 

140 """Read some or all of the Parquet file into a `pandas.DataFrame` 

141 instance. 

142 

143 Parameters 

144 ---------- 

145 columns: : `dict`, `list`, or `str`, optional 

146 A description of the columns to be loaded. See 

147 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`. 

148 

149 Returns 

150 ------- 

151 df : `pandas.DataFrame` 

152 A Pandas DataFrame. 

153 """ 

154 if columns is None: 

155 return self.file.read(use_pandas_metadata=True).to_pandas() 

156 elif isinstance(self.columns, pd.MultiIndex): 

157 assert isinstance(columns, dict) or isinstance(columns, list) 

158 columns = list(self._standardizeColumnParameter(columns)) 

159 else: 

160 for column in columns: 

161 if column not in self.columns: 

162 raise ValueError(f"Unrecognized column name {column!r}.") 

163 return self.file.read(columns=columns, use_pandas_metadata=True).to_pandas() 

164 

165 

166def _writeParquet(path: str, inMemoryDataset: pd.DataFrame) -> None: 

167 """Write a `pandas.DataFrame` instance as a Parquet file. 

168 """ 

169 table = pa.Table.from_pandas(inMemoryDataset) 

170 pq.write_table(table, path, compression="none") 

171 

172 

173class ParquetFormatter(Formatter): 

174 """Interface for reading and writing Pandas DataFrames to and from Parquet 

175 files. 

176 

177 This formatter is for the 

178 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass. 

179 """ 

180 

181 extension = ".parq" 

182 

183 def read(self, component: Optional[str] = None) -> Any: 

184 # Docstring inherited from Formatter.read. 

185 loader = _ParquetLoader(self.fileDescriptor.location.path) 

186 if component == "columns": 

187 return loader.columns 

188 

189 if not self.fileDescriptor.parameters: 

190 return loader.read() 

191 

192 return loader.read(**self.fileDescriptor.parameters) 

193 

194 def write(self, inMemoryDataset: Any) -> None: 

195 # Docstring inherited from Formatter.write. 

196 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

197 _writeParquet(location.path, inMemoryDataset)