Coverage for python/lsst/daf/butler/formatters/parquet.py: 95%

70 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-19 12:04 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("ParquetFormatter",) 

25 

26import collections.abc 

27import itertools 

28import json 

29import re 

30from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Union 

31 

32import pandas as pd 

33import pyarrow as pa 

34import pyarrow.parquet as pq 

35from lsst.daf.butler import Formatter 

36from lsst.utils.iteration import ensure_iterable 

37 

38 

39class _ParquetLoader: 

40 """Helper class for loading Parquet files into `pandas.DataFrame` 

41 instances. 

42 

43 Parameters 

44 ---------- 

45 path : `str` 

46 Full path to the file to be loaded. 

47 """ 

48 

49 def __init__(self, path: str): 

50 self.file = pq.ParquetFile(path) 

51 self.md = json.loads(self.file.metadata.metadata[b"pandas"]) 

52 indexes = self.md["column_indexes"] 

53 if len(indexes) <= 1: 

54 self.columns = pd.Index( 

55 name for name in self.file.metadata.schema.names if not name.startswith("__") 

56 ) 

57 else: 

58 raw_columns = list(self._splitColumnnNames(len(indexes), self.file.metadata.schema.names)) 

59 self.columns = pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

60 self.indexLevelNames = tuple(self.columns.names) 

61 

62 @staticmethod 

63 def _splitColumnnNames(n: int, names: Iterable[str]) -> Iterator[Sequence[str]]: 

64 """Split a string that represents a multi-index column. 

65 

66 PyArrow maps Pandas' multi-index column names (which are tuples in 

67 Pythons) to flat strings on disk. This routine exists to 

68 reconstruct the original tuple. 

69 

70 Parameters 

71 ---------- 

72 n : `int` 

73 Number of levels in the `pd.MultiIndex` that is being 

74 reconstructed. 

75 names : `~collections.abc.Iterable` of `str` 

76 Strings to be split. 

77 

78 Yields 

79 ------ 

80 tuple : `tuple` of `str` 

81 A multi-index column name tuple. 

82 """ 

83 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n))) 

84 for name in names: 

85 m = re.search(pattern, name) 

86 if m is not None: 

87 yield m.groups() 

88 

89 def _standardizeColumnParameter( 

90 self, columns: Union[List[str], List[tuple], Dict[str, Union[str, List[str]]]] 

91 ) -> Iterator[str]: 

92 """Transform a dictionary index into a multi-index column into a 

93 string directly understandable by PyArrow. 

94 

95 Parameters 

96 ---------- 

97 columns : `dict` 

98 Dictionary whose elements are string multi-index level names 

99 and whose values are the value or values (as a list) for that 

100 level. 

101 

102 Yields 

103 ------ 

104 name : `str` 

105 Stringified tuple representing a multi-index column name. 

106 """ 

107 if isinstance(columns, list): 

108 for requested in columns: 

109 if not isinstance(requested, tuple): 

110 raise ValueError( 

111 "columns parameter for multi-index data frame" 

112 "must be either a dictionary or list of tuples." 

113 ) 

114 yield str(requested) 

115 else: 

116 if not isinstance(columns, collections.abc.Mapping): 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true

117 raise ValueError( 

118 "columns parameter for multi-index data frame" 

119 "must be either a dictionary or list of tuples." 

120 ) 

121 if not set(self.indexLevelNames).issuperset(columns.keys()): 121 ↛ 122line 121 didn't jump to line 122, because the condition on line 121 was never true

122 raise ValueError( 

123 f"Cannot use dict with keys {set(columns.keys())} " 

124 f"to select columns from {self.indexLevelNames}." 

125 ) 

126 factors = [ 

127 ensure_iterable(columns.get(level, self.columns.levels[i])) 

128 for i, level in enumerate(self.indexLevelNames) 

129 ] 

130 for requested in itertools.product(*factors): 

131 for i, value in enumerate(requested): 

132 if value not in self.columns.levels[i]: 132 ↛ 133line 132 didn't jump to line 133, because the condition on line 132 was never true

133 raise ValueError( 

134 f"Unrecognized value {value!r} for index {self.indexLevelNames[i]!r}." 

135 ) 

136 yield str(requested) 

137 

138 def read( 

139 self, columns: Union[str, List[str], List[tuple], Dict[str, Union[str, List[str]]]] = None 

140 ) -> pd.DataFrame: 

141 """Read some or all of the Parquet file into a `pandas.DataFrame` 

142 instance. 

143 

144 Parameters 

145 ---------- 

146 columns: : `dict`, `list`, or `str`, optional 

147 A description of the columns to be loaded. See 

148 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe`. 

149 

150 Returns 

151 ------- 

152 df : `pandas.DataFrame` 

153 A Pandas DataFrame. 

154 """ 

155 if columns is None: 

156 return self.file.read(use_pandas_metadata=True).to_pandas() 

157 elif isinstance(self.columns, pd.MultiIndex): 

158 assert isinstance(columns, dict) or isinstance(columns, list) 

159 columns = list(self._standardizeColumnParameter(columns)) 

160 else: 

161 for column in columns: 

162 if column not in self.columns: 

163 raise ValueError(f"Unrecognized column name {column!r}.") 

164 return self.file.read(columns=columns, use_pandas_metadata=True).to_pandas() 

165 

166 

167def _writeParquet(path: str, inMemoryDataset: pd.DataFrame) -> None: 

168 """Write a `pandas.DataFrame` instance as a Parquet file.""" 

169 table = pa.Table.from_pandas(inMemoryDataset) 

170 pq.write_table(table, path) 

171 

172 

173class ParquetFormatter(Formatter): 

174 """Interface for reading and writing Pandas DataFrames to and from Parquet 

175 files. 

176 

177 This formatter is for the 

178 :ref:`lsst.daf.butler-concrete_storage_classes_dataframe` StorageClass. 

179 """ 

180 

181 extension = ".parq" 

182 

183 def read(self, component: Optional[str] = None) -> Any: 

184 # Docstring inherited from Formatter.read. 

185 loader = _ParquetLoader(self.fileDescriptor.location.path) 

186 if component == "columns": 

187 return loader.columns 

188 

189 if not self.fileDescriptor.parameters: 

190 return loader.read() 

191 

192 return loader.read(**self.fileDescriptor.parameters) 

193 

194 def write(self, inMemoryDataset: Any) -> None: 

195 # Docstring inherited from Formatter.write. 

196 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

197 _writeParquet(location.path, inMemoryDataset)