Coverage for tests/test_parquet.py: 27%

109 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-30 02:19 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import numpy as np 

32 import pandas as pd 

33except ImportError: 

34 pd = None 

35 

36try: 

37 import pyarrow.parquet 

38except ImportError: 

39 pyarrow = None 

40 

41from lsst.daf.butler import Butler, Config, DatasetType, StorageClassConfig, StorageClassFactory 

42from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

43from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

44 

45TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

46 

47 

48def _makeSingleIndexDataFrame(): 

49 """Make a single index data frame for testing. 

50 

51 Returns 

52 ------- 

53 dataFrame : `~pandas.DataFrame` 

54 The test dataframe. 

55 allColumns : `list` [`str`] 

56 List of all the columns (including index columns). 

57 """ 

58 nrow = 5 

59 data = np.zeros(nrow, dtype=[("index", "i4"), ("a", "f8"), ("b", "f8"), ("c", "f8"), ("ddd", "f8")]) 

60 data["index"][:] = np.arange(nrow) 

61 data["a"] = np.random.randn(nrow) 

62 data["b"] = np.random.randn(nrow) 

63 data["c"] = np.random.randn(nrow) 

64 data["ddd"] = np.random.randn(nrow) 

65 df = pd.DataFrame(data) 

66 df = df.set_index("index") 

67 allColumns = df.columns.append(pd.Index(df.index.names)) 

68 

69 return df, allColumns 

70 

71 

72def _makeMultiIndexDataFrame(): 

73 """Make a multi-index data frame for testing. 

74 

75 Returns 

76 ------- 

77 dataFrame : `~pandas.DataFrame` 

78 The test dataframe. 

79 """ 

80 columns = pd.MultiIndex.from_tuples( 

81 [ 

82 ("g", "a"), 

83 ("g", "b"), 

84 ("g", "c"), 

85 ("r", "a"), 

86 ("r", "b"), 

87 ("r", "c"), 

88 ], 

89 names=["filter", "column"], 

90 ) 

91 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

92 

93 return df 

94 

95 

96@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatter without pandas.") 

97@unittest.skipUnless(pyarrow is not None, "Cannot test ParquetFormatter without pyarrow.") 

98class ParquetFormatterTestCase(unittest.TestCase): 

99 """Tests for ParquetFormatter, using local file datastore.""" 

100 

101 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

102 

103 def setUp(self): 

104 """Create a new butler root for each test.""" 

105 self.root = makeTestTempDir(TESTDIR) 

106 config = Config(self.configFile) 

107 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

108 # No dimensions in dataset type so we don't have to worry about 

109 # inserting dimension data or defining data IDs. 

110 self.datasetType = DatasetType( 

111 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

112 ) 

113 self.butler.registry.registerDatasetType(self.datasetType) 

114 

115 def tearDown(self): 

116 removeTestTempDir(self.root) 

117 

118 def testSingleIndexDataFrame(self): 

119 df1, allColumns = _makeSingleIndexDataFrame() 

120 

121 self.butler.put(df1, self.datasetType, dataId={}) 

122 # Read the whole DataFrame. 

123 df2 = self.butler.get(self.datasetType, dataId={}) 

124 self.assertTrue(df1.equals(df2)) 

125 # Read just the column descriptions. 

126 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

127 self.assertTrue(allColumns.equals(columns2)) 

128 # Read just some columns a few different ways. 

129 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

130 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

131 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

132 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

133 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

134 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

135 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

136 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

137 # Passing an unrecognized column should be a ValueError. 

138 with self.assertRaises(ValueError): 

139 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

140 

141 def testMultiIndexDataFrame(self): 

142 df1 = _makeMultiIndexDataFrame() 

143 

144 self.butler.put(df1, self.datasetType, dataId={}) 

145 # Read the whole DataFrame. 

146 df2 = self.butler.get(self.datasetType, dataId={}) 

147 self.assertTrue(df1.equals(df2)) 

148 # Read just the column descriptions. 

149 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

150 self.assertTrue(df1.columns.equals(columns2)) 

151 # Read just some columns a few different ways. 

152 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

153 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

154 df4 = self.butler.get( 

155 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

156 ) 

157 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

158 column_list = [("g", "a"), ("r", "c")] 

159 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

160 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

161 # Passing an unrecognized column should be a ValueError. 

162 with self.assertRaises(ValueError): 

163 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

164 

165 

166@unittest.skipUnless(pd is not None, "Cannot test parquet InMemoryDatastore without pandas.") 

167class InMemoryParquetFormatterTestCase(ParquetFormatterTestCase): 

168 """Tests for InMemoryDatastore, using DataFrameDelegate""" 

169 

170 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

171 

172 def testMultiIndexDataFrame(self): 

173 df1 = _makeMultiIndexDataFrame() 

174 

175 delegate = DataFrameDelegate("DataFrame") 

176 

177 # Read the whole DataFrame. 

178 df2 = delegate.handleParameters(inMemoryDataset=df1) 

179 self.assertTrue(df1.equals(df2)) 

180 # Read just the column descriptions. 

181 columns2 = delegate.getComponent(composite=df1, componentName="columns") 

182 self.assertTrue(df1.columns.equals(columns2)) 

183 

184 # Read just some columns a few different ways. 

185 with self.assertRaises(NotImplementedError) as cm: 

186 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}}) 

187 self.assertIn("only supports string column names", str(cm.exception)) 

188 with self.assertRaises(NotImplementedError) as cm: 

189 delegate.handleParameters( 

190 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}} 

191 ) 

192 self.assertIn("only supports string column names", str(cm.exception)) 

193 

194 def testBadInput(self): 

195 delegate = DataFrameDelegate("DataFrame") 

196 

197 with self.assertRaises(ValueError): 

198 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

199 

200 def testStorageClass(self): 

201 df1, allColumns = _makeSingleIndexDataFrame() 

202 

203 factory = StorageClassFactory() 

204 factory.addFromConfig(StorageClassConfig()) 

205 

206 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

207 # Force the name lookup to do name matching 

208 storageClass._pytype = None 

209 self.assertEqual(storageClass.name, "DataFrame") 

210 

211 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

212 # Force the name lookup to do name matching 

213 storageClass._pytype = None 

214 self.assertEqual(storageClass.name, "DataFrame") 

215 

216 

217if __name__ == "__main__": 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true

218 unittest.main()