Coverage for tests/test_parquet.py: 34%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

55 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import numpy as np 

32 import pandas as pd 

33 import pyarrow.parquet 

34except ImportError: 

35 pyarrow = None 

36 

37from lsst.daf.butler import Butler, DatasetType 

38from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

39 

40 

41TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

42 

43 

44@unittest.skipUnless(pyarrow is not None, "Cannot test ParquetFormatter without pyarrow.") 

45class ParquetFormatterTestCase(unittest.TestCase): 

46 """Tests for ParquetFormatter, using local file datastore. 

47 """ 

48 

49 def setUp(self): 

50 """Create a new butler root for each test.""" 

51 self.root = makeTestTempDir(TESTDIR) 

52 Butler.makeRepo(self.root) 

53 self.butler = Butler(self.root, run="test_run") 

54 # No dimensions in dataset type so we don't have to worry about 

55 # inserting dimension data or defining data IDs. 

56 self.datasetType = DatasetType("data", dimensions=(), storageClass="DataFrame", 

57 universe=self.butler.registry.dimensions) 

58 self.butler.registry.registerDatasetType(self.datasetType) 

59 

60 def tearDown(self): 

61 removeTestTempDir(self.root) 

62 

63 def testSingleIndexDataFrame(self): 

64 columns1 = pd.Index(["a", "b", "c"]) 

65 df1 = pd.DataFrame(np.random.randn(5, 3), index=np.arange(5, dtype=int), columns=columns1) 

66 self.butler.put(df1, self.datasetType, dataId={}) 

67 # Read the whole DataFrame. 

68 df2 = self.butler.get(self.datasetType, dataId={}) 

69 self.assertTrue(df1.equals(df2)) 

70 # Read just the column descriptions. 

71 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

72 self.assertTrue(df1.columns.equals(columns2)) 

73 # Read just some columns a few different ways. 

74 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

75 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

76 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

77 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

78 # Passing an unrecognized column should be a ValueError. 

79 with self.assertRaises(ValueError): 

80 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

81 

82 def testMultiIndexDataFrame(self): 

83 columns1 = pd.MultiIndex.from_tuples( 

84 [ 

85 ("g", "a"), 

86 ("g", "b"), 

87 ("g", "c"), 

88 ("r", "a"), 

89 ("r", "b"), 

90 ("r", "c"), 

91 ], 

92 names=["filter", "column"], 

93 ) 

94 df1 = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns1) 

95 self.butler.put(df1, self.datasetType, dataId={}) 

96 # Read the whole DataFrame. 

97 df2 = self.butler.get(self.datasetType, dataId={}) 

98 self.assertTrue(df1.equals(df2)) 

99 # Read just the column descriptions. 

100 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

101 self.assertTrue(df1.columns.equals(columns2)) 

102 # Read just some columns a few different ways. 

103 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

104 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

105 df4 = self.butler.get(self.datasetType, dataId={}, 

106 parameters={"columns": {"filter": ["r"], "column": "a"}}) 

107 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

108 column_list = [('g', 'a'), ('r', 'c')] 

109 df5 = self.butler.get(self.datasetType, dataId={}, 

110 parameters={'columns': column_list}) 

111 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

112 # Passing an unrecognized column should be a ValueError. 

113 with self.assertRaises(ValueError): 

114 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

115 

116 

117if __name__ == "__main__": 117 ↛ 118line 117 didn't jump to line 118, because the condition on line 117 was never true

118 unittest.main()