Coverage for tests/test_parquet.py: 32%

55 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-07-03 01:08 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import numpy as np 

32 import pandas as pd 

33 import pyarrow.parquet 

34except ImportError: 

35 pyarrow = None 

36 

37from lsst.daf.butler import Butler, DatasetType 

38from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

39 

40TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

41 

42 

43@unittest.skipUnless(pyarrow is not None, "Cannot test ParquetFormatter without pyarrow.") 

44class ParquetFormatterTestCase(unittest.TestCase): 

45 """Tests for ParquetFormatter, using local file datastore.""" 

46 

47 def setUp(self): 

48 """Create a new butler root for each test.""" 

49 self.root = makeTestTempDir(TESTDIR) 

50 Butler.makeRepo(self.root) 

51 self.butler = Butler(self.root, run="test_run") 

52 # No dimensions in dataset type so we don't have to worry about 

53 # inserting dimension data or defining data IDs. 

54 self.datasetType = DatasetType( 

55 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

56 ) 

57 self.butler.registry.registerDatasetType(self.datasetType) 

58 

59 def tearDown(self): 

60 removeTestTempDir(self.root) 

61 

62 def testSingleIndexDataFrame(self): 

63 columns1 = pd.Index(["a", "b", "c"]) 

64 df1 = pd.DataFrame(np.random.randn(5, 3), index=np.arange(5, dtype=int), columns=columns1) 

65 self.butler.put(df1, self.datasetType, dataId={}) 

66 # Read the whole DataFrame. 

67 df2 = self.butler.get(self.datasetType, dataId={}) 

68 self.assertTrue(df1.equals(df2)) 

69 # Read just the column descriptions. 

70 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

71 self.assertTrue(df1.columns.equals(columns2)) 

72 # Read just some columns a few different ways. 

73 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

74 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

75 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

76 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

77 # Passing an unrecognized column should be a ValueError. 

78 with self.assertRaises(ValueError): 

79 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

80 

81 def testMultiIndexDataFrame(self): 

82 columns1 = pd.MultiIndex.from_tuples( 

83 [ 

84 ("g", "a"), 

85 ("g", "b"), 

86 ("g", "c"), 

87 ("r", "a"), 

88 ("r", "b"), 

89 ("r", "c"), 

90 ], 

91 names=["filter", "column"], 

92 ) 

93 df1 = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns1) 

94 self.butler.put(df1, self.datasetType, dataId={}) 

95 # Read the whole DataFrame. 

96 df2 = self.butler.get(self.datasetType, dataId={}) 

97 self.assertTrue(df1.equals(df2)) 

98 # Read just the column descriptions. 

99 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

100 self.assertTrue(df1.columns.equals(columns2)) 

101 # Read just some columns a few different ways. 

102 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

103 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

104 df4 = self.butler.get( 

105 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

106 ) 

107 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

108 column_list = [("g", "a"), ("r", "c")] 

109 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

110 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

111 # Passing an unrecognized column should be a ValueError. 

112 with self.assertRaises(ValueError): 

113 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

114 

115 

116if __name__ == "__main__": 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true

117 unittest.main()