Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

# This file is part of daf_butler. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

 

"""Tests for ParquetFormatter. 

 

Tests in this module are disabled unless pandas and pyarrow are importable. 

""" 

 

import os 

import unittest 

import tempfile 

import shutil 

 

try: 

import numpy as np 

import pandas as pd 

import pyarrow.parquet 

except ImportError: 

pyarrow = None 

 

from lsst.daf.butler import Butler, DatasetType 

 

 

TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

 

 

@unittest.skipUnless(pyarrow is not None, "Cannot test ParquetFormatter without pyarrow.") 

class ParquetFormatterTestCase(unittest.TestCase): 

"""Tests for ParquetFormatter, using PosixDatastore. 

""" 

 

def setUp(self): 

"""Create a new butler root for each test.""" 

self.root = tempfile.mkdtemp(dir=TESTDIR) 

Butler.makeRepo(self.root) 

self.butler = Butler(self.root, run="test_run") 

# No dimensions in dataset type so we don't have to worry about 

# inserting dimension data or defining data IDs. 

self.datasetType = DatasetType("data", dimensions=(), storageClass="DataFrame", 

universe=self.butler.registry.dimensions) 

self.butler.registry.registerDatasetType(self.datasetType) 

 

def tearDown(self): 

if os.path.exists(self.root): 

shutil.rmtree(self.root, ignore_errors=True) 

 

def testSingleIndexDataFrame(self): 

columns1 = pd.Index(["a", "b", "c"]) 

df1 = pd.DataFrame(np.random.randn(5, 3), index=np.arange(5, dtype=int), columns=columns1) 

self.butler.put(df1, self.datasetType, dataId={}) 

# Read the whole DataFrame. 

df2 = self.butler.get(self.datasetType, dataId={}) 

self.assertTrue(df1.equals(df2)) 

# Read just the column descriptions. 

columns2 = self.butler.get(f"{self.datasetType.name}.columns", dataId={}) 

self.assertTrue(df1.columns.equals(columns2)) 

# Read just some columns a few different ways. 

df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

# Passing an unrecognized column should be a ValueError. 

with self.assertRaises(ValueError): 

self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

 

def testMultiIndexDataFrame(self): 

columns1 = pd.MultiIndex.from_tuples( 

[ 

("g", "a"), 

("g", "b"), 

("g", "c"), 

("r", "a"), 

("r", "b"), 

("r", "c"), 

], 

names=["filter", "column"], 

) 

df1 = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns1) 

self.butler.put(df1, self.datasetType, dataId={}) 

# Read the whole DataFrame. 

df2 = self.butler.get(self.datasetType, dataId={}) 

self.assertTrue(df1.equals(df2)) 

# Read just the column descriptions. 

columns2 = self.butler.get(f"{self.datasetType.name}.columns", dataId={}) 

self.assertTrue(df1.columns.equals(columns2)) 

# Read just some columns a few different ways. 

df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

df4 = self.butler.get(self.datasetType, dataId={}, 

parameters={"columns": {"filter": ["r"], "column": "a"}}) 

self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

# Passing an unrecognized column should be a ValueError. 

with self.assertRaises(ValueError): 

self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

 

 

115 ↛ 116line 115 didn't jump to line 116, because the condition on line 115 was never trueif __name__ == "__main__": 

unittest.main()