Coverage for tests/test_deduplication.py: 15%

61 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-06 10:42 +0000

1# This file is part of daf_relation. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24import unittest 

25 

26from lsst.daf.relation import ( 

27 ColumnExpression, 

28 Deduplication, 

29 EngineError, 

30 SortTerm, 

31 UnaryOperationRelation, 

32 iteration, 

33 tests, 

34) 

35 

36 

37class DeduplicationTestCase(tests.RelationTestCase): 

38 """Tests for the Deduplication operation and relations based on it.""" 

39 

40 def setUp(self) -> None: 

41 self.a = tests.ColumnTag("a") 

42 self.b = tests.ColumnTag("b", is_key=False) 

43 self.engine = iteration.Engine(name="preferred") 

44 self.leaf = self.engine.make_leaf( 

45 {self.a}, payload=iteration.RowSequence([{self.a: 1}, {self.a: 0}, {self.a: 1}]), name="leaf" 

46 ) 

47 

48 def test_attributes(self) -> None: 

49 """Check that all UnaryOperation and Relation attributes have the 

50 expected values. 

51 """ 

52 relation = self.leaf.without_duplicates() 

53 assert isinstance(relation, UnaryOperationRelation) 

54 self.assertEqual(relation.columns, {self.a}) 

55 self.assertEqual(relation.engine, self.engine) 

56 self.assertEqual(relation.min_rows, 1) 

57 self.assertEqual(relation.max_rows, self.leaf.max_rows) 

58 operation = relation.operation 

59 assert isinstance(operation, Deduplication) 

60 self.assertEqual(operation.columns_required, set()) 

61 self.assertTrue(operation.is_empty_invariant) 

62 self.assertFalse(operation.is_count_invariant) 

63 self.assertFalse(operation.is_order_dependent) 

64 self.assertFalse(operation.is_count_dependent) 

65 

66 def test_min_max_rows(self) -> None: 

67 """Test min and max rows for edge-case deduplications.""" 

68 self.assertEqual(self.leaf.with_only_columns(set()).without_duplicates().min_rows, 1) 

69 self.assertEqual(self.leaf.with_only_columns(set()).without_duplicates().max_rows, 1) 

70 leaf0 = self.engine.make_leaf({self.a}, payload=iteration.RowSequence([]), name="leaf") 

71 self.assertEqual(leaf0.without_duplicates().min_rows, 0) 

72 self.assertEqual(leaf0.without_duplicates().max_rows, 0) 

73 self.assertEqual(leaf0.with_only_columns([]).without_duplicates().min_rows, 0) 

74 self.assertEqual(leaf0.with_only_columns([]).without_duplicates().max_rows, 0) 

75 

76 def test_backtracking_apply(self) -> None: 

77 """Test apply logic that involves reordering operations in the existing 

78 tree to perform the new operation in a preferred engine. 

79 """ 

80 new_engine = iteration.Engine(name="downstream") 

81 expression = ColumnExpression.reference(self.a) 

82 predicate = expression.lt(ColumnExpression.literal(20)) 

83 # Apply a bunch of operations in a new engine that a Deduplication 

84 # should commute with. 

85 target = ( 

86 self.leaf.transferred_to(new_engine) 

87 .with_calculated_column(self.b, expression) 

88 .with_rows_satisfying(predicate) 

89 .sorted([SortTerm(ColumnExpression.reference(self.a))]) 

90 ) 

91 # Apply a new Deduplication with backtracking and see that it appears 

92 # before the transfer to the new engine, with adjustments as needed 

93 # downstream (to the Projection and Chain, in this case). 

94 relation = target.without_duplicates(preferred_engine=self.engine, require_preferred_engine=True) 

95 self.assert_relations_equal( 

96 relation, 

97 ( 

98 self.leaf.without_duplicates() 

99 .transferred_to(new_engine) 

100 .with_calculated_column(self.b, expression) 

101 .with_rows_satisfying(predicate) 

102 .sorted([SortTerm(ColumnExpression.reference(self.a))]) 

103 ), 

104 ) 

105 

106 def test_no_backtracking(self) -> None: 

107 """Test apply logic that handles preferred engines without reordering 

108 operations in the existing tree. 

109 """ 

110 new_engine = iteration.Engine(name="downstream") 

111 # Construct a relation tree we can't reorder when inserting a 

112 # Deduplication, because there is a locked Materialization in the way. 

113 target = self.leaf.transferred_to(new_engine).materialized("lock") 

114 # Preferred engine is ignored if we can't backtrack and don't enable 

115 # anything else. 

116 self.assert_relations_equal( 

117 target.without_duplicates(preferred_engine=self.engine), 

118 target.without_duplicates(), 

119 ) 

120 # We can force this to be an error. 

121 with self.assertRaises(EngineError): 

122 target.without_duplicates(preferred_engine=self.engine, require_preferred_engine=True) 

123 # We can also automatically transfer (back) to the preferred engine. 

124 self.assert_relations_equal( 

125 target.without_duplicates(preferred_engine=self.engine, transfer=True), 

126 target.transferred_to(self.engine).without_duplicates(), 

127 ) 

128 # Now try a few other ways of making backtrack fail. 

129 # Deduplication does not commute with Projection. 

130 with self.assertRaises(EngineError): 

131 self.engine.make_leaf( 

132 {self.a, self.b}, 

133 payload=iteration.RowSequence([{self.a: 0, self.b: 0}, {self.a: 0, self.b: 1}]), 

134 name="leaf", 

135 ).transferred_to(new_engine).with_only_columns({self.a}).without_duplicates( 

136 preferred_engine=self.engine, require_preferred_engine=True 

137 ) 

138 # Deduplication does not commute with Slice. 

139 with self.assertRaises(EngineError): 

140 self.leaf.transferred_to(new_engine)[:1].without_duplicates( 

141 preferred_engine=self.engine, require_preferred_engine=True 

142 ) 

143 # Deduplication cannot be inserted past Chains or Joins 

144 # (at least not without more information than we have, like whether 

145 # Chain branches are disjoint or leaf relations start out with unique 

146 # rows). 

147 with self.assertRaises(EngineError): 

148 target = self.leaf.transferred_to(new_engine).chain( 

149 new_engine.make_leaf( 

150 {self.a}, 

151 payload=iteration.RowSequence([{self.a: 0}]), 

152 name="chain_leaf", 

153 ) 

154 ) 

155 target.without_duplicates(preferred_engine=self.engine, require_preferred_engine=True) 

156 with self.assertRaises(EngineError): 

157 target = self.leaf.transferred_to(new_engine).join( 

158 new_engine.make_leaf( 

159 {self.a}, 

160 payload=iteration.RowSequence([{self.a: 0}]), 

161 name="join_leaf", 

162 ) 

163 ) 

164 target.without_duplicates(preferred_engine=self.engine, require_preferred_engine=True) 

165 

166 def test_iteration(self) -> None: 

167 """Test Deduplication execution in the iteration engine.""" 

168 relation = self.leaf.without_duplicates() 

169 self.assertEqual( 

170 list(self.engine.execute(relation)), 

171 [{self.a: 1}, {self.a: 0}], 

172 ) 

173 

174 def test_str(self) -> None: 

175 """Test str(Deduplication) and 

176 str(UnaryOperationRelation[Deduplication]). 

177 """ 

178 relation = self.leaf.without_duplicates() 

179 self.assertEqual(str(relation), "deduplicate(leaf)") 

180 

181 

182if __name__ == "__main__": 

183 unittest.main()