@@ -39,56 +39,74 @@ def df():
39
39
)
40
40
return ctx .create_dataframe ([[batch ]])
41
41
42
+
42
43
@pytest .fixture
43
44
def df_aggregate_100 ():
44
45
ctx = SessionContext ()
45
46
ctx .register_csv ("aggregate_test_data" , "./testing/data/csv/aggregate_test_100.csv" )
46
47
return ctx .table ("aggregate_test_data" )
47
48
48
49
49
- @pytest .mark .parametrize ("agg_expr, calc_expected" , [
50
- (f .avg (column ("a" )), lambda a , b , c , d : np .array (np .average (a ))),
51
- (f .corr (column ("a" ), column ("b" )), lambda a , b , c , d : np .array (np .corrcoef (a , b )[0 ][1 ])),
52
- (f .count (column ("a" )), lambda a , b , c , d : pa .array ([len (a )])),
53
- # Sample (co)variance -> ddof=1
54
- # Population (co)variance -> ddof=0
55
- (f .covar (column ("a" ), column ("b" )), lambda a , b , c , d : np .array (np .cov (a , b , ddof = 1 )[0 ][1 ])),
56
- (f .covar_pop (column ("a" ), column ("c" )), lambda a , b , c , d : np .array (np .cov (a , c , ddof = 0 )[0 ][1 ])),
57
- (f .covar_samp (column ("b" ), column ("c" )), lambda a , b , c , d : np .array (np .cov (b , c , ddof = 1 )[0 ][1 ])),
58
- # f.grouping(col_a), # No physical plan implemented yet
59
- (f .max (column ("a" )), lambda a , b , c , d : np .array (np .max (a ))),
60
- (f .mean (column ("b" )), lambda a , b , c , d : np .array (np .mean (b ))),
61
- (f .median (column ("b" )), lambda a , b , c , d : np .array (np .median (b ))),
62
- (f .min (column ("a" )), lambda a , b , c , d : np .array (np .min (a ))),
63
- (f .sum (column ("b" )), lambda a , b , c , d : np .array (np .sum (b .to_pylist ()))),
64
- # Sample stdev -> ddof=1
65
- # Population stdev -> ddof=0
66
- (f .stddev (column ("a" )), lambda a , b , c , d : np .array (np .std (a , ddof = 1 ))),
67
- (f .stddev_pop (column ("b" )), lambda a , b , c , d : np .array (np .std (b , ddof = 0 ))),
68
- (f .stddev_samp (column ("c" )), lambda a , b , c , d : np .array (np .std (c , ddof = 1 ))),
69
- (f .var (column ("a" )), lambda a , b , c , d : np .array (np .var (a , ddof = 1 ))),
70
- (f .var_pop (column ("b" )), lambda a , b , c , d : np .array (np .var (b , ddof = 0 ))),
71
- (f .var_samp (column ("c" )), lambda a , b , c , d : np .array (np .var (c , ddof = 1 ))),
72
- ])
50
+ @pytest .mark .parametrize (
51
+ "agg_expr, calc_expected" ,
52
+ [
53
+ (f .avg (column ("a" )), lambda a , b , c , d : np .array (np .average (a ))),
54
+ (
55
+ f .corr (column ("a" ), column ("b" )),
56
+ lambda a , b , c , d : np .array (np .corrcoef (a , b )[0 ][1 ]),
57
+ ),
58
+ (f .count (column ("a" )), lambda a , b , c , d : pa .array ([len (a )])),
59
+ # Sample (co)variance -> ddof=1
60
+ # Population (co)variance -> ddof=0
61
+ (
62
+ f .covar (column ("a" ), column ("b" )),
63
+ lambda a , b , c , d : np .array (np .cov (a , b , ddof = 1 )[0 ][1 ]),
64
+ ),
65
+ (
66
+ f .covar_pop (column ("a" ), column ("c" )),
67
+ lambda a , b , c , d : np .array (np .cov (a , c , ddof = 0 )[0 ][1 ]),
68
+ ),
69
+ (
70
+ f .covar_samp (column ("b" ), column ("c" )),
71
+ lambda a , b , c , d : np .array (np .cov (b , c , ddof = 1 )[0 ][1 ]),
72
+ ),
73
+ # f.grouping(col_a), # No physical plan implemented yet
74
+ (f .max (column ("a" )), lambda a , b , c , d : np .array (np .max (a ))),
75
+ (f .mean (column ("b" )), lambda a , b , c , d : np .array (np .mean (b ))),
76
+ (f .median (column ("b" )), lambda a , b , c , d : np .array (np .median (b ))),
77
+ (f .min (column ("a" )), lambda a , b , c , d : np .array (np .min (a ))),
78
+ (f .sum (column ("b" )), lambda a , b , c , d : np .array (np .sum (b .to_pylist ()))),
79
+ # Sample stdev -> ddof=1
80
+
10000
# Population stdev -> ddof=0
81
+ (f .stddev (column ("a" )), lambda a , b , c , d : np .array (np .std (a , ddof = 1 ))),
82
+ (f .stddev_pop (column ("b" )), lambda a , b , c , d : np .array (np .std (b , ddof = 0 ))),
83
+ (f .stddev_samp (column ("c" )), lambda a , b , c , d : np .array (np .std (c , ddof = 1 ))),
84
+ (f .var (column ("a" )), lambda a , b , c , d : np .array (np .var (a , ddof = 1 ))),
85
+ (f .var_pop (column ("b" )), lambda a , b , c , d : np .array (np .var (b , ddof = 0 ))),
86
+ (f .var_samp (column ("c" )), lambda a , b , c , d : np .array (np .var (c , ddof = 1 ))),
87
+ ],
88
+ )
73
89
def test_aggregation_stats (df , agg_expr , calc_expected ):
74
-
75
90
agg_df = df .aggregate ([], [agg_expr ])
76
91
result = agg_df .collect ()[0 ]
77
92
values_a , values_b , values_c , values_d = df .collect ()[0 ]
78
93
expected = calc_expected (values_a , values_b , values_c , values_d )
79
94
np .testing .assert_array_almost_equal (result .column (0 ), expected )
80
95
81
96
82
- @pytest .mark .parametrize ("agg_expr, expected" , [
83
- (f .approx_distinct (column ("b" )), pa .array ([2 ], type = pa .uint64 ())),
84
- (f .approx_median (column ("b" )), pa .array ([4 ])),
85
- (f .approx_percentile_cont (column ("b" ), lit (0.5 )), pa .array ([4 ])),
86
- (
87
- f .approx_percentile_cont_with_weight (column ("b" ), lit (0.6 ), lit (0.5 )),
88
- pa .array ([6 ], type = pa .float64 ())
89
- ),
90
- (f .array_agg (column ("b" )), pa .array ([[4 , 4 , 6 ]])),
91
- ])
97
+ @pytest .mark .parametrize (
98
+ "agg_expr, expected" ,
99
+ [
100
+ (f .approx_distinct (column ("b" )), pa .array ([2 ], type = pa .uint64 ())),
101
+ (f .approx_median (column ("b" )), pa .array ([4 ])),
102
+ (f .approx_percentile_cont (column ("b" ), lit (0.5 )), pa .array ([4 ])),
103
+ (
104
+ f .approx_percentile_cont_with_weight (column ("b" ), lit (0.6 ), lit (0.5 )),
105
+ pa .array ([6 ], type = pa .float64 ()),
106
+ ),
107
+ (f .array_agg (column ("b" )), pa .array ([[4 , 4 , 6 ]])),
108
+ ],
109
+ )
92
110
def test_aggregation (df , agg_expr , expected ):
93
111
agg_df = df .aggregate ([], [agg_expr ])
94
112
result = agg_df .collect ()[0 ]
@@ -98,20 +116,21 @@ def test_aggregation(df, agg_expr, expected):
98
116
def test_aggregate_100 (df_aggregate_100 ):
99
117
# https://github.com/apache/datafusion/blob/bddb6415a50746d2803dd908d19c3758952d74f9/datafusion/sqllogictest/test_files/aggregate.slt#L1490-L1498
100
118
101
- result = df_aggregate_100 . aggregate (
102
- [
103
- column ("c1" )
104
- ],
105
- [
106
- f . approx_percentile_cont (column ("c3" ), lit ( 0.95 ), lit ( 200 )). alias ( "c3" )
107
- ]
108
- ). sort ( column ( "c1" ). sort ( ascending = True )). collect ()
119
+ result = (
120
+ df_aggregate_100 . aggregate (
121
+ [ column ("c1" )],
122
+ [ f . approx_percentile_cont ( column ( "c3" ), lit ( 0.95 ), lit ( 200 )). alias ( "c3" ) ],
123
+ )
124
+ . sort (column ("c1" ). sort ( ascending = True ) )
125
+ . collect ()
126
+ )
109
127
110
128
assert len (result ) == 1
111
129
result = result [0 ]
112
130
assert result .column ("c1" ) == pa .array (["a" , "b" , "c" , "d" , "e" ])
113
131
assert result .column ("c3" ) == pa .array ([73 , 68 , 122 , 124 , 115 ])
114
132
133
+
115
134
def test_bit_add_or_xor (df ):
116
135
df = df .aggregate (
117
136
[],
0 commit comments