8000 Run ruff format in CI (#837) · PhVHoang/datafusion-python@766e2ed · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 766e2ed

Browse files
authored
Run ruff format in CI (apache#837)
* Run ruff format in CI * Add --check parameter * Apply ruff format
1 parent 22c70ef commit 766e2ed

File tree

7 files changed

+251
-153
lines changed

7 files changed

+251
-153
lines changed

.github/workflows/build.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ jobs:
3838
pip install ruff
3939
# Update output format to enable automatic inline annotations.
4040
- name: Run Ruff
41-
run: ruff check --output-format=github python/
41+
run: |
42+
ruff check --output-format=github python/
43+
ruff format --check python/
4244
4345
generate-license:
4446
runs-on: ubuntu-latest

python/datafusion/functions.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1479,12 +1479,17 @@ def approx_percentile_cont(
14791479
"""Returns the value that is approximately at a given percentile of ``expr``."""
14801480
if num_centroids is None:
14811481
return Expr(
1482-
f.approx_percentile_cont(expression.expr, percentile.expr, distinct=distinct, num_centroids=None)
1482+
f.approx_percentile_cont(
1483+
expression.expr, percentile.expr, distinct=distinct, num_centroids=None
1484+
)
14831485
)
14841486

14851487
return Expr(
14861488
f.approx_percentile_cont(
1487-
expression.expr, percentile.expr, distinct=distinct, num_centroids=num_centroids.expr
1489+
expression.expr,
1490+
percentile.expr,
1491+
distinct=distinct,
1492+
num_centroids=num_centroids.expr,
14881493
)
14891494
)
14901495

python/datafusion/tests/test_aggregation.py

Lines changed: 62 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -39,56 +39,74 @@ def df():
3939
)
4040
return ctx.create_dataframe([[batch]])
4141

42+
4243
@pytest.fixture
4344
def df_aggregate_100():
4445
ctx = SessionContext()
4546
ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv")
4647
return ctx.table("aggregate_test_data")
4748

4849

49-
@pytest.mark.parametrize("agg_expr, calc_expected", [
50-
(f.avg(column("a")), lambda a, b, c, d: np.array(np.average(a))),
51-
(f.corr(column("a"), column("b")), lambda a, b, c, d: np.array(np.corrcoef(a, b)[0][1])),
52-
(f.count(column("a")), lambda a, b, c, d: pa.array([len(a)])),
53-
# Sample (co)variance -> ddof=1
54-
# Population (co)variance -> ddof=0
55-
(f.covar(column("a"), column("b")), lambda a, b, c, d: np.array(np.cov(a, b, ddof=1)[0][1])),
56-
(f.covar_pop(column("a"), column("c")), lambda a, b, c, d: np.array(np.cov(a, c, ddof=0)[0][1])),
57-
(f.covar_samp(column("b"), column("c")), lambda a, b, c, d: np.array(np.cov(b, c, ddof=1)[0][1])),
58-
# f.grouping(col_a), # No physical plan implemented yet
59-
(f.max(column("a")), lambda a, b, c, d: np.array(np.max(a))),
60-
(f.mean(column("b")), lambda a, b, c, d: np.array(np.mean(b))),
61-
(f.median(column("b")), lambda a, b, c, d: np.array(np.median(b))),
62-
(f.min(column("a")), lambda a, b, c, d: np.array(np.min(a))),
63-
(f.sum(column("b")), lambda a, b, c, d: np.array(np.sum(b.to_pylist()))),
64-
# Sample stdev -> ddof=1
65-
# Population stdev -> ddof=0
66-
(f.stddev(column("a")), lambda a, b, c, d: np.array(np.std(a, ddof=1))),
67-
(f.stddev_pop(column("b")), lambda a, b, c, d: np.array(np.std(b, ddof=0))),
68-
(f.stddev_samp(column("c")), lambda a, b, c, d: np.array(np.std(c, ddof=1))),
69-
(f.var(column("a")), lambda a, b, c, d: np.array(np.var(a, ddof=1))),
70-
(f.var_pop(column("b")), lambda a, b, c, d: np.array(np.var(b, ddof=0))),
71-
(f.var_samp(column("c")), lambda a, b, c, d: np.array(np.var(c, ddof=1))),
72-
])
50+
@pytest.mark.parametrize(
51+
"agg_expr, calc_expected",
52+
[
53+
(f.avg(column("a")), lambda a, b, c, d: np.array(np.average(a))),
54+
(
55+
f.corr(column("a"), column("b")),
56+
lambda a, b, c, d: np.array(np.corrcoef(a, b)[0][1]),
57+
),
58+
(f.count(column("a")), lambda a, b, c, d: pa.array([len(a)])),
59+
# Sample (co)variance -> ddof=1
60+
# Population (co)variance -> ddof=0
61+
(
62+
f.covar(column("a"), column("b")),
63+
lambda a, b, c, d: np.array(np.cov(a, b, ddof=1)[0][1]),
64+
),
65+
(
66+
f.covar_pop(column("a"), column("c")),
67+
lambda a, b, c, d: np.array(np.cov(a, c, ddof=0)[0][1]),
68+
),
69+
(
70+
f.covar_samp(column("b"), column("c")),
71+
lambda a, b, c, d: np.array(np.cov(b, c, ddof=1)[0][1]),
72+
),
73+
# f.grouping(col_a), # No physical plan implemented yet
74+
(f.max(column("a")), lambda a, b, c, d: np.array(np.max(a))),
75+
(f.mean(column("b")), lambda a, b, c, d: np.array(np.mean(b))),
76+
(f.median(column("b")), lambda a, b, c, d: np.array(np.median(b))),
77+
(f.min(column("a")), lambda a, b, c, d: np.array(np.min(a))),
78+
(f.sum(column("b")), lambda a, b, c, d: np.array(np.sum(b.to_pylist()))),
79+
# Sample stdev -> ddof=1
80+
10000 # Population stdev -> ddof=0
81+
(f.stddev(column("a")), lambda a, b, c, d: np.array(np.std(a, ddof=1))),
82+
(f.stddev_pop(column("b")), lambda a, b, c, d: np.array(np.std(b, ddof=0))),
83+
(f.stddev_samp(column("c")), lambda a, b, c, d: np.array(np.std(c, ddof=1))),
84+
(f.var(column("a")), lambda a, b, c, d: np.array(np.var(a, ddof=1))),
85+
(f.var_pop(column("b")), lambda a, b, c, d: np.array(np.var(b, ddof=0))),
86+
(f.var_samp(column("c")), lambda a, b, c, d: np.array(np.var(c, ddof=1))),
87+
],
88+
)
7389
def test_aggregation_stats(df, agg_expr, calc_expected):
74-
7590
agg_df = df.aggregate([], [agg_expr])
7691
result = agg_df.collect()[0]
7792
values_a, values_b, values_c, values_d = df.collect()[0]
7893
expected = calc_expected(values_a, values_b, values_c, values_d)
7994
np.testing.assert_array_almost_equal(result.column(0), expected)
8095

8196

82-
@pytest.mark.parametrize("agg_expr, expected", [
83-
(f.approx_distinct(column("b")), pa.array([2], type=pa.uint64())),
84-
(f.approx_median(column("b")), pa.array([4])),
85-
(f.approx_percentile_cont(column("b"), lit(0.5)), pa.array([4])),
86-
(
87-
f.approx_percentile_cont_with_weight(column("b"), lit(0.6), lit(0.5)),
88-
pa.array([6], type=pa.float64())
89-
),
90-
(f.array_agg(column("b")), pa.array([[4, 4, 6]])),
91-
])
97+
@pytest.mark.parametrize(
98+
"agg_expr, expected",
99+
[
100+
(f.approx_distinct(column("b")), pa.array([2], type=pa.uint64())),
101+
(f.approx_median(column("b")), pa.array([4])),
102+
(f.approx_percentile_cont(column("b"), lit(0.5)), pa.array([4])),
103+
(
104+
f.approx_percentile_cont_with_weight(column("b"), lit(0.6), lit(0.5)),
105+
pa.array([6], type=pa.float64()),
106+
),
107+
(f.array_agg(column("b")), pa.array([[4, 4, 6]])),
108+
],
109+
)
92110
def test_aggregation(df, agg_expr, expected):
93111
agg_df = df.aggregate([], [agg_expr])
94112
result = agg_df.collect()[0]
@@ -98,20 +116,21 @@ def test_aggregation(df, agg_expr, expected):
98116
def test_aggregate_100(df_aggregate_100):
99117
# https://github.com/apache/datafusion/blob/bddb6415a50746d2803dd908d19c3758952d74f9/datafusion/sqllogictest/test_files/aggregate.slt#L1490-L1498
100118

101-
result = df_aggregate_100.aggregate(
102-
[
103-
column("c1")
104-
],
105-
[
106-
f.approx_percentile_cont(column("c3"), lit(0.95), lit(200)).alias("c3")
107-
]
108-
).sort(column("c1").sort(ascending=True)).collect()
119+
result = (
120+
df_aggregate_100.aggregate(
121+
[column("c1")],
122+
[f.approx_percentile_cont(column("c3"), lit(0.95), lit(200)).alias("c3")],
123+
)
124+
.sort(column("c1").sort(ascending=True))
125+
.collect()
126+
)
109127

110128
assert len(result) == 1
111129
result = result[0]
112130
assert result.column("c1") == pa.array(["a", "b", "c", "d", "e"])
113131
assert result.column("c3") == pa.array([73, 68, 122, 124, 115])
114132

133+
115134
def test_bit_add_or_xor(df):
116135
df = df.aggregate(
117136
[],

python/datafusion/tests/test_dataframe.py

Lines changed: 50 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -279,57 +279,67 @@ def test_distinct():
279279

280280

281281
data_test_window_functions = [
282-
("row", f.window("row_number", [], order_by=[f.order_by(column("c"))]), [2, 1, 3]),
283-
("rank", f.window("rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2]),
284-
("dense_rank", f.window("dense_rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2] ),
285-
("percent_rank", f.window("percent_rank", [], order_by=[f.order_by(column("c"))]), [0.5, 0, 0.5]),
286-
("cume_dist", f.window("cume_dist", [], order_by=[f.order_by(column("b"))]), [0.3333333333333333, 0.6666666666666666, 1.0]),
287-
("ntile", f.window("ntile", [literal(2)], order_by=[f.order_by(column("c"))]), [1, 1, 2]),
288-
("next", f.window("lead", [column("b")], order_by=[f.order_by(column("b"))]), [5, 6, None]),
289-
("previous", f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]), [None, 4, 5]),
290-
pytest.param(
291-
"first_value",
292-
f.window(
282+
("row", f.window("row_number", [], order_by=[f.order_by(column("c"))]), [2, 1, 3]),
283+
("rank", f.window("rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2]),
284+
(
285+
"dense_rank",
286+
f.window("dense_rank", [], order_by=[f.order_by(column("c"))]),
287+
[2, 1, 2],
288+
),
289+
(
290+
"percent_rank",
291+
f.window("percent_rank", [], order_by=[f.order_by(column("c"))]),
292+
[0.5, 0, 0.5],
293+
),
294+
(
295+
"cume_dist",
296+
f.window("cume_dist", [], order_by=[f.order_by(column("b"))]),
297+
[0.3333333333333333, 0.6666666666666666, 1.0],
298+
),
299+
(
300+
"ntile",
301+
f.window("ntile", [literal(2)], order_by=[f.order_by(column("c"))]),
302+
[1, 1, 2],
303+
),
304+
(
305+
"next",
306+
f.window("lead", [column("b")], order_by=[f.order_by(column("b"))]),
307+
[5, 6, None],
308+
),
309+
(
310+
"previous",
311+
f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]),
312+
[None, 4, 5],
313+
),
314+
pytest.param(
293315
"first_value",
294-
[column("a")],
295-
order_by=[f.order_by(column("b"))]
316+
f.window("first_value", [column("a")], order_by=[f.order_by(column("b"))]),
317+
[1, 1, 1],
318+
),
319+
pytest.param(
320+
"last_value",
321+
f.window("last_value", [column("b")], order_by=[f.order_by(column("b"))]),
322+
[4, 5, 6],
296323
),
297-
[1, 1, 1],
298-
),
299-
pytest.param(
300-
"last_value",
301-
f.window("last_value", [column("b")], order_by=[f.order_by(column("b"))]),
302-
[4, 5, 6],
303-
),
304-
pytest.param(
305-
"2nd_value",
306-
f.window(
307-
"nth_value",
308-
[column("b"), literal(2)],
309-
order_by=[f.order_by(column("b"))],
324+
pytest.param(
325+
"2nd_value",
326+
f.window(
327+
"nth_value",
328+
[column("b"), literal(2)],
329+
order_by=[f.order_by(column("b"))],
330+
),
331+
[None, 5, 5],
310332
),
311-
[None, 5, 5],
312-
),
313333
]
314334

315335

316336
@pytest.mark.parametrize("name,expr,result", data_test_window_functions)
317337
def test_window_functions(df, name, expr, result):
318-
df = df.select(
319-
column("a"),
320-
column("b"),
321-
column("c"),
322-
f.alias(expr, name)
323-
)
338+
df = df.select(column("a"), column("b"), column("c"), f.alias(expr, name))
324339

325340
table = pa.Table.from_batches(df.collect())
326341

327-
expected = {
328-
"a": [1, 2, 3],
329-
"b": [4, 5, 6],
330-
"c": [8, 5, 8],
331-
name: result
332-
}
342+
expected = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8], name: result}
333343

334344
assert table.sort_by("a").to_pydict() == expected
335345

python/datafusion/tests/test_expr.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,24 +146,26 @@ def test_expr_to_variant():
146146
from datafusion import SessionContext
147147
from datafusion.expr import Filter
148148

149-
150149
def traverse_logical_plan(plan):
151150
cur_node = plan.to_variant()
152151
if isinstance(cur_node, Filter):
153152
return cur_node.predicate().to_variant()
154-
if hasattr(plan, 'inputs'):
153+
if hasattr(plan, "inputs"):
155154
for input_plan in plan.inputs():
156155
res = traverse_logical_plan(input_plan)
157156
if res is not None:
158157
return res
159158

160159
ctx = SessionContext()
161-
data = {'id': [1, 2, 3], 'name': ['Alice', 'Bob', 'Charlie']}
162-
ctx.from_pydict(data, name='table1')
160+
data = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
161+
ctx.from_pydict(data, name="table1")
163162
query = "SELECT * FROM table1 t1 WHERE t1.name IN ('dfa', 'ad', 'dfre', 'vsa')"
164163
logical_plan = ctx.sql(query).optimized_logical_plan()
165164
variant = traverse_logical_plan(logical_plan)
166165
assert variant is not None
167-
assert variant.expr().to_variant().qualified_name() == 'table1.name'
168-
assert str(variant.list()) == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]'
166+
assert variant.expr().to_variant().qualified_name() == "table1.name"
167+
assert (
168+
str(variant.list())
169+
== '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]'
170+
)
169171
assert not variant.negated()

0 commit comments

Comments
 (0)
0