8000 feat: add `head`, `tail` methods (#915) · vectorlink-ai/datafusion-python@f59dd08 · GitHub
[go: up one dir, main page]

Skip to content

Commit f59dd08

Browse files
authored
feat: add head, tail methods (apache#915)
* feat: add head, tail methods * chore: add default head/tail
1 parent 70c099a commit f59dd08

File tree

2 files changed

+47
-0
lines changed

2 files changed

+47
-0
lines changed

python/datafusion/dataframe.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,31 @@ def limit(self, count: int, offset: int = 0) -> DataFrame:
292292
"""
293293
return DataFrame(self.df.limit(count, offset))
294294

295+
def head(self, n: int = 5) -> DataFrame:
296+
"""Return a new :py:class:`DataFrame` with a limited number of rows.
297+
298+
Args:
299+
n: Number of rows to take from the head of the DataFrame.
300+
301+
Returns:
302+
DataFrame after limiting.
303+
"""
304+
return DataFrame(self.df.limit(n, 0))
305+
306+
def tail(self, n: int = 5) -> DataFrame:
307+
"""Return a new :py:class:`DataFrame` with a limited number of rows.
308+
309+
Be aware this could be potentially expensive since the row size needs to be
310+
determined of the dataframe. This is done by collecting it.
311+
312+
Args:
313+
n: Number of rows to take from the tail of the DataFrame.
314+
315+
Returns:
316+
DataFrame after limiting.
317+
"""
318+
return DataFrame(self.df.limit(n, max(0, self.count() - n)))
319 10000 +
295320
def collect(self) -> list[pa.RecordBatch]:
296321
"""Execute this :py:class:`DataFrame` and collect results into memory.
297322

python/tests/test_dataframe.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,28 @@ def test_limit_with_offset(df):
201201
assert len(result.column(1)) == 1
202202

203203

204+
def test_head(df):
205+
df = df.head(1)
206+
207+
# execute and collect the first (and only) batch
208+
result = df.collect()[0]
209+
210+
assert result.column(0) == pa.array([1])
211+
assert result.column(1) == pa.array([4])
212+
assert result.column(2) == pa.array([8])
213+
214+
215+
def test_tail(df):
216+
df = df.tail(1)
217+
218+
# execute and collect the first (and only) batch
219+
result = df.collect()[0]
220+
221+
assert result.column(0) == pa.array([3])
222+
assert result.column(1) == pa.array([6])
223+
assert result.column(2) == pa.array([8])
224+
225+
204226
def test_with_column(df):
205227
df = df.with_column("c", column("a") + column("b"))
206228

0 commit comments

Comments
 (0)
0